├── assets
└── hedge.png
├── .gitignore
├── localbuild.sh
├── testdata
└── emuddl.sql
├── Dockerfile
├── LICENSE
├── .github
└── workflows
│ └── main.yml
├── hedge_test.go
├── deployment.yaml
├── go.mod
├── service.go
├── protocol.go
├── README.md
├── semaphore.go
├── example
└── demo
│ └── main.go
├── sos.go
└── hedge.go
/assets/hedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flowerinthenight/hedge/HEAD/assets/hedge.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.exe
2 | *.exe~
3 | *.dll
4 | *.so
5 | *.dylib
6 | *.test
7 | *.out
8 | deployment-private.yaml
9 | example/demo/demo
10 | readdata
11 | readlocs
--------------------------------------------------------------------------------
/localbuild.sh:
--------------------------------------------------------------------------------
1 | kubectl delete -f deployment.yaml
2 | DOCKER_BUILDKIT=0 docker build --rm -t demo .
3 | DOCKER_BUILDKIT=0 docker tag demo asia.gcr.io/mobingi-main/hedge:$1
4 | DOCKER_BUILDKIT=0 docker push asia.gcr.io/mobingi-main/hedge:$1
5 | DOCKER_BUILDKIT=0 docker rmi $(docker images --filter "dangling=true" -q --no-trunc) -f
6 | [ -f deployment-private.yaml ] && sed -i -e 's/image\:\ asia.gcr.io\/mobingi\-main\/hedge[\:@].*$/image\:\ asia.gcr.io\/mobingi\-main\/hedge\:'$1'/g' deployment-private.yaml
7 |
--------------------------------------------------------------------------------
/testdata/emuddl.sql:
--------------------------------------------------------------------------------
1 | -- for spindle
2 | CREATE TABLE locktable (
3 | name STRING(MAX) NOT NULL,
4 | heartbeat TIMESTAMP OPTIONS (allow_commit_timestamp=true),
5 | token TIMESTAMP OPTIONS (allow_commit_timestamp=true),
6 | writer STRING(MAX)
7 | ) PRIMARY KEY (name);
8 |
9 | -- for hedge
10 | CREATE TABLE logtable (
11 | id STRING(MAX),
12 | key STRING(MAX),
13 | value STRING(MAX),
14 | leader STRING(MAX),
15 | timestamp TIMESTAMP OPTIONS (allow_commit_timestamp=true)
16 | ) PRIMARY KEY (key, id)
17 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM golang:1.25.1-trixie
2 | COPY . /go/src/github.com/flowerinthenight/hedge/
3 | WORKDIR /go/src/github.com/flowerinthenight/hedge/example/demo/
4 | RUN CGO_ENABLED=0 GOOS=linux go build -v -trimpath -installsuffix cgo -o hedge .
5 |
6 | FROM debian:stable-slim
7 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/*
8 | WORKDIR /app/
9 | COPY --from=0 /go/src/github.com/flowerinthenight/hedge/example/demo/hedge .
10 | ENTRYPOINT ["/app/hedge"]
11 | CMD ["-db=projects/{project}/instances/{instance}/databases/{database}"]
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 flowerinthenight
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: main
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | tags:
7 | - '*'
8 | pull_request:
9 | branches: [ main ]
10 |
11 | jobs:
12 | codeberg:
13 | name: Codeberg
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Checkout code
17 | uses: actions/checkout@v4
18 | with:
19 | fetch-depth: 0
20 |
21 | - name: Mirror to Codeberg
22 | uses: yesolutions/mirror-action@master
23 | with:
24 | REMOTE: "https://codeberg.org/flowerinthenight/hedge.git"
25 | GIT_USERNAME: flowerinthenight
26 | GIT_PASSWORD: ${{ secrets.GIT_PASSWORD }}
27 |
28 | build:
29 | name: Build
30 | if: "!contains(github.event.commits[0].message, 'ci skip')"
31 | runs-on: ubuntu-latest
32 | services:
33 | emulator:
34 | image: gcr.io/cloud-spanner-emulator/emulator
35 | ports:
36 | - 9010:9010
37 | - 9020:9020
38 |
39 | steps:
40 | - uses: actions/checkout@v4
41 |
42 | - uses: actions/setup-go@v4
43 | with:
44 | go-version: '1.25'
45 |
46 | - name: 'Test using emulator'
47 | run: |
48 | curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-536.0.1-linux-x86_64.tar.gz
49 | tar xvzf google-cloud-sdk-536.0.1-linux-x86_64.tar.gz && ./google-cloud-sdk/install.sh --quiet
50 | gcloud config configurations create emulator
51 | gcloud config set auth/disable_credentials true
52 | gcloud config set project test-project
53 | gcloud config set api_endpoint_overrides/spanner http://localhost:9020/
54 | gcloud spanner instances create test-instance --config=emulator-config --description="Test Instance" --nodes=1
55 | export SPANNER_EMULATOR_HOST=localhost:9010
56 | gcloud spanner databases create testdb --instance=test-instance --ddl-file=$PWD/testdata/emuddl.sql
57 | go test -v -run TestBasic
58 |
--------------------------------------------------------------------------------
/hedge_test.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "context"
5 | "log/slog"
6 | "testing"
7 | "time"
8 |
9 | "cloud.google.com/go/spanner"
10 | gaxv2 "github.com/googleapis/gax-go/v2"
11 | )
12 |
13 | const (
14 | db = "projects/test-project/instances/test-instance/databases/testdb"
15 | )
16 |
17 | func TestAny(t *testing.T) {
18 | // TODO:
19 | d := []byte{}
20 | d = append(d, []byte("hello")...)
21 | d = append(d, []byte("world")...)
22 | d = append(d, []byte("stats")...)
23 | d = append(d, []byte("one")...)
24 | slog.Info("next:", "val", d[0:5])
25 | slog.Info("next:", "val", d[5:10])
26 | slog.Info("next:", "val", d[10:15])
27 | slog.Info("next:", "val", d[15:20])
28 | }
29 |
30 | func TestBasic(t *testing.T) {
31 | ctx := context.Background()
32 | client, err := spanner.NewClient(ctx, db)
33 | if err != nil {
34 | t.Error(err)
35 | return
36 | }
37 |
38 | defer client.Close()
39 | op := New(client, ":8080", "locktable", "mylock", "logtable",
40 | WithLeaderHandler(
41 | nil,
42 | func(data any, msg []byte) ([]byte, error) {
43 | t.Log("[send] received:", string(msg))
44 | return []byte("send " + string(msg)), nil
45 | },
46 | ),
47 | WithBroadcastHandler(
48 | nil,
49 | func(data any, msg []byte) ([]byte, error) {
50 | t.Log("[broadcast/semaphore] received:", string(msg))
51 | return nil, nil
52 | },
53 | ),
54 | )
55 |
56 | done := make(chan error, 1)
57 | quit, cancel := context.WithCancel(ctx)
58 | go op.Run(quit, done)
59 |
60 | var cnt int
61 | bo := gaxv2.Backoff{
62 | Initial: time.Second,
63 | Max: time.Second * 30,
64 | Multiplier: 2,
65 | }
66 |
67 | for {
68 | cnt++
69 | locked, _ := op.HasLock()
70 | switch {
71 | case locked:
72 | t.Log("got lock")
73 | break
74 | default:
75 | t.Log("didn't get lock, retry")
76 | time.Sleep(bo.Pause())
77 | continue
78 | }
79 |
80 | if cnt >= 10 {
81 | t.Fatalf("can't get lock")
82 | }
83 |
84 | break
85 | }
86 |
87 | cancel()
88 | <-done
89 | }
90 |
--------------------------------------------------------------------------------
/deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: hedgedemo
5 | # TODO: (optional) Update {project} to your project id.
6 | annotations:
7 | iam.gke.io/gcp-service-account: hedgedemo@{project}.iam.gserviceaccount.com
8 |
9 | ---
10 |
11 | apiVersion: apps/v1
12 | kind: Deployment
13 | metadata:
14 | name: hedgedemo
15 | spec:
16 | selector:
17 | matchLabels:
18 | app: hedgedemo
19 | replicas: 3
20 | revisionHistoryLimit: 5
21 | template:
22 | metadata:
23 | labels:
24 | app: hedgedemo
25 | spec:
26 | # This sample uses GKE's Workload Identity to authenticate against GCP services.
27 | # 'hedgedemo' service account here is mapped to a GCP's IAM service account
28 | # that has access to Spanner and PubSub.
29 | # If you're not using Workload Identity, you can also use a service account key
30 | # and set the GOOGLE_APPLICATION_CREDENTIALS environment variable.
31 | serviceAccountName: hedgedemo
32 | initContainers:
33 | - image: gcr.io/google.com/cloudsdktool/cloud-sdk:363.0.0-alpine
34 | name: workload-identity-initcontainer
35 | command:
36 | - '/bin/bash'
37 | - '-c'
38 | - |
39 | curl -s -H 'Metadata-Flavor: Google' 'http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token' --retry 30 --retry-connrefused --retry-max-time 30 > /dev/null || exit 1
40 | containers:
41 | - name: hedgedemo
42 | image: quay.io/flowerinthenight/hedgedemo:v1.10.5
43 | imagePullPolicy: Always
44 | # TODO: Update as needed.
45 | args: ["-db=projects/{project}/instances/{instance}/databases/{database}"]
46 | env:
47 | # Downward value to get pod IP. We'll use this as our hedge instance id.
48 | - name: K8S_POD_IP
49 | valueFrom:
50 | fieldRef:
51 | fieldPath: status.podIP
52 | - name: GET_HOSTS_FROM
53 | value: dns
54 | # - name: GOOGLE_APPLICATION_CREDENTIALS
55 | # value: /etc/svcacct.json
56 | ports:
57 | - containerPort: 8080
58 | # Uncomment the section below and the env variable above to use GOOGLE_APPLICATION_CREDENTIALS
59 | # for authentication. You can upload a service account JSON file thru:
60 | # kubectl create secret generic hedgedemo-keyfile --from-file svcacct.json
61 | #
62 | # volumeMounts:
63 | # - name: keyfile
64 | # mountPath: "/etc/hedgedemo"
65 | # readOnly: true
66 | # volumes:
67 | # - name: keyfile
68 | # secret:
69 | # secretName: hedgedemo-keyfile
70 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/flowerinthenight/hedge/v2
2 |
3 | go 1.24.0
4 |
5 | require (
6 | cloud.google.com/go/spanner v1.85.0
7 | github.com/apache/arrow/go/v17 v17.0.0
8 | github.com/cespare/xxhash/v2 v2.3.0
9 | github.com/flowerinthenight/hedge-proto v0.1.0
10 | github.com/flowerinthenight/spindle/v2 v2.2.0
11 | github.com/google/uuid v1.6.0
12 | github.com/googleapis/gax-go/v2 v2.15.0
13 | github.com/hashicorp/memberlist v0.5.1
14 | github.com/shirou/gopsutil/v4 v4.24.9
15 | golang.org/x/exp v0.0.0-20250128182459-e0ece0dbea4c
16 | golang.org/x/sync v0.16.0
17 | google.golang.org/api v0.248.0
18 | google.golang.org/grpc v1.75.0
19 | )
20 |
21 | require (
22 | cel.dev/expr v0.24.0 // indirect
23 | cloud.google.com/go v0.122.0 // indirect
24 | cloud.google.com/go/auth v0.16.5 // indirect
25 | cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
26 | cloud.google.com/go/compute/metadata v0.8.0 // indirect
27 | cloud.google.com/go/monitoring v1.24.2 // indirect
28 | github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.3 // indirect
29 | github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 // indirect
30 | github.com/armon/go-metrics v0.4.1 // indirect
31 | github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect
32 | github.com/ebitengine/purego v0.8.0 // indirect
33 | github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect
34 | github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
35 | github.com/felixge/httpsnoop v1.0.4 // indirect
36 | github.com/go-jose/go-jose/v4 v4.1.2 // indirect
37 | github.com/go-logr/logr v1.4.3 // indirect
38 | github.com/go-logr/stdr v1.2.2 // indirect
39 | github.com/go-ole/go-ole v1.2.6 // indirect
40 | github.com/goccy/go-json v0.10.5 // indirect
41 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
42 | github.com/google/btree v1.1.3 // indirect
43 | github.com/google/flatbuffers v25.1.24+incompatible // indirect
44 | github.com/google/s2a-go v0.1.9 // indirect
45 | github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
46 | github.com/hashicorp/errwrap v1.1.0 // indirect
47 | github.com/hashicorp/go-immutable-radix v1.3.1 // indirect
48 | github.com/hashicorp/go-msgpack/v2 v2.1.2 // indirect
49 | github.com/hashicorp/go-multierror v1.1.1 // indirect
50 | github.com/hashicorp/go-sockaddr v1.0.7 // indirect
51 | github.com/hashicorp/go-uuid v1.0.2 // indirect
52 | github.com/hashicorp/golang-lru v1.0.2 // indirect
53 | github.com/klauspost/compress v1.17.11 // indirect
54 | github.com/klauspost/cpuid/v2 v2.2.9 // indirect
55 | github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
56 | github.com/miekg/dns v1.1.62 // indirect
57 | github.com/pierrec/lz4/v4 v4.1.22 // indirect
58 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
59 | github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
60 | github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
61 | github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect
62 | github.com/tklauser/go-sysconf v0.3.12 // indirect
63 | github.com/tklauser/numcpus v0.6.1 // indirect
64 | github.com/yusufpapurcu/wmi v1.2.4 // indirect
65 | github.com/zeebo/xxh3 v1.0.2 // indirect
66 | go.opencensus.io v0.24.0 // indirect
67 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect
68 | go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect
69 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect
70 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
71 | go.opentelemetry.io/otel v1.38.0 // indirect
72 | go.opentelemetry.io/otel/metric v1.38.0 // indirect
73 | go.opentelemetry.io/otel/sdk v1.38.0 // indirect
74 | go.opentelemetry.io/otel/sdk/metric v1.38.0 // indirect
75 | go.opentelemetry.io/otel/trace v1.38.0 // indirect
76 | golang.org/x/crypto v0.41.0 // indirect
77 | golang.org/x/mod v0.26.0 // indirect
78 | golang.org/x/net v0.43.0 // indirect
79 | golang.org/x/oauth2 v0.30.0 // indirect
80 | golang.org/x/sys v0.35.0 // indirect
81 | golang.org/x/text v0.28.0 // indirect
82 | golang.org/x/time v0.12.0 // indirect
83 | golang.org/x/tools v0.35.0 // indirect
84 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
85 | google.golang.org/genproto v0.0.0-20250826171959-ef028d996bc1 // indirect
86 | google.golang.org/genproto/googleapis/api v0.0.0-20250826171959-ef028d996bc1 // indirect
87 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect
88 | google.golang.org/protobuf v1.36.8 // indirect
89 | )
90 |
--------------------------------------------------------------------------------
/service.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "context"
5 | "io"
6 | "strconv"
7 |
8 | pb "github.com/flowerinthenight/hedge-proto"
9 | "golang.org/x/sync/errgroup"
10 | )
11 |
12 | type service struct {
13 | op *Op
14 |
15 | pb.UnimplementedHedgeServer
16 | }
17 |
18 | func (s *service) Send(hs pb.Hedge_SendServer) error {
19 | ctx := hs.Context()
20 | g := new(errgroup.Group)
21 | g.Go(func() error {
22 | for {
23 | select {
24 | case <-ctx.Done():
25 | return nil
26 | default:
27 | }
28 |
29 | in, err := hs.Recv()
30 | if err == io.EOF {
31 | return nil
32 | }
33 |
34 | if err != nil {
35 | s.op.logger.Println("Recv failed:", err)
36 | return err
37 | }
38 |
39 | s.op.leaderStreamIn <- &StreamMessage{
40 | Payload: in,
41 | }
42 | }
43 | })
44 |
45 | g.Go(func() error {
46 | for {
47 | select {
48 | case <-ctx.Done():
49 | return nil
50 | default:
51 | }
52 |
53 | out := <-s.op.leaderStreamOut
54 | if out == nil {
55 | return nil
56 | }
57 |
58 | hs.Send(out.Payload)
59 | }
60 | })
61 |
62 | return g.Wait()
63 | }
64 |
65 | func (s *service) Broadcast(hs pb.Hedge_BroadcastServer) error {
66 | ctx := hs.Context()
67 | g := new(errgroup.Group)
68 | g.Go(func() error {
69 | for {
70 | select {
71 | case <-ctx.Done():
72 | return nil
73 | default:
74 | }
75 |
76 | in, err := hs.Recv()
77 | if err == io.EOF {
78 | return nil
79 | }
80 |
81 | if err != nil {
82 | s.op.logger.Println("Recv failed:", err)
83 | return err
84 | }
85 |
86 | s.op.broadcastStreamIn <- &StreamMessage{
87 | Payload: in,
88 | }
89 | }
90 | })
91 |
92 | g.Go(func() error {
93 | for {
94 | select {
95 | case <-ctx.Done():
96 | return nil
97 | default:
98 | }
99 |
100 | out := <-s.op.broadcastStreamOut
101 | if out == nil {
102 | return nil
103 | }
104 |
105 | hs.Send(out.Payload)
106 | }
107 | })
108 |
109 | return g.Wait()
110 | }
111 |
112 | func (s *service) SoSWrite(hs pb.Hedge_SoSWriteServer) error {
113 | var err error
114 | ctx := hs.Context()
115 | var writer *Writer
116 |
117 | loop:
118 | for {
119 | select {
120 | case <-ctx.Done():
121 | err = ctx.Err()
122 | break loop
123 | default:
124 | }
125 |
126 | in, err := hs.Recv()
127 | if err == io.EOF {
128 | break
129 | }
130 |
131 | if err != nil {
132 | s.op.logger.Println("Recv failed:", err)
133 | break
134 | }
135 |
136 | name := in.Meta[metaName]
137 | if _, ok := s.op.soss[name]; !ok {
138 | mlimit, _ := strconv.ParseUint(in.Meta[metaMemLimit], 10, 64)
139 | dlimit, _ := strconv.ParseUint(in.Meta[metaDiskLimit], 10, 64)
140 | age, _ := strconv.ParseInt(in.Meta[metaExpire], 10, 64)
141 | s.op.soss[name] = s.op.NewSoS(name, &SoSOptions{
142 | MemLimit: mlimit,
143 | DiskLimit: dlimit,
144 | Expiration: age,
145 | })
146 | }
147 |
148 | if writer == nil {
149 | writer, _ = s.op.soss[name].Writer(&writerOptions{
150 | LocalOnly: true,
151 | })
152 | }
153 |
154 | writer.Write(in.Data)
155 | }
156 |
157 | if writer != nil {
158 | writer.Close()
159 | }
160 |
161 | return err
162 | }
163 |
164 | func (s *service) SoSRead(hs pb.Hedge_SoSReadServer) error {
165 | var err error
166 | in, err := hs.Recv()
167 | if err == io.EOF {
168 | return nil
169 | }
170 |
171 | if err != nil {
172 | s.op.logger.Println("Recv failed:", err)
173 | return nil
174 | }
175 |
176 | name := in.Meta[metaName]
177 | if _, ok := s.op.soss[name]; !ok {
178 | mlimit, _ := strconv.ParseUint(in.Meta[metaMemLimit], 10, 64)
179 | dlimit, _ := strconv.ParseUint(in.Meta[metaDiskLimit], 10, 64)
180 | age, _ := strconv.ParseInt(in.Meta[metaExpire], 10, 64)
181 | s.op.soss[name] = s.op.NewSoS(name, &SoSOptions{
182 | MemLimit: mlimit,
183 | DiskLimit: dlimit,
184 | Expiration: age,
185 | })
186 | }
187 |
188 | reader, _ := s.op.soss[name].Reader(&readerOptions{LocalOnly: true})
189 | out := make(chan []byte)
190 | eg := new(errgroup.Group)
191 | eg.Go(func() error {
192 | for d := range out {
193 | err = hs.Send(&pb.Payload{Data: d})
194 | if err != nil {
195 | s.op.logger.Println("Send failed:", err)
196 | }
197 | }
198 |
199 | return nil
200 | })
201 |
202 | reader.Read(out)
203 | eg.Wait()
204 |
205 | if reader != nil {
206 | reader.Close()
207 | }
208 |
209 | return nil
210 | }
211 |
212 | func (s *service) SoSClose(ctx context.Context, in *pb.Payload) (*pb.Payload, error) {
213 | name := in.Meta[metaName]
214 | s.op.soss[name].Close()
215 | return &pb.Payload{}, nil
216 | }
217 |
--------------------------------------------------------------------------------
/protocol.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "context"
5 | "encoding/base64"
6 | "encoding/json"
7 | "fmt"
8 | "net"
9 | "strconv"
10 | "strings"
11 | )
12 |
13 | func doConfirmLeader(ctx context.Context, op *Op, conn net.Conn, _ string) {
14 | var sb strings.Builder
15 | sb.WriteString(op.buildAckReply(nil))
16 | if hl, _ := op.HasLock(); !hl {
17 | sb.Reset()
18 | sb.WriteString("\n")
19 | }
20 |
21 | b := []byte(sb.String())
22 | conn.Write(b)
23 | }
24 |
25 | func doWrite(ctx context.Context, op *Op, conn net.Conn, msg string) {
26 | var sb strings.Builder
27 | if hl, _ := op.HasLock(); hl {
28 | ss := strings.Split(msg, " ")
29 | payload := ss[1]
30 | var noappend bool
31 | if len(ss) >= 3 {
32 | if ss[2] == FlagNoAppend {
33 | noappend = true
34 | }
35 | }
36 |
37 | decoded, _ := base64.StdEncoding.DecodeString(payload)
38 | var kv KeyValue
39 | err := json.Unmarshal(decoded, &kv)
40 | if err != nil {
41 | sb.WriteString(op.buildAckReply(err))
42 | } else {
43 | sb.WriteString(op.buildAckReply(op.Put(ctx, kv, PutOptions{
44 | DirectWrite: true,
45 | NoAppend: noappend,
46 | })))
47 | }
48 | } else {
49 | sb.WriteString("\n") // not leader, possible even if previously confirmed
50 | }
51 |
52 | b := []byte(sb.String())
53 | conn.Write(b)
54 | }
55 |
56 | func doSend(ctx context.Context, op *Op, conn net.Conn, msg string) {
57 | var sb strings.Builder
58 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoLeader.Error()))
59 | fmt.Fprintf(&sb, "%s\n", serr)
60 | if hl, _ := op.HasLock(); hl {
61 | sb.Reset()
62 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoHandler.Error()))
63 | fmt.Fprintf(&sb, "%s\n", serr)
64 | if op.fnLeader != nil {
65 | payload := strings.Split(msg, " ")[1]
66 | decoded, _ := base64.StdEncoding.DecodeString(payload)
67 | data := op.fnLdrData
68 | if data == nil {
69 | data = op
70 | }
71 |
72 | r, e := op.fnLeader(data, decoded) // call leader handler
73 | if e != nil {
74 | sb.Reset()
75 | serr := base64.StdEncoding.EncodeToString([]byte(e.Error()))
76 | fmt.Fprintf(&sb, "%s\n", serr)
77 | } else {
78 | br := base64.StdEncoding.EncodeToString([]byte(""))
79 | if r != nil {
80 | br = base64.StdEncoding.EncodeToString(r)
81 | }
82 |
83 | sb.Reset()
84 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, br)
85 | }
86 | }
87 | }
88 |
89 | b := []byte(sb.String())
90 | conn.Write(b)
91 | }
92 |
93 | func doBroadcast(ctx context.Context, op *Op, conn net.Conn, msg string) {
94 | var sb strings.Builder
95 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoHandler.Error()))
96 | fmt.Fprintf(&sb, "%s\n", serr)
97 | if op.fnBroadcast != nil {
98 | payload := strings.Split(msg, " ")[1]
99 | decoded, _ := base64.StdEncoding.DecodeString(payload)
100 | data := op.fnBcData
101 | if data == nil {
102 | data = op
103 | }
104 |
105 | r, e := op.fnBroadcast(data, decoded) // call broadcast handler
106 | if e != nil {
107 | sb.Reset()
108 | serr := base64.StdEncoding.EncodeToString([]byte(e.Error()))
109 | fmt.Fprintf(&sb, "%s\n", serr)
110 | } else {
111 | br := base64.StdEncoding.EncodeToString([]byte(""))
112 | if r != nil {
113 | br = base64.StdEncoding.EncodeToString(r)
114 | }
115 |
116 | sb.Reset()
117 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, br)
118 | }
119 | }
120 |
121 | b := []byte(sb.String())
122 | conn.Write(b)
123 | }
124 |
125 | func doHeartbeat(ctx context.Context, op *Op, conn net.Conn, msg string) {
126 | var sb strings.Builder
127 | oldallm := op.getMembers()
128 | op.addMember(strings.Split(msg, " ")[1])
129 | fmt.Fprintf(&sb, "%s\n", op.encodeMembers())
130 | conn.Write([]byte(sb.String()))
131 | newallm := op.getMembers()
132 | if len(oldallm) != len(newallm) && op.fnMemberChanged != nil {
133 | diff := len(newallm) - len(oldallm)
134 | op.fnMemberChanged(op.fnMemChangedData, []byte(fmt.Sprintf("%v", diff)))
135 | }
136 | }
137 |
138 | func doMembers(ctx context.Context, op *Op, conn net.Conn, msg string) {
139 | payload := strings.Split(msg, " ")[1]
140 | decoded, _ := base64.StdEncoding.DecodeString(payload)
141 | var m map[string]struct{}
142 | json.Unmarshal(decoded, &m)
143 | m[op.hostPort] = struct{}{} // just to be sure
144 | op.setMembers(m) // then replace my records
145 | members := op.getMembers()
146 | mlist := []string{}
147 | for k := range members {
148 | mlist = append(mlist, k)
149 | }
150 |
151 | op.logger.Printf("%v member(s) tracked", len(op.getMembers()))
152 | reply := op.buildAckReply(nil)
153 | conn.Write([]byte(reply))
154 | }
155 |
156 | func doCreateSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) {
157 | reply := op.buildAckReply(nil)
158 | func() {
159 | op.mtxSem.Lock()
160 | defer op.mtxSem.Unlock()
161 | ss := strings.Split(msg, " ")
162 | name, slimit, caller := ss[1], ss[2], ss[3]
163 | limit, err := strconv.Atoi(slimit)
164 | if err != nil {
165 | reply = op.buildAckReply(err)
166 | return
167 | }
168 |
169 | // See if this semaphore already exists.
170 | s, err := readSemaphoreEntry(ctx, op, name)
171 | if err != nil {
172 | err = createSemaphoreEntry(ctx, op, name, caller, limit)
173 | if err != nil {
174 | reply = op.buildAckReply(err)
175 | return
176 | }
177 |
178 | // Read again after create.
179 | s, err = readSemaphoreEntry(ctx, op, name)
180 | if err != nil {
181 | reply = op.buildAckReply(err)
182 | return
183 | }
184 | }
185 |
186 | slmt, _ := strconv.Atoi(strings.Split(s.Id, "=")[1])
187 | if slmt != limit {
188 | err = fmt.Errorf("semaphore already exists with a different limit")
189 | reply = op.buildAckReply(err)
190 | return
191 | }
192 | }()
193 |
194 | b := []byte(reply)
195 | conn.Write(b)
196 | }
197 |
198 | func doAcquireSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) {
199 | reply := op.buildAckReply(nil)
200 | func() {
201 | op.mtxSem.Lock()
202 | defer op.mtxSem.Unlock()
203 | ss := strings.Split(msg, " ")
204 | name, caller := ss[1], ss[2]
205 | go ensureLiveness(ctx, op)
206 | op.ensureCh <- name
207 | s, err := readSemaphoreEntry(ctx, op, name) // to get the current limit
208 | if err != nil {
209 | err = fmt.Errorf("0:%v", err) // final
210 | reply = op.buildAckReply(err)
211 | return
212 | }
213 |
214 | limit, _ := strconv.Atoi(strings.Split(s.Id, "=")[1])
215 | retry, err := createAcquireSemaphoreEntry(ctx, op, name, caller, limit)
216 | if err != nil {
217 | switch {
218 | case retry:
219 | err = fmt.Errorf("1:%v", err) // can retry
220 | default:
221 | err = fmt.Errorf("0:%v", err) // final
222 | }
223 |
224 | reply = op.buildAckReply(err)
225 | return
226 | }
227 | }()
228 |
229 | b := []byte(reply)
230 | conn.Write(b)
231 | }
232 |
233 | func doReleaseSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) {
234 | reply := op.buildAckReply(nil)
235 | func() {
236 | op.mtxSem.Lock()
237 | defer op.mtxSem.Unlock()
238 | ss := strings.Split(msg, " ")
239 | name, caller := ss[1], ss[2]
240 | s, err := readSemaphoreEntry(ctx, op, name) // to get the current limit
241 | if err != nil {
242 | reply = op.buildAckReply(err)
243 | return
244 | }
245 |
246 | limit, _ := strconv.Atoi(strings.Split(s.Id, "=")[1])
247 | err = releaseSemaphore(ctx, op, name, caller, s.Value, limit)
248 | if err != nil {
249 | reply = op.buildAckReply(err)
250 | return
251 | }
252 | }()
253 |
254 | b := []byte(reply)
255 | conn.Write(b)
256 | }
257 |
258 | func handleMsg(ctx context.Context, op *Op, conn net.Conn) {
259 | defer conn.Close()
260 | fns := map[string]func(ctx context.Context, op *Op, conn net.Conn, msg string){
261 | CmdLeader: doConfirmLeader, // confirm leader only
262 | CmdWrite + " ": doWrite, // actual write
263 | CmdSend + " ": doSend, // Send() API
264 | CmdBroadcast + " ": doBroadcast, // Broadcast() API
265 | CmdPing + " ": doHeartbeat, // heartbeat
266 | CmdMembers + " ": doMembers, // broadcast online members
267 | CmdSemaphore + " ": doCreateSemaphore, // create semaphore (we are leader)
268 | CmdSemAcquire + " ": doAcquireSemaphore, // acquire semaphore (we are leader)
269 | CmdSemRelease + " ": doReleaseSemaphore, // release semaphore (we are leader)
270 | }
271 |
272 | addSpace := func(s string) string {
273 | var sb strings.Builder
274 | fmt.Fprintf(&sb, "%s ", s)
275 | return sb.String()
276 | }
277 |
278 | for {
279 | var prefix string
280 | msg, err := op.recv(conn)
281 | if err != nil || ctx.Err() != nil {
282 | return
283 | }
284 |
285 | switch {
286 | case msg == CmdPing: // leader asking if we are online (msg has no prefix)
287 | reply := op.buildAckReply(nil)
288 | conn.Write([]byte(reply))
289 | return
290 | case strings.HasPrefix(msg, CmdLeader):
291 | prefix = CmdLeader
292 | case strings.HasPrefix(msg, addSpace(CmdWrite)):
293 | prefix = addSpace(CmdWrite)
294 | case strings.HasPrefix(msg, addSpace(CmdSend)):
295 | prefix = addSpace(CmdSend)
296 | case strings.HasPrefix(msg, addSpace(CmdBroadcast)):
297 | prefix = addSpace(CmdBroadcast)
298 | case strings.HasPrefix(msg, addSpace(CmdPing)):
299 | prefix = addSpace(CmdPing)
300 | case strings.HasPrefix(msg, addSpace(CmdMembers)):
301 | prefix = addSpace(CmdMembers)
302 | case strings.HasPrefix(msg, addSpace(CmdSemaphore)):
303 | prefix = addSpace(CmdSemaphore)
304 | case strings.HasPrefix(msg, addSpace(CmdSemAcquire)):
305 | prefix = addSpace(CmdSemAcquire)
306 | case strings.HasPrefix(msg, addSpace(CmdSemRelease)):
307 | prefix = addSpace(CmdSemRelease)
308 | default:
309 | return // do nothing
310 | }
311 |
312 | fns[prefix](ctx, op, conn, msg)
313 | }
314 | }
315 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/flowerinthenight/hedge/actions/workflows/main.yml)
2 | [](https://quay.io/repository/flowerinthenight/hedgedemo)
3 | [](https://pkg.go.dev/github.com/flowerinthenight/hedge)
4 |
5 | (This repo is mirrored to [https://codeberg.org/flowerinthenight/hedge](https://codeberg.org/flowerinthenight/hedge)).
6 |
7 | ## hedge
8 | A [Go](https://go.dev/) cluster membership management library built on [spindle](https://github.com/flowerinthenight/spindle) and [Cloud Spanner](https://cloud.google.com/spanner) that provides rudimentary distributed computing facilities to Kubernetes [Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). Features include:
9 |
10 | * a consistent, append-only, Spanner-backed distributed key/value storage,
11 | * a distributed locking/leader election mechanism through spindle,
12 | * a simple member-to-leader communication channel,
13 | * a broadcast (send-to-all) mechanism,
14 | * memory spill-over, ad-hoc storage, and
15 | * a distributed semaphore.
16 |
17 | It also works even on single-pod deployments.
18 |
19 |
20 |
21 |
22 |
23 | **hedge** is heavily used in [Alphaus](https://www.alphaus.cloud/) production with services that scale from single digit pods to hundreds.
24 |
25 | Ports:
26 |
27 | * [hedge-cb](https://github.com/flowerinthenight/hedge-cb) - trimmed down version for cluster membership, AWS-native, relies on [spindle-cb](https://github.com/flowerinthenight/spindle-cb).
28 | * [hedge-rs](https://github.com/flowerinthenight/hedge-rs) - trimmed down version written in Rust.
29 |
30 | ## Why?
31 | First, I wanted a cluster coordinator that can work within k8s Deployments as a library, not as an external service (like [ZooKeeper](https://zookeeper.apache.org/), or [etcd](https://etcd.io/)). So far, our efforts in making [Raft](https://raft.github.io/) play well with bursty, frequently scaling up/down deployments as a library is not that reliable yet (though we have an ongoing multi-[Paxos](https://en.wikipedia.org/wiki/Paxos_(computer_science))-based experiment [here](https://github.com/alphauslabs/juno) as well). I also wanted an easily-accessible storage that is a bit decoupled from the code (easier to query, edit, debug, backup, etc). We are already a heavy Spanner user, and [spindle](https://github.com/flowerinthenight/spindle/) has been in our production for many years now: these two should be able to do it; StatefulSets or DaemonSets shouldn't be a requirement. Since then, additional features have been added, such as the `Send()` API.
32 |
33 | ## What does it do?
34 | Leader election is handled by [spindle](https://github.com/flowerinthenight/spindle). Two APIs are provided for storage: `Put()` and `Get()`. All pods can serve the `Get()` calls, while only the leader handles the `Put()` API. If a non-leader pod calls `Put()`, that call is forwarded to the leader, who will do the actual write. All `Put()`'s are append-only.
35 |
36 | Spindle's `HasLock()` function is also available for distributed locking due to struct embedding, although you can use spindle separately for that, if you prefer.
37 |
38 | A `Send()` API is also provided for members to be able to send simple request/reply-type messages to the current leader at any time. A streaming equivalent (gRPC) is also available.
39 |
40 | A `Broadcast()` API is also available for all pods. Note that due to the nature of k8s deployments (pods come and go) and the internal heartbeat delays, some pods might not receive the broadcast message at call time, although all pods will have the complete broadcast target list eventually. Hedge uses a combination of heartbeats and broadcasts to propagate member information to all pods; non-leaders send liveness heartbeats to the leader while the leader broadcasts active members to all pods. A streaming equivalent (gRPC) is also available.
41 |
42 | An experimental spill-over store (**SoS**) is also supported. It's currently used in bursty, quick load-process-discard type of data processing. A **SoS** in hedge is simply a combined memory area and a disk area between pods. For example, a pod can define a SoS of 1GB memory and 1GB disk. If there are 100 pods running under hedge, that's a combined storage of (1GB + 1GB) * 100. During writes and subsequent reads, hedge handles the data distribution and assembly between local RAM, disk, and between pods. It uses [Arrow](https://arrow.apache.org/) and memory mapped files as backing stores. You can check out this [blog post](https://flowerinthenight.com/blog/2024-07-24-spillover-store/) for more information.
43 |
44 | Finally, a distributed semaphore is also provided through the `NewSemaphore()`, `[Try]Acquire()`, and `Release()` APIs.
45 |
46 | ## Prerequisites
47 | * All pods within a cluster should be able to contact each other via TCP (address:port).
48 | * Each hedge's instance id should be set using the pod's cluster IP address:port. You can use [downward API](https://kubernetes.io/docs/concepts/workloads/pods/downward-api/) to get the pod's IP address, or you can use the ":port" format in which case the IP address will be resolved internally.
49 | * For now, spindle's lock table and hedge's log table are within the same database.
50 | * Tables for spindle and hedge need to be created beforehand. See [here](https://github.com/flowerinthenight/spindle#usage) for spindle's DDL. For hedge, see below:
51 |
52 | ```sql
53 | -- 'logtable' name is just an example
54 | CREATE TABLE logtable (
55 | id STRING(MAX),
56 | key STRING(MAX),
57 | value STRING(MAX),
58 | leader STRING(MAX),
59 | timestamp TIMESTAMP OPTIONS (allow_commit_timestamp=true),
60 | ) PRIMARY KEY (key, id)
61 | ```
62 |
63 | * This library will use the input key/value table (`logtable` in the example above) for its semaphore-related operations with the following reserved keywords:
64 | ```
65 | column=key, value=__hedge/semaphore/{name}
66 | column=key, value=__caller={ip:port}
67 | column=id, value=__hedge/semaphore/{name}
68 | column=id, value=limit={num}
69 | ```
70 |
71 | ## How to use
72 | Something like:
73 | ```go
74 | ctx := context.Background()
75 | client, _ := spanner.NewClient(ctx, "your/spanner/database")
76 | defer client.Close()
77 |
78 | op := hedge.New(
79 | client,
80 | ":8080", // addr will be resolved internally
81 | "locktable",
82 | "myspindlelock",
83 | "logtable",
84 | hedge.WithLeaderHandler( // if leader only, handles Send()
85 | nil,
86 | func(data interface{}, msg []byte) ([]byte, error) {
87 | log.Println("[send] received:", string(msg))
88 | return []byte("hello " + string(msg)), nil
89 | },
90 | ),
91 | hedge.WithBroadcastHandler( // handles Broadcast()
92 | nil,
93 | func(data interface{}, msg []byte) ([]byte, error) {
94 | log.Println("[broadcast] received:", string(msg))
95 | return []byte("broadcast " + string(msg)), nil
96 | },
97 | ),
98 | })
99 |
100 | ctx, cancel := context.WithCancel(ctx)
101 | done := make(chan error, 1) // optional wait
102 | go op.Run(ctx, done)
103 |
104 | // For storage, any pod should be able to call op.Put(...) or op.Get(...) here.
105 | //
106 | // Any pod can call HasLock() here at any given time to know whether they are
107 | // leader or not.
108 | //
109 | // hl, _ := op.HasLock()
110 | // if hl {
111 | // log.Println("leader here!")
112 | // }
113 | //
114 | // To send a message to the current leader, any pod can call op.Send(...) and
115 | // the leader will handle it through the WithLeaderHandler callback. A wrapper
116 | // SendToLeader() helper function is also available for calling op.Send() with
117 | // retries+backoff.
118 | //
119 | // For broadcast, any pod can call op.Broadcast(...) here which will be handled
120 | // by each pod's WithBroadcastHandler callback, including the caller.
121 | //
122 | // For distributed semaphore, any pod can call the following:
123 | //
124 | // sem, _ := op.NewSemaphore(ctx, "semaphore-name", 2)
125 | // sem.Acquire(ctx)
126 | // ...
127 | // sem.Release(ctx)
128 |
129 | cancel()
130 | <-done
131 | ```
132 |
133 | A sample [deployment](./deployment.yaml) file for GKE is provided, although it needs a fair bit of editing (for auth) to be usable. It uses [Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) for authentication although you can update it to use other authentication methods as well. The service account needs to have Spanner permissions.
134 |
135 | Once deployed, you can do the following tests while checking the logs. We will use [kubepfm](https://github.com/flowerinthenight/kubepfm) to port-forward our test commands to the server.
136 |
137 | Test the `Put()` API:
138 |
139 | ```sh
140 | # Open a terminal and run:
141 | $ kubepfm --target deployment/hedgedemo:9090:9090
142 |
143 | # Open another terminal and run:
144 | $ curl localhost:9090/put -d "samplekey samplevalue"
145 |
146 | # To ensure a non-leader sender, you can also specify a
147 | # non-leader pod for the kubepfm command above:
148 | $ kubepfm --target hedgedemo-6b5bcd4998-n95n7:9090:9090
149 | ```
150 |
151 | Test the `Get()` API:
152 |
153 | ```sh
154 | # While kubepfm is running on a different terminal, run:
155 | $ curl localhost:9090/get -d "samplekey"
156 | ```
157 |
158 | Test the `Send()` API:
159 |
160 | ```sh
161 | # While kubepfm is running on a different terminal, run:
162 | $ curl localhost:9090/send -d "hello-world"
163 | ```
164 |
165 | Test the `Broadcast()` API:
166 |
167 | ```sh
168 | # While kubepfm is running on a different terminal, run:
169 | $ curl localhost:9090/broadcast -d "hello-all"
170 | ```
171 |
--------------------------------------------------------------------------------
/semaphore.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "context"
5 | "encoding/base64"
6 | "fmt"
7 | "net"
8 | "strings"
9 | "sync"
10 | "sync/atomic"
11 | "time"
12 |
13 | "cloud.google.com/go/spanner"
14 | "google.golang.org/api/iterator"
15 | )
16 |
17 | const (
18 | semNamef = "__hedge/semaphore/%v"
19 | semCallerf = "__caller=%v"
20 | semLimitf = "limit=%v"
21 | markDel = "delete-on-empty"
22 | )
23 |
24 | var (
25 | ErrSemFull = fmt.Errorf("hedge/semaphore: semaphore full")
26 | )
27 |
28 | // Semaphore represents a distributed semaphore object.
29 | type Semaphore struct {
30 | name string
31 | limit int
32 | op *Op
33 | }
34 |
35 | // Acquire acquires a semaphore. This call will block until the semaphore is acquired.
36 | // By default, this call will basically block forever until the semaphore is acquired
37 | // or until ctx expires or is cancelled.
38 | func (s *Semaphore) Acquire(ctx context.Context) error { return s.acquire(ctx, false) }
39 |
40 | // TryAcquire is like Acquire() but will not block until the semaphore is acquired.
41 | // It will only attempt to acquire the semaphore and will return immediately on either
42 | // success or failure, or until ctx expires or is cancelled.
43 | func (s *Semaphore) TryAcquire(ctx context.Context) error { return s.acquire(ctx, true) }
44 |
45 | func (s *Semaphore) acquire(ctx context.Context, noretry bool) error {
46 | subctx := context.WithValue(ctx, struct{}{}, nil)
47 | first := make(chan struct{}, 1)
48 | first <- struct{}{} // immediately the first time
49 | ticker := time.NewTicker(time.Second * 1)
50 | defer ticker.Stop()
51 |
52 | var active atomic.Int32
53 | acquire := func() (bool, error) { // true means okay to retry
54 | active.Store(1)
55 | defer active.Store(0)
56 | conn, err := s.op.getLeaderConn(ctx)
57 | if err != nil {
58 | return true, err
59 | }
60 |
61 | defer conn.Close()
62 | var sb strings.Builder
63 | fmt.Fprintf(&sb, "%s %s %s\n", CmdSemAcquire, s.name, s.op.hostPort)
64 | reply, err := s.op.send(conn, sb.String())
65 | if err != nil {
66 | return false, err
67 | }
68 |
69 | switch {
70 | case strings.HasPrefix(reply, CmdAck):
71 | ss := strings.Split(reply, " ")
72 | if len(ss) > 1 { // failed
73 | dec, _ := base64.StdEncoding.DecodeString(ss[1])
74 | switch {
75 | case strings.HasPrefix(string(dec), "0:"):
76 | serr := strings.Replace(string(dec), "0:", "", 1)
77 | return false, fmt.Errorf("%v", serr)
78 | case strings.HasPrefix(string(dec), "1:"):
79 | serr := strings.Replace(string(dec), "1:", "", 1)
80 | return true, fmt.Errorf("%v", serr)
81 | default: // shouldn't be the case, hopefully
82 | return false, fmt.Errorf("%v", string(dec))
83 | }
84 | }
85 | default:
86 | return false, ErrNotSupported
87 | }
88 |
89 | return false, nil
90 | }
91 |
92 | for {
93 | select {
94 | case <-subctx.Done():
95 | return context.Canceled
96 | case <-first:
97 | case <-ticker.C:
98 | }
99 |
100 | if active.Load() == 1 {
101 | continue
102 | }
103 |
104 | type acq_t struct {
105 | retry bool
106 | err error
107 | }
108 |
109 | ch := make(chan acq_t, 1)
110 | go func() {
111 | r, e := acquire()
112 | ch <- acq_t{r, e}
113 | }()
114 |
115 | ret := <-ch
116 | switch {
117 | case ret.err == nil:
118 | return nil
119 | default:
120 | if noretry {
121 | return ret.err
122 | } else {
123 | if ret.retry {
124 | continue
125 | } else {
126 | return ret.err
127 | }
128 | }
129 | }
130 | }
131 | }
132 |
133 | // Release releases a semaphore. Although recommended to release all acquired semaphores, this is still
134 | // a best-effort release as any caller could disappear/crash while holding a semaphore. To remedy this,
135 | // the current leader will attempt to track all semaphore owners and remove the non-responsive ones after
136 | // some delay. A downside of not calling release properly will cause other semaphore acquirers to block
137 | // just a bit longer while leader does the cleanup, whereas calling release will free up space immediately
138 | // allowing other semaphore acquirers to not wait that long.
139 | func (s *Semaphore) Release(ctx context.Context) error {
140 | conn, err := s.op.getLeaderConn(ctx)
141 | if err != nil {
142 | return err
143 | }
144 |
145 | defer conn.Close()
146 | var sb strings.Builder
147 | fmt.Fprintf(&sb, "%s %s %s\n", CmdSemRelease, s.name, s.op.hostPort)
148 | reply, err := s.op.send(conn, sb.String())
149 | if err != nil {
150 | return err
151 | }
152 |
153 | switch {
154 | case strings.HasPrefix(reply, CmdAck):
155 | ss := strings.Split(reply, " ")
156 | if len(ss) > 1 { // failed
157 | dec, _ := base64.StdEncoding.DecodeString(ss[1])
158 | return fmt.Errorf("%v", string(dec))
159 | }
160 | }
161 |
162 | return nil
163 | }
164 |
165 | // We will use the current logTable as our semaphore storage.
166 | // Naming convention(s):
167 | //
168 | // key="hedge/semaphore/{name}", id="limit={v}", value={caller}
169 | func createSemaphoreEntry(ctx context.Context, op *Op, name, caller string, limit int) error {
170 | _, err := op.spannerClient.ReadWriteTransaction(ctx,
171 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error {
172 | var q strings.Builder
173 | fmt.Fprintf(&q, "insert %s ", op.logTable)
174 | fmt.Fprintf(&q, "(key, id, value, leader, timestamp) values (")
175 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semNamef, name))
176 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semLimitf, limit))
177 | fmt.Fprintf(&q, "'%s', ", caller)
178 | fmt.Fprintf(&q, "'%s', ", op.hostPort)
179 | fmt.Fprintf(&q, "PENDING_COMMIT_TIMESTAMP())")
180 |
181 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()})
182 | return err
183 | },
184 | )
185 |
186 | return err
187 | }
188 |
189 | func readSemaphoreEntry(ctx context.Context, op *Op, name string) (*LogItem, error) {
190 | var q strings.Builder
191 | fmt.Fprintf(&q, "select key, id, value, leader, timestamp ")
192 | fmt.Fprintf(&q, "from %s ", op.logTable)
193 | fmt.Fprintf(&q, "where key = @name")
194 |
195 | stmt := spanner.Statement{
196 | SQL: q.String(),
197 | Params: map[string]any{
198 | "name": fmt.Sprintf(semNamef, name),
199 | },
200 | }
201 |
202 | iter := op.spannerClient.Single().Query(ctx, stmt)
203 | defer iter.Stop()
204 | for {
205 | row, err := iter.Next()
206 | if err == iterator.Done {
207 | break
208 | }
209 |
210 | if err != nil {
211 | return nil, err
212 | }
213 |
214 | var v LogItem
215 | err = row.ToStruct(&v)
216 | if err != nil {
217 | return nil, err
218 | }
219 |
220 | // Should only be one item.
221 | return &v, nil
222 | }
223 |
224 | return nil, fmt.Errorf("%v not found", name)
225 | }
226 |
227 | func createAcquireSemaphoreEntry(ctx context.Context, op *Op, name, caller string, limit int) (bool, error) {
228 | // First, see if caller already acquired this semaphore.
229 | var q strings.Builder
230 | fmt.Fprintf(&q, "select key, id ")
231 | fmt.Fprintf(&q, "from %s ", op.logTable)
232 | fmt.Fprintf(&q, "where key = @key and id = @id")
233 |
234 | stmt := spanner.Statement{
235 | SQL: q.String(),
236 | Params: map[string]any{
237 | "key": fmt.Sprintf(semCallerf, caller),
238 | "id": fmt.Sprintf(semNamef, name),
239 | },
240 | }
241 |
242 | var cnt int
243 | iter := op.spannerClient.Single().Query(ctx, stmt)
244 | defer iter.Stop()
245 | for {
246 | row, err := iter.Next()
247 | if err == iterator.Done || err != nil {
248 | break
249 | }
250 |
251 | var v LogItem
252 | err = row.ToStruct(&v)
253 | if err != nil {
254 | break
255 | }
256 |
257 | if v.Key != "" && v.Id != "" {
258 | cnt++
259 | }
260 | }
261 |
262 | if cnt > 0 {
263 | return false, fmt.Errorf("already acquired")
264 | }
265 |
266 | var free bool
267 | _, err := op.spannerClient.ReadWriteTransaction(ctx,
268 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error {
269 | getEntriesCount := func() int64 {
270 | var q strings.Builder
271 | fmt.Fprintf(&q, "select count(id) id ")
272 | fmt.Fprintf(&q, "from %s ", op.logTable)
273 | fmt.Fprintf(&q, "where id = @id")
274 |
275 | stmt := spanner.Statement{
276 | SQL: q.String(),
277 | Params: map[string]any{
278 | "id": fmt.Sprintf(semNamef, name),
279 | },
280 | }
281 |
282 | var cnt int64
283 | iter := txn.Query(ctx, stmt)
284 | defer iter.Stop()
285 | for {
286 | row, err := iter.Next()
287 | if err == iterator.Done || err != nil {
288 | break
289 | }
290 |
291 | if err := row.Columns(&cnt); err != nil {
292 | break
293 | }
294 | }
295 |
296 | return cnt
297 | }
298 |
299 | // Next, see if there is still semaphore space.
300 | free = getEntriesCount() < int64(limit)
301 | if !free {
302 | return ErrSemFull
303 | }
304 |
305 | // Finally, create the acquire semaphore entry.
306 | var q strings.Builder
307 | fmt.Fprintf(&q, "insert %s", op.logTable)
308 | fmt.Fprintf(&q, "(key, id, value, leader, timestamp) values (")
309 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semCallerf, caller))
310 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semNamef, name))
311 | fmt.Fprintf(&q, "'%s', ", caller)
312 | fmt.Fprintf(&q, "'%s', ", op.hostPort)
313 | fmt.Fprintf(&q, "PENDING_COMMIT_TIMESTAMP())")
314 |
315 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()})
316 | if err != nil {
317 | return err
318 | }
319 |
320 | // Finally, we mark this semaphore as full (once). Will be used in release later.
321 | if getEntriesCount() >= int64(limit) {
322 | var q strings.Builder
323 | fmt.Fprintf(&q, "update %s ", op.logTable)
324 | fmt.Fprintf(&q, "set value = @val where key = @name")
325 |
326 | txn.Update(ctx, spanner.Statement{
327 | SQL: q.String(),
328 | Params: map[string]any{
329 | "val": markDel,
330 | "name": fmt.Sprintf(semNamef, name),
331 | },
332 | })
333 | }
334 |
335 | return nil
336 | },
337 | )
338 |
339 | switch {
340 | case err != nil && !free:
341 | return true, err
342 | default:
343 | return false, err
344 | }
345 | }
346 |
347 | func releaseSemaphore(ctx context.Context, op *Op, name, caller, value string, limit int) error {
348 | _ = limit
349 | _, err := op.spannerClient.ReadWriteTransaction(ctx,
350 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error {
351 | // First, attempt to remove the calling entry.
352 | var q strings.Builder
353 | fmt.Fprintf(&q, "delete from %s ", op.logTable)
354 | fmt.Fprintf(&q, "where key = @key and id = @id")
355 |
356 | txn.Update(ctx, spanner.Statement{ // best-effort, could fail
357 | SQL: q.String(),
358 | Params: map[string]any{
359 | "key": fmt.Sprintf(semCallerf, caller),
360 | "id": fmt.Sprintf(semNamef, name),
361 | },
362 | })
363 |
364 | // Next, see if there are no more entries.
365 | q.Reset()
366 | fmt.Fprintf(&q, "select count(id) id ")
367 | fmt.Fprintf(&q, "from %s ", op.logTable)
368 | fmt.Fprintf(&q, "where id = @id")
369 |
370 | stmt := spanner.Statement{
371 | SQL: q.String(),
372 | Params: map[string]any{"id": fmt.Sprintf(semNamef, name)},
373 | }
374 |
375 | var cnt int64
376 | iter := txn.Query(ctx, stmt)
377 | defer iter.Stop()
378 | for {
379 | row, err := iter.Next()
380 | if err == iterator.Done || err != nil {
381 | break
382 | }
383 |
384 | if err := row.Columns(&cnt); err != nil {
385 | break
386 | }
387 | }
388 |
389 | if cnt != 0 {
390 | return nil
391 | }
392 |
393 | if value == markDel {
394 | // Finally, if no more entries, let's remove the actual semaphore entry
395 | // so we can reuse this name, perhaps with a different limit.
396 | q.Reset()
397 | fmt.Fprintf(&q, "delete from %s ", op.logTable)
398 | fmt.Fprintf(&q, "where key = @key")
399 |
400 | txn.Update(ctx, spanner.Statement{
401 | SQL: q.String(),
402 | Params: map[string]any{"key": fmt.Sprintf(semNamef, name)},
403 | })
404 | }
405 |
406 | return nil
407 | },
408 | )
409 |
410 | return err
411 | }
412 |
413 | type ensureT struct {
414 | sync.Mutex
415 | m map[string]struct{}
416 | }
417 |
418 | func ensureLock() *ensureT { return &ensureT{m: make(map[string]struct{})} }
419 |
420 | func (e *ensureT) add(name string) {
421 | e.Lock()
422 | defer e.Unlock()
423 | e.m[name] = struct{}{}
424 | }
425 |
426 | func (e *ensureT) del(name string) {
427 | e.Lock()
428 | defer e.Unlock()
429 | delete(e.m, name)
430 | }
431 |
432 | func (e *ensureT) exists(name string) bool {
433 | e.Lock()
434 | defer e.Unlock()
435 | _, ok := e.m[name]
436 | return ok
437 | }
438 |
439 | // Triggered during semaphore acquisition; meaning, this is only called when we are leader.
440 | func ensureLiveness(ctx context.Context, op *Op) {
441 | if op.ensureOn.Load() == 1 {
442 | return // one checker per leader
443 | }
444 |
445 | op.ensureOn.Store(1)
446 | defer op.ensureOn.Store(0)
447 | op.ensureCtx, op.ensureCancel = context.WithCancel(ctx)
448 |
449 | enlock := ensureLock()
450 | ensure := func(name string) {
451 | enlock.add(name)
452 | defer enlock.del(name)
453 |
454 | var q strings.Builder
455 | fmt.Fprintf(&q, "select key from %s ", op.logTable)
456 | fmt.Fprintf(&q, "where id = @id")
457 |
458 | stmt := spanner.Statement{
459 | SQL: q.String(),
460 | Params: map[string]any{
461 | "id": fmt.Sprintf(semNamef, name),
462 | },
463 | }
464 |
465 | ids := []string{}
466 | iter := op.spannerClient.Single().Query(ctx, stmt)
467 | defer iter.Stop()
468 | for {
469 | row, err := iter.Next()
470 | if err == iterator.Done {
471 | break
472 | }
473 |
474 | if err != nil {
475 | break
476 | }
477 |
478 | var v LogItem
479 | err = row.ToStruct(&v)
480 | if err != nil {
481 | continue
482 | }
483 |
484 | ids = append(ids, v.Key)
485 | }
486 |
487 | if len(ids) > 0 {
488 | todel := make(chan string, len(ids))
489 | var w sync.WaitGroup
490 | for _, id := range ids {
491 | w.Add(1)
492 | go func(t string) {
493 | var rmId string
494 | defer func(rm *string) {
495 | todel <- *rm
496 | w.Done()
497 | }(&rmId)
498 |
499 | timeout := time.Second * 5
500 | caller := strings.Split(t, "=")[1]
501 | conn, err := net.DialTimeout("tcp", caller, timeout)
502 | if err != nil {
503 | rmId = t // delete this
504 | return
505 | }
506 |
507 | var sb strings.Builder
508 | fmt.Fprintf(&sb, "%s\n", CmdPing)
509 | r, err := op.send(conn, sb.String())
510 | if err != nil {
511 | rmId = t // delete this
512 | return
513 | }
514 |
515 | if r != CmdAck {
516 | rmId = t // delete this
517 | }
518 | }(id)
519 | }
520 |
521 | w.Wait()
522 | rms := []string{}
523 | for range ids {
524 | rm := <-todel
525 | if rm != "" {
526 | rms = append(rms, rm)
527 | }
528 | }
529 |
530 | if len(rms) > 0 {
531 | op.logger.Printf("[ensure/sem] delete: %v", rms)
532 | op.spannerClient.ReadWriteTransaction(ctx,
533 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error {
534 | q.Reset()
535 | fmt.Fprintf(&q, "delete from %s ", op.logTable)
536 | fmt.Fprintf(&q, "where key in ('%s')", strings.Join(rms, "','"))
537 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()})
538 | return err
539 | },
540 | )
541 | }
542 |
543 | time.Sleep(time.Second * 5)
544 | }
545 | }
546 |
547 | for {
548 | var name string
549 | select {
550 | case <-op.ensureCtx.Done():
551 | op.ensureDone <- struct{}{}
552 | return
553 | case name = <-op.ensureCh:
554 | }
555 |
556 | if enlock.exists(name) {
557 | continue
558 | }
559 |
560 | go ensure(name)
561 | }
562 | }
563 |
--------------------------------------------------------------------------------
/example/demo/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "flag"
7 | "fmt"
8 | "io"
9 | "log"
10 | "log/slog"
11 | "net/http"
12 | "os"
13 | "os/signal"
14 | "strconv"
15 | "strings"
16 | "sync"
17 | "syscall"
18 | "time"
19 |
20 | "cloud.google.com/go/spanner"
21 | pb "github.com/flowerinthenight/hedge-proto"
22 | "github.com/flowerinthenight/hedge/v2"
23 | "github.com/google/uuid"
24 | "golang.org/x/exp/mmap"
25 | "golang.org/x/sync/errgroup"
26 | )
27 |
28 | var (
29 | dbstr = flag.String("db", "", "fmt: projects/{v}/instances/{v}/databases/{v}")
30 | lockName = flag.String("lockname", "hedge-demo-group", "lock name, common to all instances")
31 | spindleTable = flag.String("spindletable", "testlease", "see https://github.com/flowerinthenight/spindle for more info")
32 | logTable = flag.String("logtable", "", "the table for our log data (optional)")
33 | )
34 |
35 | func main() {
36 | flag.Parse()
37 | ctx, cancel := context.WithCancel(context.Background())
38 | client, err := spanner.NewClient(ctx, *dbstr)
39 | if err != nil {
40 | slog.Error("NewClient failed:", "err", err)
41 | return
42 | }
43 |
44 | defer client.Close()
45 | ldrIn := make(chan *hedge.StreamMessage)
46 | ldrOut := make(chan *hedge.StreamMessage)
47 | go func(_ctx context.Context) {
48 | for {
49 | select {
50 | case <-_ctx.Done():
51 | return
52 | case m := <-ldrIn:
53 | b, _ := json.Marshal(m)
54 | slog.Info("input stream:", "val", string(b))
55 | ldrOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("one")}}
56 | ldrOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("two")}}
57 | ldrOut <- nil // end
58 | }
59 | }
60 | }(context.WithValue(ctx, struct{}{}, nil))
61 |
62 | bcastIn := make(chan *hedge.StreamMessage)
63 | bcastOut := make(chan *hedge.StreamMessage)
64 | host, _ := os.Hostname()
65 | go func(_ctx context.Context) {
66 | for {
67 | select {
68 | case <-_ctx.Done():
69 | return
70 | case m := <-bcastIn:
71 | slog.Info("input stream:", "val", string(m.Payload.Data))
72 | bcastOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("1_" + host)}}
73 | bcastOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("2_" + host)}}
74 | bcastOut <- nil // end
75 | }
76 | }
77 | }(context.WithValue(ctx, struct{}{}, nil))
78 |
79 | op := hedge.New(client, ":8080", *spindleTable, *lockName, *logTable,
80 | hedge.WithGroupSyncInterval(time.Second*5),
81 | hedge.WithLeaderCallback(nil, func(d any, m []byte) {
82 | log.Println("callback:", string(m))
83 | }),
84 | hedge.WithLeaderHandler(
85 | nil, // since this is nil, 'data' should be 'op'
86 | func(data any, msg []byte) ([]byte, error) {
87 | op := data.(*hedge.Op)
88 | hostname, _ := os.Hostname()
89 | name := fmt.Sprintf("%v/%v", hostname, op.Name())
90 | log.Println("[send] received:", string(msg))
91 | reply := fmt.Sprintf("leader [%v] received the message [%v] on %v",
92 | name, string(msg), time.Now().Format(time.RFC3339))
93 | return []byte(reply), nil
94 | },
95 | ),
96 | hedge.WithBroadcastHandler(
97 | nil, // since this is nil, 'data' should be 'op'
98 | func(data any, msg []byte) ([]byte, error) {
99 | op := data.(*hedge.Op)
100 | hostname, _ := os.Hostname()
101 | name := fmt.Sprintf("%v/%v", hostname, op.Name())
102 | log.Println("[broadcast] received:", string(msg))
103 | reply := fmt.Sprintf("node [%v] received the broadcast message [%v] on %v",
104 | name, string(msg), time.Now().Format(time.RFC3339))
105 | return []byte(reply), nil
106 |
107 | // log.Println("[broadcast/semaphore] received:", string(msg))
108 | // ss := strings.Split(string(msg), " ")
109 | // name, slimit := ss[0], ss[1]
110 | // limit, err := strconv.Atoi(slimit)
111 | // if err != nil {
112 | // log.Println("invalid limit:", err)
113 | // return nil, err
114 | // }
115 |
116 | // go func() {
117 | // op := data.(*hedge.Op)
118 | // min, max := 10, 30
119 | // tm := rand.Intn(max-min+1) + min
120 | // s, err := op.NewSemaphore(context.Background(), name, limit)
121 | // if err != nil {
122 | // log.Println("NewSemaphore failed:", err)
123 | // return
124 | // }
125 |
126 | // err = s.Acquire(context.Background())
127 | // if err != nil {
128 | // log.Println("Acquire failed:", err)
129 | // return
130 | // }
131 |
132 | // log.Printf("semaphore acquired! simulate work for %vs, id=%v", tm, op.HostPort())
133 | // time.Sleep(time.Second * time.Duration(tm))
134 |
135 | // log.Printf("release semaphore, id=%v", op.HostPort())
136 | // s.Release(context.Background())
137 | // }()
138 |
139 | // return nil, nil
140 | },
141 | ),
142 | hedge.WithLeaderStreamChannels(ldrIn, ldrOut),
143 | hedge.WithBroadcastStreamChannels(bcastIn, bcastOut),
144 | )
145 |
146 | log.Println(op)
147 | done := make(chan error, 1)
148 | go op.Run(ctx, done)
149 |
150 | mux := http.NewServeMux()
151 | mux.HandleFunc("/put", func(w http.ResponseWriter, r *http.Request) {
152 | hostname, _ := os.Hostname()
153 | var key, value string
154 |
155 | // For /put, we expect a fmt: "key value"
156 | b, _ := io.ReadAll(r.Body)
157 | defer r.Body.Close()
158 | if len(string(b)) > 0 {
159 | ss := strings.Split(string(b), " ")
160 | if len(ss) < 2 {
161 | w.Write([]byte("invalid msg format"))
162 | return
163 | }
164 |
165 | key = ss[0]
166 | value = strings.Join(ss[1:], " ")
167 | }
168 |
169 | if key == "" || value == "" {
170 | w.Write([]byte("invalid msg format"))
171 | return
172 | }
173 |
174 | err := op.Put(ctx, hedge.KeyValue{Key: key, Value: value})
175 | if err != nil {
176 | w.Write([]byte(err.Error()))
177 | return
178 | }
179 |
180 | out := fmt.Sprintf("put: sender=%v, key=%v, value=%v", hostname, key, value)
181 | w.Write([]byte(out))
182 | })
183 |
184 | mux.HandleFunc("/get", func(w http.ResponseWriter, r *http.Request) {
185 | hostname, _ := os.Hostname()
186 | b, _ := io.ReadAll(r.Body)
187 | defer r.Body.Close()
188 | v, err := op.Get(ctx, string(b))
189 | if err != nil {
190 | w.Write([]byte(err.Error()))
191 | return
192 | }
193 |
194 | out := fmt.Sprintf("get: sender=%v, key=%v, value=%+v", hostname, string(b), v)
195 | w.Write([]byte(out))
196 | })
197 |
198 | mux.HandleFunc("/send", func(w http.ResponseWriter, r *http.Request) {
199 | hostname, _ := os.Hostname()
200 | msg := "hello" // default
201 | b, _ := io.ReadAll(r.Body)
202 | defer r.Body.Close()
203 | if len(string(b)) > 0 {
204 | msg = string(b)
205 | }
206 |
207 | log.Printf("sending %q msg to leader...", msg)
208 | v, err := hedge.SendToLeader(context.Background(), op, []byte(msg))
209 | if err != nil {
210 | w.Write([]byte(err.Error()))
211 | return
212 | }
213 |
214 | log.Printf("reply: %v", string(v))
215 | out := fmt.Sprintf("sender=%v, reply=%v", hostname, string(v))
216 | w.Write([]byte(out))
217 | })
218 |
219 | mux.HandleFunc("/streamsend", func(w http.ResponseWriter, r *http.Request) {
220 | ret, err := op.StreamToLeader(context.Background())
221 | if err != nil {
222 | w.Write([]byte(err.Error()))
223 | return
224 | }
225 |
226 | ret.In <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("test")}}
227 | close(ret.In) // we're done with input
228 | for m := range ret.Out {
229 | slog.Info("reply:", "out", string(m.Payload.Data))
230 | }
231 |
232 | w.Write([]byte("OK"))
233 | })
234 |
235 | mux.HandleFunc("/broadcast", func(w http.ResponseWriter, r *http.Request) {
236 | hostname, _ := os.Hostname()
237 | msg := "hello" // default
238 | b, _ := io.ReadAll(r.Body)
239 | defer r.Body.Close()
240 | if len(string(b)) > 0 {
241 | msg = string(b)
242 | }
243 |
244 | outs := []string{}
245 | log.Printf("broadcast %q msg to all...", msg)
246 | stream := false
247 | if stream {
248 | ch := make(chan hedge.BroadcastOutput)
249 | go op.Broadcast(context.Background(), []byte(msg), hedge.BroadcastArgs{Out: ch})
250 | for v := range ch {
251 | if v.Error != nil {
252 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, v.Error.Error())
253 | outs = append(outs, out)
254 | } else {
255 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, string(v.Reply))
256 | outs = append(outs, out)
257 | }
258 | }
259 | } else {
260 | vv := op.Broadcast(context.Background(), []byte(msg))
261 | for _, v := range vv {
262 | if v.Error != nil {
263 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, v.Error.Error())
264 | outs = append(outs, out)
265 | } else {
266 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, string(v.Reply))
267 | outs = append(outs, out)
268 | }
269 | }
270 | }
271 |
272 | w.Write([]byte(strings.Join(outs, "\n")))
273 | })
274 |
275 | mux.HandleFunc("/streambroadcast", func(w http.ResponseWriter, r *http.Request) {
276 | ret, err := op.StreamBroadcast(context.Background())
277 | if err != nil {
278 | w.Write([]byte(err.Error()))
279 | return
280 | }
281 |
282 | ret.In <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("test")}}
283 | close(ret.In) // we're done with input
284 |
285 | var wg sync.WaitGroup
286 | for k, v := range ret.Outs {
287 | wg.Add(1)
288 | go func(node string, ch chan *hedge.StreamMessage) {
289 | defer wg.Done()
290 | for m := range ch {
291 | slog.Info("reply:", "node", node, "data", string(m.Payload.Data))
292 | }
293 | }(k, v)
294 | }
295 |
296 | wg.Wait()
297 | w.Write([]byte("OK"))
298 | })
299 |
300 | mux.HandleFunc("/sos", func(w http.ResponseWriter, r *http.Request) {
301 | defer func(start time.Time) {
302 | slog.Info("distmem:", "duration", time.Since(start))
303 | }(time.Now())
304 |
305 | name := "distmem_" + time.Now().Format(time.RFC3339)
306 | rname := r.URL.Query().Get("name")
307 | if rname != "" {
308 | name = rname
309 | }
310 |
311 | slog.Info("start distmem:", "name", name)
312 | limit := 14_000 // 4 pods, all
313 | // limit := 2_500
314 |
315 | sos := func() *hedge.SoS {
316 | sos := op.NewSoS(name, &hedge.SoSOptions{
317 | MemLimit: 150_000,
318 | DiskLimit: 120_000,
319 | Expiration: 5,
320 | })
321 |
322 | writer, err := sos.Writer()
323 | if err != nil {
324 | slog.Error("Writer failed:", "err", err)
325 | return nil
326 | }
327 |
328 | defer writer.Close()
329 | var n int
330 | for i := 0; i < limit; i++ {
331 | data := fmt.Sprintf("2_%v_%v", uuid.NewString(), time.Now().Format(time.RFC3339))
332 | n += len([]byte(data))
333 | writer.Write([]byte(data))
334 | }
335 |
336 | slog.Info("write_dm:", "i", limit, "n", n, "write_err", writer.Err())
337 | return sos
338 | }()
339 |
340 | if sos == nil {
341 | slog.Error("failed in creating SoS object")
342 | return
343 | }
344 |
345 | // reader_1
346 | func() {
347 | reader, err := sos.Reader()
348 | if err != nil {
349 | slog.Error(err.Error())
350 | return
351 | }
352 |
353 | out := make(chan []byte)
354 | eg := new(errgroup.Group)
355 | eg.Go(func() error {
356 | var i, n, total int
357 | for d := range out {
358 | ss := strings.Split(string(d), "_")
359 | if len(ss) != 3 {
360 | slog.Error("bad fmt:", "len", len(ss))
361 | continue
362 | }
363 |
364 | t, err := strconv.Atoi(ss[0])
365 | if err != nil {
366 | slog.Error("Atoi failed:", "err", err)
367 | continue
368 | }
369 |
370 | total += t
371 | _, err = time.Parse(time.RFC3339, ss[2])
372 | if err != nil {
373 | slog.Error("Parse failed:", "err", err)
374 | continue
375 | }
376 |
377 | n += len(d)
378 | i++
379 | }
380 |
381 | slog.Info("read_dm:", "i", i, "n", n, "total", total)
382 | return nil
383 | })
384 |
385 | reader.Read(out)
386 | eg.Wait()
387 | reader.Close()
388 | slog.Info("read_dm:", "read_err", reader.Err())
389 | }()
390 |
391 | // reader_2
392 | func() {
393 | reader, err := sos.Reader()
394 | if err != nil {
395 | slog.Error(err.Error())
396 | return
397 | }
398 |
399 | out := make(chan []byte)
400 | eg := new(errgroup.Group)
401 | eg.Go(func() error {
402 | var i, n, total int
403 | for d := range out {
404 | ss := strings.Split(string(d), "_")
405 | if len(ss) != 3 {
406 | slog.Error("bad fmt:", "len", len(ss))
407 | continue
408 | }
409 |
410 | t, err := strconv.Atoi(ss[0])
411 | if err != nil {
412 | slog.Error("Atoi failed:", "err", err)
413 | continue
414 | }
415 |
416 | total += t
417 | _, err = time.Parse(time.RFC3339, ss[2])
418 | if err != nil {
419 | slog.Error("Parse failed:", "err", err)
420 | continue
421 | }
422 |
423 | n += len(d)
424 | i++
425 | }
426 |
427 | slog.Info("read_dm:", "i", i, "n", n, "total", total)
428 | return nil
429 | })
430 |
431 | reader.Read(out)
432 | eg.Wait()
433 | reader.Close()
434 | slog.Info("read_dm:", "read_err", reader.Err())
435 | }()
436 |
437 | sos.Close()
438 | w.Write([]byte("OK"))
439 | })
440 |
441 | // NOTE: Used only on my local environment.
442 | mux.HandleFunc("/soslocal", func(w http.ResponseWriter, r *http.Request) {
443 | defer func(start time.Time) {
444 | slog.Info("distmem:", "duration", time.Since(start))
445 | }(time.Now())
446 |
447 | type kcT struct {
448 | Key string `json:"key"`
449 | TrueUnblended float64 `json:"trueUnblended"`
450 | Unblended float64 `json:"unblended"`
451 | Usage float64 `json:"usage"`
452 | }
453 |
454 | // See $HOME/tmp/
455 | locs, _ := os.ReadFile("readlocs")
456 | ss := strings.Split(string(locs), " ")
457 |
458 | // See $HOME/tmp/
459 | ra, err := mmap.Open("readdata")
460 | if err != nil {
461 | slog.Error(err.Error())
462 | return
463 | }
464 |
465 | defer ra.Close()
466 |
467 | name := "distmem_" + time.Now().Format(time.RFC3339)
468 | rname := r.URL.Query().Get("name")
469 | if rname != "" {
470 | name = rname
471 | }
472 |
473 | slog.Info("start distmem:", "name", name)
474 |
475 | sos := func() *hedge.SoS {
476 | sos := op.NewSoS(name, &hedge.SoSOptions{
477 | MemLimit: 10_000_000,
478 | DiskLimit: 10_000_000,
479 | Expiration: 5,
480 | })
481 |
482 | writer, err := sos.Writer()
483 | if err != nil {
484 | slog.Error("Writer failed:", "err", err)
485 | return nil
486 | }
487 |
488 | var i, wt int
489 | var off int64
490 | locs := []int{}
491 | for _, sloc := range ss {
492 | i++
493 | loc, _ := strconv.ParseInt(sloc, 10, 64)
494 | locs = append(locs, int(loc))
495 | b := make([]byte, loc)
496 | n, err := ra.ReadAt(b, off)
497 | if err != nil {
498 | slog.Error(err.Error())
499 | break
500 | }
501 |
502 | var kc kcT
503 | err = json.Unmarshal(b, &kc)
504 | if err != nil {
505 | slog.Error(err.Error())
506 | break
507 | }
508 |
509 | if int64(n) != loc {
510 | slog.Error("not equal:", "n", n, "loc", loc)
511 | }
512 |
513 | off = off + int64(n)
514 | wt += n
515 | writer.Write(b)
516 | }
517 |
518 | writer.Close()
519 | slog.Info("total_write:",
520 | "count", i,
521 | "val", wt,
522 | "err", writer.Err(),
523 | )
524 |
525 | return sos
526 | }()
527 |
528 | func() {
529 | reader, _ := sos.Reader()
530 | out := make(chan []byte)
531 | eg := new(errgroup.Group)
532 | eg.Go(func() error {
533 | var print int
534 | var i, rt int
535 | for d := range out {
536 | i++
537 | var kc kcT
538 | err = json.Unmarshal(d, &kc)
539 | if err != nil {
540 | if print < 2 {
541 | slog.Error(err.Error(), "i", i, "raw", string(d))
542 | print++
543 | }
544 |
545 | continue
546 | }
547 |
548 | rt += len(d)
549 | }
550 |
551 | slog.Info("total_read:", "count", i, "val", rt)
552 | return nil
553 | })
554 |
555 | reader.Read(out)
556 | eg.Wait()
557 | reader.Close()
558 | slog.Info("read_dm:", "read_err", reader.Err())
559 | }()
560 |
561 | sos.Close()
562 | w.Write([]byte("OK"))
563 | })
564 |
565 | s := &http.Server{Addr: ":9090", Handler: mux}
566 | go s.ListenAndServe()
567 |
568 | // Interrupt handler.
569 | go func() {
570 | sigch := make(chan os.Signal, 1)
571 | signal.Notify(sigch, syscall.SIGINT, syscall.SIGTERM)
572 | <-sigch
573 | cancel()
574 | }()
575 |
576 | <-done // wait ctrl+c
577 | s.Shutdown(ctx)
578 | }
579 |
--------------------------------------------------------------------------------
/sos.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "io"
7 | "net"
8 | "os"
9 | "strconv"
10 | "sync"
11 | "sync/atomic"
12 | "time"
13 |
14 | "github.com/apache/arrow/go/v17/arrow"
15 | "github.com/apache/arrow/go/v17/arrow/array"
16 | "github.com/apache/arrow/go/v17/arrow/memory"
17 | "github.com/cespare/xxhash/v2"
18 | pb "github.com/flowerinthenight/hedge-proto"
19 | "github.com/shirou/gopsutil/v4/mem"
20 | "golang.org/x/exp/mmap"
21 | "golang.org/x/sync/errgroup"
22 | "google.golang.org/grpc"
23 | "google.golang.org/grpc/credentials/insecure"
24 | )
25 |
26 | const (
27 | metaName = "name"
28 | metaMemLimit = "mlimit"
29 | metaDiskLimit = "dlimit"
30 | metaExpire = "expire"
31 | )
32 |
33 | var (
34 | errNoInit = fmt.Errorf("sos: not properly initialized")
35 | )
36 |
37 | type metaT struct {
38 | msize atomic.Uint64
39 | dsize atomic.Uint64
40 | grpc atomic.Int32
41 | conn *grpc.ClientConn
42 | client pb.HedgeClient
43 | writer pb.Hedge_SoSWriteClient
44 | reader pb.Hedge_SoSReadClient
45 | }
46 |
47 | type SoSOptions struct {
48 | // MemLimit sets the memory limit in bytes to be used per node.
49 | MemLimit uint64
50 |
51 | // DiskLimit sets the disk limit in bytes to be used per node.
52 | DiskLimit uint64
53 |
54 | // Expiration sets the TTL (time-to-live) of the backing storage.
55 | // If not set, the default is 30s.
56 | Expiration int64
57 | }
58 |
59 | type memT struct {
60 | mem *memory.GoAllocator
61 | bb *array.BinaryBuilder
62 | bufs *array.Binary
63 | }
64 |
65 | // SoS (Spillover-Store) represents an object for spill-over (or stitched)
66 | // storage. Useful for load-process-discard types of data processing. The
67 | // order of storage priority is local memory, local disk, other pod's
68 | // memory, other pod's disk, and so on.
69 | //
70 | // Limitation: At the moment, it's not allowed to reuse a name for SOS
71 | // once it's used and closed within hedge's lifetime.
72 | type SoS struct {
73 | sync.Mutex
74 |
75 | Name string // the name of this instance
76 |
77 | op *Op // cluster coordinator
78 | nodes []uint64 // 0=local, 1..n=network
79 | meta map[uint64]*metaT // per-node metadata, key=node
80 | mlimit atomic.Uint64 // mem limit
81 | dlimit atomic.Uint64 // disk limit
82 | data map[uint64]*memT // mem data , key=node
83 | dlocs []int // disk offsets
84 | mlock *sync.Mutex // local mem lock
85 | dlock *sync.Mutex // local file lock
86 | wmtx *sync.Mutex // one active writer only
87 | writer *Writer // writer object
88 | refs atomic.Int64 // self reference count
89 | wrefs atomic.Int64 // writer reference count
90 | rrefs atomic.Int64 // reader reference count
91 | on atomic.Int32 // 1 = active
92 |
93 | ttl time.Duration // ttl to cleanup
94 | age time.Time // started
95 | }
96 |
97 | type Writer struct {
98 | sync.Mutex
99 | lo bool // local write only
100 | sos *SoS
101 | ch chan []byte
102 | on atomic.Int32
103 | err error
104 | done chan struct{}
105 | }
106 |
107 | // Err returns the last recorded error during the write operation.
108 | func (w *Writer) Err() error {
109 | w.Lock()
110 | defer w.Unlock()
111 | return w.err
112 | }
113 |
114 | // Write writes data to the underlying storage.
115 | func (w *Writer) Write(data []byte) { w.ch <- data }
116 |
117 | // Close closes the writer object.
118 | func (w *Writer) Close() {
119 | if w.on.Load() == 0 {
120 | return
121 | }
122 |
123 | close(w.ch)
124 | <-w.done // wait for start()
125 | w.on.Store(0)
126 | w.sos.wrefs.Add(-1)
127 | w.sos.wmtx.Unlock()
128 | }
129 |
130 | func (w *Writer) start() {
131 | defer func() { w.done <- struct{}{} }()
132 | w.on.Store(1)
133 | ctx := context.Background()
134 | node := w.sos.nodes[0]
135 | var file *os.File
136 |
137 | var allCount int
138 | var memCount int
139 | var diskCount int
140 | var netCount int
141 | var failCount int
142 |
143 | var mlock bool
144 | var dlock bool
145 | unlock := func(b bool, l *sync.Mutex) {
146 | if b {
147 | l.Unlock()
148 | }
149 | }
150 |
151 | for data := range w.ch {
152 | allCount++
153 | var err error
154 | var nextName string
155 | msize := w.sos.meta[node].msize.Load()
156 | mlimit := w.sos.mlimit.Load()
157 | dsize := w.sos.meta[node].dsize.Load()
158 | dlimit := w.sos.dlimit.Load()
159 |
160 | // Local (or next hop) is full. Go to the next node.
161 | if !w.lo && ((msize + dsize) >= (mlimit + dlimit)) {
162 | nextName, node = w.sos.nextNode()
163 | if nextName == "" {
164 | failCount++
165 | w.Lock()
166 | w.err = fmt.Errorf("cannot find next node")
167 | w.Unlock()
168 | continue
169 | }
170 |
171 | if w.sos.meta[node].grpc.Load() == 0 {
172 | err = func() error {
173 | host, port, _ := net.SplitHostPort(nextName)
174 | pi, _ := strconv.Atoi(port)
175 | nextName = net.JoinHostPort(host, fmt.Sprintf("%v", pi+1))
176 |
177 | var opts []grpc.DialOption
178 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))
179 | w.sos.meta[node].conn, err = grpc.NewClient(nextName, opts...)
180 | if err != nil {
181 | return fmt.Errorf("NewClient (%v) failed: %w", nextName, err)
182 | }
183 |
184 | w.sos.meta[node].client = pb.NewHedgeClient(w.sos.meta[node].conn)
185 | w.sos.meta[node].writer, err = w.sos.meta[node].client.SoSWrite(ctx)
186 | if err != nil {
187 | return fmt.Errorf("SoSWrite (%v) failed: %w", nextName, err)
188 | }
189 |
190 | w.sos.meta[node].grpc.Add(1)
191 | return nil
192 | }()
193 |
194 | if err != nil {
195 | w.Lock()
196 | w.err = err
197 | w.Unlock()
198 | }
199 | }
200 | }
201 |
202 | switch {
203 | case !w.lo && node != w.sos.me():
204 | netCount++
205 | err := w.sos.meta[node].writer.Send(&pb.Payload{
206 | Meta: map[string]string{
207 | metaName: w.sos.Name,
208 | metaMemLimit: fmt.Sprintf("%v", w.sos.mlimit.Load()),
209 | metaDiskLimit: fmt.Sprintf("%v", w.sos.dlimit.Load()),
210 | metaExpire: fmt.Sprintf("%v", int64(w.sos.ttl.Seconds())),
211 | },
212 | Data: data,
213 | })
214 |
215 | if err != nil {
216 | w.Lock()
217 | w.err = fmt.Errorf("Send failed: %w", err)
218 | w.Unlock()
219 | }
220 |
221 | w.sos.meta[node].msize.Add(uint64(len(data)))
222 | default:
223 | if msize < mlimit {
224 | memCount++
225 | if !mlock {
226 | w.sos.mlock.Lock()
227 | mlock = true
228 | }
229 |
230 | if _, ok := w.sos.data[node]; !ok {
231 | w.sos.data[node] = &memT{}
232 | }
233 |
234 | if w.sos.data[node].bb == nil {
235 | w.sos.data[node].mem = memory.NewGoAllocator()
236 | w.sos.data[node].bb = array.NewBinaryBuilder(
237 | w.sos.data[node].mem,
238 | &arrow.BinaryType{},
239 | )
240 | }
241 |
242 | w.sos.data[node].bb.Append(data)
243 | w.sos.meta[node].msize.Add(uint64(len(data)))
244 | } else {
245 | diskCount++
246 | if !dlock {
247 | w.sos.dlock.Lock()
248 | dlock = true
249 | }
250 |
251 | if file == nil {
252 | flag := os.O_WRONLY | os.O_CREATE | os.O_TRUNC
253 | file, err = os.OpenFile(w.sos.localFile(), flag, 0644)
254 | if err != nil {
255 | w.sos.op.logger.Println("OpenFile failed:", err)
256 | }
257 | }
258 |
259 | n, err := file.Write(data)
260 | if err != nil {
261 | w.Lock()
262 | w.err = fmt.Errorf("Write failed: %w", err)
263 | w.Unlock()
264 | } else {
265 | w.sos.dlocs = append(w.sos.dlocs, n)
266 | w.sos.meta[node].dsize.Add(uint64(n))
267 | }
268 | }
269 | }
270 | }
271 |
272 | // slog.Info(
273 | // "write:",
274 | // "all", allCount,
275 | // "add", memCount+diskCount+netCount+failCount,
276 | // "mem", memCount,
277 | // "disk", diskCount,
278 | // "net", netCount,
279 | // "fail", failCount,
280 | // "nodes", w.sos.nodes,
281 | // )
282 |
283 | nodes := []uint64{}
284 | for k := range w.sos.meta {
285 | nodes = append(nodes, k)
286 | }
287 |
288 | for _, n := range nodes {
289 | if w.sos.data[n].bb != nil {
290 | w.sos.data[n].bufs = w.sos.data[n].bb.NewBinaryArray()
291 | w.sos.data[n].bb.Release()
292 | w.sos.data[n].bb = nil
293 | // slog.Info("arrow: release(bb):", "node", n)
294 | }
295 | }
296 |
297 | unlock(mlock, w.sos.mlock)
298 |
299 | file.Sync()
300 | file.Close()
301 | unlock(dlock, w.sos.dlock)
302 |
303 | for _, n := range nodes {
304 | if w.sos.meta[n].writer != nil {
305 | w.sos.meta[n].writer.CloseSend()
306 | }
307 | }
308 | }
309 |
310 | type writerOptions struct {
311 | LocalOnly bool
312 | }
313 |
314 | // Writer returns a writer object for writing data to SoS. The
315 | // caller needs to call writer.Close() after use. Options is
316 | // only used internally, not exposed to callers.
317 | func (sos *SoS) Writer(opts ...*writerOptions) (*Writer, error) {
318 | if sos.on.Load() == 0 {
319 | return nil, errNoInit
320 | }
321 |
322 | sos.wmtx.Lock()
323 | var localOnly bool
324 | if len(opts) > 0 {
325 | localOnly = opts[0].LocalOnly
326 | }
327 |
328 | sos.writer = &Writer{
329 | lo: localOnly,
330 | sos: sos,
331 | ch: make(chan []byte),
332 | done: make(chan struct{}, 1),
333 | }
334 |
335 | go sos.writer.start()
336 | sos.wrefs.Add(1)
337 | return sos.writer, nil
338 | }
339 |
340 | type Reader struct {
341 | sync.Mutex
342 | lo bool // local read only
343 | sos *SoS
344 | on atomic.Int32
345 | err error
346 | done chan struct{}
347 | }
348 |
349 | // Read reads the underlying data and streams them to the `out` channel.
350 | func (r *Reader) Read(out chan []byte) {
351 | eg := new(errgroup.Group)
352 | eg.Go(func() error {
353 | r.on.Store(1)
354 | ctx := context.Background()
355 | for _, node := range r.sos.nodes {
356 | var err error
357 | switch {
358 | case !r.lo && node != r.sos.me():
359 | func() {
360 | r.sos.meta[node].reader, err = r.sos.meta[node].client.SoSRead(ctx)
361 | if err != nil {
362 | r.Lock()
363 | r.err = fmt.Errorf("SoSRead failed: %v", err)
364 | r.Unlock()
365 | return
366 | }
367 | }()
368 |
369 | err = r.sos.meta[node].reader.Send(&pb.Payload{
370 | Meta: map[string]string{
371 | metaName: r.sos.Name,
372 | metaMemLimit: fmt.Sprintf("%v", r.sos.mlimit.Load()),
373 | metaDiskLimit: fmt.Sprintf("%v", r.sos.dlimit.Load()),
374 | metaExpire: fmt.Sprintf("%v", int64(r.sos.ttl.Seconds())),
375 | },
376 | })
377 |
378 | if err != nil {
379 | r.Lock()
380 | r.err = fmt.Errorf("Send failed: %v", err)
381 | r.Unlock()
382 | continue
383 | }
384 |
385 | for {
386 | in, err := r.sos.meta[node].reader.Recv()
387 | if err == io.EOF {
388 | break
389 | }
390 |
391 | if err != nil {
392 | r.Lock()
393 | r.err = fmt.Errorf("Recv failed: %v", err)
394 | r.Unlock()
395 | break
396 | }
397 |
398 | out <- in.Data
399 | }
400 | default:
401 | func() {
402 | r.sos.mlock.Lock()
403 | defer r.sos.mlock.Unlock()
404 | if _, ok := r.sos.data[node]; !ok {
405 | return
406 | }
407 |
408 | if r.sos.data[node].bufs == nil {
409 | return
410 | }
411 |
412 | for i := 0; i < r.sos.data[node].bufs.Len(); i++ {
413 | out <- r.sos.data[node].bufs.Value(i)
414 | }
415 | }()
416 |
417 | func() {
418 | r.sos.dlock.Lock()
419 | defer r.sos.dlock.Unlock()
420 | if len(r.sos.dlocs) == 0 {
421 | return
422 | }
423 |
424 | ra, err := mmap.Open(r.sos.localFile())
425 | if err != nil {
426 | r.Lock()
427 | r.err = fmt.Errorf("Open failed: %v", err)
428 | r.Unlock()
429 | return
430 | }
431 |
432 | defer ra.Close()
433 | var off int64
434 | for _, loc := range r.sos.dlocs {
435 | buf := make([]byte, loc)
436 | n, err := ra.ReadAt(buf, off)
437 | if err != nil {
438 | r.Lock()
439 | r.err = fmt.Errorf("ReadAt failed: %v", err)
440 | r.Unlock()
441 | }
442 |
443 | out <- buf
444 | off = off + int64(n)
445 | }
446 | }()
447 | }
448 | }
449 |
450 | return nil
451 | })
452 |
453 | eg.Wait()
454 | close(out)
455 | r.done <- struct{}{}
456 | }
457 |
458 | // Err returns the last recorded error, if any, during the read operation.
459 | func (r *Reader) Err() error {
460 | r.Lock()
461 | defer r.Unlock()
462 | return r.err
463 | }
464 |
465 | // Close closes the reader object.
466 | func (r *Reader) Close() {
467 | if r.on.Load() == 0 {
468 | return
469 | }
470 |
471 | <-r.done // wait for loop
472 | r.sos.rrefs.Add(-1)
473 | r.on.Store(0)
474 | }
475 |
476 | type readerOptions struct {
477 | LocalOnly bool
478 | }
479 |
480 | // Reader returns a reader object for reading data from SoS. The
481 | // caller needs to call reader.Close() after use. Options is only
482 | // used internally, not exposed to callers.
483 | func (sos *SoS) Reader(opts ...*readerOptions) (*Reader, error) {
484 | if sos.on.Load() == 0 {
485 | return nil, errNoInit
486 | }
487 |
488 | var localOnly bool
489 | if len(opts) > 0 {
490 | localOnly = opts[0].LocalOnly
491 | }
492 |
493 | reader := &Reader{
494 | lo: localOnly,
495 | sos: sos,
496 | done: make(chan struct{}, 1),
497 | }
498 |
499 | sos.rrefs.Add(1)
500 | return reader, nil
501 | }
502 |
503 | // Close closes the SoS object.
504 | func (sos *SoS) Close() {
505 | if sos.on.Load() == 0 {
506 | return
507 | }
508 |
509 | sos.Lock()
510 | defer sos.Unlock()
511 | nodes := []uint64{}
512 | for k := range sos.meta {
513 | nodes = append(nodes, k)
514 | }
515 |
516 | ctx := context.Background()
517 | for _, n := range nodes {
518 | if sos.meta[n].conn != nil {
519 | sos.meta[n].client.SoSClose(ctx, &pb.Payload{
520 | Meta: map[string]string{metaName: sos.Name},
521 | })
522 | }
523 | }
524 |
525 | sos.refs.Add(-1)
526 | sos.on.Store(0)
527 | }
528 |
529 | func (sos *SoS) nextNode() (string, uint64) {
530 | var mb string
531 | members := sos.op.Members()
532 | for _, member := range members {
533 | nn := xxhash.Sum64String(member)
534 | if nn == sos.me() {
535 | continue
536 | }
537 |
538 | if _, ok := sos.data[nn]; ok {
539 | continue
540 | }
541 |
542 | mb = member
543 | sos.nodes = append(sos.nodes, nn)
544 | sos.meta[nn] = &metaT{}
545 | sos.data[nn] = &memT{}
546 | break
547 | }
548 |
549 | return mb, sos.nodes[len(sos.nodes)-1]
550 | }
551 |
552 | func (sos *SoS) me() uint64 { return xxhash.Sum64String(sos.op.Name()) }
553 |
554 | func (sos *SoS) localFile() string {
555 | name1 := fmt.Sprintf("%v", sos.me())
556 | name2 := xxhash.Sum64String(sos.Name)
557 | return fmt.Sprintf("%v_%v.dat", name1, name2)
558 | }
559 |
560 | func (sos *SoS) cleaner() {
561 | eg := new(errgroup.Group)
562 | eg.Go(func() error {
563 | started := sos.age
564 | for {
565 | time.Sleep(time.Second * 1)
566 | refs := sos.refs.Load()
567 | wrefs := sos.wrefs.Load()
568 | rrefs := sos.rrefs.Load()
569 | if (refs + wrefs + rrefs) > 0 {
570 | started = time.Now()
571 | continue
572 | }
573 |
574 | if time.Since(started) > sos.ttl {
575 | func() {
576 | // Cleanup memory area:
577 | sos.op.soss[sos.Name].mlock.Lock()
578 | defer sos.op.soss[sos.Name].mlock.Unlock()
579 | for _, node := range sos.op.soss[sos.Name].nodes {
580 | if sos.data[node].bufs != nil {
581 | sos.data[node].bufs.Release()
582 | sos.data[node].bufs = nil
583 | // slog.Info("arrow: release(buf):", "node", node)
584 | }
585 | }
586 | }()
587 |
588 | // Cleanup disk area:
589 | sos.op.soss[sos.Name].dlock.Lock()
590 | os.Remove(sos.localFile())
591 | sos.op.soss[sos.Name].dlock.Unlock()
592 |
593 | // Remove the main map entry:
594 | sos.op.sosLock.Lock()
595 | delete(sos.op.soss, sos.Name)
596 | sos.op.sosLock.Unlock()
597 | break
598 | }
599 | }
600 |
601 | return nil
602 | })
603 |
604 | eg.Wait()
605 | }
606 |
607 | func newSoS(name string, op *Op, opts ...*SoSOptions) *SoS {
608 | sos := &SoS{
609 | Name: name,
610 | op: op,
611 | meta: make(map[uint64]*metaT),
612 | data: map[uint64]*memT{},
613 | dlocs: []int{},
614 | mlock: &sync.Mutex{},
615 | dlock: &sync.Mutex{},
616 | wmtx: &sync.Mutex{},
617 | }
618 |
619 | sos.on.Store(1)
620 | sos.nodes = []uint64{sos.me()}
621 | sos.meta[sos.me()] = &metaT{}
622 | sos.data[sos.me()] = &memT{}
623 |
624 | if len(opts) > 0 {
625 | sos.mlimit.Store(opts[0].MemLimit)
626 | sos.dlimit.Store(opts[0].DiskLimit)
627 | if opts[0].Expiration > 0 {
628 | sos.ttl = time.Second * time.Duration(opts[0].Expiration)
629 | }
630 | }
631 |
632 | if sos.mlimit.Load() == 0 {
633 | vm, _ := mem.VirtualMemory()
634 | sos.mlimit.Store(vm.Available / 2) // half of free mem
635 | }
636 |
637 | if sos.dlimit.Load() == 0 {
638 | sos.dlimit.Store(1 << 30) // 1GB by default
639 | }
640 |
641 | if sos.ttl == 0 {
642 | sos.ttl = time.Second * 30
643 | }
644 |
645 | sos.refs.Add(1)
646 | sos.age = time.Now()
647 | go sos.cleaner()
648 | return sos
649 | }
650 |
--------------------------------------------------------------------------------
/hedge.go:
--------------------------------------------------------------------------------
1 | package hedge
2 |
3 | import (
4 | "bufio"
5 | "context"
6 | "encoding/base64"
7 | "encoding/json"
8 | "fmt"
9 | "io"
10 | "log"
11 | "maps"
12 | "net"
13 | "os"
14 | "strconv"
15 | "strings"
16 | "sync"
17 | "sync/atomic"
18 | "time"
19 |
20 | "cloud.google.com/go/spanner"
21 | pb "github.com/flowerinthenight/hedge-proto"
22 | "github.com/flowerinthenight/spindle/v2"
23 | "github.com/google/uuid"
24 | gaxv2 "github.com/googleapis/gax-go/v2"
25 | "github.com/hashicorp/memberlist"
26 | "google.golang.org/api/iterator"
27 | "google.golang.org/grpc"
28 | "google.golang.org/grpc/credentials/insecure"
29 | "google.golang.org/grpc/reflection"
30 | )
31 |
32 | const (
33 | CmdLeader = "LDR" // for leader confirmation, reply="ACK"
34 | CmdWrite = "PUT" // write key/value, fmt="PUT [noappend]"
35 | CmdSend = "SND" // member to leader, fmt="SND "
36 | CmdPing = "HEY" // heartbeat to indicate availability, fmt="HEY [id]"
37 | CmdMembers = "MEM" // members info from leader to all, fmt="MEM base64(JSON(members))"
38 | CmdBroadcast = "ALL" // broadcast to all, fmt="ALL base64(payload)"
39 | CmdAck = "ACK" // generic reply, fmt="ACK"|"ACK base64(err)"|"ACK base64(JSON(members))"
40 | CmdSemaphore = "SEM" // create semaphore, fmt="SEM {name} {limit} {caller}, reply="ACK"
41 | CmdSemAcquire = "SEA" // acquire semaphore, fmt="SEA {name} {caller}", reply="ACK[ base64([0:|1:]err)]" (0=final,1=retry)
42 | CmdSemRelease = "SER" // release semaphore, fmt="SER {name} {caller}"
43 |
44 | FlagNoAppend = "noappend"
45 | )
46 |
47 | var (
48 | ErrNotRunning = fmt.Errorf("hedge: not running")
49 | ErrNoLeader = fmt.Errorf("hedge: no leader available")
50 | ErrNoHandler = fmt.Errorf("hedge: no message handler")
51 | ErrNotSupported = fmt.Errorf("hedge: not supported")
52 | ErrInvalidConn = fmt.Errorf("hedge: invalid connection")
53 |
54 | cctx = func(ctx context.Context) context.Context {
55 | return context.WithValue(ctx, struct{}{}, nil)
56 | }
57 | )
58 |
59 | type FnMsgHandler func(data any, msg []byte) ([]byte, error)
60 |
61 | // KeyValue is for Put()/Get() callers.
62 | type KeyValue struct {
63 | Key string `json:"key"`
64 | Value string `json:"value"`
65 | Timestamp time.Time `json:"timestamp"` // read-only, populated when Get()
66 | }
67 |
68 | // LogItem represents an item in our log.
69 | type LogItem struct {
70 | Id string
71 | Key string
72 | Value string
73 | Leader string
74 | Timestamp time.Time
75 | }
76 |
77 | type Option interface {
78 | Apply(*Op)
79 | }
80 |
81 | type withDuration int64
82 |
83 | func (w withDuration) Apply(op *Op) { op.lockTimeout = int64(w) }
84 |
85 | // WithDuration sets Op's internal spindle object's lease duration in milliseconds.
86 | // Defaults to 30000ms (30s) when not set. Minimum value is 2000ms (2s).
87 | func WithDuration(v int64) Option { return withDuration(v) }
88 |
89 | type withGroupSyncInterval time.Duration
90 |
91 | func (w withGroupSyncInterval) Apply(op *Op) { op.syncInterval = time.Duration(w) }
92 |
93 | // WithGroupSyncInterval sets the internal interval timeout to sync membership
94 | // within the group in seconds. If not set, defaults to 30s. Minimum value is 2s.
95 | func WithGroupSyncInterval(v time.Duration) Option { return withGroupSyncInterval(v) }
96 |
97 | type withLeaderCallback struct {
98 | d any
99 | f spindle.FnLeaderCallback
100 | }
101 |
102 | func (w withLeaderCallback) Apply(op *Op) {
103 | op.cbLeaderData = w.d
104 | op.cbLeader = w.f
105 | }
106 |
107 | // WithLeaderCallback sets the node's callback function that will be called
108 | // when a leader node selected (or deselected). The msg arg for f will be
109 | // set to either 0 or 1.
110 | func WithLeaderCallback(d any, f spindle.FnLeaderCallback) Option {
111 | return withLeaderCallback{d, f}
112 | }
113 |
114 | type withLeaderHandler struct {
115 | d any
116 | h FnMsgHandler
117 | }
118 |
119 | func (w withLeaderHandler) Apply(op *Op) {
120 | op.fnLdrData = w.d
121 | op.fnLeader = w.h
122 | }
123 |
124 | // WithLeaderHandler sets the node's callback function when it is the current
125 | // leader and when members send messages to it using the Send(...) API. Any
126 | // arbitrary data represented by d will be passed to the callback h every
127 | // time it is called. If d is nil, the default callback data will be the *Op
128 | // object itself. The handler's returning []byte will serve as reply.
129 | //
130 | // Typical flow would be:
131 | // 1. Any node (including the leader) calls the Send(...) API.
132 | // 2. The current leader handles the call by reading the input.
133 | // 3. Leader will then call FnLeaderHandler, passing the arbitrary data
134 | // along with the message.
135 | // 4. FnLeaderHandler will process the data as leader, then returns the
136 | // reply to the calling member.
137 | func WithLeaderHandler(d any, h FnMsgHandler) Option {
138 | return withLeaderHandler{d, h}
139 | }
140 |
141 | type withBroadcastHandler struct {
142 | d any
143 | h FnMsgHandler
144 | }
145 |
146 | func (w withBroadcastHandler) Apply(op *Op) {
147 | op.fnBcData = w.d
148 | op.fnBroadcast = w.h
149 | }
150 |
151 | // WithMembersChangedHandler sets the leader node's callback function for any member changes
152 | func WithMembersChangedHandler(d any, h FnMsgHandler) Option {
153 | return withMembersChangedHandler{d, h}
154 | }
155 |
156 | type withMembersChangedHandler struct {
157 | d any
158 | h FnMsgHandler
159 | }
160 |
161 | func (w withMembersChangedHandler) Apply(op *Op) {
162 | op.fnMemChangedData = w.d
163 | op.fnMemberChanged = w.h
164 | }
165 |
166 | // WithBroadcastHandler sets the node's callback function for broadcast messages
167 | // from anyone in the group using the Broadcast(...) API. Any arbitrary data
168 | // represented by d will be passed to the callback h every time it is called.
169 | // If d is nil, the default callback data will be the *Op object itself. The
170 | // handler's returning []byte will serve as reply.
171 | //
172 | // A nil broadcast handler disables the internal heartbeat function.
173 | func WithBroadcastHandler(d any, h FnMsgHandler) Option {
174 | return withBroadcastHandler{d, h}
175 | }
176 |
177 | type withGrpcHostPort string
178 |
179 | func (w withGrpcHostPort) Apply(op *Op) { op.grpcHostPort = string(w) }
180 |
181 | // WithGrpcHostPort sets Op's internal grpc host/port address.
182 | // Defaults to the internal TCP host:port+1.
183 | func WithGrpcHostPort(v string) Option { return withGrpcHostPort(v) }
184 |
185 | type StreamMessage struct {
186 | Payload *pb.Payload `json:"payload"`
187 | Error error `json:"error"`
188 | }
189 |
190 | type withLeaderStreamChannels struct {
191 | in chan *StreamMessage
192 | out chan *StreamMessage
193 | }
194 |
195 | func (w withLeaderStreamChannels) Apply(op *Op) {
196 | op.leaderStreamIn = w.in
197 | op.leaderStreamOut = w.out
198 | }
199 |
200 | // WithLeaderStreamChannels sets the streaming input and output channels for sending
201 | // streaming messages to the leader. All incoming stream messages to the leader will
202 | // be sent to the `in` channel. A nil message indicates the end of the streaming data.
203 | // After sending all messages to `in`, the handler will then listen to the `out` channel
204 | // for reply messages. A nil message indicates the end of the reply stream.
205 | func WithLeaderStreamChannels(in chan *StreamMessage, out chan *StreamMessage) Option {
206 | return withLeaderStreamChannels{in, out}
207 | }
208 |
209 | type withBroadcastStreamChannels struct {
210 | in chan *StreamMessage
211 | out chan *StreamMessage
212 | }
213 |
214 | func (w withBroadcastStreamChannels) Apply(op *Op) {
215 | op.broadcastStreamIn = w.in
216 | op.broadcastStreamOut = w.out
217 | }
218 |
219 | // WithBroadcastStreamChannels sets the streaming input and output channels for broadcasting
220 | // messages to all nodes. All incoming stream messages will be sent to the `in` channel. A
221 | // nil message indicates the end of the streaming data. After sending all messages to `in`,
222 | // the handler will then listen to the `out` channel for reply messages. A nil message
223 | // indicates the end of the reply stream.
224 | func WithBroadcastStreamChannels(in chan *StreamMessage, out chan *StreamMessage) Option {
225 | return withBroadcastStreamChannels{in, out}
226 | }
227 |
228 | type withLogger struct{ l *log.Logger }
229 |
230 | func (w withLogger) Apply(op *Op) { op.logger = w.l }
231 |
232 | // WithLogger sets Op's logger object. Can be silenced by setting v to:
233 | //
234 | // log.New(io.Discard, "", 0)
235 | func WithLogger(v *log.Logger) Option { return withLogger{v} }
236 |
237 | // Op is our main instance for hedge operations.
238 | type Op struct {
239 | hostPort string // this instance's id; address:port
240 | grpcHostPort string // default is host:port+1 (from `hostPort`)
241 | spannerClient *spanner.Client // both for spindle and hedge
242 | lockTable string // spindle lock table
243 | lockName string // spindle lock name
244 | lockTimeout int64 // spindle's lock lease duration in ms
245 | logTable string // append-only log table
246 |
247 | cbLeader spindle.FnLeaderCallback
248 | cbLeaderData any
249 | fnLeader FnMsgHandler // leader message handler
250 | fnLdrData any // arbitrary data passed to fnLeader
251 | fnBroadcast FnMsgHandler // broadcast message handler
252 | fnBcData any // arbitrary data passed to fnBroadcast
253 | fnMemberChanged FnMsgHandler // member changes message handler
254 | fnMemChangedData any // arbitrary data passed to fnMemberChanged
255 | leaderStreamIn chan *StreamMessage
256 | leaderStreamOut chan *StreamMessage
257 | broadcastStreamIn chan *StreamMessage
258 | broadcastStreamOut chan *StreamMessage
259 |
260 | sosLock *sync.Mutex
261 | soss map[string]*SoS // distributed memory
262 |
263 | *spindle.Lock // handles our distributed lock
264 | members map[string]struct{} // key=id
265 | syncInterval time.Duration // ensure membership
266 | mtx sync.Mutex // local mutex
267 | mtxSem sync.Mutex // semaphore mutex
268 | ensureOn atomic.Int32 // 1=semaphore checker running
269 | ensureCh chan string // please check this id
270 | ensureCtx context.Context
271 | ensureCancel context.CancelFunc
272 | ensureDone chan struct{}
273 | active atomic.Int32 // 1=running, 0=off
274 | logger *log.Logger // internal logger
275 | }
276 |
277 | // String implements the Stringer interface.
278 | func (op *Op) String() string {
279 | return fmt.Sprintf("hostport:%s;spindle:%v;%v;%v",
280 | op.hostPort,
281 | op.spannerClient.DatabaseName(),
282 | op.lockTable,
283 | op.logTable,
284 | )
285 | }
286 |
287 | // HostPort returns the host:port (or name) of this instance.
288 | func (op *Op) HostPort() string { return op.hostPort }
289 |
290 | // Name is the same as HostPort.
291 | func (op *Op) Name() string { return op.hostPort }
292 |
293 | // IsRunning returns true if Op is already running.
294 | func (op *Op) IsRunning() bool { return op.active.Load() == 1 }
295 |
296 | // Run starts the main handler. It blocks until ctx is cancelled,
297 | // optionally sending an error message to done when finished.
298 | func (op *Op) Run(ctx context.Context, done ...chan error) error {
299 | var err error
300 | defer func(e *error) {
301 | if len(done) > 0 {
302 | done[0] <- *e
303 | }
304 | }(&err)
305 |
306 | // Some housekeeping.
307 | if op.spannerClient == nil {
308 | err = fmt.Errorf("hedge: Spanner client cannot be nil")
309 | return err
310 | }
311 |
312 | for _, v := range []struct {
313 | name string
314 | val string
315 | }{
316 | {"SpindleTable", op.lockTable},
317 | {"SpindleLockName", op.lockName},
318 | } {
319 | if v.val == "" {
320 | err = fmt.Errorf("hedge: %v cannot be empty", v.name)
321 | return err
322 | }
323 | }
324 |
325 | // Setup our server for our internal protocol.
326 | addr, err := net.ResolveTCPAddr("tcp4", op.hostPort)
327 | if err != nil {
328 | return err
329 | }
330 |
331 | var exitedTCP atomic.Int32
332 | doneTCP := make(chan error, 1)
333 |
334 | // This connection will be closed upon context termination.
335 | tl, err := net.ListenTCP("tcp", addr)
336 | if err != nil {
337 | return err
338 | }
339 |
340 | op.logger.Printf("tcp: listen on %v", op.hostPort)
341 |
342 | go func() {
343 | defer func() { doneTCP <- nil }()
344 | for {
345 | conn, err := tl.Accept()
346 | if exitedTCP.Load() == 1 {
347 | return
348 | }
349 |
350 | if err != nil {
351 | op.logger.Printf("Accept failed: %v", err)
352 | return
353 | }
354 |
355 | if ctx.Err() != nil {
356 | op.logger.Printf("cancelled: %v", ctx.Err())
357 | return
358 | }
359 |
360 | go handleMsg(ctx, op, conn)
361 | }
362 | }()
363 |
364 | gl, err := net.Listen("tcp", op.grpcHostPort)
365 | if err != nil {
366 | return err
367 | }
368 |
369 | defer gl.Close()
370 | op.logger.Printf("grpc: listen on %v", op.grpcHostPort)
371 |
372 | gs := grpc.NewServer()
373 | svc := &service{op: op}
374 | pb.RegisterHedgeServer(gs, svc)
375 | reflection.Register(gs) // register reflection service
376 | go gs.Serve(gl)
377 |
378 | // Setup and start our internal spindle object.
379 | op.Lock = spindle.New(
380 | op.spannerClient,
381 | op.lockTable,
382 | fmt.Sprintf("hedge/spindle/lockname/%v", op.lockName),
383 | spindle.WithDuration(op.lockTimeout),
384 | spindle.WithId(op.hostPort),
385 | spindle.WithLeaderCallback(op.cbLeaderData, func(data any, msg []byte) {
386 | if op.cbLeader != nil {
387 | m := fmt.Sprintf("%v %v", string(msg), op.Name())
388 | op.cbLeader(data, []byte(m))
389 | }
390 | }),
391 | spindle.WithLogger(op.logger),
392 | )
393 |
394 | spindleDone := make(chan error, 1)
395 | ctxSpindle, cancel := context.WithCancel(context.Background())
396 | op.Lock.Run(ctxSpindle, spindleDone)
397 | defer func() {
398 | cancel() // stop spindle;
399 | <-spindleDone // and wait
400 | }()
401 |
402 | // Start tracking online members.
403 | op.members[op.hostPort] = struct{}{}
404 | membersDone := make(chan error, 1)
405 | ctxMembers := cctx(ctx)
406 | first := make(chan struct{}, 1)
407 | first <- struct{}{} // immediately the first time
408 | ticker := time.NewTicker(op.syncInterval)
409 | defer func() {
410 | ticker.Stop()
411 | <-membersDone
412 | }()
413 |
414 | go func() {
415 | var active atomic.Int32
416 | fnEnsureMembers := func() {
417 | active.Store(1)
418 | defer active.Store(0)
419 | ch := make(chan *string)
420 | emdone := make(chan struct{}, 1)
421 | todel := []string{}
422 | go func() {
423 | for {
424 | m := <-ch
425 | switch m {
426 | case nil:
427 | emdone <- struct{}{}
428 | return
429 | default:
430 | todel = append(todel, *m)
431 | }
432 | }
433 | }()
434 |
435 | var w sync.WaitGroup
436 | allm := op.getMembers()
437 | oldallm := make(map[string]struct{})
438 | maps.Copy(oldallm, allm)
439 |
440 | for k := range allm {
441 | w.Add(1)
442 | go func(id string) {
443 | defer func() { w.Done() }()
444 | timeout := time.Second * 5
445 | conn, err := net.DialTimeout("tcp", id, timeout)
446 | if err != nil {
447 | ch <- &id // delete this
448 | return
449 | }
450 |
451 | var sb strings.Builder
452 | fmt.Fprintf(&sb, "%s\n", CmdPing)
453 | r, err := op.send(conn, sb.String())
454 | if err != nil {
455 | ch <- &id // delete this
456 | return
457 | }
458 |
459 | if r != CmdAck {
460 | ch <- &id // delete this
461 | }
462 | }(k)
463 | }
464 |
465 | w.Wait()
466 | ch <- nil // close;
467 | <-emdone // and wait
468 | for _, rm := range todel {
469 | if rm != "" {
470 | op.logger.Printf("[leader] delete %v", rm)
471 | op.delMember(rm)
472 | }
473 | }
474 |
475 | newallm := op.getMembers()
476 | if len(oldallm) != len(newallm) && op.fnMemberChanged != nil {
477 | diff := len(newallm) - len(oldallm)
478 | op.fnMemberChanged(op.fnMemChangedData, []byte(fmt.Sprintf("%v", diff)))
479 | }
480 |
481 | // Broadcast active members to all.
482 | for k := range newallm {
483 | w.Add(1)
484 | go func(id string) {
485 | defer w.Done()
486 | timeout := time.Second * 5
487 | conn, err := net.DialTimeout("tcp", id, timeout)
488 | if err != nil {
489 | return
490 | }
491 |
492 | defer conn.Close()
493 | var sb strings.Builder
494 | fmt.Fprintf(&sb, "%s %s\n", CmdMembers, op.encodeMembers())
495 | op.send(conn, sb.String())
496 | }(k)
497 | }
498 |
499 | w.Wait()
500 | }
501 |
502 | var hbactive atomic.Int32
503 | fnHeartbeat := func() {
504 | hbactive.Store(1)
505 | defer hbactive.Store(0)
506 | lconn, err := op.getLeaderConn(ctx)
507 | if err != nil {
508 | return
509 | }
510 |
511 | if lconn != nil {
512 | defer lconn.Close()
513 | }
514 |
515 | var sb strings.Builder
516 | fmt.Fprintf(&sb, "%s %s\n", CmdPing, op.hostPort)
517 | r, err := op.send(lconn, sb.String())
518 | if err != nil {
519 | return
520 | }
521 |
522 | b, _ := base64.StdEncoding.DecodeString(r)
523 | var allm map[string]struct{}
524 | json.Unmarshal(b, &allm)
525 | op.setMembers(allm)
526 | }
527 |
528 | for {
529 | select {
530 | case <-ctxMembers.Done():
531 | membersDone <- nil
532 | return
533 | case <-first:
534 | case <-ticker.C:
535 | }
536 |
537 | if op.fnBroadcast == nil {
538 | op.logger.Println("no broadcast support")
539 | membersDone <- nil
540 | return
541 | }
542 |
543 | if hbactive.Load() == 0 {
544 | go fnHeartbeat() // tell leader we're online
545 | }
546 |
547 | if hl, _ := op.HasLock(); !hl {
548 | continue
549 | }
550 |
551 | if active.Load() == 0 {
552 | go fnEnsureMembers() // leader only
553 | }
554 | }
555 | }()
556 |
557 | op.active.Store(1)
558 | defer op.active.Store(0)
559 |
560 | <-ctx.Done() // wait for termination
561 |
562 | exitedTCP.Store(1) // don't print err in tl.Accept
563 | tl.Close() // will cause tl.Accept to fail
564 |
565 | gs.GracefulStop() // stop grpc server
566 | if op.ensureOn.Load() == 1 {
567 | op.ensureCancel() // stop semaphore checker;
568 | <-op.ensureDone // and wait
569 | }
570 |
571 | return nil
572 | }
573 |
574 | // NewSemaphore returns a distributed semaphore object.
575 | func (op *Op) NewSemaphore(ctx context.Context, name string, limit int) (*Semaphore, error) {
576 | if op.logTable == "" {
577 | return nil, ErrNotSupported
578 | }
579 |
580 | if op.active.Load() != 1 {
581 | return nil, ErrNotRunning
582 | }
583 |
584 | if strings.Contains(name, " ") {
585 | return nil, fmt.Errorf("name cannot have whitespace(s)")
586 | }
587 |
588 | conn, err := op.getLeaderConn(ctx)
589 | if err != nil {
590 | return nil, err
591 | }
592 |
593 | if conn != nil {
594 | defer conn.Close()
595 | }
596 |
597 | var sb strings.Builder
598 | fmt.Fprintf(&sb, "%s %s %d %s\n", CmdSemaphore, name, limit, op.hostPort)
599 | reply, err := op.send(conn, sb.String())
600 | if err != nil {
601 | return nil, err
602 | }
603 |
604 | switch {
605 | case strings.HasPrefix(reply, CmdAck):
606 | ss := strings.Split(reply, " ")
607 | if len(ss) > 1 { // failed
608 | dec, _ := base64.StdEncoding.DecodeString(ss[1])
609 | return nil, fmt.Errorf("%v", string(dec))
610 | }
611 | default:
612 | return nil, ErrNotSupported
613 | }
614 |
615 | return &Semaphore{name, limit, op}, nil
616 | }
617 |
618 | // NewSoS returns an object for writing data to spill-over
619 | // storage across the cluster. The order of writing is local
620 | // memory, local disk, other pod's memory, other pod's disk,
621 | // and so on.
622 | func (op *Op) NewSoS(name string, opts ...*SoSOptions) *SoS {
623 | op.sosLock.Lock()
624 | defer op.sosLock.Unlock()
625 | if _, ok := op.soss[name]; ok {
626 | return op.soss[name]
627 | }
628 |
629 | op.soss[name] = newSoS(name, op, opts...)
630 | return op.soss[name]
631 | }
632 |
633 | // Get reads a key (or keys) from Op.
634 | // The values of limit are:
635 | //
636 | // limit = 0 --> (default) latest only
637 | // limit = -1 --> all (latest to oldest, [0]=latest)
638 | // limit = -2 --> oldest version only
639 | // limit > 0 --> items behind latest; 3 means latest + 2 versions behind, [0]=latest
640 | func (op *Op) Get(ctx context.Context, key string, limit ...int64) ([]KeyValue, error) {
641 | if op.logTable == "" {
642 | return nil, ErrNotSupported
643 | }
644 |
645 | ret := []KeyValue{}
646 | var q strings.Builder
647 | fmt.Fprintf(&q, "select key, value, timestamp ")
648 | fmt.Fprintf(&q, "from %s ", op.logTable)
649 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ")
650 | fmt.Fprintf(&q, "order by timestamp desc limit 1")
651 |
652 | if len(limit) > 0 {
653 | switch {
654 | case limit[0] > 0:
655 | q.Reset()
656 | fmt.Fprintf(&q, "select key, value, timestamp ")
657 | fmt.Fprintf(&q, "from %s ", op.logTable)
658 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ")
659 | fmt.Fprintf(&q, "order by timestamp desc limit %v", limit[0])
660 | case limit[0] == -1:
661 | q.Reset()
662 | fmt.Fprintf(&q, "select key, value, timestamp ")
663 | fmt.Fprintf(&q, "from %s ", op.logTable)
664 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ")
665 | fmt.Fprintf(&q, "order by timestamp desc")
666 | case limit[0] == -2:
667 | q.Reset()
668 | fmt.Fprintf(&q, "select key, value, timestamp ")
669 | fmt.Fprintf(&q, "from %s ", op.logTable)
670 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ")
671 | fmt.Fprintf(&q, "order by timestamp limit 1")
672 | }
673 | }
674 |
675 | stmt := spanner.Statement{SQL: q.String(), Params: map[string]any{"key": key}}
676 | iter := op.spannerClient.Single().Query(ctx, stmt)
677 | defer iter.Stop()
678 | for {
679 | row, err := iter.Next()
680 | if err == iterator.Done {
681 | break
682 | }
683 |
684 | if err != nil {
685 | return ret, err
686 | }
687 |
688 | var li LogItem
689 | err = row.ToStruct(&li)
690 | if err != nil {
691 | return ret, err
692 | }
693 |
694 | ret = append(ret, KeyValue{
695 | Key: li.Key,
696 | Value: li.Value,
697 | Timestamp: li.Timestamp,
698 | })
699 | }
700 |
701 | return ret, nil
702 | }
703 |
704 | type PutOptions struct {
705 | // If true, do a direct write, no need to fwd to leader.
706 | DirectWrite bool
707 |
708 | // If true, don't do an append-write; overwrite the latest. Note that even
709 | // if you set this to true, if you do another Put the next time with this
710 | // field set as false (default), the previous write will now be gone, or
711 | // will now be part of the history.
712 | NoAppend bool
713 | }
714 |
715 | // Put saves a key/value to Op. This call will try to block, at least roughly
716 | // until spindle's timeout, to wait for the leader's availability to do actual
717 | // writes before returning.
718 | func (op *Op) Put(ctx context.Context, kv KeyValue, po ...PutOptions) error {
719 | if op.logTable == "" {
720 | return ErrNotSupported
721 | }
722 |
723 | var err error
724 | var direct, noappend, hl bool
725 | if len(po) > 0 {
726 | direct = po[0].DirectWrite
727 | noappend = po[0].NoAppend
728 | } else {
729 | hl, _ = op.HasLock()
730 | }
731 |
732 | id := uuid.NewString()
733 | if noappend {
734 | id = "-"
735 | }
736 |
737 | if direct || hl {
738 | b, _ := json.Marshal(kv)
739 | op.logger.Printf("[Put] leader: direct write: %v", string(b))
740 | _, err := op.spannerClient.Apply(ctx, []*spanner.Mutation{
741 | spanner.InsertOrUpdate(op.logTable,
742 | []string{"id", "key", "value", "leader", "timestamp"},
743 | []any{id, kv.Key, kv.Value, op.hostPort, spanner.CommitTimestamp},
744 | ),
745 | })
746 |
747 | return err
748 | }
749 |
750 | // For non-leaders, we confirm the leader via spindle, and if so, ask leader to do the
751 | // actual write for us. Let's do a couple retries up to spindle's timeout.
752 | conn, err := op.getLeaderConn(ctx)
753 | if err != nil {
754 | return err
755 | }
756 |
757 | if conn != nil {
758 | defer conn.Close()
759 | }
760 |
761 | b, _ := json.Marshal(kv)
762 | enc := base64.StdEncoding.EncodeToString(b)
763 | var sb strings.Builder
764 | fmt.Fprintf(&sb, "%s %s\n", CmdWrite, enc)
765 | if noappend {
766 | sb.Reset()
767 | fmt.Fprintf(&sb, "%s %s %s\n", CmdWrite, enc, FlagNoAppend)
768 | }
769 |
770 | reply, err := op.send(conn, sb.String())
771 | if err != nil {
772 | return err
773 | }
774 |
775 | switch {
776 | case strings.HasPrefix(reply, CmdAck):
777 | ss := strings.Split(reply, " ")
778 | if len(ss) > 1 { // failed
779 | dec, _ := base64.StdEncoding.DecodeString(ss[1])
780 | return fmt.Errorf("%v", string(dec))
781 | }
782 | default:
783 | return ErrNoLeader
784 | }
785 |
786 | return nil
787 | }
788 |
789 | // Send sends msg to the current leader. Any node can send messages,
790 | // including the leader itself (send to self). It also blocks until
791 | // it receives the reply from the leader's message handler.
792 | func (op *Op) Send(ctx context.Context, msg []byte) ([]byte, error) {
793 | conn, err := op.getLeaderConn(ctx)
794 | if err != nil {
795 | return nil, err
796 | }
797 |
798 | if conn != nil {
799 | defer conn.Close()
800 | }
801 |
802 | enc := base64.StdEncoding.EncodeToString(msg)
803 | var sb strings.Builder
804 | fmt.Fprintf(&sb, "%s %s\n", CmdSend, enc)
805 | reply, err := op.send(conn, sb.String())
806 | if err != nil {
807 | return nil, err
808 | }
809 |
810 | switch {
811 | case strings.HasPrefix(reply, CmdAck): // expect "ACK base64(reply)"
812 | ss := strings.Split(reply, " ")
813 | if len(ss) > 1 {
814 | return base64.StdEncoding.DecodeString(ss[1])
815 | }
816 | }
817 |
818 | // If not ACK, then the whole reply is an error string.
819 | b, _ := base64.StdEncoding.DecodeString(reply)
820 | return nil, fmt.Errorf("%v", string(b))
821 | }
822 |
823 | type StreamToLeaderOutput struct {
824 | In chan *StreamMessage `json:"in"`
825 | Out chan *StreamMessage `json:"out"`
826 | }
827 |
828 | // StreamToLeader returns an input and output channels for streaming to leader.
829 | // To use the channels, send your request message(s) to the input channel, close
830 | // it (i.e. close(input)), then read the replies from the output channel. This
831 | // function will close the output channel when done.
832 | //
833 | // StreamToLeader is sequential in the sense that you need to send all your input
834 | // messages first before getting any response from the leader.
835 | func (op *Op) StreamToLeader(ctx context.Context) (*StreamToLeaderOutput, error) {
836 | if op.leaderStreamIn == nil || op.leaderStreamOut == nil {
837 | return nil, fmt.Errorf("hedge: input/output channel(s) cannot be nil")
838 | }
839 |
840 | conn, err := op.getLeaderGrpcConn(ctx)
841 | if err != nil {
842 | return nil, err
843 | }
844 |
845 | client := pb.NewHedgeClient(conn)
846 | stream, err := client.Send(ctx)
847 | if err != nil {
848 | return nil, err
849 | }
850 |
851 | keyId := "id"
852 | id := uuid.NewString()
853 | reply := make(chan error)
854 | ret := StreamToLeaderOutput{
855 | In: make(chan *StreamMessage),
856 | Out: make(chan *StreamMessage),
857 | }
858 |
859 | // Exit only when input channel is closed by the caller.
860 | // We don't wait for this goroutine.
861 | go func() {
862 | var err error
863 | for m := range ret.In {
864 | if m.Payload.Meta == nil {
865 | m.Payload.Meta = map[string]string{keyId: id}
866 | } else {
867 | if _, ok := m.Payload.Meta[keyId]; !ok {
868 | m.Payload.Meta[keyId] = id
869 | }
870 | }
871 |
872 | err = stream.Send(m.Payload)
873 | if err != nil {
874 | break
875 | }
876 | }
877 |
878 | stream.CloseSend()
879 | reply <- err
880 | }()
881 |
882 | // Exit only when streaming response is done.
883 | // We don't wait for this goroutine.
884 | go func() {
885 | defer func() {
886 | close(ret.Out)
887 | conn.Close()
888 | }()
889 |
890 | err := <-reply
891 | if err != nil {
892 | ret.Out <- &StreamMessage{Error: err}
893 | return
894 | }
895 |
896 | for {
897 | resp, err := stream.Recv()
898 | if err == io.EOF {
899 | return
900 | }
901 |
902 | ret.Out <- &StreamMessage{Payload: resp}
903 | }
904 | }()
905 |
906 | return &ret, nil
907 | }
908 |
909 | type BroadcastOutput struct {
910 | Id string `json:"id,omitempty"`
911 | Reply []byte `json:"reply,omitempty"`
912 | Error error `json:"error,omitempty"`
913 | }
914 |
915 | type BroadcastArgs struct {
916 | SkipSelf bool // if true, skip broadcasting to self
917 | Out chan BroadcastOutput
918 | OnlySendTo []string // if set, only send to these member/s
919 | }
920 |
921 | // Broadcast sends msg to all nodes (send to all). Any node can broadcast messages, including the
922 | // leader itself. Note that this is best-effort basis only; by the time you call this API, the
923 | // handler might not have all the active members in record yet, as is the usual situation with
924 | // k8s deployments, where pods come and go, and our internal heartbeat protocol hasn't been
925 | // completed yet. This call will also block until it receives all the reply from all nodes'
926 | // broadcast handlers.
927 | //
928 | // If args[].Out is set, the output will be streamed to that channel instead. Useful if you prefer
929 | // a streamed output (as reply comes) instead of waiting for all replies before returning. If set,
930 | // the return value (output slice) will be set to empty []. Also, close() will be called on the
931 | // Out channel to indicate streaming end.
932 | func (op *Op) Broadcast(ctx context.Context, msg []byte, args ...BroadcastArgs) []BroadcastOutput {
933 | if op.active.Load() != 1 || op.fnBroadcast == nil {
934 | return nil // not running or no broadcast support
935 | }
936 |
937 | var stream bool
938 | outs := []BroadcastOutput{}
939 | var w sync.WaitGroup
940 | var outch chan BroadcastOutput
941 | members := op.getMembers()
942 | if len(args) > 0 && args[0].SkipSelf {
943 | delete(members, op.Name())
944 | }
945 |
946 | if len(args) > 0 && len(args[0].OnlySendTo) > 0 {
947 | filtered := make(map[string]struct{})
948 | for _, v := range args[0].OnlySendTo {
949 | if _, ok := members[v]; ok {
950 | filtered[v] = struct{}{}
951 | }
952 | }
953 | members = filtered
954 | }
955 |
956 | switch {
957 | case len(args) > 0 && args[0].Out != nil:
958 | outch = args[0].Out
959 | stream = true
960 | default:
961 | outch = make(chan BroadcastOutput, len(members))
962 | }
963 |
964 | for k := range members {
965 | w.Add(1)
966 | go func(id string) {
967 | defer w.Done()
968 | timeout := time.Second * 5
969 | conn, err := net.DialTimeout("tcp", id, timeout)
970 | if err != nil {
971 | outch <- BroadcastOutput{Id: id, Error: err}
972 | return
973 | }
974 |
975 | defer conn.Close()
976 | enc := base64.StdEncoding.EncodeToString(msg)
977 | var sb strings.Builder
978 | fmt.Fprintf(&sb, "%s %s\n", CmdBroadcast, enc)
979 | reply, err := op.send(conn, sb.String())
980 | if err != nil {
981 | outch <- BroadcastOutput{Id: id, Error: err}
982 | return
983 | }
984 |
985 | switch {
986 | case strings.HasPrefix(reply, CmdAck): // expect "ACK base64(reply)"
987 | ss := strings.Split(reply, " ")
988 | if len(ss) > 1 {
989 | r, e := base64.StdEncoding.DecodeString(ss[1])
990 | outch <- BroadcastOutput{Id: id, Reply: r, Error: e}
991 | return
992 | }
993 | }
994 |
995 | // If not ACK, then the whole reply is an error string.
996 | r, _ := base64.StdEncoding.DecodeString(reply)
997 | outch <- BroadcastOutput{Id: id, Error: fmt.Errorf("%v", string(r))}
998 | }(k)
999 | }
1000 |
1001 | w.Wait()
1002 | switch {
1003 | case stream:
1004 | close(args[0].Out)
1005 | default:
1006 | for range members {
1007 | outs = append(outs, <-outch)
1008 | }
1009 | }
1010 |
1011 | return outs
1012 | }
1013 |
1014 | type StreamBroadcastArgs struct {
1015 | SkipSelf bool // if true, skip broadcasting to self
1016 | }
1017 |
1018 | type StreamBroadcastOutput struct {
1019 | In chan *StreamMessage
1020 | Outs map[string]chan *StreamMessage
1021 | }
1022 |
1023 | // StreamBroadcast returns input and output channels for doing streaming broadcasts. Any node can broadcast messages,
1024 | // including the leader itself. Note that this is best-effort basis only; by the time you call this API, the handler
1025 | // might not have all the active members in record yet, as is the usual situation with k8s deployments, where pods
1026 | // come and go, and our internal heartbeat protocol hasn't been completed yet. This call will also block until it
1027 | // receives all the reply from all nodes' broadcast handlers.
1028 | //
1029 | // To use the channels, send your request message(s) to the input channel, close it (i.e. close(input)), then read
1030 | // the replies from the output channels. This function will close all output channels when done.
1031 | //
1032 | // StreamBroadcast is sequential in the sense that you need to send all your input messages first before getting
1033 | // any response from all the nodes.
1034 | func (op *Op) StreamBroadcast(ctx context.Context, args ...StreamBroadcastArgs) (*StreamBroadcastOutput, error) {
1035 | if op.active.Load() != 1 {
1036 | return nil, nil // not running
1037 | }
1038 |
1039 | members := op.getMembers()
1040 | if len(args) > 0 && args[0].SkipSelf {
1041 | delete(members, op.Name())
1042 | }
1043 |
1044 | ret := StreamBroadcastOutput{
1045 | In: make(chan *StreamMessage),
1046 | Outs: make(map[string]chan *StreamMessage),
1047 | }
1048 |
1049 | _, gp, _ := net.SplitHostPort(op.grpcHostPort)
1050 | conns := make(map[string]*grpc.ClientConn)
1051 | streams := make(map[string]pb.Hedge_BroadcastClient)
1052 | for k := range members {
1053 | h, _, _ := net.SplitHostPort(k)
1054 | gHostPort := net.JoinHostPort(h, gp)
1055 |
1056 | var opts []grpc.DialOption
1057 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))
1058 | lconn, err := grpc.NewClient(gHostPort, opts...)
1059 | if err != nil {
1060 | return nil, err
1061 | }
1062 |
1063 | conns[k] = lconn
1064 | client := pb.NewHedgeClient(lconn)
1065 | stream, err := client.Broadcast(ctx)
1066 | if err != nil {
1067 | continue
1068 | }
1069 |
1070 | streams[gHostPort] = stream
1071 | ret.Outs[gHostPort] = make(chan *StreamMessage)
1072 | }
1073 |
1074 | keyId := "id"
1075 | id := uuid.NewString()
1076 | reply := make(chan error)
1077 |
1078 | // Exit only when input channel is closed by the caller.
1079 | // We don't wait for this goroutine.
1080 | go func() {
1081 | for m := range ret.In {
1082 | if m.Payload.Meta == nil {
1083 | m.Payload.Meta = map[string]string{keyId: id}
1084 | } else {
1085 | if _, ok := m.Payload.Meta[keyId]; !ok {
1086 | m.Payload.Meta[keyId] = id
1087 | }
1088 | }
1089 |
1090 | for _, v := range streams {
1091 | v.Send(m.Payload)
1092 | }
1093 | }
1094 |
1095 | for _, v := range streams {
1096 | v.CloseSend()
1097 | }
1098 |
1099 | reply <- nil
1100 | }()
1101 |
1102 | // Exit only when all streaming responses from all nodes are done.
1103 | // We don't wait for this goroutine.
1104 | go func() {
1105 | defer func() {
1106 | for _, v := range ret.Outs {
1107 | close(v)
1108 | }
1109 |
1110 | for _, v := range conns {
1111 | v.Close()
1112 | }
1113 | }()
1114 |
1115 | <-reply // input done
1116 |
1117 | var w sync.WaitGroup
1118 | for k, v := range streams {
1119 | w.Add(1)
1120 | go func(node string, stream pb.Hedge_BroadcastClient) {
1121 | defer w.Done()
1122 | for {
1123 | resp, err := stream.Recv()
1124 | if err == io.EOF {
1125 | return
1126 | }
1127 |
1128 | ret.Outs[node] <- &StreamMessage{Payload: resp}
1129 | }
1130 | }(k, v)
1131 | }
1132 |
1133 | w.Wait()
1134 | }()
1135 |
1136 | return &ret, nil
1137 | }
1138 |
1139 | // Members returns a list of members in the cluster/group.
1140 | func (op *Op) Members() []string {
1141 | members := []string{}
1142 | m := op.getMembers()
1143 | for k := range m {
1144 | members = append(members, k)
1145 | }
1146 |
1147 | return members
1148 | }
1149 |
1150 | func (op *Op) send(conn net.Conn, msg string) (string, error) {
1151 | if conn == nil {
1152 | return "", ErrInvalidConn
1153 | }
1154 |
1155 | _, err := conn.Write([]byte(msg))
1156 | if err != nil {
1157 | return "", err
1158 | }
1159 |
1160 | return op.recv(conn)
1161 | }
1162 |
1163 | func (op *Op) recv(conn net.Conn) (string, error) {
1164 | if conn == nil {
1165 | return "", ErrInvalidConn
1166 | }
1167 |
1168 | buffer, err := bufio.NewReader(conn).ReadString('\n')
1169 | if err != nil {
1170 | return "", err
1171 | }
1172 |
1173 | var reply string
1174 | if buffer != "" {
1175 | reply = buffer[:len(buffer)-1]
1176 | }
1177 |
1178 | return reply, nil
1179 | }
1180 |
1181 | func (op *Op) buildAckReply(err error) string {
1182 | var sb strings.Builder
1183 | if err != nil {
1184 | ee := base64.StdEncoding.EncodeToString([]byte(err.Error()))
1185 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, ee)
1186 | return sb.String()
1187 | } else {
1188 | fmt.Fprintf(&sb, "%s\n", CmdAck)
1189 | return sb.String()
1190 | }
1191 | }
1192 |
1193 | func (op *Op) getLeaderConn(ctx context.Context) (net.Conn, error) {
1194 | var conn net.Conn
1195 | var err error
1196 | subctx := context.WithValue(ctx, struct{}{}, nil)
1197 | first := make(chan struct{}, 1)
1198 | first <- struct{}{} // immediately the first time
1199 | tcnt, tlimit := int64(0), (op.lockTimeout/2000)*2
1200 | ticker := time.NewTicker(time.Second * 2) // processing can be more than this
1201 | defer ticker.Stop()
1202 |
1203 | var active atomic.Int32
1204 | getConn := func() (net.Conn, error) {
1205 | active.Store(1)
1206 | defer active.Store(0)
1207 | timeout := time.Second * 5
1208 | leader, err := op.Leader()
1209 | if err != nil {
1210 | return nil, err
1211 | }
1212 |
1213 | if leader == "" {
1214 | return nil, ErrNoLeader
1215 | }
1216 |
1217 | lconn, err := net.DialTimeout("tcp", leader, timeout)
1218 | if err != nil {
1219 | return nil, err
1220 | }
1221 |
1222 | defer lconn.Close()
1223 | var sb strings.Builder
1224 | fmt.Fprintf(&sb, "%s\n", CmdLeader)
1225 | reply, err := op.send(lconn, sb.String())
1226 | if err != nil {
1227 | return nil, err
1228 | }
1229 |
1230 | if !strings.HasPrefix(reply, CmdAck) {
1231 | return nil, ErrNoLeader
1232 | }
1233 |
1234 | // Create a new connection to the confirmed leader.
1235 | return net.DialTimeout("tcp", leader, timeout)
1236 | }
1237 |
1238 | type connT struct {
1239 | conn net.Conn
1240 | err error
1241 | }
1242 |
1243 | for {
1244 | select {
1245 | case <-subctx.Done():
1246 | return nil, context.Canceled
1247 | case <-first:
1248 | case <-ticker.C:
1249 | }
1250 |
1251 | if active.Load() == 1 {
1252 | continue
1253 | }
1254 |
1255 | ch := make(chan connT, 1)
1256 | go func() {
1257 | c, e := getConn()
1258 | ch <- connT{c, e}
1259 | }()
1260 |
1261 | res := <-ch
1262 | conn = res.conn
1263 | err = res.err
1264 |
1265 | tcnt++
1266 | if err == nil || (tcnt > tlimit) {
1267 | break
1268 | }
1269 | }
1270 |
1271 | return conn, nil
1272 | }
1273 |
1274 | // Don't forget to close the returned connection.
1275 | func (op *Op) getLeaderGrpcConn(ctx context.Context) (*grpc.ClientConn, error) {
1276 | var conn *grpc.ClientConn
1277 | var err error
1278 | subctx := context.WithValue(ctx, struct{}{}, nil)
1279 | first := make(chan struct{}, 1)
1280 | first <- struct{}{} // immediately the first time
1281 | tcnt, tlimit := int64(0), (op.lockTimeout/2000)*2
1282 | ticker := time.NewTicker(time.Second * 2) // processing can be more than this
1283 | defer ticker.Stop()
1284 |
1285 | var active atomic.Int32
1286 | getConn := func() (*grpc.ClientConn, error) {
1287 | active.Store(1)
1288 | defer active.Store(0)
1289 | leader, err := op.Leader()
1290 | if err != nil {
1291 | return nil, err
1292 | }
1293 |
1294 | if leader == "" {
1295 | return nil, ErrNoLeader
1296 | }
1297 |
1298 | // Get the gRPC host:port.
1299 | h, _, _ := net.SplitHostPort(leader)
1300 | _, gp, _ := net.SplitHostPort(op.grpcHostPort)
1301 | gleader := net.JoinHostPort(h, gp)
1302 |
1303 | var opts []grpc.DialOption
1304 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))
1305 | lconn, err := grpc.NewClient(gleader, opts...)
1306 | if err != nil {
1307 | return nil, err
1308 | }
1309 |
1310 | return lconn, nil
1311 | }
1312 |
1313 | type connT struct {
1314 | conn *grpc.ClientConn
1315 | err error
1316 | }
1317 |
1318 | for {
1319 | select {
1320 | case <-subctx.Done():
1321 | return nil, context.Canceled
1322 | case <-first:
1323 | case <-ticker.C:
1324 | }
1325 |
1326 | if active.Load() == 1 {
1327 | continue
1328 | }
1329 |
1330 | ch := make(chan connT, 1)
1331 | go func() {
1332 | c, e := getConn()
1333 | ch <- connT{c, e}
1334 | }()
1335 |
1336 | res := <-ch
1337 | conn = res.conn
1338 | err = res.err
1339 |
1340 | tcnt++
1341 | if err == nil || (tcnt > tlimit) {
1342 | break
1343 | }
1344 | }
1345 |
1346 | return conn, nil
1347 | }
1348 |
1349 | func (op *Op) getMembers() map[string]struct{} {
1350 | op.mtx.Lock()
1351 | copy := make(map[string]struct{})
1352 | maps.Copy(copy, op.members)
1353 | op.mtx.Unlock()
1354 | return copy
1355 | }
1356 |
1357 | func (op *Op) encodeMembers() string {
1358 | op.mtx.Lock()
1359 | defer op.mtx.Unlock()
1360 | b, _ := json.Marshal(op.members)
1361 | return base64.StdEncoding.EncodeToString(b)
1362 | }
1363 |
1364 | func (op *Op) setMembers(m map[string]struct{}) {
1365 | op.mtx.Lock()
1366 | defer op.mtx.Unlock()
1367 | op.members = m
1368 | }
1369 |
1370 | func (op *Op) addMember(id string) {
1371 | op.mtx.Lock()
1372 | defer op.mtx.Unlock()
1373 | op.members[id] = struct{}{}
1374 | }
1375 |
1376 | func (op *Op) delMember(id string) {
1377 | op.mtx.Lock()
1378 | defer op.mtx.Unlock()
1379 | delete(op.members, id)
1380 | }
1381 |
1382 | // New creates an instance of Op. hostPort can be in "ip:port" format, or ":port" format, in which case
1383 | // the IP part will be resolved internally, or empty, in which case port 8080 will be used. The internal
1384 | // spindle object's lock table name will be lockTable, and lockName is the lock name. logTable will
1385 | // serve as our append-only, distributed key/value storage table. If logTable is empty, Put, Get, and
1386 | // Semaphore features will be disabled.
1387 | func New(client *spanner.Client, hostPort, lockTable, lockName, logTable string, opts ...Option) *Op {
1388 | op := &Op{
1389 | hostPort: hostPort,
1390 | spannerClient: client,
1391 | lockTable: lockTable,
1392 | lockName: lockName,
1393 | logTable: logTable,
1394 | members: make(map[string]struct{}),
1395 | ensureCh: make(chan string),
1396 | ensureDone: make(chan struct{}, 1),
1397 | sosLock: &sync.Mutex{},
1398 | soss: map[string]*SoS{},
1399 | Lock: &spindle.Lock{}, // init later
1400 | }
1401 |
1402 | for _, opt := range opts {
1403 | opt.Apply(op)
1404 | }
1405 |
1406 | host, port, _ := net.SplitHostPort(op.hostPort)
1407 | switch {
1408 | case host == "" && port != "":
1409 | // We will use memberlist for IP resolution.
1410 | list, _ := memberlist.Create(memberlist.DefaultLANConfig())
1411 | localNode := list.LocalNode()
1412 | lh, _, _ := net.SplitHostPort(localNode.Address())
1413 | op.hostPort = net.JoinHostPort(lh, port)
1414 | list.Shutdown()
1415 | case host == "" && port == "":
1416 | // We will use memberlist for IP resolution.
1417 | list, _ := memberlist.Create(memberlist.DefaultLANConfig())
1418 | localNode := list.LocalNode()
1419 | lh, _, _ := net.SplitHostPort(localNode.Address())
1420 | op.hostPort = net.JoinHostPort(lh, "8080")
1421 | list.Shutdown()
1422 | }
1423 |
1424 | // Our gRPC host:port by default is set to host:port+1.
1425 | if op.grpcHostPort == "" {
1426 | host, port, _ := net.SplitHostPort(op.hostPort)
1427 | pi, _ := strconv.Atoi(port)
1428 | op.grpcHostPort = net.JoinHostPort(host, fmt.Sprintf("%v", pi+1))
1429 | }
1430 |
1431 | switch {
1432 | case op.lockTimeout == 0:
1433 | op.lockTimeout = 30000 // default 30s
1434 | case op.lockTimeout < 2000:
1435 | op.lockTimeout = 2000 // minimum 2s
1436 | }
1437 |
1438 | switch {
1439 | case op.syncInterval == 0:
1440 | op.syncInterval = time.Second * 30 // default
1441 | case op.syncInterval < (time.Second * 2):
1442 | op.syncInterval = time.Second * 2 // minimum
1443 | }
1444 |
1445 | if op.logger == nil {
1446 | prefix := fmt.Sprintf("[hedge/%v] ", op.hostPort)
1447 | op.logger = log.New(os.Stdout, prefix, log.LstdFlags)
1448 | }
1449 |
1450 | return op
1451 | }
1452 |
1453 | type SendToLeaderArgs struct {
1454 | // Number of retry attempts to contact the leader.
1455 | // Defaults to 10. If set to a negative number, it
1456 | // will retry forever.
1457 | Retries int
1458 | }
1459 |
1460 | // SendToLeader is a wrapper to hedge.Send() with builtin retry mechanisms.
1461 | func SendToLeader(ctx context.Context, op *Op, m []byte, args ...*SendToLeaderArgs) ([]byte, error) {
1462 | if op == nil {
1463 | return nil, fmt.Errorf("hedge: op cannot be nil")
1464 | }
1465 |
1466 | retries := 10
1467 | if len(args) > 0 {
1468 | retries = args[0].Retries
1469 | }
1470 |
1471 | if retries == 0 {
1472 | retries = 10
1473 | }
1474 |
1475 | result := make(chan []byte, 1)
1476 | done := make(chan error, 1)
1477 | go func() {
1478 | var err error
1479 | var res []byte
1480 | defer func(b *[]byte, e *error) {
1481 | result <- *b
1482 | done <- *e
1483 | }(&res, &err)
1484 |
1485 | bo := gaxv2.Backoff{
1486 | Max: time.Minute,
1487 | }
1488 |
1489 | var i int
1490 | for {
1491 | if i >= retries && retries >= 0 {
1492 | break
1493 | }
1494 |
1495 | if !op.IsRunning() {
1496 | time.Sleep(bo.Pause())
1497 | }
1498 |
1499 | if retries >= 0 {
1500 | i++
1501 | }
1502 | }
1503 |
1504 | i = 0
1505 | for {
1506 | if i >= retries && retries >= 0 {
1507 | err = fmt.Errorf("hedge: retries exhausted")
1508 | return
1509 | }
1510 |
1511 | var r []byte
1512 | r, err = op.Send(ctx, m)
1513 | if err != nil {
1514 | time.Sleep(bo.Pause())
1515 | } else {
1516 | res = r // to outside
1517 | return
1518 | }
1519 |
1520 | if retries >= 0 {
1521 | i++
1522 | }
1523 | }
1524 | }()
1525 |
1526 | for {
1527 | select {
1528 | case e := <-done:
1529 | return <-result, e
1530 | case <-ctx.Done():
1531 | return nil, context.Canceled
1532 | }
1533 | }
1534 | }
1535 |
--------------------------------------------------------------------------------