├── assets └── hedge.png ├── .gitignore ├── localbuild.sh ├── testdata └── emuddl.sql ├── Dockerfile ├── LICENSE ├── .github └── workflows │ └── main.yml ├── hedge_test.go ├── deployment.yaml ├── go.mod ├── service.go ├── protocol.go ├── README.md ├── semaphore.go ├── example └── demo │ └── main.go ├── sos.go └── hedge.go /assets/hedge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flowerinthenight/hedge/HEAD/assets/hedge.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.exe 2 | *.exe~ 3 | *.dll 4 | *.so 5 | *.dylib 6 | *.test 7 | *.out 8 | deployment-private.yaml 9 | example/demo/demo 10 | readdata 11 | readlocs -------------------------------------------------------------------------------- /localbuild.sh: -------------------------------------------------------------------------------- 1 | kubectl delete -f deployment.yaml 2 | DOCKER_BUILDKIT=0 docker build --rm -t demo . 3 | DOCKER_BUILDKIT=0 docker tag demo asia.gcr.io/mobingi-main/hedge:$1 4 | DOCKER_BUILDKIT=0 docker push asia.gcr.io/mobingi-main/hedge:$1 5 | DOCKER_BUILDKIT=0 docker rmi $(docker images --filter "dangling=true" -q --no-trunc) -f 6 | [ -f deployment-private.yaml ] && sed -i -e 's/image\:\ asia.gcr.io\/mobingi\-main\/hedge[\:@].*$/image\:\ asia.gcr.io\/mobingi\-main\/hedge\:'$1'/g' deployment-private.yaml 7 | -------------------------------------------------------------------------------- /testdata/emuddl.sql: -------------------------------------------------------------------------------- 1 | -- for spindle 2 | CREATE TABLE locktable ( 3 | name STRING(MAX) NOT NULL, 4 | heartbeat TIMESTAMP OPTIONS (allow_commit_timestamp=true), 5 | token TIMESTAMP OPTIONS (allow_commit_timestamp=true), 6 | writer STRING(MAX) 7 | ) PRIMARY KEY (name); 8 | 9 | -- for hedge 10 | CREATE TABLE logtable ( 11 | id STRING(MAX), 12 | key STRING(MAX), 13 | value STRING(MAX), 14 | leader STRING(MAX), 15 | timestamp TIMESTAMP OPTIONS (allow_commit_timestamp=true) 16 | ) PRIMARY KEY (key, id) 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.25.1-trixie 2 | COPY . /go/src/github.com/flowerinthenight/hedge/ 3 | WORKDIR /go/src/github.com/flowerinthenight/hedge/example/demo/ 4 | RUN CGO_ENABLED=0 GOOS=linux go build -v -trimpath -installsuffix cgo -o hedge . 5 | 6 | FROM debian:stable-slim 7 | RUN set -x && apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* 8 | WORKDIR /app/ 9 | COPY --from=0 /go/src/github.com/flowerinthenight/hedge/example/demo/hedge . 10 | ENTRYPOINT ["/app/hedge"] 11 | CMD ["-db=projects/{project}/instances/{instance}/databases/{database}"] 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 flowerinthenight 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | tags: 7 | - '*' 8 | pull_request: 9 | branches: [ main ] 10 | 11 | jobs: 12 | codeberg: 13 | name: Codeberg 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Mirror to Codeberg 22 | uses: yesolutions/mirror-action@master 23 | with: 24 | REMOTE: "https://codeberg.org/flowerinthenight/hedge.git" 25 | GIT_USERNAME: flowerinthenight 26 | GIT_PASSWORD: ${{ secrets.GIT_PASSWORD }} 27 | 28 | build: 29 | name: Build 30 | if: "!contains(github.event.commits[0].message, 'ci skip')" 31 | runs-on: ubuntu-latest 32 | services: 33 | emulator: 34 | image: gcr.io/cloud-spanner-emulator/emulator 35 | ports: 36 | - 9010:9010 37 | - 9020:9020 38 | 39 | steps: 40 | - uses: actions/checkout@v4 41 | 42 | - uses: actions/setup-go@v4 43 | with: 44 | go-version: '1.25' 45 | 46 | - name: 'Test using emulator' 47 | run: | 48 | curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-536.0.1-linux-x86_64.tar.gz 49 | tar xvzf google-cloud-sdk-536.0.1-linux-x86_64.tar.gz && ./google-cloud-sdk/install.sh --quiet 50 | gcloud config configurations create emulator 51 | gcloud config set auth/disable_credentials true 52 | gcloud config set project test-project 53 | gcloud config set api_endpoint_overrides/spanner http://localhost:9020/ 54 | gcloud spanner instances create test-instance --config=emulator-config --description="Test Instance" --nodes=1 55 | export SPANNER_EMULATOR_HOST=localhost:9010 56 | gcloud spanner databases create testdb --instance=test-instance --ddl-file=$PWD/testdata/emuddl.sql 57 | go test -v -run TestBasic 58 | -------------------------------------------------------------------------------- /hedge_test.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "context" 5 | "log/slog" 6 | "testing" 7 | "time" 8 | 9 | "cloud.google.com/go/spanner" 10 | gaxv2 "github.com/googleapis/gax-go/v2" 11 | ) 12 | 13 | const ( 14 | db = "projects/test-project/instances/test-instance/databases/testdb" 15 | ) 16 | 17 | func TestAny(t *testing.T) { 18 | // TODO: 19 | d := []byte{} 20 | d = append(d, []byte("hello")...) 21 | d = append(d, []byte("world")...) 22 | d = append(d, []byte("stats")...) 23 | d = append(d, []byte("one")...) 24 | slog.Info("next:", "val", d[0:5]) 25 | slog.Info("next:", "val", d[5:10]) 26 | slog.Info("next:", "val", d[10:15]) 27 | slog.Info("next:", "val", d[15:20]) 28 | } 29 | 30 | func TestBasic(t *testing.T) { 31 | ctx := context.Background() 32 | client, err := spanner.NewClient(ctx, db) 33 | if err != nil { 34 | t.Error(err) 35 | return 36 | } 37 | 38 | defer client.Close() 39 | op := New(client, ":8080", "locktable", "mylock", "logtable", 40 | WithLeaderHandler( 41 | nil, 42 | func(data any, msg []byte) ([]byte, error) { 43 | t.Log("[send] received:", string(msg)) 44 | return []byte("send " + string(msg)), nil 45 | }, 46 | ), 47 | WithBroadcastHandler( 48 | nil, 49 | func(data any, msg []byte) ([]byte, error) { 50 | t.Log("[broadcast/semaphore] received:", string(msg)) 51 | return nil, nil 52 | }, 53 | ), 54 | ) 55 | 56 | done := make(chan error, 1) 57 | quit, cancel := context.WithCancel(ctx) 58 | go op.Run(quit, done) 59 | 60 | var cnt int 61 | bo := gaxv2.Backoff{ 62 | Initial: time.Second, 63 | Max: time.Second * 30, 64 | Multiplier: 2, 65 | } 66 | 67 | for { 68 | cnt++ 69 | locked, _ := op.HasLock() 70 | switch { 71 | case locked: 72 | t.Log("got lock") 73 | break 74 | default: 75 | t.Log("didn't get lock, retry") 76 | time.Sleep(bo.Pause()) 77 | continue 78 | } 79 | 80 | if cnt >= 10 { 81 | t.Fatalf("can't get lock") 82 | } 83 | 84 | break 85 | } 86 | 87 | cancel() 88 | <-done 89 | } 90 | -------------------------------------------------------------------------------- /deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: hedgedemo 5 | # TODO: (optional) Update {project} to your project id. 6 | annotations: 7 | iam.gke.io/gcp-service-account: hedgedemo@{project}.iam.gserviceaccount.com 8 | 9 | --- 10 | 11 | apiVersion: apps/v1 12 | kind: Deployment 13 | metadata: 14 | name: hedgedemo 15 | spec: 16 | selector: 17 | matchLabels: 18 | app: hedgedemo 19 | replicas: 3 20 | revisionHistoryLimit: 5 21 | template: 22 | metadata: 23 | labels: 24 | app: hedgedemo 25 | spec: 26 | # This sample uses GKE's Workload Identity to authenticate against GCP services. 27 | # 'hedgedemo' service account here is mapped to a GCP's IAM service account 28 | # that has access to Spanner and PubSub. 29 | # If you're not using Workload Identity, you can also use a service account key 30 | # and set the GOOGLE_APPLICATION_CREDENTIALS environment variable. 31 | serviceAccountName: hedgedemo 32 | initContainers: 33 | - image: gcr.io/google.com/cloudsdktool/cloud-sdk:363.0.0-alpine 34 | name: workload-identity-initcontainer 35 | command: 36 | - '/bin/bash' 37 | - '-c' 38 | - | 39 | curl -s -H 'Metadata-Flavor: Google' 'http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token' --retry 30 --retry-connrefused --retry-max-time 30 > /dev/null || exit 1 40 | containers: 41 | - name: hedgedemo 42 | image: quay.io/flowerinthenight/hedgedemo:v1.10.5 43 | imagePullPolicy: Always 44 | # TODO: Update as needed. 45 | args: ["-db=projects/{project}/instances/{instance}/databases/{database}"] 46 | env: 47 | # Downward value to get pod IP. We'll use this as our hedge instance id. 48 | - name: K8S_POD_IP 49 | valueFrom: 50 | fieldRef: 51 | fieldPath: status.podIP 52 | - name: GET_HOSTS_FROM 53 | value: dns 54 | # - name: GOOGLE_APPLICATION_CREDENTIALS 55 | # value: /etc/svcacct.json 56 | ports: 57 | - containerPort: 8080 58 | # Uncomment the section below and the env variable above to use GOOGLE_APPLICATION_CREDENTIALS 59 | # for authentication. You can upload a service account JSON file thru: 60 | # kubectl create secret generic hedgedemo-keyfile --from-file svcacct.json 61 | # 62 | # volumeMounts: 63 | # - name: keyfile 64 | # mountPath: "/etc/hedgedemo" 65 | # readOnly: true 66 | # volumes: 67 | # - name: keyfile 68 | # secret: 69 | # secretName: hedgedemo-keyfile 70 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/flowerinthenight/hedge/v2 2 | 3 | go 1.24.0 4 | 5 | require ( 6 | cloud.google.com/go/spanner v1.85.0 7 | github.com/apache/arrow/go/v17 v17.0.0 8 | github.com/cespare/xxhash/v2 v2.3.0 9 | github.com/flowerinthenight/hedge-proto v0.1.0 10 | github.com/flowerinthenight/spindle/v2 v2.2.0 11 | github.com/google/uuid v1.6.0 12 | github.com/googleapis/gax-go/v2 v2.15.0 13 | github.com/hashicorp/memberlist v0.5.1 14 | github.com/shirou/gopsutil/v4 v4.24.9 15 | golang.org/x/exp v0.0.0-20250128182459-e0ece0dbea4c 16 | golang.org/x/sync v0.16.0 17 | google.golang.org/api v0.248.0 18 | google.golang.org/grpc v1.75.0 19 | ) 20 | 21 | require ( 22 | cel.dev/expr v0.24.0 // indirect 23 | cloud.google.com/go v0.122.0 // indirect 24 | cloud.google.com/go/auth v0.16.5 // indirect 25 | cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect 26 | cloud.google.com/go/compute/metadata v0.8.0 // indirect 27 | cloud.google.com/go/monitoring v1.24.2 // indirect 28 | github.com/GoogleCloudPlatform/grpc-gcp-go/grpcgcp v1.5.3 // indirect 29 | github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 // indirect 30 | github.com/armon/go-metrics v0.4.1 // indirect 31 | github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect 32 | github.com/ebitengine/purego v0.8.0 // indirect 33 | github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect 34 | github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect 35 | github.com/felixge/httpsnoop v1.0.4 // indirect 36 | github.com/go-jose/go-jose/v4 v4.1.2 // indirect 37 | github.com/go-logr/logr v1.4.3 // indirect 38 | github.com/go-logr/stdr v1.2.2 // indirect 39 | github.com/go-ole/go-ole v1.2.6 // indirect 40 | github.com/goccy/go-json v0.10.5 // indirect 41 | github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect 42 | github.com/google/btree v1.1.3 // indirect 43 | github.com/google/flatbuffers v25.1.24+incompatible // indirect 44 | github.com/google/s2a-go v0.1.9 // indirect 45 | github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect 46 | github.com/hashicorp/errwrap v1.1.0 // indirect 47 | github.com/hashicorp/go-immutable-radix v1.3.1 // indirect 48 | github.com/hashicorp/go-msgpack/v2 v2.1.2 // indirect 49 | github.com/hashicorp/go-multierror v1.1.1 // indirect 50 | github.com/hashicorp/go-sockaddr v1.0.7 // indirect 51 | github.com/hashicorp/go-uuid v1.0.2 // indirect 52 | github.com/hashicorp/golang-lru v1.0.2 // indirect 53 | github.com/klauspost/compress v1.17.11 // indirect 54 | github.com/klauspost/cpuid/v2 v2.2.9 // indirect 55 | github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect 56 | github.com/miekg/dns v1.1.62 // indirect 57 | github.com/pierrec/lz4/v4 v4.1.22 // indirect 58 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect 59 | github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect 60 | github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect 61 | github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect 62 | github.com/tklauser/go-sysconf v0.3.12 // indirect 63 | github.com/tklauser/numcpus v0.6.1 // indirect 64 | github.com/yusufpapurcu/wmi v1.2.4 // indirect 65 | github.com/zeebo/xxh3 v1.0.2 // indirect 66 | go.opencensus.io v0.24.0 // indirect 67 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 68 | go.opentelemetry.io/contrib/detectors/gcp v1.38.0 // indirect 69 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect 70 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect 71 | go.opentelemetry.io/otel v1.38.0 // indirect 72 | go.opentelemetry.io/otel/metric v1.38.0 // indirect 73 | go.opentelemetry.io/otel/sdk v1.38.0 // indirect 74 | go.opentelemetry.io/otel/sdk/metric v1.38.0 // indirect 75 | go.opentelemetry.io/otel/trace v1.38.0 // indirect 76 | golang.org/x/crypto v0.41.0 // indirect 77 | golang.org/x/mod v0.26.0 // indirect 78 | golang.org/x/net v0.43.0 // indirect 79 | golang.org/x/oauth2 v0.30.0 // indirect 80 | golang.org/x/sys v0.35.0 // indirect 81 | golang.org/x/text v0.28.0 // indirect 82 | golang.org/x/time v0.12.0 // indirect 83 | golang.org/x/tools v0.35.0 // indirect 84 | golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect 85 | google.golang.org/genproto v0.0.0-20250826171959-ef028d996bc1 // indirect 86 | google.golang.org/genproto/googleapis/api v0.0.0-20250826171959-ef028d996bc1 // indirect 87 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect 88 | google.golang.org/protobuf v1.36.8 // indirect 89 | ) 90 | -------------------------------------------------------------------------------- /service.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "strconv" 7 | 8 | pb "github.com/flowerinthenight/hedge-proto" 9 | "golang.org/x/sync/errgroup" 10 | ) 11 | 12 | type service struct { 13 | op *Op 14 | 15 | pb.UnimplementedHedgeServer 16 | } 17 | 18 | func (s *service) Send(hs pb.Hedge_SendServer) error { 19 | ctx := hs.Context() 20 | g := new(errgroup.Group) 21 | g.Go(func() error { 22 | for { 23 | select { 24 | case <-ctx.Done(): 25 | return nil 26 | default: 27 | } 28 | 29 | in, err := hs.Recv() 30 | if err == io.EOF { 31 | return nil 32 | } 33 | 34 | if err != nil { 35 | s.op.logger.Println("Recv failed:", err) 36 | return err 37 | } 38 | 39 | s.op.leaderStreamIn <- &StreamMessage{ 40 | Payload: in, 41 | } 42 | } 43 | }) 44 | 45 | g.Go(func() error { 46 | for { 47 | select { 48 | case <-ctx.Done(): 49 | return nil 50 | default: 51 | } 52 | 53 | out := <-s.op.leaderStreamOut 54 | if out == nil { 55 | return nil 56 | } 57 | 58 | hs.Send(out.Payload) 59 | } 60 | }) 61 | 62 | return g.Wait() 63 | } 64 | 65 | func (s *service) Broadcast(hs pb.Hedge_BroadcastServer) error { 66 | ctx := hs.Context() 67 | g := new(errgroup.Group) 68 | g.Go(func() error { 69 | for { 70 | select { 71 | case <-ctx.Done(): 72 | return nil 73 | default: 74 | } 75 | 76 | in, err := hs.Recv() 77 | if err == io.EOF { 78 | return nil 79 | } 80 | 81 | if err != nil { 82 | s.op.logger.Println("Recv failed:", err) 83 | return err 84 | } 85 | 86 | s.op.broadcastStreamIn <- &StreamMessage{ 87 | Payload: in, 88 | } 89 | } 90 | }) 91 | 92 | g.Go(func() error { 93 | for { 94 | select { 95 | case <-ctx.Done(): 96 | return nil 97 | default: 98 | } 99 | 100 | out := <-s.op.broadcastStreamOut 101 | if out == nil { 102 | return nil 103 | } 104 | 105 | hs.Send(out.Payload) 106 | } 107 | }) 108 | 109 | return g.Wait() 110 | } 111 | 112 | func (s *service) SoSWrite(hs pb.Hedge_SoSWriteServer) error { 113 | var err error 114 | ctx := hs.Context() 115 | var writer *Writer 116 | 117 | loop: 118 | for { 119 | select { 120 | case <-ctx.Done(): 121 | err = ctx.Err() 122 | break loop 123 | default: 124 | } 125 | 126 | in, err := hs.Recv() 127 | if err == io.EOF { 128 | break 129 | } 130 | 131 | if err != nil { 132 | s.op.logger.Println("Recv failed:", err) 133 | break 134 | } 135 | 136 | name := in.Meta[metaName] 137 | if _, ok := s.op.soss[name]; !ok { 138 | mlimit, _ := strconv.ParseUint(in.Meta[metaMemLimit], 10, 64) 139 | dlimit, _ := strconv.ParseUint(in.Meta[metaDiskLimit], 10, 64) 140 | age, _ := strconv.ParseInt(in.Meta[metaExpire], 10, 64) 141 | s.op.soss[name] = s.op.NewSoS(name, &SoSOptions{ 142 | MemLimit: mlimit, 143 | DiskLimit: dlimit, 144 | Expiration: age, 145 | }) 146 | } 147 | 148 | if writer == nil { 149 | writer, _ = s.op.soss[name].Writer(&writerOptions{ 150 | LocalOnly: true, 151 | }) 152 | } 153 | 154 | writer.Write(in.Data) 155 | } 156 | 157 | if writer != nil { 158 | writer.Close() 159 | } 160 | 161 | return err 162 | } 163 | 164 | func (s *service) SoSRead(hs pb.Hedge_SoSReadServer) error { 165 | var err error 166 | in, err := hs.Recv() 167 | if err == io.EOF { 168 | return nil 169 | } 170 | 171 | if err != nil { 172 | s.op.logger.Println("Recv failed:", err) 173 | return nil 174 | } 175 | 176 | name := in.Meta[metaName] 177 | if _, ok := s.op.soss[name]; !ok { 178 | mlimit, _ := strconv.ParseUint(in.Meta[metaMemLimit], 10, 64) 179 | dlimit, _ := strconv.ParseUint(in.Meta[metaDiskLimit], 10, 64) 180 | age, _ := strconv.ParseInt(in.Meta[metaExpire], 10, 64) 181 | s.op.soss[name] = s.op.NewSoS(name, &SoSOptions{ 182 | MemLimit: mlimit, 183 | DiskLimit: dlimit, 184 | Expiration: age, 185 | }) 186 | } 187 | 188 | reader, _ := s.op.soss[name].Reader(&readerOptions{LocalOnly: true}) 189 | out := make(chan []byte) 190 | eg := new(errgroup.Group) 191 | eg.Go(func() error { 192 | for d := range out { 193 | err = hs.Send(&pb.Payload{Data: d}) 194 | if err != nil { 195 | s.op.logger.Println("Send failed:", err) 196 | } 197 | } 198 | 199 | return nil 200 | }) 201 | 202 | reader.Read(out) 203 | eg.Wait() 204 | 205 | if reader != nil { 206 | reader.Close() 207 | } 208 | 209 | return nil 210 | } 211 | 212 | func (s *service) SoSClose(ctx context.Context, in *pb.Payload) (*pb.Payload, error) { 213 | name := in.Meta[metaName] 214 | s.op.soss[name].Close() 215 | return &pb.Payload{}, nil 216 | } 217 | -------------------------------------------------------------------------------- /protocol.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "context" 5 | "encoding/base64" 6 | "encoding/json" 7 | "fmt" 8 | "net" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | func doConfirmLeader(ctx context.Context, op *Op, conn net.Conn, _ string) { 14 | var sb strings.Builder 15 | sb.WriteString(op.buildAckReply(nil)) 16 | if hl, _ := op.HasLock(); !hl { 17 | sb.Reset() 18 | sb.WriteString("\n") 19 | } 20 | 21 | b := []byte(sb.String()) 22 | conn.Write(b) 23 | } 24 | 25 | func doWrite(ctx context.Context, op *Op, conn net.Conn, msg string) { 26 | var sb strings.Builder 27 | if hl, _ := op.HasLock(); hl { 28 | ss := strings.Split(msg, " ") 29 | payload := ss[1] 30 | var noappend bool 31 | if len(ss) >= 3 { 32 | if ss[2] == FlagNoAppend { 33 | noappend = true 34 | } 35 | } 36 | 37 | decoded, _ := base64.StdEncoding.DecodeString(payload) 38 | var kv KeyValue 39 | err := json.Unmarshal(decoded, &kv) 40 | if err != nil { 41 | sb.WriteString(op.buildAckReply(err)) 42 | } else { 43 | sb.WriteString(op.buildAckReply(op.Put(ctx, kv, PutOptions{ 44 | DirectWrite: true, 45 | NoAppend: noappend, 46 | }))) 47 | } 48 | } else { 49 | sb.WriteString("\n") // not leader, possible even if previously confirmed 50 | } 51 | 52 | b := []byte(sb.String()) 53 | conn.Write(b) 54 | } 55 | 56 | func doSend(ctx context.Context, op *Op, conn net.Conn, msg string) { 57 | var sb strings.Builder 58 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoLeader.Error())) 59 | fmt.Fprintf(&sb, "%s\n", serr) 60 | if hl, _ := op.HasLock(); hl { 61 | sb.Reset() 62 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoHandler.Error())) 63 | fmt.Fprintf(&sb, "%s\n", serr) 64 | if op.fnLeader != nil { 65 | payload := strings.Split(msg, " ")[1] 66 | decoded, _ := base64.StdEncoding.DecodeString(payload) 67 | data := op.fnLdrData 68 | if data == nil { 69 | data = op 70 | } 71 | 72 | r, e := op.fnLeader(data, decoded) // call leader handler 73 | if e != nil { 74 | sb.Reset() 75 | serr := base64.StdEncoding.EncodeToString([]byte(e.Error())) 76 | fmt.Fprintf(&sb, "%s\n", serr) 77 | } else { 78 | br := base64.StdEncoding.EncodeToString([]byte("")) 79 | if r != nil { 80 | br = base64.StdEncoding.EncodeToString(r) 81 | } 82 | 83 | sb.Reset() 84 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, br) 85 | } 86 | } 87 | } 88 | 89 | b := []byte(sb.String()) 90 | conn.Write(b) 91 | } 92 | 93 | func doBroadcast(ctx context.Context, op *Op, conn net.Conn, msg string) { 94 | var sb strings.Builder 95 | serr := base64.StdEncoding.EncodeToString([]byte(ErrNoHandler.Error())) 96 | fmt.Fprintf(&sb, "%s\n", serr) 97 | if op.fnBroadcast != nil { 98 | payload := strings.Split(msg, " ")[1] 99 | decoded, _ := base64.StdEncoding.DecodeString(payload) 100 | data := op.fnBcData 101 | if data == nil { 102 | data = op 103 | } 104 | 105 | r, e := op.fnBroadcast(data, decoded) // call broadcast handler 106 | if e != nil { 107 | sb.Reset() 108 | serr := base64.StdEncoding.EncodeToString([]byte(e.Error())) 109 | fmt.Fprintf(&sb, "%s\n", serr) 110 | } else { 111 | br := base64.StdEncoding.EncodeToString([]byte("")) 112 | if r != nil { 113 | br = base64.StdEncoding.EncodeToString(r) 114 | } 115 | 116 | sb.Reset() 117 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, br) 118 | } 119 | } 120 | 121 | b := []byte(sb.String()) 122 | conn.Write(b) 123 | } 124 | 125 | func doHeartbeat(ctx context.Context, op *Op, conn net.Conn, msg string) { 126 | var sb strings.Builder 127 | oldallm := op.getMembers() 128 | op.addMember(strings.Split(msg, " ")[1]) 129 | fmt.Fprintf(&sb, "%s\n", op.encodeMembers()) 130 | conn.Write([]byte(sb.String())) 131 | newallm := op.getMembers() 132 | if len(oldallm) != len(newallm) && op.fnMemberChanged != nil { 133 | diff := len(newallm) - len(oldallm) 134 | op.fnMemberChanged(op.fnMemChangedData, []byte(fmt.Sprintf("%v", diff))) 135 | } 136 | } 137 | 138 | func doMembers(ctx context.Context, op *Op, conn net.Conn, msg string) { 139 | payload := strings.Split(msg, " ")[1] 140 | decoded, _ := base64.StdEncoding.DecodeString(payload) 141 | var m map[string]struct{} 142 | json.Unmarshal(decoded, &m) 143 | m[op.hostPort] = struct{}{} // just to be sure 144 | op.setMembers(m) // then replace my records 145 | members := op.getMembers() 146 | mlist := []string{} 147 | for k := range members { 148 | mlist = append(mlist, k) 149 | } 150 | 151 | op.logger.Printf("%v member(s) tracked", len(op.getMembers())) 152 | reply := op.buildAckReply(nil) 153 | conn.Write([]byte(reply)) 154 | } 155 | 156 | func doCreateSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) { 157 | reply := op.buildAckReply(nil) 158 | func() { 159 | op.mtxSem.Lock() 160 | defer op.mtxSem.Unlock() 161 | ss := strings.Split(msg, " ") 162 | name, slimit, caller := ss[1], ss[2], ss[3] 163 | limit, err := strconv.Atoi(slimit) 164 | if err != nil { 165 | reply = op.buildAckReply(err) 166 | return 167 | } 168 | 169 | // See if this semaphore already exists. 170 | s, err := readSemaphoreEntry(ctx, op, name) 171 | if err != nil { 172 | err = createSemaphoreEntry(ctx, op, name, caller, limit) 173 | if err != nil { 174 | reply = op.buildAckReply(err) 175 | return 176 | } 177 | 178 | // Read again after create. 179 | s, err = readSemaphoreEntry(ctx, op, name) 180 | if err != nil { 181 | reply = op.buildAckReply(err) 182 | return 183 | } 184 | } 185 | 186 | slmt, _ := strconv.Atoi(strings.Split(s.Id, "=")[1]) 187 | if slmt != limit { 188 | err = fmt.Errorf("semaphore already exists with a different limit") 189 | reply = op.buildAckReply(err) 190 | return 191 | } 192 | }() 193 | 194 | b := []byte(reply) 195 | conn.Write(b) 196 | } 197 | 198 | func doAcquireSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) { 199 | reply := op.buildAckReply(nil) 200 | func() { 201 | op.mtxSem.Lock() 202 | defer op.mtxSem.Unlock() 203 | ss := strings.Split(msg, " ") 204 | name, caller := ss[1], ss[2] 205 | go ensureLiveness(ctx, op) 206 | op.ensureCh <- name 207 | s, err := readSemaphoreEntry(ctx, op, name) // to get the current limit 208 | if err != nil { 209 | err = fmt.Errorf("0:%v", err) // final 210 | reply = op.buildAckReply(err) 211 | return 212 | } 213 | 214 | limit, _ := strconv.Atoi(strings.Split(s.Id, "=")[1]) 215 | retry, err := createAcquireSemaphoreEntry(ctx, op, name, caller, limit) 216 | if err != nil { 217 | switch { 218 | case retry: 219 | err = fmt.Errorf("1:%v", err) // can retry 220 | default: 221 | err = fmt.Errorf("0:%v", err) // final 222 | } 223 | 224 | reply = op.buildAckReply(err) 225 | return 226 | } 227 | }() 228 | 229 | b := []byte(reply) 230 | conn.Write(b) 231 | } 232 | 233 | func doReleaseSemaphore(ctx context.Context, op *Op, conn net.Conn, msg string) { 234 | reply := op.buildAckReply(nil) 235 | func() { 236 | op.mtxSem.Lock() 237 | defer op.mtxSem.Unlock() 238 | ss := strings.Split(msg, " ") 239 | name, caller := ss[1], ss[2] 240 | s, err := readSemaphoreEntry(ctx, op, name) // to get the current limit 241 | if err != nil { 242 | reply = op.buildAckReply(err) 243 | return 244 | } 245 | 246 | limit, _ := strconv.Atoi(strings.Split(s.Id, "=")[1]) 247 | err = releaseSemaphore(ctx, op, name, caller, s.Value, limit) 248 | if err != nil { 249 | reply = op.buildAckReply(err) 250 | return 251 | } 252 | }() 253 | 254 | b := []byte(reply) 255 | conn.Write(b) 256 | } 257 | 258 | func handleMsg(ctx context.Context, op *Op, conn net.Conn) { 259 | defer conn.Close() 260 | fns := map[string]func(ctx context.Context, op *Op, conn net.Conn, msg string){ 261 | CmdLeader: doConfirmLeader, // confirm leader only 262 | CmdWrite + " ": doWrite, // actual write 263 | CmdSend + " ": doSend, // Send() API 264 | CmdBroadcast + " ": doBroadcast, // Broadcast() API 265 | CmdPing + " ": doHeartbeat, // heartbeat 266 | CmdMembers + " ": doMembers, // broadcast online members 267 | CmdSemaphore + " ": doCreateSemaphore, // create semaphore (we are leader) 268 | CmdSemAcquire + " ": doAcquireSemaphore, // acquire semaphore (we are leader) 269 | CmdSemRelease + " ": doReleaseSemaphore, // release semaphore (we are leader) 270 | } 271 | 272 | addSpace := func(s string) string { 273 | var sb strings.Builder 274 | fmt.Fprintf(&sb, "%s ", s) 275 | return sb.String() 276 | } 277 | 278 | for { 279 | var prefix string 280 | msg, err := op.recv(conn) 281 | if err != nil || ctx.Err() != nil { 282 | return 283 | } 284 | 285 | switch { 286 | case msg == CmdPing: // leader asking if we are online (msg has no prefix) 287 | reply := op.buildAckReply(nil) 288 | conn.Write([]byte(reply)) 289 | return 290 | case strings.HasPrefix(msg, CmdLeader): 291 | prefix = CmdLeader 292 | case strings.HasPrefix(msg, addSpace(CmdWrite)): 293 | prefix = addSpace(CmdWrite) 294 | case strings.HasPrefix(msg, addSpace(CmdSend)): 295 | prefix = addSpace(CmdSend) 296 | case strings.HasPrefix(msg, addSpace(CmdBroadcast)): 297 | prefix = addSpace(CmdBroadcast) 298 | case strings.HasPrefix(msg, addSpace(CmdPing)): 299 | prefix = addSpace(CmdPing) 300 | case strings.HasPrefix(msg, addSpace(CmdMembers)): 301 | prefix = addSpace(CmdMembers) 302 | case strings.HasPrefix(msg, addSpace(CmdSemaphore)): 303 | prefix = addSpace(CmdSemaphore) 304 | case strings.HasPrefix(msg, addSpace(CmdSemAcquire)): 305 | prefix = addSpace(CmdSemAcquire) 306 | case strings.HasPrefix(msg, addSpace(CmdSemRelease)): 307 | prefix = addSpace(CmdSemRelease) 308 | default: 309 | return // do nothing 310 | } 311 | 312 | fns[prefix](ctx, op, conn, msg) 313 | } 314 | } 315 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![main](https://github.com/flowerinthenight/hedge/actions/workflows/main.yml/badge.svg)](https://github.com/flowerinthenight/hedge/actions/workflows/main.yml) 2 | [![Docker Repository on Quay](https://quay.io/repository/flowerinthenight/hedgedemo/status "Docker Repository on Quay")](https://quay.io/repository/flowerinthenight/hedgedemo) 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/flowerinthenight/hedge.svg)](https://pkg.go.dev/github.com/flowerinthenight/hedge) 4 | 5 | (This repo is mirrored to [https://codeberg.org/flowerinthenight/hedge](https://codeberg.org/flowerinthenight/hedge)). 6 | 7 | ## hedge 8 | A [Go](https://go.dev/) cluster membership management library built on [spindle](https://github.com/flowerinthenight/spindle) and [Cloud Spanner](https://cloud.google.com/spanner) that provides rudimentary distributed computing facilities to Kubernetes [Deployments](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/). Features include: 9 | 10 | * a consistent, append-only, Spanner-backed distributed key/value storage, 11 | * a distributed locking/leader election mechanism through spindle, 12 | * a simple member-to-leader communication channel, 13 | * a broadcast (send-to-all) mechanism, 14 | * memory spill-over, ad-hoc storage, and 15 | * a distributed semaphore. 16 | 17 | It also works even on single-pod deployments. 18 | 19 |

20 | 21 |

22 | 23 | **hedge** is heavily used in [Alphaus](https://www.alphaus.cloud/) production with services that scale from single digit pods to hundreds. 24 | 25 | Ports: 26 | 27 | * [hedge-cb](https://github.com/flowerinthenight/hedge-cb) - trimmed down version for cluster membership, AWS-native, relies on [spindle-cb](https://github.com/flowerinthenight/spindle-cb). 28 | * [hedge-rs](https://github.com/flowerinthenight/hedge-rs) - trimmed down version written in Rust. 29 | 30 | ## Why? 31 | First, I wanted a cluster coordinator that can work within k8s Deployments as a library, not as an external service (like [ZooKeeper](https://zookeeper.apache.org/), or [etcd](https://etcd.io/)). So far, our efforts in making [Raft](https://raft.github.io/) play well with bursty, frequently scaling up/down deployments as a library is not that reliable yet (though we have an ongoing multi-[Paxos](https://en.wikipedia.org/wiki/Paxos_(computer_science))-based experiment [here](https://github.com/alphauslabs/juno) as well). I also wanted an easily-accessible storage that is a bit decoupled from the code (easier to query, edit, debug, backup, etc). We are already a heavy Spanner user, and [spindle](https://github.com/flowerinthenight/spindle/) has been in our production for many years now: these two should be able to do it; StatefulSets or DaemonSets shouldn't be a requirement. Since then, additional features have been added, such as the `Send()` API. 32 | 33 | ## What does it do? 34 | Leader election is handled by [spindle](https://github.com/flowerinthenight/spindle). Two APIs are provided for storage: `Put()` and `Get()`. All pods can serve the `Get()` calls, while only the leader handles the `Put()` API. If a non-leader pod calls `Put()`, that call is forwarded to the leader, who will do the actual write. All `Put()`'s are append-only. 35 | 36 | Spindle's `HasLock()` function is also available for distributed locking due to struct embedding, although you can use spindle separately for that, if you prefer. 37 | 38 | A `Send()` API is also provided for members to be able to send simple request/reply-type messages to the current leader at any time. A streaming equivalent (gRPC) is also available. 39 | 40 | A `Broadcast()` API is also available for all pods. Note that due to the nature of k8s deployments (pods come and go) and the internal heartbeat delays, some pods might not receive the broadcast message at call time, although all pods will have the complete broadcast target list eventually. Hedge uses a combination of heartbeats and broadcasts to propagate member information to all pods; non-leaders send liveness heartbeats to the leader while the leader broadcasts active members to all pods. A streaming equivalent (gRPC) is also available. 41 | 42 | An experimental spill-over store (**SoS**) is also supported. It's currently used in bursty, quick load-process-discard type of data processing. A **SoS** in hedge is simply a combined memory area and a disk area between pods. For example, a pod can define a SoS of 1GB memory and 1GB disk. If there are 100 pods running under hedge, that's a combined storage of (1GB + 1GB) * 100. During writes and subsequent reads, hedge handles the data distribution and assembly between local RAM, disk, and between pods. It uses [Arrow](https://arrow.apache.org/) and memory mapped files as backing stores. You can check out this [blog post](https://flowerinthenight.com/blog/2024-07-24-spillover-store/) for more information. 43 | 44 | Finally, a distributed semaphore is also provided through the `NewSemaphore()`, `[Try]Acquire()`, and `Release()` APIs. 45 | 46 | ## Prerequisites 47 | * All pods within a cluster should be able to contact each other via TCP (address:port). 48 | * Each hedge's instance id should be set using the pod's cluster IP address:port. You can use [downward API](https://kubernetes.io/docs/concepts/workloads/pods/downward-api/) to get the pod's IP address, or you can use the ":port" format in which case the IP address will be resolved internally. 49 | * For now, spindle's lock table and hedge's log table are within the same database. 50 | * Tables for spindle and hedge need to be created beforehand. See [here](https://github.com/flowerinthenight/spindle#usage) for spindle's DDL. For hedge, see below: 51 | 52 | ```sql 53 | -- 'logtable' name is just an example 54 | CREATE TABLE logtable ( 55 | id STRING(MAX), 56 | key STRING(MAX), 57 | value STRING(MAX), 58 | leader STRING(MAX), 59 | timestamp TIMESTAMP OPTIONS (allow_commit_timestamp=true), 60 | ) PRIMARY KEY (key, id) 61 | ``` 62 | 63 | * This library will use the input key/value table (`logtable` in the example above) for its semaphore-related operations with the following reserved keywords: 64 | ``` 65 | column=key, value=__hedge/semaphore/{name} 66 | column=key, value=__caller={ip:port} 67 | column=id, value=__hedge/semaphore/{name} 68 | column=id, value=limit={num} 69 | ``` 70 | 71 | ## How to use 72 | Something like: 73 | ```go 74 | ctx := context.Background() 75 | client, _ := spanner.NewClient(ctx, "your/spanner/database") 76 | defer client.Close() 77 | 78 | op := hedge.New( 79 | client, 80 | ":8080", // addr will be resolved internally 81 | "locktable", 82 | "myspindlelock", 83 | "logtable", 84 | hedge.WithLeaderHandler( // if leader only, handles Send() 85 | nil, 86 | func(data interface{}, msg []byte) ([]byte, error) { 87 | log.Println("[send] received:", string(msg)) 88 | return []byte("hello " + string(msg)), nil 89 | }, 90 | ), 91 | hedge.WithBroadcastHandler( // handles Broadcast() 92 | nil, 93 | func(data interface{}, msg []byte) ([]byte, error) { 94 | log.Println("[broadcast] received:", string(msg)) 95 | return []byte("broadcast " + string(msg)), nil 96 | }, 97 | ), 98 | }) 99 | 100 | ctx, cancel := context.WithCancel(ctx) 101 | done := make(chan error, 1) // optional wait 102 | go op.Run(ctx, done) 103 | 104 | // For storage, any pod should be able to call op.Put(...) or op.Get(...) here. 105 | // 106 | // Any pod can call HasLock() here at any given time to know whether they are 107 | // leader or not. 108 | // 109 | // hl, _ := op.HasLock() 110 | // if hl { 111 | // log.Println("leader here!") 112 | // } 113 | // 114 | // To send a message to the current leader, any pod can call op.Send(...) and 115 | // the leader will handle it through the WithLeaderHandler callback. A wrapper 116 | // SendToLeader() helper function is also available for calling op.Send() with 117 | // retries+backoff. 118 | // 119 | // For broadcast, any pod can call op.Broadcast(...) here which will be handled 120 | // by each pod's WithBroadcastHandler callback, including the caller. 121 | // 122 | // For distributed semaphore, any pod can call the following: 123 | // 124 | // sem, _ := op.NewSemaphore(ctx, "semaphore-name", 2) 125 | // sem.Acquire(ctx) 126 | // ... 127 | // sem.Release(ctx) 128 | 129 | cancel() 130 | <-done 131 | ``` 132 | 133 | A sample [deployment](./deployment.yaml) file for GKE is provided, although it needs a fair bit of editing (for auth) to be usable. It uses [Workload Identity Federation](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) for authentication although you can update it to use other authentication methods as well. The service account needs to have Spanner permissions. 134 | 135 | Once deployed, you can do the following tests while checking the logs. We will use [kubepfm](https://github.com/flowerinthenight/kubepfm) to port-forward our test commands to the server. 136 | 137 | Test the `Put()` API: 138 | 139 | ```sh 140 | # Open a terminal and run: 141 | $ kubepfm --target deployment/hedgedemo:9090:9090 142 | 143 | # Open another terminal and run: 144 | $ curl localhost:9090/put -d "samplekey samplevalue" 145 | 146 | # To ensure a non-leader sender, you can also specify a 147 | # non-leader pod for the kubepfm command above: 148 | $ kubepfm --target hedgedemo-6b5bcd4998-n95n7:9090:9090 149 | ``` 150 | 151 | Test the `Get()` API: 152 | 153 | ```sh 154 | # While kubepfm is running on a different terminal, run: 155 | $ curl localhost:9090/get -d "samplekey" 156 | ``` 157 | 158 | Test the `Send()` API: 159 | 160 | ```sh 161 | # While kubepfm is running on a different terminal, run: 162 | $ curl localhost:9090/send -d "hello-world" 163 | ``` 164 | 165 | Test the `Broadcast()` API: 166 | 167 | ```sh 168 | # While kubepfm is running on a different terminal, run: 169 | $ curl localhost:9090/broadcast -d "hello-all" 170 | ``` 171 | -------------------------------------------------------------------------------- /semaphore.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "context" 5 | "encoding/base64" 6 | "fmt" 7 | "net" 8 | "strings" 9 | "sync" 10 | "sync/atomic" 11 | "time" 12 | 13 | "cloud.google.com/go/spanner" 14 | "google.golang.org/api/iterator" 15 | ) 16 | 17 | const ( 18 | semNamef = "__hedge/semaphore/%v" 19 | semCallerf = "__caller=%v" 20 | semLimitf = "limit=%v" 21 | markDel = "delete-on-empty" 22 | ) 23 | 24 | var ( 25 | ErrSemFull = fmt.Errorf("hedge/semaphore: semaphore full") 26 | ) 27 | 28 | // Semaphore represents a distributed semaphore object. 29 | type Semaphore struct { 30 | name string 31 | limit int 32 | op *Op 33 | } 34 | 35 | // Acquire acquires a semaphore. This call will block until the semaphore is acquired. 36 | // By default, this call will basically block forever until the semaphore is acquired 37 | // or until ctx expires or is cancelled. 38 | func (s *Semaphore) Acquire(ctx context.Context) error { return s.acquire(ctx, false) } 39 | 40 | // TryAcquire is like Acquire() but will not block until the semaphore is acquired. 41 | // It will only attempt to acquire the semaphore and will return immediately on either 42 | // success or failure, or until ctx expires or is cancelled. 43 | func (s *Semaphore) TryAcquire(ctx context.Context) error { return s.acquire(ctx, true) } 44 | 45 | func (s *Semaphore) acquire(ctx context.Context, noretry bool) error { 46 | subctx := context.WithValue(ctx, struct{}{}, nil) 47 | first := make(chan struct{}, 1) 48 | first <- struct{}{} // immediately the first time 49 | ticker := time.NewTicker(time.Second * 1) 50 | defer ticker.Stop() 51 | 52 | var active atomic.Int32 53 | acquire := func() (bool, error) { // true means okay to retry 54 | active.Store(1) 55 | defer active.Store(0) 56 | conn, err := s.op.getLeaderConn(ctx) 57 | if err != nil { 58 | return true, err 59 | } 60 | 61 | defer conn.Close() 62 | var sb strings.Builder 63 | fmt.Fprintf(&sb, "%s %s %s\n", CmdSemAcquire, s.name, s.op.hostPort) 64 | reply, err := s.op.send(conn, sb.String()) 65 | if err != nil { 66 | return false, err 67 | } 68 | 69 | switch { 70 | case strings.HasPrefix(reply, CmdAck): 71 | ss := strings.Split(reply, " ") 72 | if len(ss) > 1 { // failed 73 | dec, _ := base64.StdEncoding.DecodeString(ss[1]) 74 | switch { 75 | case strings.HasPrefix(string(dec), "0:"): 76 | serr := strings.Replace(string(dec), "0:", "", 1) 77 | return false, fmt.Errorf("%v", serr) 78 | case strings.HasPrefix(string(dec), "1:"): 79 | serr := strings.Replace(string(dec), "1:", "", 1) 80 | return true, fmt.Errorf("%v", serr) 81 | default: // shouldn't be the case, hopefully 82 | return false, fmt.Errorf("%v", string(dec)) 83 | } 84 | } 85 | default: 86 | return false, ErrNotSupported 87 | } 88 | 89 | return false, nil 90 | } 91 | 92 | for { 93 | select { 94 | case <-subctx.Done(): 95 | return context.Canceled 96 | case <-first: 97 | case <-ticker.C: 98 | } 99 | 100 | if active.Load() == 1 { 101 | continue 102 | } 103 | 104 | type acq_t struct { 105 | retry bool 106 | err error 107 | } 108 | 109 | ch := make(chan acq_t, 1) 110 | go func() { 111 | r, e := acquire() 112 | ch <- acq_t{r, e} 113 | }() 114 | 115 | ret := <-ch 116 | switch { 117 | case ret.err == nil: 118 | return nil 119 | default: 120 | if noretry { 121 | return ret.err 122 | } else { 123 | if ret.retry { 124 | continue 125 | } else { 126 | return ret.err 127 | } 128 | } 129 | } 130 | } 131 | } 132 | 133 | // Release releases a semaphore. Although recommended to release all acquired semaphores, this is still 134 | // a best-effort release as any caller could disappear/crash while holding a semaphore. To remedy this, 135 | // the current leader will attempt to track all semaphore owners and remove the non-responsive ones after 136 | // some delay. A downside of not calling release properly will cause other semaphore acquirers to block 137 | // just a bit longer while leader does the cleanup, whereas calling release will free up space immediately 138 | // allowing other semaphore acquirers to not wait that long. 139 | func (s *Semaphore) Release(ctx context.Context) error { 140 | conn, err := s.op.getLeaderConn(ctx) 141 | if err != nil { 142 | return err 143 | } 144 | 145 | defer conn.Close() 146 | var sb strings.Builder 147 | fmt.Fprintf(&sb, "%s %s %s\n", CmdSemRelease, s.name, s.op.hostPort) 148 | reply, err := s.op.send(conn, sb.String()) 149 | if err != nil { 150 | return err 151 | } 152 | 153 | switch { 154 | case strings.HasPrefix(reply, CmdAck): 155 | ss := strings.Split(reply, " ") 156 | if len(ss) > 1 { // failed 157 | dec, _ := base64.StdEncoding.DecodeString(ss[1]) 158 | return fmt.Errorf("%v", string(dec)) 159 | } 160 | } 161 | 162 | return nil 163 | } 164 | 165 | // We will use the current logTable as our semaphore storage. 166 | // Naming convention(s): 167 | // 168 | // key="hedge/semaphore/{name}", id="limit={v}", value={caller} 169 | func createSemaphoreEntry(ctx context.Context, op *Op, name, caller string, limit int) error { 170 | _, err := op.spannerClient.ReadWriteTransaction(ctx, 171 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { 172 | var q strings.Builder 173 | fmt.Fprintf(&q, "insert %s ", op.logTable) 174 | fmt.Fprintf(&q, "(key, id, value, leader, timestamp) values (") 175 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semNamef, name)) 176 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semLimitf, limit)) 177 | fmt.Fprintf(&q, "'%s', ", caller) 178 | fmt.Fprintf(&q, "'%s', ", op.hostPort) 179 | fmt.Fprintf(&q, "PENDING_COMMIT_TIMESTAMP())") 180 | 181 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()}) 182 | return err 183 | }, 184 | ) 185 | 186 | return err 187 | } 188 | 189 | func readSemaphoreEntry(ctx context.Context, op *Op, name string) (*LogItem, error) { 190 | var q strings.Builder 191 | fmt.Fprintf(&q, "select key, id, value, leader, timestamp ") 192 | fmt.Fprintf(&q, "from %s ", op.logTable) 193 | fmt.Fprintf(&q, "where key = @name") 194 | 195 | stmt := spanner.Statement{ 196 | SQL: q.String(), 197 | Params: map[string]any{ 198 | "name": fmt.Sprintf(semNamef, name), 199 | }, 200 | } 201 | 202 | iter := op.spannerClient.Single().Query(ctx, stmt) 203 | defer iter.Stop() 204 | for { 205 | row, err := iter.Next() 206 | if err == iterator.Done { 207 | break 208 | } 209 | 210 | if err != nil { 211 | return nil, err 212 | } 213 | 214 | var v LogItem 215 | err = row.ToStruct(&v) 216 | if err != nil { 217 | return nil, err 218 | } 219 | 220 | // Should only be one item. 221 | return &v, nil 222 | } 223 | 224 | return nil, fmt.Errorf("%v not found", name) 225 | } 226 | 227 | func createAcquireSemaphoreEntry(ctx context.Context, op *Op, name, caller string, limit int) (bool, error) { 228 | // First, see if caller already acquired this semaphore. 229 | var q strings.Builder 230 | fmt.Fprintf(&q, "select key, id ") 231 | fmt.Fprintf(&q, "from %s ", op.logTable) 232 | fmt.Fprintf(&q, "where key = @key and id = @id") 233 | 234 | stmt := spanner.Statement{ 235 | SQL: q.String(), 236 | Params: map[string]any{ 237 | "key": fmt.Sprintf(semCallerf, caller), 238 | "id": fmt.Sprintf(semNamef, name), 239 | }, 240 | } 241 | 242 | var cnt int 243 | iter := op.spannerClient.Single().Query(ctx, stmt) 244 | defer iter.Stop() 245 | for { 246 | row, err := iter.Next() 247 | if err == iterator.Done || err != nil { 248 | break 249 | } 250 | 251 | var v LogItem 252 | err = row.ToStruct(&v) 253 | if err != nil { 254 | break 255 | } 256 | 257 | if v.Key != "" && v.Id != "" { 258 | cnt++ 259 | } 260 | } 261 | 262 | if cnt > 0 { 263 | return false, fmt.Errorf("already acquired") 264 | } 265 | 266 | var free bool 267 | _, err := op.spannerClient.ReadWriteTransaction(ctx, 268 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { 269 | getEntriesCount := func() int64 { 270 | var q strings.Builder 271 | fmt.Fprintf(&q, "select count(id) id ") 272 | fmt.Fprintf(&q, "from %s ", op.logTable) 273 | fmt.Fprintf(&q, "where id = @id") 274 | 275 | stmt := spanner.Statement{ 276 | SQL: q.String(), 277 | Params: map[string]any{ 278 | "id": fmt.Sprintf(semNamef, name), 279 | }, 280 | } 281 | 282 | var cnt int64 283 | iter := txn.Query(ctx, stmt) 284 | defer iter.Stop() 285 | for { 286 | row, err := iter.Next() 287 | if err == iterator.Done || err != nil { 288 | break 289 | } 290 | 291 | if err := row.Columns(&cnt); err != nil { 292 | break 293 | } 294 | } 295 | 296 | return cnt 297 | } 298 | 299 | // Next, see if there is still semaphore space. 300 | free = getEntriesCount() < int64(limit) 301 | if !free { 302 | return ErrSemFull 303 | } 304 | 305 | // Finally, create the acquire semaphore entry. 306 | var q strings.Builder 307 | fmt.Fprintf(&q, "insert %s", op.logTable) 308 | fmt.Fprintf(&q, "(key, id, value, leader, timestamp) values (") 309 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semCallerf, caller)) 310 | fmt.Fprintf(&q, "'%s', ", fmt.Sprintf(semNamef, name)) 311 | fmt.Fprintf(&q, "'%s', ", caller) 312 | fmt.Fprintf(&q, "'%s', ", op.hostPort) 313 | fmt.Fprintf(&q, "PENDING_COMMIT_TIMESTAMP())") 314 | 315 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()}) 316 | if err != nil { 317 | return err 318 | } 319 | 320 | // Finally, we mark this semaphore as full (once). Will be used in release later. 321 | if getEntriesCount() >= int64(limit) { 322 | var q strings.Builder 323 | fmt.Fprintf(&q, "update %s ", op.logTable) 324 | fmt.Fprintf(&q, "set value = @val where key = @name") 325 | 326 | txn.Update(ctx, spanner.Statement{ 327 | SQL: q.String(), 328 | Params: map[string]any{ 329 | "val": markDel, 330 | "name": fmt.Sprintf(semNamef, name), 331 | }, 332 | }) 333 | } 334 | 335 | return nil 336 | }, 337 | ) 338 | 339 | switch { 340 | case err != nil && !free: 341 | return true, err 342 | default: 343 | return false, err 344 | } 345 | } 346 | 347 | func releaseSemaphore(ctx context.Context, op *Op, name, caller, value string, limit int) error { 348 | _ = limit 349 | _, err := op.spannerClient.ReadWriteTransaction(ctx, 350 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { 351 | // First, attempt to remove the calling entry. 352 | var q strings.Builder 353 | fmt.Fprintf(&q, "delete from %s ", op.logTable) 354 | fmt.Fprintf(&q, "where key = @key and id = @id") 355 | 356 | txn.Update(ctx, spanner.Statement{ // best-effort, could fail 357 | SQL: q.String(), 358 | Params: map[string]any{ 359 | "key": fmt.Sprintf(semCallerf, caller), 360 | "id": fmt.Sprintf(semNamef, name), 361 | }, 362 | }) 363 | 364 | // Next, see if there are no more entries. 365 | q.Reset() 366 | fmt.Fprintf(&q, "select count(id) id ") 367 | fmt.Fprintf(&q, "from %s ", op.logTable) 368 | fmt.Fprintf(&q, "where id = @id") 369 | 370 | stmt := spanner.Statement{ 371 | SQL: q.String(), 372 | Params: map[string]any{"id": fmt.Sprintf(semNamef, name)}, 373 | } 374 | 375 | var cnt int64 376 | iter := txn.Query(ctx, stmt) 377 | defer iter.Stop() 378 | for { 379 | row, err := iter.Next() 380 | if err == iterator.Done || err != nil { 381 | break 382 | } 383 | 384 | if err := row.Columns(&cnt); err != nil { 385 | break 386 | } 387 | } 388 | 389 | if cnt != 0 { 390 | return nil 391 | } 392 | 393 | if value == markDel { 394 | // Finally, if no more entries, let's remove the actual semaphore entry 395 | // so we can reuse this name, perhaps with a different limit. 396 | q.Reset() 397 | fmt.Fprintf(&q, "delete from %s ", op.logTable) 398 | fmt.Fprintf(&q, "where key = @key") 399 | 400 | txn.Update(ctx, spanner.Statement{ 401 | SQL: q.String(), 402 | Params: map[string]any{"key": fmt.Sprintf(semNamef, name)}, 403 | }) 404 | } 405 | 406 | return nil 407 | }, 408 | ) 409 | 410 | return err 411 | } 412 | 413 | type ensureT struct { 414 | sync.Mutex 415 | m map[string]struct{} 416 | } 417 | 418 | func ensureLock() *ensureT { return &ensureT{m: make(map[string]struct{})} } 419 | 420 | func (e *ensureT) add(name string) { 421 | e.Lock() 422 | defer e.Unlock() 423 | e.m[name] = struct{}{} 424 | } 425 | 426 | func (e *ensureT) del(name string) { 427 | e.Lock() 428 | defer e.Unlock() 429 | delete(e.m, name) 430 | } 431 | 432 | func (e *ensureT) exists(name string) bool { 433 | e.Lock() 434 | defer e.Unlock() 435 | _, ok := e.m[name] 436 | return ok 437 | } 438 | 439 | // Triggered during semaphore acquisition; meaning, this is only called when we are leader. 440 | func ensureLiveness(ctx context.Context, op *Op) { 441 | if op.ensureOn.Load() == 1 { 442 | return // one checker per leader 443 | } 444 | 445 | op.ensureOn.Store(1) 446 | defer op.ensureOn.Store(0) 447 | op.ensureCtx, op.ensureCancel = context.WithCancel(ctx) 448 | 449 | enlock := ensureLock() 450 | ensure := func(name string) { 451 | enlock.add(name) 452 | defer enlock.del(name) 453 | 454 | var q strings.Builder 455 | fmt.Fprintf(&q, "select key from %s ", op.logTable) 456 | fmt.Fprintf(&q, "where id = @id") 457 | 458 | stmt := spanner.Statement{ 459 | SQL: q.String(), 460 | Params: map[string]any{ 461 | "id": fmt.Sprintf(semNamef, name), 462 | }, 463 | } 464 | 465 | ids := []string{} 466 | iter := op.spannerClient.Single().Query(ctx, stmt) 467 | defer iter.Stop() 468 | for { 469 | row, err := iter.Next() 470 | if err == iterator.Done { 471 | break 472 | } 473 | 474 | if err != nil { 475 | break 476 | } 477 | 478 | var v LogItem 479 | err = row.ToStruct(&v) 480 | if err != nil { 481 | continue 482 | } 483 | 484 | ids = append(ids, v.Key) 485 | } 486 | 487 | if len(ids) > 0 { 488 | todel := make(chan string, len(ids)) 489 | var w sync.WaitGroup 490 | for _, id := range ids { 491 | w.Add(1) 492 | go func(t string) { 493 | var rmId string 494 | defer func(rm *string) { 495 | todel <- *rm 496 | w.Done() 497 | }(&rmId) 498 | 499 | timeout := time.Second * 5 500 | caller := strings.Split(t, "=")[1] 501 | conn, err := net.DialTimeout("tcp", caller, timeout) 502 | if err != nil { 503 | rmId = t // delete this 504 | return 505 | } 506 | 507 | var sb strings.Builder 508 | fmt.Fprintf(&sb, "%s\n", CmdPing) 509 | r, err := op.send(conn, sb.String()) 510 | if err != nil { 511 | rmId = t // delete this 512 | return 513 | } 514 | 515 | if r != CmdAck { 516 | rmId = t // delete this 517 | } 518 | }(id) 519 | } 520 | 521 | w.Wait() 522 | rms := []string{} 523 | for range ids { 524 | rm := <-todel 525 | if rm != "" { 526 | rms = append(rms, rm) 527 | } 528 | } 529 | 530 | if len(rms) > 0 { 531 | op.logger.Printf("[ensure/sem] delete: %v", rms) 532 | op.spannerClient.ReadWriteTransaction(ctx, 533 | func(ctx context.Context, txn *spanner.ReadWriteTransaction) error { 534 | q.Reset() 535 | fmt.Fprintf(&q, "delete from %s ", op.logTable) 536 | fmt.Fprintf(&q, "where key in ('%s')", strings.Join(rms, "','")) 537 | _, err := txn.Update(ctx, spanner.Statement{SQL: q.String()}) 538 | return err 539 | }, 540 | ) 541 | } 542 | 543 | time.Sleep(time.Second * 5) 544 | } 545 | } 546 | 547 | for { 548 | var name string 549 | select { 550 | case <-op.ensureCtx.Done(): 551 | op.ensureDone <- struct{}{} 552 | return 553 | case name = <-op.ensureCh: 554 | } 555 | 556 | if enlock.exists(name) { 557 | continue 558 | } 559 | 560 | go ensure(name) 561 | } 562 | } 563 | -------------------------------------------------------------------------------- /example/demo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "log" 10 | "log/slog" 11 | "net/http" 12 | "os" 13 | "os/signal" 14 | "strconv" 15 | "strings" 16 | "sync" 17 | "syscall" 18 | "time" 19 | 20 | "cloud.google.com/go/spanner" 21 | pb "github.com/flowerinthenight/hedge-proto" 22 | "github.com/flowerinthenight/hedge/v2" 23 | "github.com/google/uuid" 24 | "golang.org/x/exp/mmap" 25 | "golang.org/x/sync/errgroup" 26 | ) 27 | 28 | var ( 29 | dbstr = flag.String("db", "", "fmt: projects/{v}/instances/{v}/databases/{v}") 30 | lockName = flag.String("lockname", "hedge-demo-group", "lock name, common to all instances") 31 | spindleTable = flag.String("spindletable", "testlease", "see https://github.com/flowerinthenight/spindle for more info") 32 | logTable = flag.String("logtable", "", "the table for our log data (optional)") 33 | ) 34 | 35 | func main() { 36 | flag.Parse() 37 | ctx, cancel := context.WithCancel(context.Background()) 38 | client, err := spanner.NewClient(ctx, *dbstr) 39 | if err != nil { 40 | slog.Error("NewClient failed:", "err", err) 41 | return 42 | } 43 | 44 | defer client.Close() 45 | ldrIn := make(chan *hedge.StreamMessage) 46 | ldrOut := make(chan *hedge.StreamMessage) 47 | go func(_ctx context.Context) { 48 | for { 49 | select { 50 | case <-_ctx.Done(): 51 | return 52 | case m := <-ldrIn: 53 | b, _ := json.Marshal(m) 54 | slog.Info("input stream:", "val", string(b)) 55 | ldrOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("one")}} 56 | ldrOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("two")}} 57 | ldrOut <- nil // end 58 | } 59 | } 60 | }(context.WithValue(ctx, struct{}{}, nil)) 61 | 62 | bcastIn := make(chan *hedge.StreamMessage) 63 | bcastOut := make(chan *hedge.StreamMessage) 64 | host, _ := os.Hostname() 65 | go func(_ctx context.Context) { 66 | for { 67 | select { 68 | case <-_ctx.Done(): 69 | return 70 | case m := <-bcastIn: 71 | slog.Info("input stream:", "val", string(m.Payload.Data)) 72 | bcastOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("1_" + host)}} 73 | bcastOut <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("2_" + host)}} 74 | bcastOut <- nil // end 75 | } 76 | } 77 | }(context.WithValue(ctx, struct{}{}, nil)) 78 | 79 | op := hedge.New(client, ":8080", *spindleTable, *lockName, *logTable, 80 | hedge.WithGroupSyncInterval(time.Second*5), 81 | hedge.WithLeaderCallback(nil, func(d any, m []byte) { 82 | log.Println("callback:", string(m)) 83 | }), 84 | hedge.WithLeaderHandler( 85 | nil, // since this is nil, 'data' should be 'op' 86 | func(data any, msg []byte) ([]byte, error) { 87 | op := data.(*hedge.Op) 88 | hostname, _ := os.Hostname() 89 | name := fmt.Sprintf("%v/%v", hostname, op.Name()) 90 | log.Println("[send] received:", string(msg)) 91 | reply := fmt.Sprintf("leader [%v] received the message [%v] on %v", 92 | name, string(msg), time.Now().Format(time.RFC3339)) 93 | return []byte(reply), nil 94 | }, 95 | ), 96 | hedge.WithBroadcastHandler( 97 | nil, // since this is nil, 'data' should be 'op' 98 | func(data any, msg []byte) ([]byte, error) { 99 | op := data.(*hedge.Op) 100 | hostname, _ := os.Hostname() 101 | name := fmt.Sprintf("%v/%v", hostname, op.Name()) 102 | log.Println("[broadcast] received:", string(msg)) 103 | reply := fmt.Sprintf("node [%v] received the broadcast message [%v] on %v", 104 | name, string(msg), time.Now().Format(time.RFC3339)) 105 | return []byte(reply), nil 106 | 107 | // log.Println("[broadcast/semaphore] received:", string(msg)) 108 | // ss := strings.Split(string(msg), " ") 109 | // name, slimit := ss[0], ss[1] 110 | // limit, err := strconv.Atoi(slimit) 111 | // if err != nil { 112 | // log.Println("invalid limit:", err) 113 | // return nil, err 114 | // } 115 | 116 | // go func() { 117 | // op := data.(*hedge.Op) 118 | // min, max := 10, 30 119 | // tm := rand.Intn(max-min+1) + min 120 | // s, err := op.NewSemaphore(context.Background(), name, limit) 121 | // if err != nil { 122 | // log.Println("NewSemaphore failed:", err) 123 | // return 124 | // } 125 | 126 | // err = s.Acquire(context.Background()) 127 | // if err != nil { 128 | // log.Println("Acquire failed:", err) 129 | // return 130 | // } 131 | 132 | // log.Printf("semaphore acquired! simulate work for %vs, id=%v", tm, op.HostPort()) 133 | // time.Sleep(time.Second * time.Duration(tm)) 134 | 135 | // log.Printf("release semaphore, id=%v", op.HostPort()) 136 | // s.Release(context.Background()) 137 | // }() 138 | 139 | // return nil, nil 140 | }, 141 | ), 142 | hedge.WithLeaderStreamChannels(ldrIn, ldrOut), 143 | hedge.WithBroadcastStreamChannels(bcastIn, bcastOut), 144 | ) 145 | 146 | log.Println(op) 147 | done := make(chan error, 1) 148 | go op.Run(ctx, done) 149 | 150 | mux := http.NewServeMux() 151 | mux.HandleFunc("/put", func(w http.ResponseWriter, r *http.Request) { 152 | hostname, _ := os.Hostname() 153 | var key, value string 154 | 155 | // For /put, we expect a fmt: "key value" 156 | b, _ := io.ReadAll(r.Body) 157 | defer r.Body.Close() 158 | if len(string(b)) > 0 { 159 | ss := strings.Split(string(b), " ") 160 | if len(ss) < 2 { 161 | w.Write([]byte("invalid msg format")) 162 | return 163 | } 164 | 165 | key = ss[0] 166 | value = strings.Join(ss[1:], " ") 167 | } 168 | 169 | if key == "" || value == "" { 170 | w.Write([]byte("invalid msg format")) 171 | return 172 | } 173 | 174 | err := op.Put(ctx, hedge.KeyValue{Key: key, Value: value}) 175 | if err != nil { 176 | w.Write([]byte(err.Error())) 177 | return 178 | } 179 | 180 | out := fmt.Sprintf("put: sender=%v, key=%v, value=%v", hostname, key, value) 181 | w.Write([]byte(out)) 182 | }) 183 | 184 | mux.HandleFunc("/get", func(w http.ResponseWriter, r *http.Request) { 185 | hostname, _ := os.Hostname() 186 | b, _ := io.ReadAll(r.Body) 187 | defer r.Body.Close() 188 | v, err := op.Get(ctx, string(b)) 189 | if err != nil { 190 | w.Write([]byte(err.Error())) 191 | return 192 | } 193 | 194 | out := fmt.Sprintf("get: sender=%v, key=%v, value=%+v", hostname, string(b), v) 195 | w.Write([]byte(out)) 196 | }) 197 | 198 | mux.HandleFunc("/send", func(w http.ResponseWriter, r *http.Request) { 199 | hostname, _ := os.Hostname() 200 | msg := "hello" // default 201 | b, _ := io.ReadAll(r.Body) 202 | defer r.Body.Close() 203 | if len(string(b)) > 0 { 204 | msg = string(b) 205 | } 206 | 207 | log.Printf("sending %q msg to leader...", msg) 208 | v, err := hedge.SendToLeader(context.Background(), op, []byte(msg)) 209 | if err != nil { 210 | w.Write([]byte(err.Error())) 211 | return 212 | } 213 | 214 | log.Printf("reply: %v", string(v)) 215 | out := fmt.Sprintf("sender=%v, reply=%v", hostname, string(v)) 216 | w.Write([]byte(out)) 217 | }) 218 | 219 | mux.HandleFunc("/streamsend", func(w http.ResponseWriter, r *http.Request) { 220 | ret, err := op.StreamToLeader(context.Background()) 221 | if err != nil { 222 | w.Write([]byte(err.Error())) 223 | return 224 | } 225 | 226 | ret.In <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("test")}} 227 | close(ret.In) // we're done with input 228 | for m := range ret.Out { 229 | slog.Info("reply:", "out", string(m.Payload.Data)) 230 | } 231 | 232 | w.Write([]byte("OK")) 233 | }) 234 | 235 | mux.HandleFunc("/broadcast", func(w http.ResponseWriter, r *http.Request) { 236 | hostname, _ := os.Hostname() 237 | msg := "hello" // default 238 | b, _ := io.ReadAll(r.Body) 239 | defer r.Body.Close() 240 | if len(string(b)) > 0 { 241 | msg = string(b) 242 | } 243 | 244 | outs := []string{} 245 | log.Printf("broadcast %q msg to all...", msg) 246 | stream := false 247 | if stream { 248 | ch := make(chan hedge.BroadcastOutput) 249 | go op.Broadcast(context.Background(), []byte(msg), hedge.BroadcastArgs{Out: ch}) 250 | for v := range ch { 251 | if v.Error != nil { 252 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, v.Error.Error()) 253 | outs = append(outs, out) 254 | } else { 255 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, string(v.Reply)) 256 | outs = append(outs, out) 257 | } 258 | } 259 | } else { 260 | vv := op.Broadcast(context.Background(), []byte(msg)) 261 | for _, v := range vv { 262 | if v.Error != nil { 263 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, v.Error.Error()) 264 | outs = append(outs, out) 265 | } else { 266 | out := fmt.Sprintf("broadcast: sender=%v, reply=%v", hostname, string(v.Reply)) 267 | outs = append(outs, out) 268 | } 269 | } 270 | } 271 | 272 | w.Write([]byte(strings.Join(outs, "\n"))) 273 | }) 274 | 275 | mux.HandleFunc("/streambroadcast", func(w http.ResponseWriter, r *http.Request) { 276 | ret, err := op.StreamBroadcast(context.Background()) 277 | if err != nil { 278 | w.Write([]byte(err.Error())) 279 | return 280 | } 281 | 282 | ret.In <- &hedge.StreamMessage{Payload: &pb.Payload{Data: []byte("test")}} 283 | close(ret.In) // we're done with input 284 | 285 | var wg sync.WaitGroup 286 | for k, v := range ret.Outs { 287 | wg.Add(1) 288 | go func(node string, ch chan *hedge.StreamMessage) { 289 | defer wg.Done() 290 | for m := range ch { 291 | slog.Info("reply:", "node", node, "data", string(m.Payload.Data)) 292 | } 293 | }(k, v) 294 | } 295 | 296 | wg.Wait() 297 | w.Write([]byte("OK")) 298 | }) 299 | 300 | mux.HandleFunc("/sos", func(w http.ResponseWriter, r *http.Request) { 301 | defer func(start time.Time) { 302 | slog.Info("distmem:", "duration", time.Since(start)) 303 | }(time.Now()) 304 | 305 | name := "distmem_" + time.Now().Format(time.RFC3339) 306 | rname := r.URL.Query().Get("name") 307 | if rname != "" { 308 | name = rname 309 | } 310 | 311 | slog.Info("start distmem:", "name", name) 312 | limit := 14_000 // 4 pods, all 313 | // limit := 2_500 314 | 315 | sos := func() *hedge.SoS { 316 | sos := op.NewSoS(name, &hedge.SoSOptions{ 317 | MemLimit: 150_000, 318 | DiskLimit: 120_000, 319 | Expiration: 5, 320 | }) 321 | 322 | writer, err := sos.Writer() 323 | if err != nil { 324 | slog.Error("Writer failed:", "err", err) 325 | return nil 326 | } 327 | 328 | defer writer.Close() 329 | var n int 330 | for i := 0; i < limit; i++ { 331 | data := fmt.Sprintf("2_%v_%v", uuid.NewString(), time.Now().Format(time.RFC3339)) 332 | n += len([]byte(data)) 333 | writer.Write([]byte(data)) 334 | } 335 | 336 | slog.Info("write_dm:", "i", limit, "n", n, "write_err", writer.Err()) 337 | return sos 338 | }() 339 | 340 | if sos == nil { 341 | slog.Error("failed in creating SoS object") 342 | return 343 | } 344 | 345 | // reader_1 346 | func() { 347 | reader, err := sos.Reader() 348 | if err != nil { 349 | slog.Error(err.Error()) 350 | return 351 | } 352 | 353 | out := make(chan []byte) 354 | eg := new(errgroup.Group) 355 | eg.Go(func() error { 356 | var i, n, total int 357 | for d := range out { 358 | ss := strings.Split(string(d), "_") 359 | if len(ss) != 3 { 360 | slog.Error("bad fmt:", "len", len(ss)) 361 | continue 362 | } 363 | 364 | t, err := strconv.Atoi(ss[0]) 365 | if err != nil { 366 | slog.Error("Atoi failed:", "err", err) 367 | continue 368 | } 369 | 370 | total += t 371 | _, err = time.Parse(time.RFC3339, ss[2]) 372 | if err != nil { 373 | slog.Error("Parse failed:", "err", err) 374 | continue 375 | } 376 | 377 | n += len(d) 378 | i++ 379 | } 380 | 381 | slog.Info("read_dm:", "i", i, "n", n, "total", total) 382 | return nil 383 | }) 384 | 385 | reader.Read(out) 386 | eg.Wait() 387 | reader.Close() 388 | slog.Info("read_dm:", "read_err", reader.Err()) 389 | }() 390 | 391 | // reader_2 392 | func() { 393 | reader, err := sos.Reader() 394 | if err != nil { 395 | slog.Error(err.Error()) 396 | return 397 | } 398 | 399 | out := make(chan []byte) 400 | eg := new(errgroup.Group) 401 | eg.Go(func() error { 402 | var i, n, total int 403 | for d := range out { 404 | ss := strings.Split(string(d), "_") 405 | if len(ss) != 3 { 406 | slog.Error("bad fmt:", "len", len(ss)) 407 | continue 408 | } 409 | 410 | t, err := strconv.Atoi(ss[0]) 411 | if err != nil { 412 | slog.Error("Atoi failed:", "err", err) 413 | continue 414 | } 415 | 416 | total += t 417 | _, err = time.Parse(time.RFC3339, ss[2]) 418 | if err != nil { 419 | slog.Error("Parse failed:", "err", err) 420 | continue 421 | } 422 | 423 | n += len(d) 424 | i++ 425 | } 426 | 427 | slog.Info("read_dm:", "i", i, "n", n, "total", total) 428 | return nil 429 | }) 430 | 431 | reader.Read(out) 432 | eg.Wait() 433 | reader.Close() 434 | slog.Info("read_dm:", "read_err", reader.Err()) 435 | }() 436 | 437 | sos.Close() 438 | w.Write([]byte("OK")) 439 | }) 440 | 441 | // NOTE: Used only on my local environment. 442 | mux.HandleFunc("/soslocal", func(w http.ResponseWriter, r *http.Request) { 443 | defer func(start time.Time) { 444 | slog.Info("distmem:", "duration", time.Since(start)) 445 | }(time.Now()) 446 | 447 | type kcT struct { 448 | Key string `json:"key"` 449 | TrueUnblended float64 `json:"trueUnblended"` 450 | Unblended float64 `json:"unblended"` 451 | Usage float64 `json:"usage"` 452 | } 453 | 454 | // See $HOME/tmp/ 455 | locs, _ := os.ReadFile("readlocs") 456 | ss := strings.Split(string(locs), " ") 457 | 458 | // See $HOME/tmp/ 459 | ra, err := mmap.Open("readdata") 460 | if err != nil { 461 | slog.Error(err.Error()) 462 | return 463 | } 464 | 465 | defer ra.Close() 466 | 467 | name := "distmem_" + time.Now().Format(time.RFC3339) 468 | rname := r.URL.Query().Get("name") 469 | if rname != "" { 470 | name = rname 471 | } 472 | 473 | slog.Info("start distmem:", "name", name) 474 | 475 | sos := func() *hedge.SoS { 476 | sos := op.NewSoS(name, &hedge.SoSOptions{ 477 | MemLimit: 10_000_000, 478 | DiskLimit: 10_000_000, 479 | Expiration: 5, 480 | }) 481 | 482 | writer, err := sos.Writer() 483 | if err != nil { 484 | slog.Error("Writer failed:", "err", err) 485 | return nil 486 | } 487 | 488 | var i, wt int 489 | var off int64 490 | locs := []int{} 491 | for _, sloc := range ss { 492 | i++ 493 | loc, _ := strconv.ParseInt(sloc, 10, 64) 494 | locs = append(locs, int(loc)) 495 | b := make([]byte, loc) 496 | n, err := ra.ReadAt(b, off) 497 | if err != nil { 498 | slog.Error(err.Error()) 499 | break 500 | } 501 | 502 | var kc kcT 503 | err = json.Unmarshal(b, &kc) 504 | if err != nil { 505 | slog.Error(err.Error()) 506 | break 507 | } 508 | 509 | if int64(n) != loc { 510 | slog.Error("not equal:", "n", n, "loc", loc) 511 | } 512 | 513 | off = off + int64(n) 514 | wt += n 515 | writer.Write(b) 516 | } 517 | 518 | writer.Close() 519 | slog.Info("total_write:", 520 | "count", i, 521 | "val", wt, 522 | "err", writer.Err(), 523 | ) 524 | 525 | return sos 526 | }() 527 | 528 | func() { 529 | reader, _ := sos.Reader() 530 | out := make(chan []byte) 531 | eg := new(errgroup.Group) 532 | eg.Go(func() error { 533 | var print int 534 | var i, rt int 535 | for d := range out { 536 | i++ 537 | var kc kcT 538 | err = json.Unmarshal(d, &kc) 539 | if err != nil { 540 | if print < 2 { 541 | slog.Error(err.Error(), "i", i, "raw", string(d)) 542 | print++ 543 | } 544 | 545 | continue 546 | } 547 | 548 | rt += len(d) 549 | } 550 | 551 | slog.Info("total_read:", "count", i, "val", rt) 552 | return nil 553 | }) 554 | 555 | reader.Read(out) 556 | eg.Wait() 557 | reader.Close() 558 | slog.Info("read_dm:", "read_err", reader.Err()) 559 | }() 560 | 561 | sos.Close() 562 | w.Write([]byte("OK")) 563 | }) 564 | 565 | s := &http.Server{Addr: ":9090", Handler: mux} 566 | go s.ListenAndServe() 567 | 568 | // Interrupt handler. 569 | go func() { 570 | sigch := make(chan os.Signal, 1) 571 | signal.Notify(sigch, syscall.SIGINT, syscall.SIGTERM) 572 | <-sigch 573 | cancel() 574 | }() 575 | 576 | <-done // wait ctrl+c 577 | s.Shutdown(ctx) 578 | } 579 | -------------------------------------------------------------------------------- /sos.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "net" 8 | "os" 9 | "strconv" 10 | "sync" 11 | "sync/atomic" 12 | "time" 13 | 14 | "github.com/apache/arrow/go/v17/arrow" 15 | "github.com/apache/arrow/go/v17/arrow/array" 16 | "github.com/apache/arrow/go/v17/arrow/memory" 17 | "github.com/cespare/xxhash/v2" 18 | pb "github.com/flowerinthenight/hedge-proto" 19 | "github.com/shirou/gopsutil/v4/mem" 20 | "golang.org/x/exp/mmap" 21 | "golang.org/x/sync/errgroup" 22 | "google.golang.org/grpc" 23 | "google.golang.org/grpc/credentials/insecure" 24 | ) 25 | 26 | const ( 27 | metaName = "name" 28 | metaMemLimit = "mlimit" 29 | metaDiskLimit = "dlimit" 30 | metaExpire = "expire" 31 | ) 32 | 33 | var ( 34 | errNoInit = fmt.Errorf("sos: not properly initialized") 35 | ) 36 | 37 | type metaT struct { 38 | msize atomic.Uint64 39 | dsize atomic.Uint64 40 | grpc atomic.Int32 41 | conn *grpc.ClientConn 42 | client pb.HedgeClient 43 | writer pb.Hedge_SoSWriteClient 44 | reader pb.Hedge_SoSReadClient 45 | } 46 | 47 | type SoSOptions struct { 48 | // MemLimit sets the memory limit in bytes to be used per node. 49 | MemLimit uint64 50 | 51 | // DiskLimit sets the disk limit in bytes to be used per node. 52 | DiskLimit uint64 53 | 54 | // Expiration sets the TTL (time-to-live) of the backing storage. 55 | // If not set, the default is 30s. 56 | Expiration int64 57 | } 58 | 59 | type memT struct { 60 | mem *memory.GoAllocator 61 | bb *array.BinaryBuilder 62 | bufs *array.Binary 63 | } 64 | 65 | // SoS (Spillover-Store) represents an object for spill-over (or stitched) 66 | // storage. Useful for load-process-discard types of data processing. The 67 | // order of storage priority is local memory, local disk, other pod's 68 | // memory, other pod's disk, and so on. 69 | // 70 | // Limitation: At the moment, it's not allowed to reuse a name for SOS 71 | // once it's used and closed within hedge's lifetime. 72 | type SoS struct { 73 | sync.Mutex 74 | 75 | Name string // the name of this instance 76 | 77 | op *Op // cluster coordinator 78 | nodes []uint64 // 0=local, 1..n=network 79 | meta map[uint64]*metaT // per-node metadata, key=node 80 | mlimit atomic.Uint64 // mem limit 81 | dlimit atomic.Uint64 // disk limit 82 | data map[uint64]*memT // mem data , key=node 83 | dlocs []int // disk offsets 84 | mlock *sync.Mutex // local mem lock 85 | dlock *sync.Mutex // local file lock 86 | wmtx *sync.Mutex // one active writer only 87 | writer *Writer // writer object 88 | refs atomic.Int64 // self reference count 89 | wrefs atomic.Int64 // writer reference count 90 | rrefs atomic.Int64 // reader reference count 91 | on atomic.Int32 // 1 = active 92 | 93 | ttl time.Duration // ttl to cleanup 94 | age time.Time // started 95 | } 96 | 97 | type Writer struct { 98 | sync.Mutex 99 | lo bool // local write only 100 | sos *SoS 101 | ch chan []byte 102 | on atomic.Int32 103 | err error 104 | done chan struct{} 105 | } 106 | 107 | // Err returns the last recorded error during the write operation. 108 | func (w *Writer) Err() error { 109 | w.Lock() 110 | defer w.Unlock() 111 | return w.err 112 | } 113 | 114 | // Write writes data to the underlying storage. 115 | func (w *Writer) Write(data []byte) { w.ch <- data } 116 | 117 | // Close closes the writer object. 118 | func (w *Writer) Close() { 119 | if w.on.Load() == 0 { 120 | return 121 | } 122 | 123 | close(w.ch) 124 | <-w.done // wait for start() 125 | w.on.Store(0) 126 | w.sos.wrefs.Add(-1) 127 | w.sos.wmtx.Unlock() 128 | } 129 | 130 | func (w *Writer) start() { 131 | defer func() { w.done <- struct{}{} }() 132 | w.on.Store(1) 133 | ctx := context.Background() 134 | node := w.sos.nodes[0] 135 | var file *os.File 136 | 137 | var allCount int 138 | var memCount int 139 | var diskCount int 140 | var netCount int 141 | var failCount int 142 | 143 | var mlock bool 144 | var dlock bool 145 | unlock := func(b bool, l *sync.Mutex) { 146 | if b { 147 | l.Unlock() 148 | } 149 | } 150 | 151 | for data := range w.ch { 152 | allCount++ 153 | var err error 154 | var nextName string 155 | msize := w.sos.meta[node].msize.Load() 156 | mlimit := w.sos.mlimit.Load() 157 | dsize := w.sos.meta[node].dsize.Load() 158 | dlimit := w.sos.dlimit.Load() 159 | 160 | // Local (or next hop) is full. Go to the next node. 161 | if !w.lo && ((msize + dsize) >= (mlimit + dlimit)) { 162 | nextName, node = w.sos.nextNode() 163 | if nextName == "" { 164 | failCount++ 165 | w.Lock() 166 | w.err = fmt.Errorf("cannot find next node") 167 | w.Unlock() 168 | continue 169 | } 170 | 171 | if w.sos.meta[node].grpc.Load() == 0 { 172 | err = func() error { 173 | host, port, _ := net.SplitHostPort(nextName) 174 | pi, _ := strconv.Atoi(port) 175 | nextName = net.JoinHostPort(host, fmt.Sprintf("%v", pi+1)) 176 | 177 | var opts []grpc.DialOption 178 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) 179 | w.sos.meta[node].conn, err = grpc.NewClient(nextName, opts...) 180 | if err != nil { 181 | return fmt.Errorf("NewClient (%v) failed: %w", nextName, err) 182 | } 183 | 184 | w.sos.meta[node].client = pb.NewHedgeClient(w.sos.meta[node].conn) 185 | w.sos.meta[node].writer, err = w.sos.meta[node].client.SoSWrite(ctx) 186 | if err != nil { 187 | return fmt.Errorf("SoSWrite (%v) failed: %w", nextName, err) 188 | } 189 | 190 | w.sos.meta[node].grpc.Add(1) 191 | return nil 192 | }() 193 | 194 | if err != nil { 195 | w.Lock() 196 | w.err = err 197 | w.Unlock() 198 | } 199 | } 200 | } 201 | 202 | switch { 203 | case !w.lo && node != w.sos.me(): 204 | netCount++ 205 | err := w.sos.meta[node].writer.Send(&pb.Payload{ 206 | Meta: map[string]string{ 207 | metaName: w.sos.Name, 208 | metaMemLimit: fmt.Sprintf("%v", w.sos.mlimit.Load()), 209 | metaDiskLimit: fmt.Sprintf("%v", w.sos.dlimit.Load()), 210 | metaExpire: fmt.Sprintf("%v", int64(w.sos.ttl.Seconds())), 211 | }, 212 | Data: data, 213 | }) 214 | 215 | if err != nil { 216 | w.Lock() 217 | w.err = fmt.Errorf("Send failed: %w", err) 218 | w.Unlock() 219 | } 220 | 221 | w.sos.meta[node].msize.Add(uint64(len(data))) 222 | default: 223 | if msize < mlimit { 224 | memCount++ 225 | if !mlock { 226 | w.sos.mlock.Lock() 227 | mlock = true 228 | } 229 | 230 | if _, ok := w.sos.data[node]; !ok { 231 | w.sos.data[node] = &memT{} 232 | } 233 | 234 | if w.sos.data[node].bb == nil { 235 | w.sos.data[node].mem = memory.NewGoAllocator() 236 | w.sos.data[node].bb = array.NewBinaryBuilder( 237 | w.sos.data[node].mem, 238 | &arrow.BinaryType{}, 239 | ) 240 | } 241 | 242 | w.sos.data[node].bb.Append(data) 243 | w.sos.meta[node].msize.Add(uint64(len(data))) 244 | } else { 245 | diskCount++ 246 | if !dlock { 247 | w.sos.dlock.Lock() 248 | dlock = true 249 | } 250 | 251 | if file == nil { 252 | flag := os.O_WRONLY | os.O_CREATE | os.O_TRUNC 253 | file, err = os.OpenFile(w.sos.localFile(), flag, 0644) 254 | if err != nil { 255 | w.sos.op.logger.Println("OpenFile failed:", err) 256 | } 257 | } 258 | 259 | n, err := file.Write(data) 260 | if err != nil { 261 | w.Lock() 262 | w.err = fmt.Errorf("Write failed: %w", err) 263 | w.Unlock() 264 | } else { 265 | w.sos.dlocs = append(w.sos.dlocs, n) 266 | w.sos.meta[node].dsize.Add(uint64(n)) 267 | } 268 | } 269 | } 270 | } 271 | 272 | // slog.Info( 273 | // "write:", 274 | // "all", allCount, 275 | // "add", memCount+diskCount+netCount+failCount, 276 | // "mem", memCount, 277 | // "disk", diskCount, 278 | // "net", netCount, 279 | // "fail", failCount, 280 | // "nodes", w.sos.nodes, 281 | // ) 282 | 283 | nodes := []uint64{} 284 | for k := range w.sos.meta { 285 | nodes = append(nodes, k) 286 | } 287 | 288 | for _, n := range nodes { 289 | if w.sos.data[n].bb != nil { 290 | w.sos.data[n].bufs = w.sos.data[n].bb.NewBinaryArray() 291 | w.sos.data[n].bb.Release() 292 | w.sos.data[n].bb = nil 293 | // slog.Info("arrow: release(bb):", "node", n) 294 | } 295 | } 296 | 297 | unlock(mlock, w.sos.mlock) 298 | 299 | file.Sync() 300 | file.Close() 301 | unlock(dlock, w.sos.dlock) 302 | 303 | for _, n := range nodes { 304 | if w.sos.meta[n].writer != nil { 305 | w.sos.meta[n].writer.CloseSend() 306 | } 307 | } 308 | } 309 | 310 | type writerOptions struct { 311 | LocalOnly bool 312 | } 313 | 314 | // Writer returns a writer object for writing data to SoS. The 315 | // caller needs to call writer.Close() after use. Options is 316 | // only used internally, not exposed to callers. 317 | func (sos *SoS) Writer(opts ...*writerOptions) (*Writer, error) { 318 | if sos.on.Load() == 0 { 319 | return nil, errNoInit 320 | } 321 | 322 | sos.wmtx.Lock() 323 | var localOnly bool 324 | if len(opts) > 0 { 325 | localOnly = opts[0].LocalOnly 326 | } 327 | 328 | sos.writer = &Writer{ 329 | lo: localOnly, 330 | sos: sos, 331 | ch: make(chan []byte), 332 | done: make(chan struct{}, 1), 333 | } 334 | 335 | go sos.writer.start() 336 | sos.wrefs.Add(1) 337 | return sos.writer, nil 338 | } 339 | 340 | type Reader struct { 341 | sync.Mutex 342 | lo bool // local read only 343 | sos *SoS 344 | on atomic.Int32 345 | err error 346 | done chan struct{} 347 | } 348 | 349 | // Read reads the underlying data and streams them to the `out` channel. 350 | func (r *Reader) Read(out chan []byte) { 351 | eg := new(errgroup.Group) 352 | eg.Go(func() error { 353 | r.on.Store(1) 354 | ctx := context.Background() 355 | for _, node := range r.sos.nodes { 356 | var err error 357 | switch { 358 | case !r.lo && node != r.sos.me(): 359 | func() { 360 | r.sos.meta[node].reader, err = r.sos.meta[node].client.SoSRead(ctx) 361 | if err != nil { 362 | r.Lock() 363 | r.err = fmt.Errorf("SoSRead failed: %v", err) 364 | r.Unlock() 365 | return 366 | } 367 | }() 368 | 369 | err = r.sos.meta[node].reader.Send(&pb.Payload{ 370 | Meta: map[string]string{ 371 | metaName: r.sos.Name, 372 | metaMemLimit: fmt.Sprintf("%v", r.sos.mlimit.Load()), 373 | metaDiskLimit: fmt.Sprintf("%v", r.sos.dlimit.Load()), 374 | metaExpire: fmt.Sprintf("%v", int64(r.sos.ttl.Seconds())), 375 | }, 376 | }) 377 | 378 | if err != nil { 379 | r.Lock() 380 | r.err = fmt.Errorf("Send failed: %v", err) 381 | r.Unlock() 382 | continue 383 | } 384 | 385 | for { 386 | in, err := r.sos.meta[node].reader.Recv() 387 | if err == io.EOF { 388 | break 389 | } 390 | 391 | if err != nil { 392 | r.Lock() 393 | r.err = fmt.Errorf("Recv failed: %v", err) 394 | r.Unlock() 395 | break 396 | } 397 | 398 | out <- in.Data 399 | } 400 | default: 401 | func() { 402 | r.sos.mlock.Lock() 403 | defer r.sos.mlock.Unlock() 404 | if _, ok := r.sos.data[node]; !ok { 405 | return 406 | } 407 | 408 | if r.sos.data[node].bufs == nil { 409 | return 410 | } 411 | 412 | for i := 0; i < r.sos.data[node].bufs.Len(); i++ { 413 | out <- r.sos.data[node].bufs.Value(i) 414 | } 415 | }() 416 | 417 | func() { 418 | r.sos.dlock.Lock() 419 | defer r.sos.dlock.Unlock() 420 | if len(r.sos.dlocs) == 0 { 421 | return 422 | } 423 | 424 | ra, err := mmap.Open(r.sos.localFile()) 425 | if err != nil { 426 | r.Lock() 427 | r.err = fmt.Errorf("Open failed: %v", err) 428 | r.Unlock() 429 | return 430 | } 431 | 432 | defer ra.Close() 433 | var off int64 434 | for _, loc := range r.sos.dlocs { 435 | buf := make([]byte, loc) 436 | n, err := ra.ReadAt(buf, off) 437 | if err != nil { 438 | r.Lock() 439 | r.err = fmt.Errorf("ReadAt failed: %v", err) 440 | r.Unlock() 441 | } 442 | 443 | out <- buf 444 | off = off + int64(n) 445 | } 446 | }() 447 | } 448 | } 449 | 450 | return nil 451 | }) 452 | 453 | eg.Wait() 454 | close(out) 455 | r.done <- struct{}{} 456 | } 457 | 458 | // Err returns the last recorded error, if any, during the read operation. 459 | func (r *Reader) Err() error { 460 | r.Lock() 461 | defer r.Unlock() 462 | return r.err 463 | } 464 | 465 | // Close closes the reader object. 466 | func (r *Reader) Close() { 467 | if r.on.Load() == 0 { 468 | return 469 | } 470 | 471 | <-r.done // wait for loop 472 | r.sos.rrefs.Add(-1) 473 | r.on.Store(0) 474 | } 475 | 476 | type readerOptions struct { 477 | LocalOnly bool 478 | } 479 | 480 | // Reader returns a reader object for reading data from SoS. The 481 | // caller needs to call reader.Close() after use. Options is only 482 | // used internally, not exposed to callers. 483 | func (sos *SoS) Reader(opts ...*readerOptions) (*Reader, error) { 484 | if sos.on.Load() == 0 { 485 | return nil, errNoInit 486 | } 487 | 488 | var localOnly bool 489 | if len(opts) > 0 { 490 | localOnly = opts[0].LocalOnly 491 | } 492 | 493 | reader := &Reader{ 494 | lo: localOnly, 495 | sos: sos, 496 | done: make(chan struct{}, 1), 497 | } 498 | 499 | sos.rrefs.Add(1) 500 | return reader, nil 501 | } 502 | 503 | // Close closes the SoS object. 504 | func (sos *SoS) Close() { 505 | if sos.on.Load() == 0 { 506 | return 507 | } 508 | 509 | sos.Lock() 510 | defer sos.Unlock() 511 | nodes := []uint64{} 512 | for k := range sos.meta { 513 | nodes = append(nodes, k) 514 | } 515 | 516 | ctx := context.Background() 517 | for _, n := range nodes { 518 | if sos.meta[n].conn != nil { 519 | sos.meta[n].client.SoSClose(ctx, &pb.Payload{ 520 | Meta: map[string]string{metaName: sos.Name}, 521 | }) 522 | } 523 | } 524 | 525 | sos.refs.Add(-1) 526 | sos.on.Store(0) 527 | } 528 | 529 | func (sos *SoS) nextNode() (string, uint64) { 530 | var mb string 531 | members := sos.op.Members() 532 | for _, member := range members { 533 | nn := xxhash.Sum64String(member) 534 | if nn == sos.me() { 535 | continue 536 | } 537 | 538 | if _, ok := sos.data[nn]; ok { 539 | continue 540 | } 541 | 542 | mb = member 543 | sos.nodes = append(sos.nodes, nn) 544 | sos.meta[nn] = &metaT{} 545 | sos.data[nn] = &memT{} 546 | break 547 | } 548 | 549 | return mb, sos.nodes[len(sos.nodes)-1] 550 | } 551 | 552 | func (sos *SoS) me() uint64 { return xxhash.Sum64String(sos.op.Name()) } 553 | 554 | func (sos *SoS) localFile() string { 555 | name1 := fmt.Sprintf("%v", sos.me()) 556 | name2 := xxhash.Sum64String(sos.Name) 557 | return fmt.Sprintf("%v_%v.dat", name1, name2) 558 | } 559 | 560 | func (sos *SoS) cleaner() { 561 | eg := new(errgroup.Group) 562 | eg.Go(func() error { 563 | started := sos.age 564 | for { 565 | time.Sleep(time.Second * 1) 566 | refs := sos.refs.Load() 567 | wrefs := sos.wrefs.Load() 568 | rrefs := sos.rrefs.Load() 569 | if (refs + wrefs + rrefs) > 0 { 570 | started = time.Now() 571 | continue 572 | } 573 | 574 | if time.Since(started) > sos.ttl { 575 | func() { 576 | // Cleanup memory area: 577 | sos.op.soss[sos.Name].mlock.Lock() 578 | defer sos.op.soss[sos.Name].mlock.Unlock() 579 | for _, node := range sos.op.soss[sos.Name].nodes { 580 | if sos.data[node].bufs != nil { 581 | sos.data[node].bufs.Release() 582 | sos.data[node].bufs = nil 583 | // slog.Info("arrow: release(buf):", "node", node) 584 | } 585 | } 586 | }() 587 | 588 | // Cleanup disk area: 589 | sos.op.soss[sos.Name].dlock.Lock() 590 | os.Remove(sos.localFile()) 591 | sos.op.soss[sos.Name].dlock.Unlock() 592 | 593 | // Remove the main map entry: 594 | sos.op.sosLock.Lock() 595 | delete(sos.op.soss, sos.Name) 596 | sos.op.sosLock.Unlock() 597 | break 598 | } 599 | } 600 | 601 | return nil 602 | }) 603 | 604 | eg.Wait() 605 | } 606 | 607 | func newSoS(name string, op *Op, opts ...*SoSOptions) *SoS { 608 | sos := &SoS{ 609 | Name: name, 610 | op: op, 611 | meta: make(map[uint64]*metaT), 612 | data: map[uint64]*memT{}, 613 | dlocs: []int{}, 614 | mlock: &sync.Mutex{}, 615 | dlock: &sync.Mutex{}, 616 | wmtx: &sync.Mutex{}, 617 | } 618 | 619 | sos.on.Store(1) 620 | sos.nodes = []uint64{sos.me()} 621 | sos.meta[sos.me()] = &metaT{} 622 | sos.data[sos.me()] = &memT{} 623 | 624 | if len(opts) > 0 { 625 | sos.mlimit.Store(opts[0].MemLimit) 626 | sos.dlimit.Store(opts[0].DiskLimit) 627 | if opts[0].Expiration > 0 { 628 | sos.ttl = time.Second * time.Duration(opts[0].Expiration) 629 | } 630 | } 631 | 632 | if sos.mlimit.Load() == 0 { 633 | vm, _ := mem.VirtualMemory() 634 | sos.mlimit.Store(vm.Available / 2) // half of free mem 635 | } 636 | 637 | if sos.dlimit.Load() == 0 { 638 | sos.dlimit.Store(1 << 30) // 1GB by default 639 | } 640 | 641 | if sos.ttl == 0 { 642 | sos.ttl = time.Second * 30 643 | } 644 | 645 | sos.refs.Add(1) 646 | sos.age = time.Now() 647 | go sos.cleaner() 648 | return sos 649 | } 650 | -------------------------------------------------------------------------------- /hedge.go: -------------------------------------------------------------------------------- 1 | package hedge 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "encoding/base64" 7 | "encoding/json" 8 | "fmt" 9 | "io" 10 | "log" 11 | "maps" 12 | "net" 13 | "os" 14 | "strconv" 15 | "strings" 16 | "sync" 17 | "sync/atomic" 18 | "time" 19 | 20 | "cloud.google.com/go/spanner" 21 | pb "github.com/flowerinthenight/hedge-proto" 22 | "github.com/flowerinthenight/spindle/v2" 23 | "github.com/google/uuid" 24 | gaxv2 "github.com/googleapis/gax-go/v2" 25 | "github.com/hashicorp/memberlist" 26 | "google.golang.org/api/iterator" 27 | "google.golang.org/grpc" 28 | "google.golang.org/grpc/credentials/insecure" 29 | "google.golang.org/grpc/reflection" 30 | ) 31 | 32 | const ( 33 | CmdLeader = "LDR" // for leader confirmation, reply="ACK" 34 | CmdWrite = "PUT" // write key/value, fmt="PUT [noappend]" 35 | CmdSend = "SND" // member to leader, fmt="SND " 36 | CmdPing = "HEY" // heartbeat to indicate availability, fmt="HEY [id]" 37 | CmdMembers = "MEM" // members info from leader to all, fmt="MEM base64(JSON(members))" 38 | CmdBroadcast = "ALL" // broadcast to all, fmt="ALL base64(payload)" 39 | CmdAck = "ACK" // generic reply, fmt="ACK"|"ACK base64(err)"|"ACK base64(JSON(members))" 40 | CmdSemaphore = "SEM" // create semaphore, fmt="SEM {name} {limit} {caller}, reply="ACK" 41 | CmdSemAcquire = "SEA" // acquire semaphore, fmt="SEA {name} {caller}", reply="ACK[ base64([0:|1:]err)]" (0=final,1=retry) 42 | CmdSemRelease = "SER" // release semaphore, fmt="SER {name} {caller}" 43 | 44 | FlagNoAppend = "noappend" 45 | ) 46 | 47 | var ( 48 | ErrNotRunning = fmt.Errorf("hedge: not running") 49 | ErrNoLeader = fmt.Errorf("hedge: no leader available") 50 | ErrNoHandler = fmt.Errorf("hedge: no message handler") 51 | ErrNotSupported = fmt.Errorf("hedge: not supported") 52 | ErrInvalidConn = fmt.Errorf("hedge: invalid connection") 53 | 54 | cctx = func(ctx context.Context) context.Context { 55 | return context.WithValue(ctx, struct{}{}, nil) 56 | } 57 | ) 58 | 59 | type FnMsgHandler func(data any, msg []byte) ([]byte, error) 60 | 61 | // KeyValue is for Put()/Get() callers. 62 | type KeyValue struct { 63 | Key string `json:"key"` 64 | Value string `json:"value"` 65 | Timestamp time.Time `json:"timestamp"` // read-only, populated when Get() 66 | } 67 | 68 | // LogItem represents an item in our log. 69 | type LogItem struct { 70 | Id string 71 | Key string 72 | Value string 73 | Leader string 74 | Timestamp time.Time 75 | } 76 | 77 | type Option interface { 78 | Apply(*Op) 79 | } 80 | 81 | type withDuration int64 82 | 83 | func (w withDuration) Apply(op *Op) { op.lockTimeout = int64(w) } 84 | 85 | // WithDuration sets Op's internal spindle object's lease duration in milliseconds. 86 | // Defaults to 30000ms (30s) when not set. Minimum value is 2000ms (2s). 87 | func WithDuration(v int64) Option { return withDuration(v) } 88 | 89 | type withGroupSyncInterval time.Duration 90 | 91 | func (w withGroupSyncInterval) Apply(op *Op) { op.syncInterval = time.Duration(w) } 92 | 93 | // WithGroupSyncInterval sets the internal interval timeout to sync membership 94 | // within the group in seconds. If not set, defaults to 30s. Minimum value is 2s. 95 | func WithGroupSyncInterval(v time.Duration) Option { return withGroupSyncInterval(v) } 96 | 97 | type withLeaderCallback struct { 98 | d any 99 | f spindle.FnLeaderCallback 100 | } 101 | 102 | func (w withLeaderCallback) Apply(op *Op) { 103 | op.cbLeaderData = w.d 104 | op.cbLeader = w.f 105 | } 106 | 107 | // WithLeaderCallback sets the node's callback function that will be called 108 | // when a leader node selected (or deselected). The msg arg for f will be 109 | // set to either 0 or 1. 110 | func WithLeaderCallback(d any, f spindle.FnLeaderCallback) Option { 111 | return withLeaderCallback{d, f} 112 | } 113 | 114 | type withLeaderHandler struct { 115 | d any 116 | h FnMsgHandler 117 | } 118 | 119 | func (w withLeaderHandler) Apply(op *Op) { 120 | op.fnLdrData = w.d 121 | op.fnLeader = w.h 122 | } 123 | 124 | // WithLeaderHandler sets the node's callback function when it is the current 125 | // leader and when members send messages to it using the Send(...) API. Any 126 | // arbitrary data represented by d will be passed to the callback h every 127 | // time it is called. If d is nil, the default callback data will be the *Op 128 | // object itself. The handler's returning []byte will serve as reply. 129 | // 130 | // Typical flow would be: 131 | // 1. Any node (including the leader) calls the Send(...) API. 132 | // 2. The current leader handles the call by reading the input. 133 | // 3. Leader will then call FnLeaderHandler, passing the arbitrary data 134 | // along with the message. 135 | // 4. FnLeaderHandler will process the data as leader, then returns the 136 | // reply to the calling member. 137 | func WithLeaderHandler(d any, h FnMsgHandler) Option { 138 | return withLeaderHandler{d, h} 139 | } 140 | 141 | type withBroadcastHandler struct { 142 | d any 143 | h FnMsgHandler 144 | } 145 | 146 | func (w withBroadcastHandler) Apply(op *Op) { 147 | op.fnBcData = w.d 148 | op.fnBroadcast = w.h 149 | } 150 | 151 | // WithMembersChangedHandler sets the leader node's callback function for any member changes 152 | func WithMembersChangedHandler(d any, h FnMsgHandler) Option { 153 | return withMembersChangedHandler{d, h} 154 | } 155 | 156 | type withMembersChangedHandler struct { 157 | d any 158 | h FnMsgHandler 159 | } 160 | 161 | func (w withMembersChangedHandler) Apply(op *Op) { 162 | op.fnMemChangedData = w.d 163 | op.fnMemberChanged = w.h 164 | } 165 | 166 | // WithBroadcastHandler sets the node's callback function for broadcast messages 167 | // from anyone in the group using the Broadcast(...) API. Any arbitrary data 168 | // represented by d will be passed to the callback h every time it is called. 169 | // If d is nil, the default callback data will be the *Op object itself. The 170 | // handler's returning []byte will serve as reply. 171 | // 172 | // A nil broadcast handler disables the internal heartbeat function. 173 | func WithBroadcastHandler(d any, h FnMsgHandler) Option { 174 | return withBroadcastHandler{d, h} 175 | } 176 | 177 | type withGrpcHostPort string 178 | 179 | func (w withGrpcHostPort) Apply(op *Op) { op.grpcHostPort = string(w) } 180 | 181 | // WithGrpcHostPort sets Op's internal grpc host/port address. 182 | // Defaults to the internal TCP host:port+1. 183 | func WithGrpcHostPort(v string) Option { return withGrpcHostPort(v) } 184 | 185 | type StreamMessage struct { 186 | Payload *pb.Payload `json:"payload"` 187 | Error error `json:"error"` 188 | } 189 | 190 | type withLeaderStreamChannels struct { 191 | in chan *StreamMessage 192 | out chan *StreamMessage 193 | } 194 | 195 | func (w withLeaderStreamChannels) Apply(op *Op) { 196 | op.leaderStreamIn = w.in 197 | op.leaderStreamOut = w.out 198 | } 199 | 200 | // WithLeaderStreamChannels sets the streaming input and output channels for sending 201 | // streaming messages to the leader. All incoming stream messages to the leader will 202 | // be sent to the `in` channel. A nil message indicates the end of the streaming data. 203 | // After sending all messages to `in`, the handler will then listen to the `out` channel 204 | // for reply messages. A nil message indicates the end of the reply stream. 205 | func WithLeaderStreamChannels(in chan *StreamMessage, out chan *StreamMessage) Option { 206 | return withLeaderStreamChannels{in, out} 207 | } 208 | 209 | type withBroadcastStreamChannels struct { 210 | in chan *StreamMessage 211 | out chan *StreamMessage 212 | } 213 | 214 | func (w withBroadcastStreamChannels) Apply(op *Op) { 215 | op.broadcastStreamIn = w.in 216 | op.broadcastStreamOut = w.out 217 | } 218 | 219 | // WithBroadcastStreamChannels sets the streaming input and output channels for broadcasting 220 | // messages to all nodes. All incoming stream messages will be sent to the `in` channel. A 221 | // nil message indicates the end of the streaming data. After sending all messages to `in`, 222 | // the handler will then listen to the `out` channel for reply messages. A nil message 223 | // indicates the end of the reply stream. 224 | func WithBroadcastStreamChannels(in chan *StreamMessage, out chan *StreamMessage) Option { 225 | return withBroadcastStreamChannels{in, out} 226 | } 227 | 228 | type withLogger struct{ l *log.Logger } 229 | 230 | func (w withLogger) Apply(op *Op) { op.logger = w.l } 231 | 232 | // WithLogger sets Op's logger object. Can be silenced by setting v to: 233 | // 234 | // log.New(io.Discard, "", 0) 235 | func WithLogger(v *log.Logger) Option { return withLogger{v} } 236 | 237 | // Op is our main instance for hedge operations. 238 | type Op struct { 239 | hostPort string // this instance's id; address:port 240 | grpcHostPort string // default is host:port+1 (from `hostPort`) 241 | spannerClient *spanner.Client // both for spindle and hedge 242 | lockTable string // spindle lock table 243 | lockName string // spindle lock name 244 | lockTimeout int64 // spindle's lock lease duration in ms 245 | logTable string // append-only log table 246 | 247 | cbLeader spindle.FnLeaderCallback 248 | cbLeaderData any 249 | fnLeader FnMsgHandler // leader message handler 250 | fnLdrData any // arbitrary data passed to fnLeader 251 | fnBroadcast FnMsgHandler // broadcast message handler 252 | fnBcData any // arbitrary data passed to fnBroadcast 253 | fnMemberChanged FnMsgHandler // member changes message handler 254 | fnMemChangedData any // arbitrary data passed to fnMemberChanged 255 | leaderStreamIn chan *StreamMessage 256 | leaderStreamOut chan *StreamMessage 257 | broadcastStreamIn chan *StreamMessage 258 | broadcastStreamOut chan *StreamMessage 259 | 260 | sosLock *sync.Mutex 261 | soss map[string]*SoS // distributed memory 262 | 263 | *spindle.Lock // handles our distributed lock 264 | members map[string]struct{} // key=id 265 | syncInterval time.Duration // ensure membership 266 | mtx sync.Mutex // local mutex 267 | mtxSem sync.Mutex // semaphore mutex 268 | ensureOn atomic.Int32 // 1=semaphore checker running 269 | ensureCh chan string // please check this id 270 | ensureCtx context.Context 271 | ensureCancel context.CancelFunc 272 | ensureDone chan struct{} 273 | active atomic.Int32 // 1=running, 0=off 274 | logger *log.Logger // internal logger 275 | } 276 | 277 | // String implements the Stringer interface. 278 | func (op *Op) String() string { 279 | return fmt.Sprintf("hostport:%s;spindle:%v;%v;%v", 280 | op.hostPort, 281 | op.spannerClient.DatabaseName(), 282 | op.lockTable, 283 | op.logTable, 284 | ) 285 | } 286 | 287 | // HostPort returns the host:port (or name) of this instance. 288 | func (op *Op) HostPort() string { return op.hostPort } 289 | 290 | // Name is the same as HostPort. 291 | func (op *Op) Name() string { return op.hostPort } 292 | 293 | // IsRunning returns true if Op is already running. 294 | func (op *Op) IsRunning() bool { return op.active.Load() == 1 } 295 | 296 | // Run starts the main handler. It blocks until ctx is cancelled, 297 | // optionally sending an error message to done when finished. 298 | func (op *Op) Run(ctx context.Context, done ...chan error) error { 299 | var err error 300 | defer func(e *error) { 301 | if len(done) > 0 { 302 | done[0] <- *e 303 | } 304 | }(&err) 305 | 306 | // Some housekeeping. 307 | if op.spannerClient == nil { 308 | err = fmt.Errorf("hedge: Spanner client cannot be nil") 309 | return err 310 | } 311 | 312 | for _, v := range []struct { 313 | name string 314 | val string 315 | }{ 316 | {"SpindleTable", op.lockTable}, 317 | {"SpindleLockName", op.lockName}, 318 | } { 319 | if v.val == "" { 320 | err = fmt.Errorf("hedge: %v cannot be empty", v.name) 321 | return err 322 | } 323 | } 324 | 325 | // Setup our server for our internal protocol. 326 | addr, err := net.ResolveTCPAddr("tcp4", op.hostPort) 327 | if err != nil { 328 | return err 329 | } 330 | 331 | var exitedTCP atomic.Int32 332 | doneTCP := make(chan error, 1) 333 | 334 | // This connection will be closed upon context termination. 335 | tl, err := net.ListenTCP("tcp", addr) 336 | if err != nil { 337 | return err 338 | } 339 | 340 | op.logger.Printf("tcp: listen on %v", op.hostPort) 341 | 342 | go func() { 343 | defer func() { doneTCP <- nil }() 344 | for { 345 | conn, err := tl.Accept() 346 | if exitedTCP.Load() == 1 { 347 | return 348 | } 349 | 350 | if err != nil { 351 | op.logger.Printf("Accept failed: %v", err) 352 | return 353 | } 354 | 355 | if ctx.Err() != nil { 356 | op.logger.Printf("cancelled: %v", ctx.Err()) 357 | return 358 | } 359 | 360 | go handleMsg(ctx, op, conn) 361 | } 362 | }() 363 | 364 | gl, err := net.Listen("tcp", op.grpcHostPort) 365 | if err != nil { 366 | return err 367 | } 368 | 369 | defer gl.Close() 370 | op.logger.Printf("grpc: listen on %v", op.grpcHostPort) 371 | 372 | gs := grpc.NewServer() 373 | svc := &service{op: op} 374 | pb.RegisterHedgeServer(gs, svc) 375 | reflection.Register(gs) // register reflection service 376 | go gs.Serve(gl) 377 | 378 | // Setup and start our internal spindle object. 379 | op.Lock = spindle.New( 380 | op.spannerClient, 381 | op.lockTable, 382 | fmt.Sprintf("hedge/spindle/lockname/%v", op.lockName), 383 | spindle.WithDuration(op.lockTimeout), 384 | spindle.WithId(op.hostPort), 385 | spindle.WithLeaderCallback(op.cbLeaderData, func(data any, msg []byte) { 386 | if op.cbLeader != nil { 387 | m := fmt.Sprintf("%v %v", string(msg), op.Name()) 388 | op.cbLeader(data, []byte(m)) 389 | } 390 | }), 391 | spindle.WithLogger(op.logger), 392 | ) 393 | 394 | spindleDone := make(chan error, 1) 395 | ctxSpindle, cancel := context.WithCancel(context.Background()) 396 | op.Lock.Run(ctxSpindle, spindleDone) 397 | defer func() { 398 | cancel() // stop spindle; 399 | <-spindleDone // and wait 400 | }() 401 | 402 | // Start tracking online members. 403 | op.members[op.hostPort] = struct{}{} 404 | membersDone := make(chan error, 1) 405 | ctxMembers := cctx(ctx) 406 | first := make(chan struct{}, 1) 407 | first <- struct{}{} // immediately the first time 408 | ticker := time.NewTicker(op.syncInterval) 409 | defer func() { 410 | ticker.Stop() 411 | <-membersDone 412 | }() 413 | 414 | go func() { 415 | var active atomic.Int32 416 | fnEnsureMembers := func() { 417 | active.Store(1) 418 | defer active.Store(0) 419 | ch := make(chan *string) 420 | emdone := make(chan struct{}, 1) 421 | todel := []string{} 422 | go func() { 423 | for { 424 | m := <-ch 425 | switch m { 426 | case nil: 427 | emdone <- struct{}{} 428 | return 429 | default: 430 | todel = append(todel, *m) 431 | } 432 | } 433 | }() 434 | 435 | var w sync.WaitGroup 436 | allm := op.getMembers() 437 | oldallm := make(map[string]struct{}) 438 | maps.Copy(oldallm, allm) 439 | 440 | for k := range allm { 441 | w.Add(1) 442 | go func(id string) { 443 | defer func() { w.Done() }() 444 | timeout := time.Second * 5 445 | conn, err := net.DialTimeout("tcp", id, timeout) 446 | if err != nil { 447 | ch <- &id // delete this 448 | return 449 | } 450 | 451 | var sb strings.Builder 452 | fmt.Fprintf(&sb, "%s\n", CmdPing) 453 | r, err := op.send(conn, sb.String()) 454 | if err != nil { 455 | ch <- &id // delete this 456 | return 457 | } 458 | 459 | if r != CmdAck { 460 | ch <- &id // delete this 461 | } 462 | }(k) 463 | } 464 | 465 | w.Wait() 466 | ch <- nil // close; 467 | <-emdone // and wait 468 | for _, rm := range todel { 469 | if rm != "" { 470 | op.logger.Printf("[leader] delete %v", rm) 471 | op.delMember(rm) 472 | } 473 | } 474 | 475 | newallm := op.getMembers() 476 | if len(oldallm) != len(newallm) && op.fnMemberChanged != nil { 477 | diff := len(newallm) - len(oldallm) 478 | op.fnMemberChanged(op.fnMemChangedData, []byte(fmt.Sprintf("%v", diff))) 479 | } 480 | 481 | // Broadcast active members to all. 482 | for k := range newallm { 483 | w.Add(1) 484 | go func(id string) { 485 | defer w.Done() 486 | timeout := time.Second * 5 487 | conn, err := net.DialTimeout("tcp", id, timeout) 488 | if err != nil { 489 | return 490 | } 491 | 492 | defer conn.Close() 493 | var sb strings.Builder 494 | fmt.Fprintf(&sb, "%s %s\n", CmdMembers, op.encodeMembers()) 495 | op.send(conn, sb.String()) 496 | }(k) 497 | } 498 | 499 | w.Wait() 500 | } 501 | 502 | var hbactive atomic.Int32 503 | fnHeartbeat := func() { 504 | hbactive.Store(1) 505 | defer hbactive.Store(0) 506 | lconn, err := op.getLeaderConn(ctx) 507 | if err != nil { 508 | return 509 | } 510 | 511 | if lconn != nil { 512 | defer lconn.Close() 513 | } 514 | 515 | var sb strings.Builder 516 | fmt.Fprintf(&sb, "%s %s\n", CmdPing, op.hostPort) 517 | r, err := op.send(lconn, sb.String()) 518 | if err != nil { 519 | return 520 | } 521 | 522 | b, _ := base64.StdEncoding.DecodeString(r) 523 | var allm map[string]struct{} 524 | json.Unmarshal(b, &allm) 525 | op.setMembers(allm) 526 | } 527 | 528 | for { 529 | select { 530 | case <-ctxMembers.Done(): 531 | membersDone <- nil 532 | return 533 | case <-first: 534 | case <-ticker.C: 535 | } 536 | 537 | if op.fnBroadcast == nil { 538 | op.logger.Println("no broadcast support") 539 | membersDone <- nil 540 | return 541 | } 542 | 543 | if hbactive.Load() == 0 { 544 | go fnHeartbeat() // tell leader we're online 545 | } 546 | 547 | if hl, _ := op.HasLock(); !hl { 548 | continue 549 | } 550 | 551 | if active.Load() == 0 { 552 | go fnEnsureMembers() // leader only 553 | } 554 | } 555 | }() 556 | 557 | op.active.Store(1) 558 | defer op.active.Store(0) 559 | 560 | <-ctx.Done() // wait for termination 561 | 562 | exitedTCP.Store(1) // don't print err in tl.Accept 563 | tl.Close() // will cause tl.Accept to fail 564 | 565 | gs.GracefulStop() // stop grpc server 566 | if op.ensureOn.Load() == 1 { 567 | op.ensureCancel() // stop semaphore checker; 568 | <-op.ensureDone // and wait 569 | } 570 | 571 | return nil 572 | } 573 | 574 | // NewSemaphore returns a distributed semaphore object. 575 | func (op *Op) NewSemaphore(ctx context.Context, name string, limit int) (*Semaphore, error) { 576 | if op.logTable == "" { 577 | return nil, ErrNotSupported 578 | } 579 | 580 | if op.active.Load() != 1 { 581 | return nil, ErrNotRunning 582 | } 583 | 584 | if strings.Contains(name, " ") { 585 | return nil, fmt.Errorf("name cannot have whitespace(s)") 586 | } 587 | 588 | conn, err := op.getLeaderConn(ctx) 589 | if err != nil { 590 | return nil, err 591 | } 592 | 593 | if conn != nil { 594 | defer conn.Close() 595 | } 596 | 597 | var sb strings.Builder 598 | fmt.Fprintf(&sb, "%s %s %d %s\n", CmdSemaphore, name, limit, op.hostPort) 599 | reply, err := op.send(conn, sb.String()) 600 | if err != nil { 601 | return nil, err 602 | } 603 | 604 | switch { 605 | case strings.HasPrefix(reply, CmdAck): 606 | ss := strings.Split(reply, " ") 607 | if len(ss) > 1 { // failed 608 | dec, _ := base64.StdEncoding.DecodeString(ss[1]) 609 | return nil, fmt.Errorf("%v", string(dec)) 610 | } 611 | default: 612 | return nil, ErrNotSupported 613 | } 614 | 615 | return &Semaphore{name, limit, op}, nil 616 | } 617 | 618 | // NewSoS returns an object for writing data to spill-over 619 | // storage across the cluster. The order of writing is local 620 | // memory, local disk, other pod's memory, other pod's disk, 621 | // and so on. 622 | func (op *Op) NewSoS(name string, opts ...*SoSOptions) *SoS { 623 | op.sosLock.Lock() 624 | defer op.sosLock.Unlock() 625 | if _, ok := op.soss[name]; ok { 626 | return op.soss[name] 627 | } 628 | 629 | op.soss[name] = newSoS(name, op, opts...) 630 | return op.soss[name] 631 | } 632 | 633 | // Get reads a key (or keys) from Op. 634 | // The values of limit are: 635 | // 636 | // limit = 0 --> (default) latest only 637 | // limit = -1 --> all (latest to oldest, [0]=latest) 638 | // limit = -2 --> oldest version only 639 | // limit > 0 --> items behind latest; 3 means latest + 2 versions behind, [0]=latest 640 | func (op *Op) Get(ctx context.Context, key string, limit ...int64) ([]KeyValue, error) { 641 | if op.logTable == "" { 642 | return nil, ErrNotSupported 643 | } 644 | 645 | ret := []KeyValue{} 646 | var q strings.Builder 647 | fmt.Fprintf(&q, "select key, value, timestamp ") 648 | fmt.Fprintf(&q, "from %s ", op.logTable) 649 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ") 650 | fmt.Fprintf(&q, "order by timestamp desc limit 1") 651 | 652 | if len(limit) > 0 { 653 | switch { 654 | case limit[0] > 0: 655 | q.Reset() 656 | fmt.Fprintf(&q, "select key, value, timestamp ") 657 | fmt.Fprintf(&q, "from %s ", op.logTable) 658 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ") 659 | fmt.Fprintf(&q, "order by timestamp desc limit %v", limit[0]) 660 | case limit[0] == -1: 661 | q.Reset() 662 | fmt.Fprintf(&q, "select key, value, timestamp ") 663 | fmt.Fprintf(&q, "from %s ", op.logTable) 664 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ") 665 | fmt.Fprintf(&q, "order by timestamp desc") 666 | case limit[0] == -2: 667 | q.Reset() 668 | fmt.Fprintf(&q, "select key, value, timestamp ") 669 | fmt.Fprintf(&q, "from %s ", op.logTable) 670 | fmt.Fprintf(&q, "where key = @key and timestamp is not null ") 671 | fmt.Fprintf(&q, "order by timestamp limit 1") 672 | } 673 | } 674 | 675 | stmt := spanner.Statement{SQL: q.String(), Params: map[string]any{"key": key}} 676 | iter := op.spannerClient.Single().Query(ctx, stmt) 677 | defer iter.Stop() 678 | for { 679 | row, err := iter.Next() 680 | if err == iterator.Done { 681 | break 682 | } 683 | 684 | if err != nil { 685 | return ret, err 686 | } 687 | 688 | var li LogItem 689 | err = row.ToStruct(&li) 690 | if err != nil { 691 | return ret, err 692 | } 693 | 694 | ret = append(ret, KeyValue{ 695 | Key: li.Key, 696 | Value: li.Value, 697 | Timestamp: li.Timestamp, 698 | }) 699 | } 700 | 701 | return ret, nil 702 | } 703 | 704 | type PutOptions struct { 705 | // If true, do a direct write, no need to fwd to leader. 706 | DirectWrite bool 707 | 708 | // If true, don't do an append-write; overwrite the latest. Note that even 709 | // if you set this to true, if you do another Put the next time with this 710 | // field set as false (default), the previous write will now be gone, or 711 | // will now be part of the history. 712 | NoAppend bool 713 | } 714 | 715 | // Put saves a key/value to Op. This call will try to block, at least roughly 716 | // until spindle's timeout, to wait for the leader's availability to do actual 717 | // writes before returning. 718 | func (op *Op) Put(ctx context.Context, kv KeyValue, po ...PutOptions) error { 719 | if op.logTable == "" { 720 | return ErrNotSupported 721 | } 722 | 723 | var err error 724 | var direct, noappend, hl bool 725 | if len(po) > 0 { 726 | direct = po[0].DirectWrite 727 | noappend = po[0].NoAppend 728 | } else { 729 | hl, _ = op.HasLock() 730 | } 731 | 732 | id := uuid.NewString() 733 | if noappend { 734 | id = "-" 735 | } 736 | 737 | if direct || hl { 738 | b, _ := json.Marshal(kv) 739 | op.logger.Printf("[Put] leader: direct write: %v", string(b)) 740 | _, err := op.spannerClient.Apply(ctx, []*spanner.Mutation{ 741 | spanner.InsertOrUpdate(op.logTable, 742 | []string{"id", "key", "value", "leader", "timestamp"}, 743 | []any{id, kv.Key, kv.Value, op.hostPort, spanner.CommitTimestamp}, 744 | ), 745 | }) 746 | 747 | return err 748 | } 749 | 750 | // For non-leaders, we confirm the leader via spindle, and if so, ask leader to do the 751 | // actual write for us. Let's do a couple retries up to spindle's timeout. 752 | conn, err := op.getLeaderConn(ctx) 753 | if err != nil { 754 | return err 755 | } 756 | 757 | if conn != nil { 758 | defer conn.Close() 759 | } 760 | 761 | b, _ := json.Marshal(kv) 762 | enc := base64.StdEncoding.EncodeToString(b) 763 | var sb strings.Builder 764 | fmt.Fprintf(&sb, "%s %s\n", CmdWrite, enc) 765 | if noappend { 766 | sb.Reset() 767 | fmt.Fprintf(&sb, "%s %s %s\n", CmdWrite, enc, FlagNoAppend) 768 | } 769 | 770 | reply, err := op.send(conn, sb.String()) 771 | if err != nil { 772 | return err 773 | } 774 | 775 | switch { 776 | case strings.HasPrefix(reply, CmdAck): 777 | ss := strings.Split(reply, " ") 778 | if len(ss) > 1 { // failed 779 | dec, _ := base64.StdEncoding.DecodeString(ss[1]) 780 | return fmt.Errorf("%v", string(dec)) 781 | } 782 | default: 783 | return ErrNoLeader 784 | } 785 | 786 | return nil 787 | } 788 | 789 | // Send sends msg to the current leader. Any node can send messages, 790 | // including the leader itself (send to self). It also blocks until 791 | // it receives the reply from the leader's message handler. 792 | func (op *Op) Send(ctx context.Context, msg []byte) ([]byte, error) { 793 | conn, err := op.getLeaderConn(ctx) 794 | if err != nil { 795 | return nil, err 796 | } 797 | 798 | if conn != nil { 799 | defer conn.Close() 800 | } 801 | 802 | enc := base64.StdEncoding.EncodeToString(msg) 803 | var sb strings.Builder 804 | fmt.Fprintf(&sb, "%s %s\n", CmdSend, enc) 805 | reply, err := op.send(conn, sb.String()) 806 | if err != nil { 807 | return nil, err 808 | } 809 | 810 | switch { 811 | case strings.HasPrefix(reply, CmdAck): // expect "ACK base64(reply)" 812 | ss := strings.Split(reply, " ") 813 | if len(ss) > 1 { 814 | return base64.StdEncoding.DecodeString(ss[1]) 815 | } 816 | } 817 | 818 | // If not ACK, then the whole reply is an error string. 819 | b, _ := base64.StdEncoding.DecodeString(reply) 820 | return nil, fmt.Errorf("%v", string(b)) 821 | } 822 | 823 | type StreamToLeaderOutput struct { 824 | In chan *StreamMessage `json:"in"` 825 | Out chan *StreamMessage `json:"out"` 826 | } 827 | 828 | // StreamToLeader returns an input and output channels for streaming to leader. 829 | // To use the channels, send your request message(s) to the input channel, close 830 | // it (i.e. close(input)), then read the replies from the output channel. This 831 | // function will close the output channel when done. 832 | // 833 | // StreamToLeader is sequential in the sense that you need to send all your input 834 | // messages first before getting any response from the leader. 835 | func (op *Op) StreamToLeader(ctx context.Context) (*StreamToLeaderOutput, error) { 836 | if op.leaderStreamIn == nil || op.leaderStreamOut == nil { 837 | return nil, fmt.Errorf("hedge: input/output channel(s) cannot be nil") 838 | } 839 | 840 | conn, err := op.getLeaderGrpcConn(ctx) 841 | if err != nil { 842 | return nil, err 843 | } 844 | 845 | client := pb.NewHedgeClient(conn) 846 | stream, err := client.Send(ctx) 847 | if err != nil { 848 | return nil, err 849 | } 850 | 851 | keyId := "id" 852 | id := uuid.NewString() 853 | reply := make(chan error) 854 | ret := StreamToLeaderOutput{ 855 | In: make(chan *StreamMessage), 856 | Out: make(chan *StreamMessage), 857 | } 858 | 859 | // Exit only when input channel is closed by the caller. 860 | // We don't wait for this goroutine. 861 | go func() { 862 | var err error 863 | for m := range ret.In { 864 | if m.Payload.Meta == nil { 865 | m.Payload.Meta = map[string]string{keyId: id} 866 | } else { 867 | if _, ok := m.Payload.Meta[keyId]; !ok { 868 | m.Payload.Meta[keyId] = id 869 | } 870 | } 871 | 872 | err = stream.Send(m.Payload) 873 | if err != nil { 874 | break 875 | } 876 | } 877 | 878 | stream.CloseSend() 879 | reply <- err 880 | }() 881 | 882 | // Exit only when streaming response is done. 883 | // We don't wait for this goroutine. 884 | go func() { 885 | defer func() { 886 | close(ret.Out) 887 | conn.Close() 888 | }() 889 | 890 | err := <-reply 891 | if err != nil { 892 | ret.Out <- &StreamMessage{Error: err} 893 | return 894 | } 895 | 896 | for { 897 | resp, err := stream.Recv() 898 | if err == io.EOF { 899 | return 900 | } 901 | 902 | ret.Out <- &StreamMessage{Payload: resp} 903 | } 904 | }() 905 | 906 | return &ret, nil 907 | } 908 | 909 | type BroadcastOutput struct { 910 | Id string `json:"id,omitempty"` 911 | Reply []byte `json:"reply,omitempty"` 912 | Error error `json:"error,omitempty"` 913 | } 914 | 915 | type BroadcastArgs struct { 916 | SkipSelf bool // if true, skip broadcasting to self 917 | Out chan BroadcastOutput 918 | OnlySendTo []string // if set, only send to these member/s 919 | } 920 | 921 | // Broadcast sends msg to all nodes (send to all). Any node can broadcast messages, including the 922 | // leader itself. Note that this is best-effort basis only; by the time you call this API, the 923 | // handler might not have all the active members in record yet, as is the usual situation with 924 | // k8s deployments, where pods come and go, and our internal heartbeat protocol hasn't been 925 | // completed yet. This call will also block until it receives all the reply from all nodes' 926 | // broadcast handlers. 927 | // 928 | // If args[].Out is set, the output will be streamed to that channel instead. Useful if you prefer 929 | // a streamed output (as reply comes) instead of waiting for all replies before returning. If set, 930 | // the return value (output slice) will be set to empty []. Also, close() will be called on the 931 | // Out channel to indicate streaming end. 932 | func (op *Op) Broadcast(ctx context.Context, msg []byte, args ...BroadcastArgs) []BroadcastOutput { 933 | if op.active.Load() != 1 || op.fnBroadcast == nil { 934 | return nil // not running or no broadcast support 935 | } 936 | 937 | var stream bool 938 | outs := []BroadcastOutput{} 939 | var w sync.WaitGroup 940 | var outch chan BroadcastOutput 941 | members := op.getMembers() 942 | if len(args) > 0 && args[0].SkipSelf { 943 | delete(members, op.Name()) 944 | } 945 | 946 | if len(args) > 0 && len(args[0].OnlySendTo) > 0 { 947 | filtered := make(map[string]struct{}) 948 | for _, v := range args[0].OnlySendTo { 949 | if _, ok := members[v]; ok { 950 | filtered[v] = struct{}{} 951 | } 952 | } 953 | members = filtered 954 | } 955 | 956 | switch { 957 | case len(args) > 0 && args[0].Out != nil: 958 | outch = args[0].Out 959 | stream = true 960 | default: 961 | outch = make(chan BroadcastOutput, len(members)) 962 | } 963 | 964 | for k := range members { 965 | w.Add(1) 966 | go func(id string) { 967 | defer w.Done() 968 | timeout := time.Second * 5 969 | conn, err := net.DialTimeout("tcp", id, timeout) 970 | if err != nil { 971 | outch <- BroadcastOutput{Id: id, Error: err} 972 | return 973 | } 974 | 975 | defer conn.Close() 976 | enc := base64.StdEncoding.EncodeToString(msg) 977 | var sb strings.Builder 978 | fmt.Fprintf(&sb, "%s %s\n", CmdBroadcast, enc) 979 | reply, err := op.send(conn, sb.String()) 980 | if err != nil { 981 | outch <- BroadcastOutput{Id: id, Error: err} 982 | return 983 | } 984 | 985 | switch { 986 | case strings.HasPrefix(reply, CmdAck): // expect "ACK base64(reply)" 987 | ss := strings.Split(reply, " ") 988 | if len(ss) > 1 { 989 | r, e := base64.StdEncoding.DecodeString(ss[1]) 990 | outch <- BroadcastOutput{Id: id, Reply: r, Error: e} 991 | return 992 | } 993 | } 994 | 995 | // If not ACK, then the whole reply is an error string. 996 | r, _ := base64.StdEncoding.DecodeString(reply) 997 | outch <- BroadcastOutput{Id: id, Error: fmt.Errorf("%v", string(r))} 998 | }(k) 999 | } 1000 | 1001 | w.Wait() 1002 | switch { 1003 | case stream: 1004 | close(args[0].Out) 1005 | default: 1006 | for range members { 1007 | outs = append(outs, <-outch) 1008 | } 1009 | } 1010 | 1011 | return outs 1012 | } 1013 | 1014 | type StreamBroadcastArgs struct { 1015 | SkipSelf bool // if true, skip broadcasting to self 1016 | } 1017 | 1018 | type StreamBroadcastOutput struct { 1019 | In chan *StreamMessage 1020 | Outs map[string]chan *StreamMessage 1021 | } 1022 | 1023 | // StreamBroadcast returns input and output channels for doing streaming broadcasts. Any node can broadcast messages, 1024 | // including the leader itself. Note that this is best-effort basis only; by the time you call this API, the handler 1025 | // might not have all the active members in record yet, as is the usual situation with k8s deployments, where pods 1026 | // come and go, and our internal heartbeat protocol hasn't been completed yet. This call will also block until it 1027 | // receives all the reply from all nodes' broadcast handlers. 1028 | // 1029 | // To use the channels, send your request message(s) to the input channel, close it (i.e. close(input)), then read 1030 | // the replies from the output channels. This function will close all output channels when done. 1031 | // 1032 | // StreamBroadcast is sequential in the sense that you need to send all your input messages first before getting 1033 | // any response from all the nodes. 1034 | func (op *Op) StreamBroadcast(ctx context.Context, args ...StreamBroadcastArgs) (*StreamBroadcastOutput, error) { 1035 | if op.active.Load() != 1 { 1036 | return nil, nil // not running 1037 | } 1038 | 1039 | members := op.getMembers() 1040 | if len(args) > 0 && args[0].SkipSelf { 1041 | delete(members, op.Name()) 1042 | } 1043 | 1044 | ret := StreamBroadcastOutput{ 1045 | In: make(chan *StreamMessage), 1046 | Outs: make(map[string]chan *StreamMessage), 1047 | } 1048 | 1049 | _, gp, _ := net.SplitHostPort(op.grpcHostPort) 1050 | conns := make(map[string]*grpc.ClientConn) 1051 | streams := make(map[string]pb.Hedge_BroadcastClient) 1052 | for k := range members { 1053 | h, _, _ := net.SplitHostPort(k) 1054 | gHostPort := net.JoinHostPort(h, gp) 1055 | 1056 | var opts []grpc.DialOption 1057 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) 1058 | lconn, err := grpc.NewClient(gHostPort, opts...) 1059 | if err != nil { 1060 | return nil, err 1061 | } 1062 | 1063 | conns[k] = lconn 1064 | client := pb.NewHedgeClient(lconn) 1065 | stream, err := client.Broadcast(ctx) 1066 | if err != nil { 1067 | continue 1068 | } 1069 | 1070 | streams[gHostPort] = stream 1071 | ret.Outs[gHostPort] = make(chan *StreamMessage) 1072 | } 1073 | 1074 | keyId := "id" 1075 | id := uuid.NewString() 1076 | reply := make(chan error) 1077 | 1078 | // Exit only when input channel is closed by the caller. 1079 | // We don't wait for this goroutine. 1080 | go func() { 1081 | for m := range ret.In { 1082 | if m.Payload.Meta == nil { 1083 | m.Payload.Meta = map[string]string{keyId: id} 1084 | } else { 1085 | if _, ok := m.Payload.Meta[keyId]; !ok { 1086 | m.Payload.Meta[keyId] = id 1087 | } 1088 | } 1089 | 1090 | for _, v := range streams { 1091 | v.Send(m.Payload) 1092 | } 1093 | } 1094 | 1095 | for _, v := range streams { 1096 | v.CloseSend() 1097 | } 1098 | 1099 | reply <- nil 1100 | }() 1101 | 1102 | // Exit only when all streaming responses from all nodes are done. 1103 | // We don't wait for this goroutine. 1104 | go func() { 1105 | defer func() { 1106 | for _, v := range ret.Outs { 1107 | close(v) 1108 | } 1109 | 1110 | for _, v := range conns { 1111 | v.Close() 1112 | } 1113 | }() 1114 | 1115 | <-reply // input done 1116 | 1117 | var w sync.WaitGroup 1118 | for k, v := range streams { 1119 | w.Add(1) 1120 | go func(node string, stream pb.Hedge_BroadcastClient) { 1121 | defer w.Done() 1122 | for { 1123 | resp, err := stream.Recv() 1124 | if err == io.EOF { 1125 | return 1126 | } 1127 | 1128 | ret.Outs[node] <- &StreamMessage{Payload: resp} 1129 | } 1130 | }(k, v) 1131 | } 1132 | 1133 | w.Wait() 1134 | }() 1135 | 1136 | return &ret, nil 1137 | } 1138 | 1139 | // Members returns a list of members in the cluster/group. 1140 | func (op *Op) Members() []string { 1141 | members := []string{} 1142 | m := op.getMembers() 1143 | for k := range m { 1144 | members = append(members, k) 1145 | } 1146 | 1147 | return members 1148 | } 1149 | 1150 | func (op *Op) send(conn net.Conn, msg string) (string, error) { 1151 | if conn == nil { 1152 | return "", ErrInvalidConn 1153 | } 1154 | 1155 | _, err := conn.Write([]byte(msg)) 1156 | if err != nil { 1157 | return "", err 1158 | } 1159 | 1160 | return op.recv(conn) 1161 | } 1162 | 1163 | func (op *Op) recv(conn net.Conn) (string, error) { 1164 | if conn == nil { 1165 | return "", ErrInvalidConn 1166 | } 1167 | 1168 | buffer, err := bufio.NewReader(conn).ReadString('\n') 1169 | if err != nil { 1170 | return "", err 1171 | } 1172 | 1173 | var reply string 1174 | if buffer != "" { 1175 | reply = buffer[:len(buffer)-1] 1176 | } 1177 | 1178 | return reply, nil 1179 | } 1180 | 1181 | func (op *Op) buildAckReply(err error) string { 1182 | var sb strings.Builder 1183 | if err != nil { 1184 | ee := base64.StdEncoding.EncodeToString([]byte(err.Error())) 1185 | fmt.Fprintf(&sb, "%s %s\n", CmdAck, ee) 1186 | return sb.String() 1187 | } else { 1188 | fmt.Fprintf(&sb, "%s\n", CmdAck) 1189 | return sb.String() 1190 | } 1191 | } 1192 | 1193 | func (op *Op) getLeaderConn(ctx context.Context) (net.Conn, error) { 1194 | var conn net.Conn 1195 | var err error 1196 | subctx := context.WithValue(ctx, struct{}{}, nil) 1197 | first := make(chan struct{}, 1) 1198 | first <- struct{}{} // immediately the first time 1199 | tcnt, tlimit := int64(0), (op.lockTimeout/2000)*2 1200 | ticker := time.NewTicker(time.Second * 2) // processing can be more than this 1201 | defer ticker.Stop() 1202 | 1203 | var active atomic.Int32 1204 | getConn := func() (net.Conn, error) { 1205 | active.Store(1) 1206 | defer active.Store(0) 1207 | timeout := time.Second * 5 1208 | leader, err := op.Leader() 1209 | if err != nil { 1210 | return nil, err 1211 | } 1212 | 1213 | if leader == "" { 1214 | return nil, ErrNoLeader 1215 | } 1216 | 1217 | lconn, err := net.DialTimeout("tcp", leader, timeout) 1218 | if err != nil { 1219 | return nil, err 1220 | } 1221 | 1222 | defer lconn.Close() 1223 | var sb strings.Builder 1224 | fmt.Fprintf(&sb, "%s\n", CmdLeader) 1225 | reply, err := op.send(lconn, sb.String()) 1226 | if err != nil { 1227 | return nil, err 1228 | } 1229 | 1230 | if !strings.HasPrefix(reply, CmdAck) { 1231 | return nil, ErrNoLeader 1232 | } 1233 | 1234 | // Create a new connection to the confirmed leader. 1235 | return net.DialTimeout("tcp", leader, timeout) 1236 | } 1237 | 1238 | type connT struct { 1239 | conn net.Conn 1240 | err error 1241 | } 1242 | 1243 | for { 1244 | select { 1245 | case <-subctx.Done(): 1246 | return nil, context.Canceled 1247 | case <-first: 1248 | case <-ticker.C: 1249 | } 1250 | 1251 | if active.Load() == 1 { 1252 | continue 1253 | } 1254 | 1255 | ch := make(chan connT, 1) 1256 | go func() { 1257 | c, e := getConn() 1258 | ch <- connT{c, e} 1259 | }() 1260 | 1261 | res := <-ch 1262 | conn = res.conn 1263 | err = res.err 1264 | 1265 | tcnt++ 1266 | if err == nil || (tcnt > tlimit) { 1267 | break 1268 | } 1269 | } 1270 | 1271 | return conn, nil 1272 | } 1273 | 1274 | // Don't forget to close the returned connection. 1275 | func (op *Op) getLeaderGrpcConn(ctx context.Context) (*grpc.ClientConn, error) { 1276 | var conn *grpc.ClientConn 1277 | var err error 1278 | subctx := context.WithValue(ctx, struct{}{}, nil) 1279 | first := make(chan struct{}, 1) 1280 | first <- struct{}{} // immediately the first time 1281 | tcnt, tlimit := int64(0), (op.lockTimeout/2000)*2 1282 | ticker := time.NewTicker(time.Second * 2) // processing can be more than this 1283 | defer ticker.Stop() 1284 | 1285 | var active atomic.Int32 1286 | getConn := func() (*grpc.ClientConn, error) { 1287 | active.Store(1) 1288 | defer active.Store(0) 1289 | leader, err := op.Leader() 1290 | if err != nil { 1291 | return nil, err 1292 | } 1293 | 1294 | if leader == "" { 1295 | return nil, ErrNoLeader 1296 | } 1297 | 1298 | // Get the gRPC host:port. 1299 | h, _, _ := net.SplitHostPort(leader) 1300 | _, gp, _ := net.SplitHostPort(op.grpcHostPort) 1301 | gleader := net.JoinHostPort(h, gp) 1302 | 1303 | var opts []grpc.DialOption 1304 | opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) 1305 | lconn, err := grpc.NewClient(gleader, opts...) 1306 | if err != nil { 1307 | return nil, err 1308 | } 1309 | 1310 | return lconn, nil 1311 | } 1312 | 1313 | type connT struct { 1314 | conn *grpc.ClientConn 1315 | err error 1316 | } 1317 | 1318 | for { 1319 | select { 1320 | case <-subctx.Done(): 1321 | return nil, context.Canceled 1322 | case <-first: 1323 | case <-ticker.C: 1324 | } 1325 | 1326 | if active.Load() == 1 { 1327 | continue 1328 | } 1329 | 1330 | ch := make(chan connT, 1) 1331 | go func() { 1332 | c, e := getConn() 1333 | ch <- connT{c, e} 1334 | }() 1335 | 1336 | res := <-ch 1337 | conn = res.conn 1338 | err = res.err 1339 | 1340 | tcnt++ 1341 | if err == nil || (tcnt > tlimit) { 1342 | break 1343 | } 1344 | } 1345 | 1346 | return conn, nil 1347 | } 1348 | 1349 | func (op *Op) getMembers() map[string]struct{} { 1350 | op.mtx.Lock() 1351 | copy := make(map[string]struct{}) 1352 | maps.Copy(copy, op.members) 1353 | op.mtx.Unlock() 1354 | return copy 1355 | } 1356 | 1357 | func (op *Op) encodeMembers() string { 1358 | op.mtx.Lock() 1359 | defer op.mtx.Unlock() 1360 | b, _ := json.Marshal(op.members) 1361 | return base64.StdEncoding.EncodeToString(b) 1362 | } 1363 | 1364 | func (op *Op) setMembers(m map[string]struct{}) { 1365 | op.mtx.Lock() 1366 | defer op.mtx.Unlock() 1367 | op.members = m 1368 | } 1369 | 1370 | func (op *Op) addMember(id string) { 1371 | op.mtx.Lock() 1372 | defer op.mtx.Unlock() 1373 | op.members[id] = struct{}{} 1374 | } 1375 | 1376 | func (op *Op) delMember(id string) { 1377 | op.mtx.Lock() 1378 | defer op.mtx.Unlock() 1379 | delete(op.members, id) 1380 | } 1381 | 1382 | // New creates an instance of Op. hostPort can be in "ip:port" format, or ":port" format, in which case 1383 | // the IP part will be resolved internally, or empty, in which case port 8080 will be used. The internal 1384 | // spindle object's lock table name will be lockTable, and lockName is the lock name. logTable will 1385 | // serve as our append-only, distributed key/value storage table. If logTable is empty, Put, Get, and 1386 | // Semaphore features will be disabled. 1387 | func New(client *spanner.Client, hostPort, lockTable, lockName, logTable string, opts ...Option) *Op { 1388 | op := &Op{ 1389 | hostPort: hostPort, 1390 | spannerClient: client, 1391 | lockTable: lockTable, 1392 | lockName: lockName, 1393 | logTable: logTable, 1394 | members: make(map[string]struct{}), 1395 | ensureCh: make(chan string), 1396 | ensureDone: make(chan struct{}, 1), 1397 | sosLock: &sync.Mutex{}, 1398 | soss: map[string]*SoS{}, 1399 | Lock: &spindle.Lock{}, // init later 1400 | } 1401 | 1402 | for _, opt := range opts { 1403 | opt.Apply(op) 1404 | } 1405 | 1406 | host, port, _ := net.SplitHostPort(op.hostPort) 1407 | switch { 1408 | case host == "" && port != "": 1409 | // We will use memberlist for IP resolution. 1410 | list, _ := memberlist.Create(memberlist.DefaultLANConfig()) 1411 | localNode := list.LocalNode() 1412 | lh, _, _ := net.SplitHostPort(localNode.Address()) 1413 | op.hostPort = net.JoinHostPort(lh, port) 1414 | list.Shutdown() 1415 | case host == "" && port == "": 1416 | // We will use memberlist for IP resolution. 1417 | list, _ := memberlist.Create(memberlist.DefaultLANConfig()) 1418 | localNode := list.LocalNode() 1419 | lh, _, _ := net.SplitHostPort(localNode.Address()) 1420 | op.hostPort = net.JoinHostPort(lh, "8080") 1421 | list.Shutdown() 1422 | } 1423 | 1424 | // Our gRPC host:port by default is set to host:port+1. 1425 | if op.grpcHostPort == "" { 1426 | host, port, _ := net.SplitHostPort(op.hostPort) 1427 | pi, _ := strconv.Atoi(port) 1428 | op.grpcHostPort = net.JoinHostPort(host, fmt.Sprintf("%v", pi+1)) 1429 | } 1430 | 1431 | switch { 1432 | case op.lockTimeout == 0: 1433 | op.lockTimeout = 30000 // default 30s 1434 | case op.lockTimeout < 2000: 1435 | op.lockTimeout = 2000 // minimum 2s 1436 | } 1437 | 1438 | switch { 1439 | case op.syncInterval == 0: 1440 | op.syncInterval = time.Second * 30 // default 1441 | case op.syncInterval < (time.Second * 2): 1442 | op.syncInterval = time.Second * 2 // minimum 1443 | } 1444 | 1445 | if op.logger == nil { 1446 | prefix := fmt.Sprintf("[hedge/%v] ", op.hostPort) 1447 | op.logger = log.New(os.Stdout, prefix, log.LstdFlags) 1448 | } 1449 | 1450 | return op 1451 | } 1452 | 1453 | type SendToLeaderArgs struct { 1454 | // Number of retry attempts to contact the leader. 1455 | // Defaults to 10. If set to a negative number, it 1456 | // will retry forever. 1457 | Retries int 1458 | } 1459 | 1460 | // SendToLeader is a wrapper to hedge.Send() with builtin retry mechanisms. 1461 | func SendToLeader(ctx context.Context, op *Op, m []byte, args ...*SendToLeaderArgs) ([]byte, error) { 1462 | if op == nil { 1463 | return nil, fmt.Errorf("hedge: op cannot be nil") 1464 | } 1465 | 1466 | retries := 10 1467 | if len(args) > 0 { 1468 | retries = args[0].Retries 1469 | } 1470 | 1471 | if retries == 0 { 1472 | retries = 10 1473 | } 1474 | 1475 | result := make(chan []byte, 1) 1476 | done := make(chan error, 1) 1477 | go func() { 1478 | var err error 1479 | var res []byte 1480 | defer func(b *[]byte, e *error) { 1481 | result <- *b 1482 | done <- *e 1483 | }(&res, &err) 1484 | 1485 | bo := gaxv2.Backoff{ 1486 | Max: time.Minute, 1487 | } 1488 | 1489 | var i int 1490 | for { 1491 | if i >= retries && retries >= 0 { 1492 | break 1493 | } 1494 | 1495 | if !op.IsRunning() { 1496 | time.Sleep(bo.Pause()) 1497 | } 1498 | 1499 | if retries >= 0 { 1500 | i++ 1501 | } 1502 | } 1503 | 1504 | i = 0 1505 | for { 1506 | if i >= retries && retries >= 0 { 1507 | err = fmt.Errorf("hedge: retries exhausted") 1508 | return 1509 | } 1510 | 1511 | var r []byte 1512 | r, err = op.Send(ctx, m) 1513 | if err != nil { 1514 | time.Sleep(bo.Pause()) 1515 | } else { 1516 | res = r // to outside 1517 | return 1518 | } 1519 | 1520 | if retries >= 0 { 1521 | i++ 1522 | } 1523 | } 1524 | }() 1525 | 1526 | for { 1527 | select { 1528 | case e := <-done: 1529 | return <-result, e 1530 | case <-ctx.Done(): 1531 | return nil, context.Canceled 1532 | } 1533 | } 1534 | } 1535 | --------------------------------------------------------------------------------