├── assets └── logo.JPG ├── go.mod ├── internal ├── mathutil │ └── mathutil.go └── httpcache │ ├── index.go │ └── index_test.go ├── .gitignore ├── benchmarks ├── cluster │ ├── runner │ │ └── Dockerfile │ ├── direct │ │ ├── Dockerfile │ │ └── main.go │ ├── node │ │ ├── Dockerfile │ │ └── main.go │ ├── README.md │ └── docker-compose.yml ├── go.mod ├── go.sum ├── benchmark_runner.go └── README.md ├── go.sum ├── cluster ├── limiter_test.go ├── codec.go ├── rendezvous_test.go ├── weights.go ├── bufpool_test.go ├── hlc_test.go ├── bf_msg.go ├── membership_test.go ├── weights_test.go ├── limiter.go ├── errors.go ├── lease_test.go ├── codec_test.go ├── bufpool.go ├── keycodec.go ├── migrate.go ├── keycodec_test.go ├── lease.go ├── hlc.go ├── membership.go ├── wire.go ├── adapter.go ├── heat.go ├── bf_rpc.go ├── rendezvous.go ├── config.go ├── replication.go ├── bf_join.go └── transport.go ├── fnv.go ├── errors.go ├── .github └── workflows │ └── test.yml ├── Makefile ├── _examples ├── basic │ └── main.go └── advanced │ └── main.go ├── snapshot.go ├── shard.go ├── hash_test.go ├── lfu.go ├── manager.go ├── hash.go ├── CHANGELOG.md ├── README.md ├── eviction.go ├── fnv_test.go └── CLUSTER.md /assets/logo.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unkn0wn-root/kioshun/HEAD/assets/logo.JPG -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/unkn0wn-root/kioshun 2 | 3 | go 1.24 4 | 5 | require ( 6 | github.com/cespare/xxhash/v2 v2.3.0 7 | github.com/fxamacker/cbor/v2 v2.6.0 8 | ) 9 | 10 | require github.com/x448/float16 v0.8.4 // indirect 11 | -------------------------------------------------------------------------------- /internal/mathutil/mathutil.go: -------------------------------------------------------------------------------- 1 | package mathutil 2 | 3 | import "math/bits" 4 | 5 | // NextPowerOf2 returns the next power of 2 greater than or equal to n. 6 | func NextPowerOf2(n int) int { 7 | if n <= 1 { 8 | return 1 9 | } 10 | return 1 << bits.Len(uint(n-1)) 11 | } 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.dylib 3 | *.test 4 | *.out 5 | 6 | go.work 7 | 8 | *.tmp 9 | *.temp 10 | *.log 11 | 12 | /dist/ 13 | /build/ 14 | /bin/ 15 | 16 | coverage.out 17 | coverage.html 18 | 19 | *.prof 20 | 21 | .env 22 | .env.local 23 | .env.*.local 24 | 25 | debug 26 | debug.test 27 | __debug_bin* 28 | -------------------------------------------------------------------------------- /benchmarks/cluster/runner/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.22-alpine AS build 2 | WORKDIR /src 3 | COPY . . 4 | RUN --mount=type=cache,target=/go/pkg/mod --mount=type=cache,target=/root/.cache/go-build \ 5 | set -eux; \ 6 | cd _benchmarks; \ 7 | go mod tidy; \ 8 | cd cluster/runner && go build -o /out/runner ./ 9 | 10 | FROM alpine:3.19 11 | RUN apk add --no-cache ca-certificates && adduser -D app 12 | USER app 13 | WORKDIR /app 14 | COPY --from=build /out/runner /app/runner 15 | ENTRYPOINT ["/app/runner"] 16 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= 2 | github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 3 | github.com/fxamacker/cbor/v2 v2.6.0 h1:sU6J2usfADwWlYDAFhZBQ6TnLFBHxgesMrQfQgk1tWA= 4 | github.com/fxamacker/cbor/v2 v2.6.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= 5 | github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= 6 | github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= 7 | -------------------------------------------------------------------------------- /benchmarks/cluster/direct/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.22-alpine AS build 2 | WORKDIR /src 3 | COPY . . 4 | RUN --mount=type=cache,target=/go/pkg/mod --mount=type=cache,target=/root/.cache/go-build \ 5 | set -eux; \ 6 | cd _benchmarks; \ 7 | go mod tidy; \ 8 | cd cluster/direct && go build -o /out/direct-runner ./ 9 | 10 | FROM alpine:3.19 11 | RUN apk add --no-cache ca-certificates && adduser -D app 12 | USER app 13 | WORKDIR /app 14 | COPY --from=build /out/direct-runner /app/runner 15 | ENTRYPOINT ["/app/runner"] 16 | 17 | -------------------------------------------------------------------------------- /cluster/limiter_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestRateLimiterTokensAndRefill(t *testing.T) { 9 | rl := newRateLimiter(2, 50*time.Millisecond) 10 | defer rl.Stop() 11 | 12 | if !rl.Allow() || !rl.Allow() { 13 | t.Fatalf("expected first two allows to pass") 14 | } 15 | 16 | if rl.Allow() { 17 | t.Fatalf("expected third allow to be rate-limited") 18 | } 19 | 20 | time.Sleep(60 * time.Millisecond) 21 | if !rl.Allow() { 22 | t.Fatalf("expected allow after refill") 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /benchmarks/cluster/node/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.22-alpine AS build 2 | WORKDIR /src 3 | COPY . . 4 | RUN --mount=type=cache,target=/go/pkg/mod --mount=type=cache,target=/root/.cache/go-build \ 5 | set -eux; \ 6 | cd _benchmarks; \ 7 | go mod tidy; \ 8 | cd cluster/node && go build -o /out/meshnode ./ 9 | 10 | FROM alpine:3.19 11 | RUN apk add --no-cache wget && adduser -D app 12 | USER app 13 | WORKDIR /app 14 | COPY --from=build /out/meshnode /app/meshnode 15 | EXPOSE 8081 8082 8083 5011 5012 5013 16 | ENTRYPOINT ["/app/meshnode"] 17 | -------------------------------------------------------------------------------- /benchmarks/go.mod: -------------------------------------------------------------------------------- 1 | module benchmark 2 | 3 | go 1.21 4 | 5 | require ( 6 | github.com/allegro/bigcache/v3 v3.1.0 7 | github.com/coocood/freecache v1.2.4 8 | github.com/dgraph-io/ristretto v0.1.1 9 | github.com/patrickmn/go-cache v2.1.0+incompatible 10 | github.com/unkn0wn-root/kioshun v0.0.3 11 | github.com/redis/go-redis/v9 v9.5.2 12 | ) 13 | 14 | require ( 15 | github.com/cespare/xxhash/v2 v2.1.2 // indirect 16 | github.com/dustin/go-humanize v1.0.0 // indirect 17 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b // indirect 18 | github.com/pkg/errors v0.9.1 // indirect 19 | golang.org/x/sys v0.0.0-20221010170243-090e33056c14 // indirect 20 | ) 21 | 22 | replace github.com/unkn0wn-root/kioshun => ../ 23 | -------------------------------------------------------------------------------- /fnv.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | const ( 4 | // FNV-1a 5 | fnvOffset64 = 14695981039346656037 6 | fnvPrime64 = 1099511628211 7 | ) 8 | 9 | // fnvHash64 implements FNV-1a hash algorithm with XOR-folding. 10 | // 11 | // Standard FNV-1a algorithm: 12 | // 1. Initialize hash with FNV offset basis 13 | // 2. For each byte: XOR byte with current hash, then multiply by FNV prime 14 | // 3. XOR-before-multiply order distinguishes FNV-1a from FNV-1 15 | // 16 | // Kioshun XOR-folding: 17 | // - Combines upper and lower 32 bits via h ^ (h >> 32) 18 | // - Better hash distribution for shard selection 19 | // - Reduces clustering when using power-of-2 table sizes 20 | func fnvHash64(s string) uint64 { 21 | h := uint64(fnvOffset64) 22 | for i := 0; i < len(s); i++ { 23 | h ^= uint64(s[i]) 24 | h *= fnvPrime64 25 | } 26 | return h ^ (h >> 32) 27 | } 28 | -------------------------------------------------------------------------------- /cluster/codec.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | cbor "github.com/fxamacker/cbor/v2" 5 | ) 6 | 7 | // Codec abstracts value encoding for the wire. Must be 8 | // deterministic and stable across nodes to allow backfill/replication. 9 | type Codec[V any] interface { 10 | Encode(V) ([]byte, error) 11 | Decode([]byte) (V, error) 12 | } 13 | 14 | // BytesCodec: pass-through []byte (no copy on Encode; Decode returns a copy). 15 | type BytesCodec struct{} 16 | 17 | func (BytesCodec) Encode(v []byte) ([]byte, error) { return v, nil } 18 | func (BytesCodec) Decode(b []byte) ([]byte, error) { out := append([]byte(nil), b...); return out, nil } 19 | 20 | type CBORCodec[V any] struct{} 21 | 22 | func (CBORCodec[V]) Encode(v V) ([]byte, error) { return cbor.Marshal(v) } 23 | func (CBORCodec[V]) Decode(b []byte) (V, error) { 24 | var v V 25 | err := cbor.Unmarshal(b, &v) 26 | return v, err 27 | } 28 | -------------------------------------------------------------------------------- /cluster/rendezvous_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "sync/atomic" 5 | "testing" 6 | ) 7 | 8 | func TestRingOwnersWeightedFirstIsHeaviest(t *testing.T) { 9 | r := newRing(2) 10 | a := newMeta(NodeID("A"), "a") 11 | b := newMeta(NodeID("B"), "b") 12 | c := newMeta(NodeID("C"), "c") 13 | 14 | // make A overwhelmingly heavy so it always wins top rank. 15 | atomic.StoreUint64(&a.weight, 1_000_000) 16 | atomic.StoreUint64(&b.weight, 1) 17 | atomic.StoreUint64(&c.weight, 1) 18 | r.nodes = []*nodeMeta{a, b, c} 19 | 20 | owners := r.ownersFromKeyHash(12345) 21 | if len(owners) == 0 || owners[0] != a { 22 | t.Fatalf("expected A as first owner, got %#v", owners) 23 | } 24 | 25 | top := r.ownersTopNFromKeyHash(12345, 3) 26 | if len(top) != 3 { 27 | t.Fatalf("expected 3 candidates, got %d", len(top)) 28 | } 29 | if top[0] != a { 30 | t.Fatalf("expected A as first candidate, got %#v", top) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /cluster/weights.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import "math" 4 | 5 | const ( 6 | weightMax = 1_000_000 7 | memRef = 8 << 30 // 8 GiB reference for normalization 8 | ) 9 | 10 | // computeWeight converts node load into a rendezvous weight (1..1_000_000). 11 | // Higher free memory and lower CPU/evictions/size produce larger weights. 12 | func computeWeight(load NodeLoad) uint64 { 13 | var fm float64 = 0.5 14 | if load.FreeMemBytes > 0 { 15 | fm = clamp(float64(load.FreeMemBytes) / float64(memRef)) 16 | } 17 | 18 | cpu := 0.5 19 | if load.CPUu16 > 0 { 20 | cpu = 1.0 - clamp(float64(load.CPUu16)/10000.0) 21 | } 22 | 23 | ev := 1.0 / (1.0 + float64(load.Evictions)) 24 | sz := 1.0 / (1.0 + clamp(float64(load.Size)/1_000_000.0)) 25 | s := fm*0.35 + cpu*0.35 + ev*0.2 + sz*0.1 26 | w := uint64(s * weightMax) 27 | if w < 1 { 28 | w = 1 29 | } 30 | return w 31 | } 32 | func clamp(v float64) float64 { 33 | return math.Max(0, math.Min(1, v)) 34 | } 35 | -------------------------------------------------------------------------------- /cluster/bufpool_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import "testing" 4 | 5 | func TestBufPoolGetPut(t *testing.T) { 6 | bp := newBufPool([]int{64, 128}) 7 | 8 | // small buffer uses 64 bucket 9 | b := bp.get(50) 10 | if len(b) != 50 || cap(b) != 64 { 11 | t.Fatalf("unexpected buf: len=%d cap=%d", len(b), cap(b)) 12 | } 13 | bp.put(b) 14 | 15 | // large buffer gets exact allocation (> largest bucket) 16 | big := bp.get(256) 17 | if len(big) != 256 || cap(big) != 256 { 18 | t.Fatalf("unexpected big buf: len=%d cap=%d", len(big), cap(big)) 19 | } 20 | } 21 | 22 | func TestBufPoolClass(t *testing.T) { 23 | bp := newBufPool([]int{64, 128}) 24 | if got := bp.class(1); got != 0 { 25 | t.Fatalf("class(1)=%d", got) 26 | } 27 | if got := bp.class(64); got != 0 { 28 | t.Fatalf("class(64)=%d", got) 29 | } 30 | if got := bp.class(65); got != 1 { 31 | t.Fatalf("class(65)=%d", got) 32 | } 33 | if got := bp.class(129); got != -1 { 34 | t.Fatalf("class(129)=%d", got) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /cluster/hlc_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestHLCNextMonotonic(t *testing.T) { 9 | h := newHLC(1) 10 | a := h.Next() 11 | b := h.Next() 12 | if b <= a { 13 | t.Fatalf("Next not monotonic: a=%d b=%d", a, b) 14 | } 15 | } 16 | 17 | func TestHLCObserveRemoteAhead(t *testing.T) { 18 | h := newHLC(1) 19 | _ = h.Next() // initialize 20 | rp := time.Now().UnixMilli() + 5 21 | remote := packHLC(rp, 3, 0) 22 | h.Observe(remote) 23 | after := h.Next() 24 | ap, _ := unpackHLC(after) 25 | if ap < rp { // should catch up to remote physical time 26 | t.Fatalf("did not catch up: ap=%d rp=%d", ap, rp) 27 | } 28 | } 29 | 30 | func TestHLCObserveRemoteBehindNoRegression(t *testing.T) { 31 | h := newHLC(1) 32 | a := h.Next() 33 | ap, _ := unpackHLC(a) 34 | // remote behind current physical time 35 | remote := packHLC(ap-10, 0, 0) 36 | h.Observe(remote) 37 | b := h.Next() 38 | if b <= a { 39 | t.Fatalf("regressed: a=%d b=%d", a, b) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | ) 7 | 8 | var ( 9 | ErrCacheExists = errors.New("cache already exists") 10 | ErrCacheNotFound = errors.New("cache not found") 11 | ErrTypeMismatch = errors.New("cache type mismatch") 12 | ErrInvalidConfig = errors.New("invalid cache configuration") 13 | ErrCacheClosed = errors.New("cache is closed") 14 | ) 15 | 16 | type CacheError struct { 17 | Op string 18 | Name string 19 | Cause error 20 | } 21 | 22 | func (e *CacheError) Error() string { 23 | if e.Name != "" { 24 | return fmt.Sprintf("cache %s %s: %v", e.Op, e.Name, e.Cause) 25 | } 26 | return fmt.Sprintf("cache %s: %v", e.Op, e.Cause) 27 | } 28 | 29 | func (e *CacheError) Unwrap() error { 30 | return e.Cause 31 | } 32 | 33 | func newCacheError(op, name string, cause error) *CacheError { 34 | return &CacheError{ 35 | Op: op, 36 | Name: name, 37 | Cause: cause, 38 | } 39 | } 40 | 41 | func wrapError(op string, err error) *CacheError { 42 | return &CacheError{ 43 | Op: op, 44 | Cause: err, 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /cluster/bf_msg.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | // Digest for a key-hash prefix bucket; used to detect divergent buckets 4 | // without transferring all keys. 5 | type BucketDigest struct { 6 | Prefix []byte `cbor:"p"` // first Depth bytes of key-hash (big-endian) 7 | Count uint32 `cbor:"c"` 8 | Hash64 uint64 `cbor:"h"` 9 | } 10 | 11 | type MsgBackfillDigestReq struct { 12 | Base 13 | TargetID string `cbor:"tid"` // joiner ID 14 | Depth uint8 `cbor:"d"` // bytes of prefix (1..8) 15 | } 16 | 17 | type MsgBackfillDigestResp struct { 18 | Base 19 | Depth uint8 `cbor:"d"` 20 | Buckets []BucketDigest `cbor:"b"` 21 | NotInRing bool `cbor:"nr,omitempty"` 22 | } 23 | 24 | type MsgBackfillKeysReq struct { 25 | Base 26 | TargetID string `cbor:"tid"` // joiner ID 27 | Prefix []byte `cbor:"p"` // len == Depth 28 | Limit int `cbor:"l"` // page size 29 | Cursor []byte `cbor:"u"` // last 8B key-hash (big-endian) for pagination 30 | } 31 | 32 | type MsgBackfillKeysResp struct { 33 | Base 34 | Items []KV `cbor:"i"` 35 | NextCursor []byte `cbor:"u"` // nil when done 36 | Done bool `cbor:"o"` 37 | NotInRing bool `cbor:"nr,omitempty"` 38 | } 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, macos-latest] 15 | go-version: ['1.21', '1.22'] 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Set up Go 21 | uses: actions/setup-go@v4 22 | with: 23 | go-version: ${{ matrix.go-version }} 24 | 25 | - name: Cache Go modules 26 | uses: actions/cache@v4 27 | with: 28 | path: | 29 | ~/.cache/go-build 30 | ~/go/pkg/mod 31 | key: ${{ runner.os }}-go-${{ matrix.go-version }}-${{ hashFiles('**/go.sum') }} 32 | restore-keys: | 33 | ${{ runner.os }}-go-${{ matrix.go-version }}- 34 | 35 | - name: Format check 36 | run: | 37 | if [ "$(gofmt -s -l . | wc -l)" -gt 0 ]; then 38 | echo "Code is not formatted:" 39 | gofmt -s -l . 40 | exit 1 41 | fi 42 | 43 | - name: Lint 44 | run: go vet ./... 45 | 46 | - name: Run tests 47 | run: go test -v -race ./... 48 | -------------------------------------------------------------------------------- /cluster/membership_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestMembershipIntegrateAlivePrune(t *testing.T) { 9 | m := newMembership() 10 | now := time.Now().UnixNano() 11 | 12 | // integrate gossip from A, referencing B as known peer. 13 | m.integrate(NodeID("A"), "A", []PeerInfo{{ID: "B", Addr: "B"}}, map[string]int64{"B": now}, 10, now) 14 | 15 | if m.epoch != 10 { 16 | t.Fatalf("epoch not updated: %d", m.epoch) 17 | } 18 | 19 | al := m.alive(now, 1*time.Second) 20 | got := make(map[NodeID]bool) 21 | for _, nm := range al { 22 | got[nm.ID] = true 23 | } 24 | 25 | if !got[NodeID("A")] || !got[NodeID("B")] { 26 | t.Fatalf("alive missing A or B: %+v", got) 27 | } 28 | 29 | // lower epoch should not decrease stored epoch. 30 | m.integrate(NodeID("A"), "A", nil, nil, 5, now) 31 | if m.epoch != 10 { 32 | t.Fatalf("epoch regressed: %d", m.epoch) 33 | } 34 | 35 | // make B stale and prune tombstones. 36 | m.mu.Lock() 37 | m.seen[NodeID("B")] = now - int64(10*time.Second) 38 | m.mu.Unlock() 39 | m.pruneTombstones(now, 5*time.Second) 40 | 41 | m.mu.RLock() 42 | _, okB := m.peers[NodeID("B")] 43 | m.mu.RUnlock() 44 | if okB { 45 | t.Fatalf("expected B to be pruned") 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /cluster/weights_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import "testing" 4 | 5 | func TestComputeWeightBounds(t *testing.T) { 6 | w := computeWeight(NodeLoad{}) 7 | if w < 1 || w > weightMax { 8 | t.Fatalf("weight out of bounds: %d", w) 9 | } 10 | } 11 | 12 | func TestComputeWeightSensitivity(t *testing.T) { 13 | base := NodeLoad{Size: 1_000_000, Evictions: 0, FreeMemBytes: 4 << 30, CPUu16: 5000} 14 | w1 := computeWeight(base) 15 | 16 | // more free memory => higher weight 17 | w2 := computeWeight(NodeLoad{Size: base.Size, Evictions: base.Evictions, FreeMemBytes: 8 << 30, CPUu16: base.CPUu16}) 18 | if w2 <= w1 { 19 | t.Fatalf("expected weight to increase with free memory: %d -> %d", w1, w2) 20 | } 21 | 22 | // higher CPU usage => lower weight 23 | w3 := computeWeight(NodeLoad{Size: base.Size, Evictions: base.Evictions, FreeMemBytes: base.FreeMemBytes, CPUu16: 9000}) 24 | if w3 >= w1 { 25 | t.Fatalf("expected weight to decrease with CPU load: %d -> %d", w1, w3) 26 | } 27 | 28 | // more evictions => lower weight 29 | w4 := computeWeight(NodeLoad{Size: base.Size, Evictions: 100, FreeMemBytes: base.FreeMemBytes, CPUu16: base.CPUu16}) 30 | if w4 >= w1 { 31 | t.Fatalf("expected weight to decrease with evictions: %d -> %d", w1, w4) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /cluster/limiter.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | type rateLimiter struct { 9 | mu sync.Mutex 10 | max int 11 | tokens int 12 | interval time.Duration 13 | stopCh chan struct{} 14 | } 15 | 16 | // newRateLimiter implements a simple token bucket with fixed window refill. 17 | func newRateLimiter(max int, interval time.Duration) *rateLimiter { 18 | rl := &rateLimiter{ 19 | max: max, 20 | tokens: max, 21 | interval: interval, 22 | stopCh: make(chan struct{}), 23 | } 24 | go rl.refill() 25 | return rl 26 | } 27 | 28 | // refill resets the available tokens to max at fixed intervals. 29 | func (r *rateLimiter) refill() { 30 | t := time.NewTicker(r.interval) 31 | defer t.Stop() 32 | for { 33 | select { 34 | case <-t.C: 35 | r.mu.Lock() 36 | r.tokens = r.max 37 | r.mu.Unlock() 38 | case <-r.stopCh: 39 | return 40 | } 41 | } 42 | } 43 | 44 | // Allow consumes a token if available; returns false when rate-limited. 45 | func (r *rateLimiter) Allow() bool { 46 | r.mu.Lock() 47 | defer r.mu.Unlock() 48 | if r.tokens <= 0 { 49 | return false 50 | } 51 | r.tokens-- 52 | return true 53 | } 54 | 55 | // Stop terminates the refill goroutine. 56 | func (r *rateLimiter) Stop() { close(r.stopCh) } 57 | -------------------------------------------------------------------------------- /cluster/errors.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "net" 7 | "syscall" 8 | ) 9 | 10 | const ErrNotFound = "notfound" 11 | 12 | var ( 13 | ErrNoOwner = errors.New("no owner for key") 14 | ErrTimeout = errors.New("timeout") 15 | ErrClosed = errors.New("cluster closed") 16 | ErrBadPeer = errors.New("bad peer response") 17 | ErrNoLoader = errors.New("no loader configured on primary") 18 | ErrLeaseTimeout = errors.New("lease timeout") 19 | ErrPeerClosed = errors.New("peer closed") 20 | ) 21 | 22 | // isFatalTransport reports whether an error indicates a broken or unusable 23 | // transport that should trigger a peer reset/redial. 24 | // Timeouts and application errors are considered non-fatal. 25 | func isFatalTransport(err error) bool { 26 | if err == nil { 27 | return false 28 | } 29 | 30 | if errors.Is(err, ErrTimeout) { 31 | return false 32 | } 33 | 34 | if errors.Is(err, ErrPeerClosed) || errors.Is(err, net.ErrClosed) || errors.Is(err, io.EOF) { 35 | return true 36 | } 37 | 38 | var nerr net.Error 39 | if errors.As(err, &nerr) { 40 | return !nerr.Timeout() 41 | } 42 | 43 | if errors.Is(err, syscall.ECONNRESET) || errors.Is(err, syscall.EPIPE) || errors.Is(err, syscall.ECONNABORTED) { 44 | return true 45 | } 46 | return false 47 | } 48 | -------------------------------------------------------------------------------- /cluster/lease_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestLeaseTableAcquireReleaseWait(t *testing.T) { 10 | lt := newLeaseTable(200 * time.Millisecond) 11 | defer lt.Stop() 12 | 13 | f1, acq1 := lt.acquire("k") 14 | if !acq1 || f1 == nil { 15 | t.Fatalf("expected first acquire to create inflight") 16 | } 17 | 18 | f2, acq2 := lt.acquire("k") 19 | if acq2 || f2 == nil || f2 != f1 { 20 | t.Fatalf("expected second acquire to wait on same inflight") 21 | } 22 | 23 | done := make(chan error, 1) 24 | go func() { 25 | done <- lt.wait(context.Background(), "k") 26 | }() 27 | 28 | time.Sleep(20 * time.Millisecond) 29 | lt.release("k", nil) 30 | select { 31 | case err := <-done: 32 | if err != nil { 33 | t.Fatalf("unexpected wait error: %v", err) 34 | } 35 | case <-time.After(1 * time.Second): 36 | t.Fatalf("wait timed out") 37 | } 38 | } 39 | 40 | func TestLeaseTableTimeout(t *testing.T) { 41 | lt := newLeaseTable(50 * time.Millisecond) 42 | defer lt.Stop() 43 | 44 | _, acq := lt.acquire("x") 45 | if !acq { 46 | t.Fatalf("expected acquire") 47 | } 48 | 49 | ctx, cancel := context.WithTimeout(context.Background(), time.Second) 50 | defer cancel() 51 | err := lt.wait(ctx, "x") 52 | if err == nil { 53 | t.Fatalf("expected timeout error") 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cluster/codec_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func TestBytesCodecPassAndCopyOnDecode(t *testing.T) { 9 | var bc BytesCodec 10 | 11 | // encode should be pass-through 12 | v := []byte{1, 2, 3} 13 | enc, err := bc.Encode(v) 14 | if err != nil { 15 | t.Fatalf("encode error: %v", err) 16 | } 17 | v[0] = 9 18 | if enc[0] != 9 { 19 | t.Fatalf("encode not pass-through: got %v", enc) 20 | } 21 | 22 | // decode should return a copy detached from input. 23 | in := []byte{4, 5, 6} 24 | out, err := bc.Decode(in) 25 | if err != nil { 26 | t.Fatalf("decode error: %v", err) 27 | } 28 | if !reflect.DeepEqual(out, in) { 29 | t.Fatalf("decode mismatch: got %v want %v", out, in) 30 | } 31 | in[0] = 7 32 | if out[0] == in[0] { 33 | t.Fatalf("decode did not copy. Out mutated: %v vs %v", out, in) 34 | } 35 | } 36 | 37 | func TestCBORCodecRoundTrip(t *testing.T) { 38 | type S struct { 39 | A int 40 | B string 41 | } 42 | var c CBORCodec[S] 43 | orig := S{A: 42, B: "x"} 44 | b, err := c.Encode(orig) 45 | if err != nil { 46 | t.Fatalf("encode error: %v", err) 47 | } 48 | got, err := c.Decode(b) 49 | if err != nil { 50 | t.Fatalf("decode error: %v", err) 51 | } 52 | if !reflect.DeepEqual(got, orig) { 53 | t.Fatalf("round-trip mismatch: got %+v want %+v", got, orig) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cluster/bufpool.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import "sync" 4 | 5 | type bufPool struct { 6 | sizes []int 7 | pools []sync.Pool 8 | indexBySize map[int]int 9 | } 10 | 11 | // newBufPool creates fixed-size byte slice pools for a small set of common 12 | // buffer sizes to reduce allocations on hot paths (framed I/O). 13 | func newBufPool(sizes []int) *bufPool { 14 | bp := &bufPool{ 15 | sizes: sizes, 16 | pools: make([]sync.Pool, len(sizes)), 17 | indexBySize: make(map[int]int, len(sizes)), 18 | } 19 | for i, sz := range sizes { 20 | size := sz 21 | bp.pools[i].New = func() any { 22 | b := make([]byte, size) 23 | return b 24 | } 25 | bp.indexBySize[sz] = i 26 | } 27 | return bp 28 | } 29 | 30 | // class returns the index of the first bucket that can hold n bytes. 31 | func (bp *bufPool) class(n int) int { 32 | for i, sz := range bp.sizes { 33 | if n <= sz { 34 | return i 35 | } 36 | } 37 | return -1 38 | } 39 | 40 | // get returns a slice of length n from an appropriate bucket (or an exact 41 | // allocation if n exceeds the largest bucket size). 42 | func (bp *bufPool) get(n int) []byte { 43 | if i := bp.class(n); i >= 0 { 44 | b := bp.pools[i].Get().([]byte) 45 | // return a slice of length n, capacity bucket size. 46 | return b[:n] 47 | } 48 | // big frame (> largest bucket): allocate exact. 49 | return make([]byte, n) 50 | } 51 | 52 | // put returns a buffer to the matching bucket by capacity. non-pooled sizes 53 | // are dropped on the floor to avoid unbounded pool growth. 54 | func (bp *bufPool) put(b []byte) { 55 | if i, ok := bp.indexBySize[cap(b)]; ok { 56 | // restore to full capacity before putting back. 57 | b = b[:bp.sizes[i]] 58 | bp.pools[i].Put(b) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GOCMD=go 2 | GOBUILD=$(GOCMD) build 3 | GOCLEAN=$(GOCMD) clean 4 | GOTEST=$(GOCMD) test 5 | GOGET=$(GOCMD) get 6 | GOMOD=$(GOCMD) mod 7 | GOFMT=$(GOCMD) fmt 8 | GOVET=$(GOCMD) vet 9 | GOLINT=golint 10 | 11 | BINARY_NAME=kioshun 12 | PACKAGE_NAME=github.com/unkn0wn-root/kioshun 13 | 14 | .PHONY: all 15 | all: test build 16 | 17 | .PHONY: build 18 | build: 19 | $(GOBUILD) -v ./... 20 | 21 | .PHONY: test 22 | test: 23 | $(GOTEST) -v -race -coverprofile=coverage.out ./... 24 | 25 | .PHONY: bench-deps 26 | bench-deps: 27 | cd _benchmarks && $(GOMOD) tidy && $(GOMOD) download 28 | 29 | .PHONY: bench-runner 30 | bench-runner: bench-deps 31 | cd _benchmarks && timeout 600 go run benchmark_runner.go 32 | 33 | .PHONY: bench 34 | bench: bench-deps 35 | cd _benchmarks && $(GOTEST) -bench=. -benchmem -run=^$$ ./... 36 | 37 | .PHONY: bench-full 38 | bench-full: bench-deps 39 | cd _benchmarks && $(GOTEST) -bench=. -benchmem -benchtime=10s -run=^$$ ./... 40 | 41 | .PHONY: bench-compare 42 | bench-compare: bench-deps 43 | @echo "Running performance comparison..." 44 | cd _benchmarks && $(GOTEST) -bench=BenchmarkCacheShardComparison -benchmem -run=^$$ ./... 45 | cd _benchmarks && $(GOTEST) -bench=BenchmarkCacheEvictionPolicyComparison -benchmem -run=^$$ ./... 46 | 47 | .PHONY: lint 48 | lint: 49 | $(GOVET) ./... 50 | $(GOLINT) ./... 51 | 52 | .PHONY: fmt 53 | fmt: 54 | $(GOFMT) ./... 55 | 56 | .PHONY: clean 57 | clean: 58 | $(GOCLEAN) 59 | rm -f $(BINARY_NAME) 60 | 61 | .PHONY: tidy 62 | tidy: 63 | $(GOMOD) tidy 64 | 65 | .PHONY: deps 66 | deps: 67 | $(GOMOD) download 68 | 69 | .PHONY: check 70 | check: fmt lint test 71 | 72 | .PHONY: stress-test 73 | stress-test: bench-deps 74 | @echo "Running stress test..." 75 | cd _benchmarks && $(GOTEST) -bench=BenchmarkCacheScalability -benchmem -benchtime=30s -run=^$$ ./... 76 | 77 | .PHONY: mem-analysis 78 | mem-analysis: bench-deps 79 | @echo "Running memory usage analysis..." 80 | cd _benchmarks && $(GOTEST) -bench=BenchmarkCacheMemoryUsage -benchmem -run=^$$ ./... 81 | 82 | .PHONY: install-tools 83 | install-tools: 84 | $(GOGET) -u golang.org/x/lint/golint 85 | $(GOGET) -u golang.org/x/tools/cmd/goimports 86 | $(GOGET) -u github.com/kisielk/errcheck 87 | -------------------------------------------------------------------------------- /_examples/basic/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/unkn0wn-root/kioshun" 8 | ) 9 | 10 | func main() { 11 | fmt.Println("=== Basic Cache Usage ===") 12 | 13 | // Create a cache with default configuration 14 | cache := cache.NewWithDefaults[string, string]() 15 | defer cache.Close() 16 | 17 | // Basic Set and Get operations 18 | fmt.Println("\n1. Basic Set and Get ops:") 19 | cache.Set("user:123", "David Kier", 5*time.Minute) 20 | cache.Set("user:456", "Michael Ballack", 5*time.Minute) 21 | cache.Set("user:789", "Cristiano Bombaldo", 5*time.Minute) 22 | 23 | if value, found := cache.Get("user:123"); found { 24 | fmt.Printf("Found user: %s\n", value) 25 | } 26 | 27 | // Get with TTL information 28 | fmt.Println("\n2. Get with TTL:") 29 | if value, ttl, found := cache.GetWithTTL("user:123"); found { 30 | fmt.Printf("User: %s, TTL remaining: %s\n", value, ttl) 31 | } 32 | 33 | // Check existence without updating access time 34 | fmt.Println("\n3. Check existence:") 35 | if cache.Exists("user:123") { 36 | fmt.Println("User 123 exists in cache") 37 | } 38 | 39 | // Delete operation 40 | fmt.Println("\n4. Delete ops:") 41 | if cache.Delete("user:456") { 42 | fmt.Println("User 456 deleted from cache") 43 | } 44 | 45 | // Check size 46 | fmt.Println("\n5. Cache size:") 47 | fmt.Printf("Current cache size: %d\n", cache.Size()) 48 | 49 | // Get all keys 50 | fmt.Println("\n6. All keys:") 51 | keys := cache.Keys() 52 | for _, key := range keys { 53 | fmt.Printf("Key: %s\n", key) 54 | } 55 | 56 | // Cache statistics 57 | fmt.Println("\n7. Cache statistics:") 58 | stats := cache.Stats() 59 | fmt.Printf("Hits: %d, Misses: %d, Size: %d, Hit Ratio: %.2f%%\n", 60 | stats.Hits, stats.Misses, stats.Size, stats.HitRatio*100) 61 | 62 | // Set with callback on expiration 63 | fmt.Println("\n8. Set with expiration callback:") 64 | cache.SetWithCallback("temp:data", "temporary value", 2*time.Second, func(key string, value string) { 65 | fmt.Printf("Key %s expired with value: %s\n", key, value) 66 | }) 67 | 68 | // Wait for expiration 69 | time.Sleep(3 * time.Second) 70 | 71 | // Clear all 72 | fmt.Println("\n9. Clear all:") 73 | cache.Clear() 74 | fmt.Printf("Cache size after clear: %d\n", cache.Size()) 75 | 76 | fmt.Println("\n=== Example completed ===") 77 | } 78 | -------------------------------------------------------------------------------- /cluster/keycodec.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "encoding/binary" 5 | "errors" 6 | 7 | xxhash "github.com/cespare/xxhash/v2" 8 | ) 9 | 10 | // KeyCodec maps K <-> []byte for wire/hashing. Should be 11 | // stable across nodes. Optional KeyHasher allows zero-copy hash fast-paths. 12 | type KeyCodec[K any] interface { 13 | EncodeKey(K) []byte 14 | DecodeKey([]byte) (K, error) 15 | } 16 | 17 | // KeyHasher optional fast-path (zero-copy hash of K). 18 | type KeyHasher[K any] interface { 19 | Hash64(K) uint64 20 | } 21 | 22 | // String keys: encode to raw bytes; xxhash for hashing. 23 | type StringKeyCodec[K ~string] struct{} 24 | 25 | func (StringKeyCodec[K]) EncodeKey(k K) []byte { return []byte(string(k)) } 26 | func (StringKeyCodec[K]) DecodeKey(b []byte) (K, error) { return K(string(b)), nil } 27 | func (StringKeyCodec[K]) Hash64(k K) uint64 { return xxhash.Sum64String(string(k)) } 28 | 29 | // Bytes keys: returns underlying slice. Decode copies to detach from caller. 30 | type BytesKeyCodec[K ~[]byte] struct{} 31 | 32 | func (BytesKeyCodec[K]) EncodeKey(k K) []byte { return []byte(k) } 33 | func (BytesKeyCodec[K]) DecodeKey(b []byte) (K, error) { return K(append([]byte(nil), b...)), nil } 34 | func (BytesKeyCodec[K]) Hash64(k K) uint64 { return xxhash.Sum64([]byte(k)) } 35 | 36 | type Int64KeyCodec[K ~int64] struct{} 37 | 38 | func (Int64KeyCodec[K]) EncodeKey(k K) []byte { 39 | var buf [8]byte 40 | binary.BigEndian.PutUint64(buf[:], uint64(k)) 41 | return buf[:] 42 | } 43 | 44 | func (Int64KeyCodec[K]) DecodeKey(b []byte) (K, error) { 45 | if len(b) != 8 { 46 | return *new(K), errors.New("invalid int64 key length") 47 | } 48 | return K(int64(binary.BigEndian.Uint64(b))), nil 49 | } 50 | 51 | func (Int64KeyCodec[K]) Hash64(k K) uint64 { 52 | return mix64(uint64(k)) 53 | } 54 | 55 | type Uint64KeyCodec[K ~uint64] struct{} 56 | 57 | func (Uint64KeyCodec[K]) EncodeKey(k K) []byte { 58 | var buf [8]byte 59 | binary.BigEndian.PutUint64(buf[:], uint64(k)) 60 | return buf[:] 61 | } 62 | 63 | func (Uint64KeyCodec[K]) DecodeKey(b []byte) (K, error) { 64 | if len(b) != 8 { 65 | return *new(K), errors.New("invalid uint64 key length") 66 | } 67 | return K(binary.BigEndian.Uint64(b)), nil 68 | } 69 | 70 | func (Uint64KeyCodec[K]) Hash64(k K) uint64 { 71 | return mix64(uint64(k)) 72 | } 73 | -------------------------------------------------------------------------------- /cluster/migrate.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "time" 5 | 6 | cbor "github.com/fxamacker/cbor/v2" 7 | ) 8 | 9 | // rebalancerLoop periodically runs a bounded rebalance pass that migrates 10 | // locally owned-but-not-primary keys to their current primary owner. 11 | func (n *Node[K, V]) rebalancerLoop() { 12 | iv := n.cfg.RebalanceInterval 13 | if iv <= 0 { 14 | return 15 | } 16 | 17 | t := time.NewTicker(iv) 18 | defer t.Stop() 19 | for { 20 | select { 21 | case <-t.C: 22 | n.rebalanceOnce() 23 | case <-n.stop: 24 | return 25 | } 26 | } 27 | } 28 | 29 | // rebalanceOnce scans up to RebalanceLimit local keys and, for keys whose 30 | // primary owner moved away, pushes their latest value to the new primary and 31 | // deletes the local copy on success. 32 | func (n *Node[K, V]) rebalanceOnce() { 33 | keys := n.local.Keys() 34 | if len(keys) == 0 { 35 | return 36 | } 37 | 38 | limit := n.cfg.RebalanceLimit 39 | if limit <= 0 || limit > len(keys) { 40 | limit = len(keys) 41 | } 42 | 43 | for i := 0; i < limit; i++ { 44 | k := keys[i] 45 | owners := n.ownersFor(k) 46 | if len(owners) == 0 { 47 | continue 48 | } 49 | 50 | primary := owners[0] 51 | if primary.ID == n.cfg.ID { 52 | continue 53 | } 54 | 55 | v, ttl, ok := n.local.GetWithTTL(k) 56 | if !ok { 57 | continue 58 | } 59 | 60 | // Encode + (maybe) compress once. 61 | vb, err := n.codec.Encode(v) 62 | if err != nil { 63 | continue 64 | } 65 | vb, cp := n.maybeCompress(vb) 66 | 67 | bk := n.kc.EncodeKey(k) 68 | exp := absExpiry(ttl) 69 | pc := n.getPeer(primary.ID) 70 | if pc == nil || pc.penalized() { 71 | // Let next pass try again; we keep local until success. 72 | continue 73 | } 74 | 75 | id := n.nextReqID() 76 | ver := n.clock.Next() 77 | msg := &MsgSet{ 78 | Base: Base{ 79 | T: MTSet, 80 | ID: id, 81 | }, 82 | Key: bk, 83 | Val: vb, 84 | Exp: exp, 85 | Ver: ver, 86 | Cp: cp, 87 | } 88 | 89 | raw, err := pc.request(msg, id, n.cfg.Sec.WriteTimeout) 90 | if err != nil { 91 | if isFatalTransport(err) { 92 | n.resetPeer(primary.ID) 93 | } 94 | continue 95 | } 96 | 97 | var resp MsgSetResp 98 | if e := cbor.Unmarshal(raw, &resp); e != nil { 99 | n.resetPeer(primary.ID) 100 | continue 101 | } 102 | if !resp.OK { 103 | continue 104 | } 105 | n.local.Delete(k) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /cluster/keycodec_test.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "encoding/binary" 5 | "testing" 6 | 7 | xxhash "github.com/cespare/xxhash/v2" 8 | ) 9 | 10 | func TestStringKeyCodec(t *testing.T) { 11 | var kc StringKeyCodec[string] 12 | k := "hello" 13 | b := kc.EncodeKey(k) 14 | if string(b) != k { 15 | t.Fatalf("encode mismatch: %q", string(b)) 16 | } 17 | 18 | dk, err := kc.DecodeKey(b) 19 | if err != nil { 20 | t.Fatalf("decode error: %v", err) 21 | } 22 | if dk != k { 23 | t.Fatalf("decode mismatch: %q != %q", dk, k) 24 | } 25 | if kc.Hash64(k) != xxhash.Sum64String(k) { 26 | t.Fatalf("hash mismatch") 27 | } 28 | } 29 | 30 | func TestBytesKeyCodec(t *testing.T) { 31 | var kc BytesKeyCodec[[]byte] 32 | in := []byte{1, 2, 3} 33 | b := kc.EncodeKey(in) 34 | if string(b) != string(in) { 35 | t.Fatalf("encode mismatch") 36 | } 37 | 38 | dk, err := kc.DecodeKey(b) 39 | if err != nil { 40 | t.Fatalf("decode error: %v", err) 41 | } 42 | 43 | // decode should copy to detach from caller buffer. 44 | b[0] = 9 45 | if dk[0] == b[0] { 46 | t.Fatalf("decode did not copy") 47 | } 48 | 49 | if kc.Hash64(in) != xxhash.Sum64(in) { 50 | t.Fatalf("hash mismatch") 51 | } 52 | } 53 | 54 | func TestInt64KeyCodec(t *testing.T) { 55 | var kc Int64KeyCodec[int64] 56 | k := int64(-1234567890) 57 | b := kc.EncodeKey(k) 58 | if len(b) != 8 { 59 | t.Fatalf("encode length: %d", len(b)) 60 | } 61 | got, err := kc.DecodeKey(b) 62 | if err != nil { 63 | t.Fatalf("decode error: %v", err) 64 | } 65 | if got != k { 66 | t.Fatalf("round-trip mismatch: %d != %d", got, k) 67 | } 68 | 69 | // check big-endian layout 70 | if binary.BigEndian.Uint64(b) != uint64(k) { 71 | t.Fatalf("big-endian mismatch") 72 | } 73 | 74 | if _, err := kc.DecodeKey([]byte{1, 2}); err == nil { 75 | t.Fatalf("expected length error") 76 | } 77 | 78 | if kc.Hash64(k) != mix64(uint64(k)) { 79 | t.Fatalf("hash mismatch") 80 | } 81 | } 82 | 83 | func TestUint64KeyCodec(t *testing.T) { 84 | var kc Uint64KeyCodec[uint64] 85 | k := uint64(0xdeadbeefcafebabe) 86 | b := kc.EncodeKey(k) 87 | if len(b) != 8 { 88 | t.Fatalf("encode length: %d", len(b)) 89 | } 90 | 91 | got, err := kc.DecodeKey(b) 92 | if err != nil { 93 | t.Fatalf("decode error: %v", err) 94 | } 95 | if got != k { 96 | t.Fatalf("round-trip mismatch: %d != %d", got, k) 97 | } 98 | 99 | if _, err := kc.DecodeKey([]byte{1, 2, 3}); err == nil { 100 | t.Fatalf("expected length error") 101 | } 102 | 103 | if kc.Hash64(k) != mix64(k) { 104 | t.Fatalf("hash mismatch") 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /snapshot.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "sync/atomic" 5 | "time" 6 | ) 7 | 8 | // Item is a wire-friendly export with absolute expiry. 9 | // NOTE: 10 | // - Version here is NOT the cluster LWW/HLC version. Export() currently uses a 11 | // placeholder (frequency) which is suitable for application-level cache dumps 12 | // and warm starts, but not for cluster snapshots. 13 | // - Cluster replication/backfill paths supply their own authoritative versions 14 | // and call InMemoryCache.Import directly with those values. 15 | type Item[K comparable, V any] struct { 16 | Key K 17 | Val V 18 | ExpireAbs int64 // 0 = no expiration 19 | Version uint64 // reserved for LWW if you add a real version later 20 | } 21 | 22 | // Export up to max items for which selectFn(key) is true. 23 | // Intended for application-level dump/restore. Not used by cluster state 24 | // transfer, because it does not carry the cluster's LWW versions. 25 | func (c *InMemoryCache[K, V]) Export(selectFn func(K) bool, mx int) []Item[K, V] { 26 | out := make([]Item[K, V], 0, mx) 27 | outer: 28 | for _, s := range c.shards { 29 | s.mu.RLock() 30 | now := time.Now().UnixNano() 31 | for k, it := range s.data { 32 | if mx > 0 && len(out) >= mx { 33 | s.mu.RUnlock() 34 | break outer 35 | } 36 | if it.expireTime > 0 && now > it.expireTime { 37 | continue 38 | } 39 | if !selectFn(k) { 40 | continue 41 | } 42 | out = append(out, Item[K, V]{ 43 | Key: k, 44 | Val: it.value, 45 | ExpireAbs: it.expireTime, 46 | Version: uint64(it.frequency), // placeholder 47 | }) 48 | } 49 | s.mu.RUnlock() 50 | } 51 | return out 52 | } 53 | 54 | // Import inserts/overwrites with absolute expiry. Cluster replication, 55 | // backfill, and rebalancing use this to apply authoritative state (including 56 | // LWW versions) without admission/eviction decisions. 57 | func (c *InMemoryCache[K, V]) Import(items []Item[K, V]) { 58 | now := time.Now().UnixNano() 59 | for _, it := range items { 60 | s := c.getShard(it.Key) 61 | s.mu.Lock() 62 | ex, ok := s.data[it.Key] 63 | if !ok { 64 | ex = c.itemPool.Get().(*cacheItem[V]) 65 | s.data[it.Key] = ex 66 | s.addToLRUHead(ex) 67 | atomic.AddInt64(&s.size, 1) 68 | } else if c.config.EvictionPolicy == LFU { 69 | s.lfuList.remove(ex) 70 | } 71 | ex.key, ex.value = it.Key, it.Val 72 | ex.lastAccess = now 73 | ex.expireTime = it.ExpireAbs 74 | switch c.config.EvictionPolicy { 75 | case LRU: 76 | s.moveToLRUHead(ex) 77 | case LFU: 78 | ex.frequency = 1 79 | s.lfuList.add(ex) 80 | case AdmissionLFU: 81 | ex.frequency = 1 82 | } 83 | s.mu.Unlock() 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /benchmarks/go.sum: -------------------------------------------------------------------------------- 1 | github.com/allegro/bigcache/v3 v3.1.0 h1:H2Vp8VOvxcrB91o86fUSVJFqeuz8kpyyB02eH3bSzwk= 2 | github.com/allegro/bigcache/v3 v3.1.0/go.mod h1:aPyh7jEvrog9zAwx5N7+JUQX5dZTSGpxF1LAR4dr35I= 3 | github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 4 | github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= 5 | github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= 6 | github.com/coocood/freecache v1.2.4 h1:UdR6Yz/X1HW4fZOuH0Z94KwG851GWOSknua5VUbb/5M= 7 | github.com/coocood/freecache v1.2.4/go.mod h1:RBUWa/Cy+OHdfTGFEhEuE1pMCMX51Ncizj7rthiQ3vk= 8 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 9 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 10 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 11 | github.com/dgraph-io/ristretto v0.1.1 h1:6CWw5tJNgpegArSHpNHJKldNeq03FQCwYvfMVWajOK8= 12 | github.com/dgraph-io/ristretto v0.1.1/go.mod h1:S1GPSBCYCIhmVNfcth17y2zZtQT6wzkzgwUve0VDWWA= 13 | github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= 14 | github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= 15 | github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= 16 | github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= 17 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= 18 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 19 | github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= 20 | github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= 21 | github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= 22 | github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 23 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 24 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 25 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 26 | github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= 27 | github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= 28 | golang.org/x/sys v0.0.0-20221010170243-090e33056c14 h1:k5II8e6QD8mITdi+okbbmR/cIyEbeXLBhy5Ha4nevyc= 29 | golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 30 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 31 | gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= 32 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 33 | -------------------------------------------------------------------------------- /benchmarks/cluster/README.md: -------------------------------------------------------------------------------- 1 | Mesh Bench: 3‑Node Kioshun Cluster Load Test 2 | 3 | Overview 4 | - Spins up a 3‑node Kioshun mesh cluster (no DB) and a separate runner that issues massive concurrent GET/SET operations. 5 | - Measures p50/p95/p99 latencies for GET and SET, logs HIT/MISS (local vs remote), and performs integrity checks to ensure the cluster never returns the wrong object for a key. 6 | 7 | Quick Start 8 | - docker compose -f _benchmarks/cluster/docker-compose.yml up --build 9 | 10 | Services 11 | - node1/node2/node3: Minimal HTTP wrappers around a Kioshun cluster node. 12 | - Endpoints: 13 | - GET /get?k=KEY → 200 with value on hit, 404 on miss. Headers: X-Cache=HIT_LOCAL|HIT_REMOTE|MISS 14 | - POST /set JSON {"k":"KEY","v":"VALUE","ttl_ms":0} → 200 15 | - GET /stats → local shard stats 16 | - runner: Generates high concurrency load and prints percentile latencies and hit ratios. 17 | - direct-runner: Starts 3 kioshun nodes in-process and drives load via the Node API (no HTTP); useful to compare protocol-only performance vs Redis. 18 | 19 | Runner Env Vars 20 | - TARGETS: Comma-separated list of node URLs (default: http://node1:8081,http://node2:8082,http://node3:8083) 21 | - DURATION: Test duration (default: 60s) 22 | - CONCURRENCY: Number of goroutines (default: 512) 23 | - KEYS: Key space size (default: 50000) 24 | - SET_RATIO: Percentage of SET ops (0..100, default: 10) 25 | - LOG_EVERY: Log every N ops per worker (default: 0 = disable) 26 | - STATS_EVERY: Print aggregated node /stats every interval (e.g., 10s). Empty disables. 27 | - SET_TTL_MS: TTL used for all SETs. Use -1 for no expiration across all replicas; positive ms for fixed TTL; avoid 0 if you want consistent TTL across owners. 28 | 29 | Failure Injection (optional) 30 | - KILL_MODE: none | random | target (default: none) 31 | - KILL_AFTER: when to kill (duration, e.g., 45s) 32 | - KILL_TARGET: base URL of node to kill when mode=target (e.g., http://node2:8082) 33 | - KILL_TOKEN: shared token passed to /kill to authorize 34 | 35 | Node Env Vars 36 | - ALLOW_KILL: enable /kill endpoint (default: false) 37 | - KILL_TOKEN: required token to authorize /kill (optional, recommended) 38 | 39 | Read/Write/Failure Tuning (per node) 40 | - REPLICATION_FACTOR: owners per key (default 3) 41 | - WRITE_CONCERN: acks required (default 2) 42 | - READ_MAX_FANOUT: max parallel read legs (default 2) 43 | - READ_PER_TRY_MS: per-leg timeout (ms) 44 | - READ_HEDGE_DELAY_MS: delay before spinning hedges (ms) 45 | - READ_HEDGE_INTERVAL_MS: spacing between hedges (ms) 46 | - WRITE_TIMEOUT_MS: write timeout (ms) 47 | - READ_TIMEOUT_MS: read timeout (ms) 48 | - SUSPICION_AFTER_MS: suspect peer after (ms) 49 | - WEIGHT_UPDATE_MS: ring weight refresh interval (ms) 50 | - GOSSIP_INTERVAL_MS: gossip interval (ms) 51 | 52 | Output 53 | - Prints total ops, hit/miss counts (local/remote), and p50/p95/p99 for GET/SET. 54 | - Performs periodic consistency checks across all nodes and flags mismatches. 55 | - Optionally prints aggregated node stats during the run and at the end. 56 | 57 | Stopping 58 | - The runner traps SIGINT/SIGTERM and will always print the summary on exit. Use Ctrl+C safely. 59 | 60 | Direct Runner 61 | - Build/run: docker compose -f _benchmarks/cluster/docker-compose.yml up --build direct 62 | - Env knobs (same as node tuning + workload): DURATION, CONCURRENCY, KEYS, SET_RATIO, SET_TTL_MS, KILL_AFTER, CACHE_AUTH, READ_* and *_MS vars. 63 | -------------------------------------------------------------------------------- /cluster/lease.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "context" 5 | "sync" 6 | "time" 7 | ) 8 | 9 | type inflight struct { 10 | ch chan struct{} 11 | err error 12 | exp int64 13 | done bool 14 | } 15 | 16 | // leaseTable provides per-key single-flight semantics with a TTL. The first 17 | // goroutine acquires a lease and performs the work, others wait on the channel 18 | // until the lease is released or times out. 19 | type leaseTable struct { 20 | mu sync.Mutex 21 | m map[string]*inflight 22 | ttl time.Duration 23 | stopCh chan struct{} 24 | } 25 | 26 | // newLeaseTable creates a per-key lease table with an optional TTL to break 27 | // stuck leases. A background sweeper closes expired leases when ttl>0. 28 | func newLeaseTable(ttl time.Duration) *leaseTable { 29 | t := &leaseTable{ 30 | m: make(map[string]*inflight), 31 | ttl: ttl, 32 | stopCh: make(chan struct{}), 33 | } 34 | if ttl > 0 { 35 | go t.sweeper() 36 | } 37 | return t 38 | } 39 | 40 | // acquire obtains a lease for key if none exists and returns (lease, true). 41 | // When a lease already exists, returns the existing lease and false. 42 | func (t *leaseTable) acquire(key string) (*inflight, bool) { 43 | t.mu.Lock() 44 | if f, ok := t.m[key]; ok { 45 | t.mu.Unlock() 46 | return f, false 47 | } 48 | f := &inflight{ch: make(chan struct{})} 49 | if t.ttl > 0 { 50 | f.exp = time.Now().Add(t.ttl).UnixNano() 51 | } 52 | t.m[key] = f 53 | t.mu.Unlock() 54 | return f, true 55 | } 56 | 57 | // release removes the lease and notifies waiters with the provided error. 58 | func (t *leaseTable) release(key string, err error) { 59 | t.mu.Lock() 60 | f, ok := t.m[key] 61 | if ok { 62 | delete(t.m, key) 63 | } 64 | // close channel under the lock to avoid double-close with sweeper 65 | if ok && !f.done { 66 | f.err = err 67 | f.done = true 68 | close(f.ch) 69 | } 70 | t.mu.Unlock() 71 | } 72 | 73 | // wait blocks until the lease for key completes or ctx is done, returning 74 | // the terminal error set by the releaser (nil on success). 75 | func (t *leaseTable) wait(ctx context.Context, key string) error { 76 | t.mu.Lock() 77 | f := t.m[key] 78 | t.mu.Unlock() 79 | if f == nil { 80 | return nil 81 | } 82 | select { 83 | case <-ctx.Done(): 84 | return ctx.Err() 85 | case <-f.ch: 86 | return f.err 87 | } 88 | } 89 | 90 | // sweeper periodically scans for and force-closes expired leases to prevent 91 | // indefinite blocking when holders crash or hang. 92 | func (t *leaseTable) sweeper() { 93 | period := t.ttl / 2 94 | if period <= 0 { 95 | period = 10 * time.Millisecond 96 | } 97 | tick := time.NewTicker(period) 98 | defer tick.Stop() 99 | for { 100 | select { 101 | case <-tick.C: 102 | now := time.Now().UnixNano() 103 | t.mu.Lock() 104 | for k, f := range t.m { 105 | if f.exp > 0 && now >= f.exp { 106 | delete(t.m, k) 107 | if !f.done { 108 | f.err = ErrLeaseTimeout 109 | f.done = true 110 | close(f.ch) 111 | } 112 | } 113 | } 114 | t.mu.Unlock() 115 | case <-t.stopCh: 116 | return 117 | } 118 | } 119 | } 120 | 121 | // Stop shuts down the sweeper goroutine. 122 | func (t *leaseTable) Stop() { 123 | select { 124 | case <-t.stopCh: 125 | return 126 | default: 127 | close(t.stopCh) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /cluster/hlc.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "sync" 5 | "time" 6 | ) 7 | 8 | const ( 9 | hlcLogicalBits = 16 // total logical bits (low end of the 64-bit HLC) 10 | hlcNodeBits = 8 // low bits reserved for nodeID (0..255) 11 | hlcSeqBits = 16 - 8 // remaining logical bits for per-ms sequence 12 | hlcNodeMask = (1 << hlcNodeBits) - 1 13 | hlcSeqMask = (1 << hlcSeqBits) - 1 14 | ) 15 | 16 | // hlc is a 64-bit Hybrid Logical Clock: 17 | // layout: 18 | // [48 bits physical millis][hlcSeqBits seq][hlcNodeBits nodeID]. 19 | type hlc struct { 20 | mu sync.Mutex 21 | physMS int64 22 | seq uint16 23 | nodeID uint16 24 | } 25 | 26 | // newHLC constructs an HLC that embeds a per-node ID in the low logical bits. 27 | // nodeID is masked to hlcNodeBits (e.g., 8 bits -> 0..255). 28 | func newHLC(nodeID uint16) *hlc { 29 | return &hlc{nodeID: nodeID & hlcNodeMask} 30 | } 31 | 32 | // Next returns a strictly monotonic timestamp for local events. 33 | // yields strictly monotonic timestamps for local events. 34 | func (h *hlc) Next() uint64 { 35 | now := time.Now().UnixMilli() 36 | 37 | h.mu.Lock() 38 | defer h.mu.Unlock() 39 | 40 | if now > h.physMS { 41 | h.physMS = now 42 | h.seq = 0 43 | } else { 44 | if h.seq < hlcSeqMask { 45 | h.seq++ 46 | } else { 47 | h.physMS++ 48 | h.seq = 0 49 | } 50 | } 51 | v := packHLC(h.physMS, h.seq, h.nodeID) 52 | return v 53 | } 54 | 55 | // Observe incorporates a remote HLC into our state to avoid regressions. 56 | // After Observe(remote), a subsequent Next() will be strictly > remote (monotonic). 57 | func (h *hlc) Observe(remote uint64) { 58 | rp, rlog := unpackHLC(remote) 59 | rseq, _ := splitLogical(rlog) 60 | now := time.Now().UnixMilli() 61 | 62 | h.mu.Lock() 63 | defer h.mu.Unlock() 64 | 65 | phys := maxOf(h.physMS, now, rp) 66 | 67 | switch { 68 | case phys == rp && phys == h.physMS: 69 | target := h.seq 70 | if rseq > target { 71 | target = rseq 72 | } 73 | 74 | newSeq := target + 1 75 | if newSeq > hlcSeqMask { 76 | h.physMS = phys + 1 77 | h.seq = 0 78 | } else { 79 | h.physMS = phys 80 | h.seq = newSeq 81 | } 82 | case phys == rp && phys > h.physMS: 83 | newSeq := rseq + 1 84 | if newSeq > hlcSeqMask { 85 | h.physMS = phys + 1 86 | h.seq = 0 87 | } else { 88 | h.physMS = phys 89 | h.seq = newSeq 90 | } 91 | case phys == h.physMS && phys > rp: 92 | if h.seq < hlcSeqMask { 93 | h.seq++ 94 | } else { 95 | h.physMS++ 96 | h.seq = 0 97 | } 98 | 99 | default: 100 | h.physMS = phys 101 | h.seq = 0 102 | } 103 | 104 | } 105 | 106 | // packHLC encodes physical milliseconds and a combined (seq,nodeID) into a 64-bit HLC. 107 | func packHLC(physMS int64, seq uint16, nodeID uint16) uint64 { 108 | logical := ((seq & hlcSeqMask) << hlcNodeBits) | (nodeID & hlcNodeMask) 109 | return (uint64(physMS) << hlcLogicalBits) | uint64(logical) 110 | } 111 | 112 | // unpackHLC decodes a 64-bit HLC into physical milliseconds and the 16-bit logical. 113 | func unpackHLC(ts uint64) (physMS int64, logical uint16) { 114 | return int64(ts >> hlcLogicalBits), uint16(ts & ((1 << hlcLogicalBits) - 1)) 115 | } 116 | 117 | // splitLogical splits the 16-bit logical field into (seq, nodeID). 118 | func splitLogical(logical uint16) (seq uint16, nodeID uint16) { 119 | seq = (logical >> hlcNodeBits) & hlcSeqMask 120 | nodeID = logical & hlcNodeMask 121 | return 122 | } 123 | 124 | func maxOf(a, b, c int64) int64 { 125 | if b > a { 126 | a = b 127 | } 128 | if c > a { 129 | a = c 130 | } 131 | return a 132 | } 133 | -------------------------------------------------------------------------------- /cluster/membership.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | type membership struct { 10 | mu sync.RWMutex 11 | peers map[NodeID]*nodeMeta 12 | seen map[NodeID]int64 13 | epoch uint64 14 | } 15 | 16 | // newMembership creates an empty membership view with per-node metadata and 17 | // last-seen timestamps used for liveness and ring construction. 18 | func newMembership() *membership { 19 | return &membership{ 20 | peers: make(map[NodeID]*nodeMeta), 21 | seen: make(map[NodeID]int64), 22 | } 23 | } 24 | 25 | // snapshot returns copies of peers and seen maps along with the current epoch 26 | // so callers can take a consistent view without holding locks. 27 | func (m *membership) snapshot() (map[NodeID]*nodeMeta, map[NodeID]int64, uint64) { 28 | m.mu.RLock() 29 | defer m.mu.RUnlock() 30 | p := make(map[NodeID]*nodeMeta, len(m.peers)) 31 | for k, v := range m.peers { 32 | p[k] = v 33 | } 34 | 35 | s := make(map[NodeID]int64, len(m.seen)) 36 | for k, v := range m.seen { 37 | s[k] = v 38 | } 39 | return p, s, m.epoch 40 | } 41 | 42 | // integrate merges gossip from a peer: updates address, seen timestamps, 43 | // and tracks the highest epoch to detect cluster resyncs. 44 | func (m *membership) integrate(from NodeID, addr string, peers []PeerInfo, seen map[string]int64, epoch uint64, now int64) { 45 | m.mu.Lock() 46 | defer m.mu.Unlock() 47 | if epoch > m.epoch { 48 | m.epoch = epoch 49 | } 50 | 51 | if _, ok := m.peers[from]; !ok { 52 | m.peers[from] = newMeta(from, addr) 53 | } else { 54 | m.peers[from].Addr = addr 55 | } 56 | 57 | m.seen[from] = now 58 | 59 | for _, p := range peers { 60 | id := NodeID(p.ID) 61 | if _, ok := m.peers[id]; !ok { 62 | m.peers[id] = newMeta(id, p.Addr) 63 | } else { 64 | m.peers[id].Addr = p.Addr 65 | } 66 | } 67 | 68 | // merge remote observations: keep the freshest timestamp per node. 69 | for k, ts := range seen { 70 | id := NodeID(k) 71 | if old, ok := m.seen[id]; !ok || ts > old { 72 | m.seen[id] = ts 73 | } 74 | } 75 | } 76 | 77 | // alive returns nodes not suspected within the given timeframe. 78 | func (m *membership) alive(now int64, suspicionAfter time.Duration) []*nodeMeta { 79 | m.mu.RLock() 80 | defer m.mu.RUnlock() 81 | 82 | out := make([]*nodeMeta, 0, len(m.peers)) 83 | threshold := now - suspicionAfter.Nanoseconds() 84 | for id, meta := range m.peers { 85 | if m.seen[id] >= threshold { 86 | out = append(out, meta) 87 | } 88 | } 89 | return out 90 | } 91 | 92 | // pruneTombstones removes nodes that have not been seen for tombstoneAfter. 93 | func (m *membership) pruneTombstones(now int64, tombstoneAfter time.Duration) { 94 | m.mu.Lock() 95 | defer m.mu.Unlock() 96 | threshold := now - tombstoneAfter.Nanoseconds() 97 | for id := range m.peers { 98 | if ts, ok := m.seen[id]; ok && ts < threshold { 99 | delete(m.peers, id) 100 | delete(m.seen, id) 101 | } 102 | } 103 | } 104 | 105 | // ensure ensures a node entry exists and bumps its seen timestamp to now. 106 | func (m *membership) ensure(id NodeID, addr string) { 107 | m.mu.Lock() 108 | defer m.mu.Unlock() 109 | if _, ok := m.peers[id]; !ok { 110 | m.peers[id] = newMeta(id, addr) 111 | } 112 | m.seen[id] = time.Now().UnixNano() 113 | } 114 | 115 | // bumpEpoch increments the membership epoch to signal a topology change. 116 | func (m *membership) bumpEpoch() uint64 { 117 | m.mu.Lock() 118 | m.epoch++ 119 | e := m.epoch 120 | m.mu.Unlock() 121 | return e 122 | } 123 | 124 | // setWeight updates the rendezvous weight for a peer. 125 | func (m *membership) setWeight(id NodeID, weight uint64) { 126 | m.mu.RLock() 127 | if meta, ok := m.peers[id]; ok { 128 | atomic.StoreUint64(&meta.weight, weight) 129 | } 130 | m.mu.RUnlock() 131 | } 132 | -------------------------------------------------------------------------------- /cluster/wire.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | // CBOR-based wire protocol: frames carry a CBOR-encoded Base{T,ID} header 4 | // followed by message-specific fields. Keys/values are byte slices; values 5 | // may be gzip-compressed (Cp=true). LWW uses Ver and HLC. 6 | 7 | type MsgType uint8 8 | 9 | const ( 10 | MTHello MsgType = iota + 1 11 | MTHelloResp 12 | MTGet 13 | MTGetResp 14 | MTGetBulk 15 | MTGetBulkResp 16 | MTSet 17 | MTSetResp 18 | MTSetBulk 19 | MTSetBulkResp 20 | MTDelete 21 | MTDeleteResp 22 | MTLeaseLoad 23 | MTLeaseLoadResp 24 | MTMigratePull 25 | MTGossip 26 | MTBackfillDigestReq MsgType = 200 27 | MTBackfillDigestResp MsgType = 201 28 | MTBackfillKeysReq MsgType = 202 29 | MTBackfillKeysResp MsgType = 203 30 | ) 31 | 32 | // PeerInfo advertises identity + current dialable address 33 | type PeerInfo struct { 34 | ID string `cbor:"i"` 35 | Addr string `cbor:"a"` 36 | } 37 | 38 | type Base struct { 39 | T MsgType `cbor:"t"` 40 | ID uint64 `cbor:"id"` 41 | } 42 | 43 | type MsgHello struct { 44 | Base 45 | FromID string `cbor:"fi"` 46 | FromAddr string `cbor:"fa"` 47 | Token string `cbor:"tok"` 48 | } 49 | 50 | type MsgHelloResp struct { 51 | Base 52 | OK bool `cbor:"ok"` 53 | PeerID string `cbor:"pi"` 54 | Err string `cbor:"err,omitempty"` 55 | } 56 | 57 | type MsgGet struct { 58 | Base 59 | Key []byte `cbor:"k"` 60 | } 61 | type MsgGetResp struct { 62 | Base 63 | Found bool `cbor:"f"` 64 | Val []byte `cbor:"v"` 65 | Exp int64 `cbor:"e"` 66 | Cp bool `cbor:"cp"` 67 | Err string `cbor:"err,omitempty"` 68 | } 69 | 70 | type MsgGetBulk struct { 71 | Base 72 | Keys [][]byte `cbor:"ks"` 73 | } 74 | 75 | type MsgGetBulkResp struct { 76 | Base 77 | Hits []bool `cbor:"h"` 78 | Vals [][]byte `cbor:"vs"` 79 | Exps []int64 `cbor:"es"` 80 | Cps []bool `cbor:"cps"` 81 | Err string `cbor:"err,omitempty"` 82 | } 83 | 84 | type MsgSet struct { 85 | Base 86 | Key []byte `cbor:"k"` 87 | Val []byte `cbor:"v"` 88 | Exp int64 `cbor:"e"` 89 | Ver uint64 `cbor:"ver"` 90 | Cp bool `cbor:"cp"` 91 | } 92 | 93 | type MsgSetResp struct { 94 | Base 95 | OK bool `cbor:"ok"` 96 | Err string `cbor:"err,omitempty"` 97 | } 98 | 99 | type KV struct { 100 | K []byte `cbor:"k"` 101 | V []byte `cbor:"v"` 102 | E int64 `cbor:"e"` 103 | Ver uint64 `cbor:"ver"` 104 | Cp bool `cbor:"cp"` 105 | } 106 | 107 | type MsgSetBulk struct { 108 | Base 109 | Items []KV `cbor:"items"` 110 | } 111 | 112 | type MsgSetBulkResp struct { 113 | Base 114 | OK bool `cbor:"ok"` 115 | Err string `cbor:"err,omitempty"` 116 | } 117 | 118 | type MsgDel struct { 119 | Base 120 | Key []byte `cbor:"k"` 121 | Ver uint64 `cbor:"ver"` 122 | } 123 | 124 | type MsgDelResp struct { 125 | Base 126 | OK bool `cbor:"ok"` 127 | Err string `cbor:"err,omitempty"` 128 | } 129 | 130 | type MsgLeaseLoad struct { 131 | Base 132 | Key []byte `cbor:"k"` 133 | } 134 | 135 | type MsgLeaseLoadResp struct { 136 | Base 137 | Found bool `cbor:"f"` 138 | Val []byte `cbor:"v"` 139 | Exp int64 `cbor:"e"` 140 | Cp bool `cbor:"cp"` 141 | Err string `cbor:"err,omitempty"` 142 | } 143 | 144 | type MsgGossip struct { 145 | Base 146 | FromID string `cbor:"fi"` 147 | FromAddr string `cbor:"fa"` 148 | Seen map[string]int64 `cbor:"sn"` // keys are peer IDs 149 | Peers []PeerInfo `cbor:"pe"` // ID + current address 150 | Load NodeLoad `cbor:"ld"` 151 | TopK []HotKey `cbor:"hh"` 152 | Epoch uint64 `cbor:"ep"` 153 | } 154 | 155 | type NodeLoad struct { 156 | Size int64 `cbor:"sz"` 157 | Evictions int64 `cbor:"ev"` 158 | FreeMemBytes uint64 `cbor:"fm"` 159 | CPUu16 uint16 `cbor:"cpu"` 160 | } 161 | 162 | type HotKey struct { 163 | K []byte `cbor:"k"` 164 | C uint64 `cbor:"c"` 165 | } 166 | -------------------------------------------------------------------------------- /shard.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | ) 7 | 8 | // shard is a per-partition structure that confines contention; map/list mutations under mu, counters via atomics. 9 | type shard[K comparable, V any] struct { 10 | mu sync.RWMutex 11 | data map[K]*cacheItem[V] 12 | 13 | // Intrusive LRU list sentinels (head.next = MRU, tail.prev = LRU). 14 | // Invariant: head.prev == nil, tail.next == nil, and head↔…↔tail forms the chain. 15 | head *cacheItem[V] 16 | tail *cacheItem[V] 17 | 18 | lfuList *lfuList[K, V] // Allocated only for pure LFU policy. 19 | 20 | size int64 // live items (atomic) 21 | hits int64 // per-shard hits (atomic) 22 | misses int64 // per-shard misses (atomic) 23 | evictions int64 // per-shard evictions (atomic) 24 | expirations int64 // per-shard TTL expirations (atomic) 25 | 26 | // AdmissionLFU-only: shard-local adaptive admission filter. 27 | admission *adaptiveAdmissionFilter 28 | 29 | // Observability: frequency of last evicted victim (AdmissionLFU). 30 | lastVictimFrequency uint64 31 | } 32 | 33 | // initLRU sets up an empty LRU list with head/tail sentinels (no nil checks on operations). 34 | func (s *shard[K, V]) initLRU() { 35 | s.head = &cacheItem[V]{} 36 | s.tail = &cacheItem[V]{} 37 | // head <-> tail (empty) 38 | s.head.next = s.tail 39 | s.tail.prev = s.head 40 | } 41 | 42 | // addToLRUHead inserts item as MRU directly after head (O(1)). 43 | func (s *shard[K, V]) addToLRUHead(item *cacheItem[V]) { 44 | oldNext := s.head.next 45 | // head -> item -> oldNext 46 | s.head.next = item 47 | item.next = oldNext 48 | // head <- item <- oldNext 49 | item.prev = s.head 50 | oldNext.prev = item 51 | } 52 | 53 | // removeFromLRU unlinks item from the list by splicing neighbors; clears item links. 54 | func (s *shard[K, V]) removeFromLRU(item *cacheItem[V]) { 55 | // prev -> next (skip item) 56 | if item.prev != nil { 57 | item.prev.next = item.next 58 | } 59 | if item.next != nil { 60 | item.next.prev = item.prev 61 | } 62 | item.prev = nil 63 | item.next = nil 64 | } 65 | 66 | // moveToLRUHead promotes item to MRU unless already MRU (unlink then insert after head). 67 | func (s *shard[K, V]) moveToLRUHead(item *cacheItem[V]) { 68 | if s.head.next == item { 69 | return 70 | } 71 | // Unlink from current position. 72 | if item.prev != nil { 73 | item.prev.next = item.next 74 | } 75 | if item.next != nil { 76 | item.next.prev = item.prev 77 | } 78 | 79 | // Insert right after head. 80 | oldNext := s.head.next 81 | s.head.next = item 82 | item.prev = s.head 83 | item.next = oldNext 84 | oldNext.prev = item 85 | } 86 | 87 | // cleanup removes expired items and halves per-item frequency for AdmissionLFU (cheap aging). 88 | // Phase 1: collect expired keys and apply in-place aging under write lock. 89 | // Phase 2: delete collected keys, unlink from structures, recycle nodes, update stats. 90 | func (s *shard[K, V]) cleanup(now int64, evictionPolicy EvictionPolicy, itemPool *sync.Pool, statsEnabled bool) { 91 | s.mu.Lock() 92 | defer s.mu.Unlock() 93 | 94 | var keysToDelete []K 95 | for key, item := range s.data { 96 | if item.expireTime > 0 && now > item.expireTime { 97 | keysToDelete = append(keysToDelete, key) 98 | continue 99 | } 100 | // Lightweight aging (AdmissionLFU only) to prevent stale, permanently high frequencies. 101 | if evictionPolicy == AdmissionLFU && item.frequency > 1 { 102 | item.frequency >>= 1 103 | } 104 | } 105 | 106 | // Destructive pass over collected keys (re-check existence under lock). 107 | for _, key := range keysToDelete { 108 | if item, exists := s.data[key]; exists { 109 | delete(s.data, key) 110 | s.removeFromLRU(item) 111 | if evictionPolicy == LFU { 112 | s.lfuList.remove(item) 113 | } 114 | itemPool.Put(item) 115 | atomic.AddInt64(&s.size, -1) 116 | if statsEnabled { 117 | atomic.AddInt64(&s.expirations, 1) 118 | } 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /hash_test.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | var testStrings = []string{ 9 | "a", // 1 byte 10 | "test", // 4 bytes 11 | "testkey", // 7 bytes 12 | "testkey1", // 8 bytes 13 | "testkey12", // 9 bytes 14 | "user:profile:12345", // 18 bytes 15 | "cache:session:user:1234567890:data", // 34 bytes 16 | "this:is:a:very:long:cache:key:that:represents:typical:usage:in:high:performance:systems", // 89 bytes 17 | } 18 | 19 | // Test that hash function produces consistent results 20 | func TestHashConsistency(t *testing.T) { 21 | h := newHasher[string]() 22 | 23 | for _, str := range testStrings { 24 | hash1 := h.hash(str) 25 | hash2 := h.hash(str) 26 | 27 | if hash1 != hash2 { 28 | t.Errorf("Hash function not consistent for string %q: got %v and %v", str, hash1, hash2) 29 | } 30 | } 31 | } 32 | 33 | // Test that different strings produce different hashes (basic collision test) 34 | func TestHashDistribution(t *testing.T) { 35 | h := newHasher[string]() 36 | hashes := make(map[uint64]string) 37 | 38 | for _, str := range testStrings { 39 | hash := h.hash(str) 40 | if existing, exists := hashes[hash]; exists { 41 | t.Errorf("Hash collision: %q and %q both hash to %v", str, existing, hash) 42 | } 43 | hashes[hash] = str 44 | } 45 | } 46 | 47 | // Test integer hashing 48 | func TestIntegerHashing(t *testing.T) { 49 | h := newHasher[int]() 50 | 51 | testInts := []int{0, 1, 42, 1000, -1, -42} 52 | hashes := make(map[uint64]int) 53 | 54 | for _, num := range testInts { 55 | hash := h.hash(num) 56 | if existing, exists := hashes[hash]; exists { 57 | t.Errorf("Hash collision: %d and %d both hash to %v", num, existing, hash) 58 | } 59 | hashes[hash] = num 60 | } 61 | } 62 | 63 | func TestHybridThreshold(t *testing.T) { 64 | h := newHasher[string]() 65 | 66 | shortString := "short" // 5 bytes - should use FNV 67 | longString := "this_is_a_very_long_string_that_exceeds_the_threshold_length" // >32 bytes - should use xxHash 68 | 69 | shortHash := h.hash(shortString) 70 | longHash := h.hash(longString) 71 | 72 | if shortHash == 0 || longHash == 0 { 73 | t.Error("Hash functions should not produce zero hashes for non-empty strings") 74 | } 75 | 76 | if shortHash == longHash { 77 | t.Error("Different strings should produce different hashes") 78 | } 79 | } 80 | 81 | func BenchmarkHasherString(t *testing.B) { 82 | h := newHasher[string]() 83 | 84 | for _, str := range testStrings { 85 | t.Run(fmt.Sprintf("len_%d", len(str)), func(b *testing.B) { 86 | b.ResetTimer() 87 | for i := 0; i < b.N; i++ { 88 | _ = h.hash(str) 89 | } 90 | }) 91 | } 92 | } 93 | 94 | func BenchmarkRealisticWorkload(t *testing.B) { 95 | h := newHasher[string]() 96 | 97 | workloadKeys := []string{ 98 | "u:1", // Very short user ID 99 | "user:1234", // Short user key 100 | "session:abc123def456", // Medium session key 101 | "cache:user:profile:1234567890", // Long structured key 102 | "api:v1:endpoint:users:get:with:filters:and:pagination:page:1:limit:50", // Very long API key 103 | } 104 | 105 | t.ResetTimer() 106 | for i := 0; i < t.N; i++ { 107 | key := workloadKeys[i%len(workloadKeys)] 108 | _ = h.hash(key) 109 | } 110 | } 111 | 112 | func BenchmarkHashDistribution(t *testing.B) { 113 | h := newHasher[string]() 114 | 115 | // Generate keys with common prefixes to test collision resistance 116 | keys := make([]string, 1000) 117 | for i := range keys { 118 | keys[i] = fmt.Sprintf("user:session:id:%d:data", i) 119 | } 120 | 121 | t.ResetTimer() 122 | collisions := make(map[uint64]int) 123 | for i := 0; i < t.N && i < len(keys); i++ { 124 | hash := h.hash(keys[i]) 125 | collisions[hash]++ 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /cluster/adapter.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "context" 5 | "time" 6 | 7 | cache "github.com/unkn0wn-root/kioshun" 8 | ) 9 | 10 | // DistributedCache adapts a running Node to the cache.Cache interface so 11 | // existing code using kioshun's single-node Cache can switch to the clustered 12 | // backend without invasive changes. Methods that cannot be expressed 13 | // cluster‑wide (e.g., Clear, Size, Stats) operate on the local shard only. 14 | // 15 | // This adapter exposes two usage styles: 16 | // 1. Compatibility methods without context (Set/Get/Delete) that internally 17 | // use Node's configured timeouts (ReadTimeout/WriteTimeout). 18 | // 2. Context-aware methods (SetCtx/GetCtx/DeleteCtx/GetOrLoadCtx) that accept 19 | // a caller-provided context and surface errors. 20 | type DistributedCache[K comparable, V any] struct { 21 | n *Node[K, V] 22 | } 23 | 24 | // NewDistributedCache wraps a started Node and returns a cache.Cache 25 | // compatible adapter. Call node.Start() before using the adapter, and Stop() 26 | // (or Close() on the adapter) during shutdown. 27 | func NewDistributedCache[K comparable, V any](n *Node[K, V]) *DistributedCache[K, V] { 28 | return &DistributedCache[K, V]{n: n} 29 | } 30 | 31 | // Alias for NewDistributedCache 32 | func NewClient[K comparable, V any](n *Node[K, V]) *DistributedCache[K, V] { 33 | return NewDistributedCache[K, V](n) 34 | } 35 | 36 | // getCtx returns a context with timeout derived from node security settings. 37 | func (a *DistributedCache[K, V]) getCtx(write bool) (context.Context, context.CancelFunc) { 38 | to := a.n.cfg.Sec.ReadTimeout 39 | if write { 40 | to = a.n.cfg.Sec.WriteTimeout 41 | } 42 | if to <= 0 { 43 | to = 3 * time.Second 44 | } 45 | return context.WithTimeout(context.Background(), to) 46 | } 47 | 48 | // Set forwards to Node.Set with the configured write timeout. 49 | func (a *DistributedCache[K, V]) Set(key K, value V, ttl time.Duration) error { 50 | ctx, cancel := a.getCtx(true) 51 | defer cancel() 52 | return a.n.Set(ctx, key, value, ttl) 53 | } 54 | 55 | // SetCtx forwards to Node.Set using the provided context. 56 | func (a *DistributedCache[K, V]) SetCtx(ctx context.Context, key K, value V, ttl time.Duration) error { 57 | return a.n.Set(ctx, key, value, ttl) 58 | } 59 | 60 | // Get forwards to Node.Get with the configured read timeout. 61 | func (a *DistributedCache[K, V]) Get(key K) (V, bool) { 62 | ctx, cancel := a.getCtx(false) 63 | defer cancel() 64 | v, ok, err := a.n.Get(ctx, key) 65 | if err != nil { 66 | var zero V 67 | return zero, false 68 | } 69 | return v, ok 70 | } 71 | 72 | func (a *DistributedCache[K, V]) GetCtx(ctx context.Context, key K) (V, bool, error) { 73 | return a.n.Get(ctx, key) 74 | } 75 | 76 | func (a *DistributedCache[K, V]) Delete(key K) bool { 77 | ctx, cancel := a.getCtx(true) 78 | defer cancel() 79 | return a.n.Delete(ctx, key) == nil 80 | } 81 | 82 | func (a *DistributedCache[K, V]) DeleteCtx(ctx context.Context, key K) error { 83 | return a.n.Delete(ctx, key) 84 | } 85 | 86 | // GetOrLoadCtx delegates to Node.GetOrLoad, enabling single-flight loading via 87 | // the Node's Lease table on the primary owner. This is the preferred interface 88 | // for read-through caching at the application layer. 89 | func (a *DistributedCache[K, V]) GetOrLoadCtx(ctx context.Context, key K, loader func(context.Context) (V, time.Duration, error)) (V, error) { 90 | return a.n.GetOrLoad(ctx, key, loader) 91 | } 92 | 93 | // Clear clears only the local in-memory shard. 94 | // This does not broadcast a cluster-wide clear. 95 | // Callers requiring global invalidation should implement an explicit protocol at a higher layer. 96 | func (a *DistributedCache[K, V]) Clear() { a.n.local.Clear() } 97 | 98 | // Size returns the size of the local shard only. 99 | func (a *DistributedCache[K, V]) Size() int64 { return a.n.local.Size() } 100 | 101 | // Stats returns statistics from the local shard only. 102 | func (a *DistributedCache[K, V]) Stats() cache.Stats { return a.n.local.Stats() } 103 | 104 | // Close stops the node and returns nil. 105 | func (a *DistributedCache[K, V]) Close() error { a.n.Stop(); return nil } 106 | -------------------------------------------------------------------------------- /benchmarks/benchmark_runner.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | func main() { 12 | fmt.Println("=== KIOSHUN Cache Benchmark Suite ===") 13 | fmt.Println("Running benchmarks against popular Go caches") 14 | fmt.Println() 15 | 16 | benchmarks := []struct { 17 | name string 18 | pattern string 19 | description string 20 | benchtime string 21 | }{ 22 | { 23 | name: "Comparison - Set Operations", 24 | pattern: "BenchmarkCacheComparison_Set", 25 | description: "Pure write performance comparison", 26 | benchtime: "5s", 27 | }, 28 | { 29 | name: "Comparison - Get Operations", 30 | pattern: "BenchmarkCacheComparison_Get", 31 | description: "Pure read performance comparison", 32 | benchtime: "5s", 33 | }, 34 | { 35 | name: "Comparison - Mixed Operations", 36 | pattern: "BenchmarkCacheComparison_Mixed", 37 | description: "Mixed read/write workload comparison", 38 | benchtime: "5s", 39 | }, 40 | { 41 | name: "Comparison - High Contention", 42 | pattern: "BenchmarkCacheComparison_HighContention", 43 | description: "High contention scenario comparison", 44 | benchtime: "5s", 45 | }, 46 | { 47 | name: "Comparison - Read Heavy", 48 | pattern: "BenchmarkCacheComparison_ReadHeavy", 49 | description: "Read-heavy workload comparison", 50 | benchtime: "3s", 51 | }, 52 | { 53 | name: "Comparison - Write Heavy", 54 | pattern: "BenchmarkCacheComparison_WriteHeavy", 55 | description: "Write-heavy workload comparison", 56 | benchtime: "3s", 57 | }, 58 | { 59 | name: "Comparison - Close to Real World", 60 | pattern: "BenchmarkCacheComparison_RealWorldWorkload", 61 | description: "Realistic workload patterns", 62 | benchtime: "3s", 63 | }, 64 | { 65 | name: "Heavy Load Tests", 66 | pattern: "BenchmarkCacheHeavyLoad", 67 | description: "Extreme load scenarios for kioshun", 68 | benchtime: "3s", 69 | }, 70 | { 71 | name: "Contention Stress", 72 | pattern: "BenchmarkCacheContentionStress", 73 | description: "High contention stress test for kioshun", 74 | benchtime: "3s", 75 | }, 76 | { 77 | name: "Eviction Stress", 78 | pattern: "BenchmarkCacheEvictionStress", 79 | description: "Heavy eviction testing for kioshun", 80 | benchtime: "3s", 81 | }, 82 | { 83 | name: "Memory Pressure", 84 | pattern: "BenchmarkCacheMemoryPressure", 85 | description: "Memory pressure testing for kioshun", 86 | benchtime: "3s", 87 | }, 88 | { 89 | name: "Sharding Efficiency", 90 | pattern: "BenchmarkCacheShardingEfficiency", 91 | description: "Sharding performance analysis for kioshun", 92 | benchtime: "3s", 93 | }, 94 | } 95 | 96 | totalStart := time.Now() 97 | 98 | for i, bench := range benchmarks { 99 | fmt.Printf("[%d/%d] %s\n", i+1, len(benchmarks), bench.name) 100 | fmt.Printf("Description: %s\n", bench.description) 101 | fmt.Printf("Running: go test -bench=%s -benchmem -benchtime=%s\n", bench.pattern, bench.benchtime) 102 | fmt.Println(strings.Repeat("-", 80)) 103 | 104 | start := time.Now() 105 | 106 | cmd := exec.Command("go", "test", "-bench="+bench.pattern, "-benchmem", "-benchtime="+bench.benchtime, ".") 107 | cmd.Stdout = os.Stdout 108 | cmd.Stderr = os.Stderr 109 | 110 | err := cmd.Run() 111 | 112 | duration := time.Since(start) 113 | 114 | if err != nil { 115 | fmt.Printf(" - Benchmark failed: %v\n", err) 116 | } else { 117 | fmt.Printf(" + Benchmark completed in %v\n", duration) 118 | } 119 | 120 | fmt.Println() 121 | } 122 | 123 | totalDuration := time.Since(totalStart) 124 | fmt.Printf("🏁 All benchmarks completed in %v\n", totalDuration) 125 | fmt.Println() 126 | fmt.Println("=== Summary ===") 127 | fmt.Println("The benchmarks compare kioshun cache against:") 128 | fmt.Println("- Ristretto (by Dgraph)") 129 | fmt.Println("- BigCache (by Allegro)") 130 | fmt.Println("- FreeCache (by Coocood)") 131 | fmt.Println("- Go-cache (by PatrickMN)") 132 | fmt.Println() 133 | fmt.Println("Key performance areas tested:") 134 | fmt.Println("- Pure read/write performance") 135 | fmt.Println("- Mixed workload scenarios") 136 | fmt.Println("- High contention handling") 137 | fmt.Println("- Memory efficiency") 138 | fmt.Println("- Eviction policy performance") 139 | fmt.Println("- Sharding effectiveness") 140 | fmt.Println("- Scalability under load") 141 | } 142 | -------------------------------------------------------------------------------- /lfu.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | // freqNode is a doubly-linked bucket for one exact frequency. 4 | // non-sentinel empty buckets are removed eagerly so head.next is the current min. 5 | type freqNode[K comparable, V any] struct { 6 | freq int64 // exact frequency (>= 0); head sentinel uses 0 7 | items map[*cacheItem[V]]struct{} // set of items at this frequency 8 | prev *freqNode[K, V] 9 | next *freqNode[K, V] 10 | } 11 | 12 | // lfuList is an LFU index of ascending-frequency buckets (sentinel head at freq==0). 13 | // O(1) add/increment/remove; used under the shard lock. 14 | type lfuList[K comparable, V any] struct { 15 | head *freqNode[K, V] 16 | freqMap map[int64]*freqNode[K, V] // freq → bucket 17 | itemFreq map[*cacheItem[V]]*freqNode[K, V] // item → bucket 18 | } 19 | 20 | // newLFUList creates a circular list with a freq==0 sentinel; sentinel holds no real items. 21 | func newLFUList[K comparable, V any]() *lfuList[K, V] { 22 | list := &lfuList[K, V]{ 23 | head: &freqNode[K, V]{freq: 0, items: make(map[*cacheItem[V]]struct{})}, 24 | freqMap: make(map[int64]*freqNode[K, V]), 25 | itemFreq: make(map[*cacheItem[V]]*freqNode[K, V]), 26 | } 27 | list.head.next = list.head 28 | list.head.prev = list.head 29 | return list 30 | } 31 | 32 | // add inserts item with frequency=1 and indexes it. 33 | func (l *lfuList[K, V]) add(item *cacheItem[V]) { 34 | freq := int64(1) 35 | item.frequency = freq 36 | 37 | node := l.getOrCreateFreqNode(freq) 38 | node.items[item] = struct{}{} 39 | l.itemFreq[item] = node 40 | } 41 | 42 | // increment bumps item's frequency by 1, moves it to the correct bucket, and removes an empty old bucket. 43 | func (l *lfuList[K, V]) increment(item *cacheItem[V]) { 44 | cur := l.itemFreq[item] 45 | if cur == nil { 46 | // Item not indexed yet (defensive); treat as new with freq=1. 47 | l.add(item) 48 | return 49 | } 50 | 51 | newFreq := cur.freq + 1 52 | delete(cur.items, item) 53 | 54 | nxt := cur.next 55 | var target *freqNode[K, V] 56 | if nxt != l.head && nxt.freq == newFreq { 57 | // Fast path: the next bucket already has the desired frequency. 58 | target = nxt 59 | } else { 60 | // Create or find the exact bucket at newFreq right after 'cur'. 61 | target = l.ensureIndex(cur, newFreq) 62 | } 63 | target.items[item] = struct{}{} 64 | l.itemFreq[item] = target 65 | item.frequency = newFreq 66 | 67 | // Drop the old bucket if it is now empty (sentinel node is never removed). 68 | if len(cur.items) == 0 && cur.freq != 0 { 69 | l.removeFreqNode(cur) 70 | } 71 | } 72 | 73 | // removeLFU removes and returns one item from the minimum-frequency bucket 74 | // unlinks the bucket if it becomes empty. 75 | func (l *lfuList[K, V]) removeLFU() *cacheItem[V] { 76 | node := l.head.next 77 | if node == l.head { 78 | return nil // list is empty 79 | } 80 | // By invariant, non-sentinel buckets are never empty. 81 | var victim *cacheItem[V] 82 | for it := range node.items { 83 | victim = it 84 | break 85 | } 86 | 87 | delete(node.items, victim) 88 | delete(l.itemFreq, victim) 89 | if len(node.items) == 0 { 90 | l.removeFreqNode(node) 91 | } 92 | return victim 93 | } 94 | 95 | // remove deletes a specific item from its bucket and removes the bucket if it becomes empty (non-sentinel). 96 | func (l *lfuList[K, V]) remove(item *cacheItem[V]) { 97 | node := l.itemFreq[item] 98 | if node == nil { 99 | return // item not tracked 100 | } 101 | 102 | delete(node.items, item) 103 | delete(l.itemFreq, item) 104 | 105 | if len(node.items) == 0 && node.freq != 0 { 106 | l.removeFreqNode(node) 107 | } 108 | } 109 | 110 | // ensureIndex returns the bucket for freq, inserting a new bucket immediately after prev to keep ascending order. 111 | func (l *lfuList[K, V]) ensureIndex(prev *freqNode[K, V], freq int64) *freqNode[K, V] { 112 | // Exact-hit fast path via freqMap. 113 | if node, ok := l.freqMap[freq]; ok { 114 | return node 115 | } 116 | 117 | newNode := &freqNode[K, V]{ 118 | freq: freq, 119 | items: make(map[*cacheItem[V]]struct{}), 120 | } 121 | 122 | nxt := prev.next 123 | prev.next = newNode 124 | newNode.prev = prev 125 | newNode.next = nxt 126 | nxt.prev = newNode 127 | 128 | l.freqMap[freq] = newNode 129 | return newNode 130 | } 131 | 132 | // getOrCreateFreqNode returns the bucket for freq, inserting after freq-1 (or after head for freq==1). 133 | func (l *lfuList[K, V]) getOrCreateFreqNode(freq int64) *freqNode[K, V] { 134 | if freq == 1 { 135 | return l.ensureIndex(l.head, 1) 136 | } 137 | // Insert immediately after the previous frequency bucket. 138 | prev := l.freqMap[freq-1] 139 | return l.ensureIndex(prev, freq) 140 | } 141 | 142 | // removeFreqNode unlinks an empty non-sentinel bucket and drops its freq map entry. 143 | func (l *lfuList[K, V]) removeFreqNode(node *freqNode[K, V]) { 144 | node.prev.next = node.next 145 | node.next.prev = node.prev 146 | delete(l.freqMap, node.freq) 147 | } 148 | -------------------------------------------------------------------------------- /manager.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | // GlobalManager holds global cache instances 9 | var GlobalManager = NewManager() 10 | 11 | // Manager manages multiple named cache instances with different configurations 12 | type Manager struct { 13 | caches sync.Map // map of cache instances by name 14 | configs map[string]Config 15 | configMu sync.RWMutex 16 | } 17 | 18 | // NewManager creates a new cache manager instance 19 | func NewManager() *Manager { 20 | return &Manager{ 21 | configs: make(map[string]Config), 22 | } 23 | } 24 | 25 | // RegisterCache registers a configuration for a named cache. 26 | // Returns an error if a configuration with the same name already exists. 27 | func (m *Manager) RegisterCache(name string, config Config) error { 28 | m.configMu.Lock() 29 | defer m.configMu.Unlock() 30 | 31 | if _, exists := m.configs[name]; exists { 32 | return newCacheError("register", name, ErrCacheExists) 33 | } 34 | 35 | m.configs[name] = config 36 | return nil 37 | } 38 | 39 | // GetCache retrieves an existing cache or creates a new one with the registered 40 | // configuration. If no configuration is registered, uses DefaultConfig(). 41 | func GetCache[K comparable, V any](m *Manager, name string) (*InMemoryCache[K, V], error) { 42 | // Fast path: return existing cache if found (most common case) 43 | if cached, ok := m.caches.Load(name); ok { 44 | if cache, ok := cached.(*InMemoryCache[K, V]); ok { 45 | return cache, nil 46 | } 47 | return nil, newCacheError("get", name, ErrTypeMismatch) 48 | } 49 | 50 | m.configMu.RLock() 51 | config, exists := m.configs[name] 52 | m.configMu.RUnlock() 53 | 54 | if !exists { 55 | config = DefaultConfig() 56 | } 57 | 58 | // Slow path: create new cache 59 | cache := New[K, V](config) 60 | // Atomic LoadOrStore handles race condition where multiple goroutines 61 | // attempt to create the same cache simultaneously 62 | if actual, loaded := m.caches.LoadOrStore(name, cache); loaded { 63 | // Another goroutine created the cache first 64 | cache.Close() 65 | // Return the winner's cache if types match 66 | if existingCache, ok := actual.(*InMemoryCache[K, V]); ok { 67 | return existingCache, nil 68 | } 69 | return nil, newCacheError("get", name, ErrTypeMismatch) 70 | } 71 | 72 | return cache, nil 73 | } 74 | 75 | // GetCacheStats returns performance statistics for all managed caches. 76 | func (m *Manager) GetCacheStats() map[string]Stats { 77 | stats := make(map[string]Stats) 78 | 79 | m.caches.Range(func(key, value any) bool { 80 | if name, ok := key.(string); ok { 81 | if cache, ok := value.(interface{ Stats() Stats }); ok { 82 | stats[name] = cache.Stats() 83 | } 84 | } 85 | return true 86 | }) 87 | 88 | return stats 89 | } 90 | 91 | // CloseAll closes all managed cache instances and returns any errors encountered. 92 | // Uses two-phase approach: first close all caches, then clear the registry. 93 | func (m *Manager) CloseAll() error { 94 | var closeErrors []error 95 | 96 | // Phase 1: Close all cache instances 97 | m.caches.Range(func(key, value any) bool { 98 | if cache, ok := value.(interface{ Close() error }); ok { 99 | if err := cache.Close(); err != nil { 100 | if name, ok := key.(string); ok { 101 | closeErrors = append(closeErrors, newCacheError("close", name, err)) 102 | } else { 103 | closeErrors = append(closeErrors, wrapError("close", err)) 104 | } 105 | } 106 | } 107 | return true 108 | }) 109 | 110 | // Phase 2: Clear all registry entries 111 | m.caches.Range(func(key, _ any) bool { 112 | m.caches.Delete(key) 113 | return true 114 | }) 115 | 116 | if len(closeErrors) > 0 { 117 | return fmt.Errorf("errors closing caches: %v", closeErrors) 118 | } 119 | 120 | return nil 121 | } 122 | 123 | // RemoveCache removes and closes the named cache instance. 124 | // Removes from registry and cleans up both runtime and configuration state. 125 | func (m *Manager) RemoveCache(name string) error { 126 | if cached, ok := m.caches.LoadAndDelete(name); ok { 127 | if cache, ok := cached.(interface{ Close() error }); ok { 128 | return cache.Close() 129 | } 130 | } 131 | 132 | m.configMu.Lock() 133 | delete(m.configs, name) 134 | m.configMu.Unlock() 135 | 136 | return nil 137 | } 138 | 139 | // RegisterGlobalCache registers a configuration in the global manager. 140 | func RegisterGlobalCache(name string, config Config) error { 141 | return GlobalManager.RegisterCache(name, config) 142 | } 143 | 144 | // GetGlobalCache retrieves or creates a cache from the global manager. 145 | func GetGlobalCache[K comparable, V any](name string) (*InMemoryCache[K, V], error) { 146 | return GetCache[K, V](GlobalManager, name) 147 | } 148 | 149 | // GetGlobalCacheStats returns stats for all caches in the global manager. 150 | func GetGlobalCacheStats() map[string]Stats { 151 | return GlobalManager.GetCacheStats() 152 | } 153 | 154 | // CloseAllGlobalCaches closes all caches in the global manager. 155 | func CloseAllGlobalCaches() error { 156 | return GlobalManager.CloseAll() 157 | } 158 | -------------------------------------------------------------------------------- /benchmarks/cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | networks: 4 | mesh: 5 | driver: bridge 6 | 7 | services: 8 | node1: 9 | build: 10 | context: ../.. 11 | dockerfile: _benchmarks/cluster/node/Dockerfile 12 | container_name: kioshun-mesh-node1 13 | environment: 14 | - PORT=8081 15 | - CACHE_BIND=:5011 16 | - CACHE_PUBLIC=node1:5011 17 | - CACHE_SEEDS=node1:5011,node2:5012,node3:5013 18 | - CACHE_AUTH=supersecret 19 | - ALLOW_KILL=true 20 | - KILL_TOKEN=letmein 21 | # Optional tuning (uncomment/tune as needed) 22 | # - REPLICATION_FACTOR=3 23 | # - WRITE_CONCERN=2 24 | - READ_MAX_FANOUT=1 25 | # - READ_PER_TRY_MS=100 26 | - READ_HEDGE_DELAY_MS=2 27 | - READ_HEDGE_INTERVAL_MS=2 28 | - WRITE_TIMEOUT_MS=2000 29 | # - READ_TIMEOUT_MS=3000 30 | # - SUSPICION_AFTER_MS=1500 31 | - WEIGHT_UPDATE_MS=500 32 | - GOSSIP_INTERVAL_MS=300 33 | networks: [mesh] 34 | healthcheck: 35 | test: ["CMD-SHELL", "wget -qO- http://localhost:8081/ready >/dev/null 2>&1 || exit 1"] 36 | interval: 2s 37 | timeout: 1s 38 | retries: 30 39 | restart: unless-stopped 40 | 41 | node2: 42 | build: 43 | context: ../.. 44 | dockerfile: _benchmarks/cluster/node/Dockerfile 45 | container_name: kioshun-mesh-node2 46 | environment: 47 | - PORT=8082 48 | - CACHE_BIND=:5012 49 | - CACHE_PUBLIC=node2:5012 50 | - CACHE_SEEDS=node1:5011,node2:5012,node3:5013 51 | - CACHE_AUTH=supersecret 52 | - ALLOW_KILL=true 53 | - KILL_TOKEN=letmein 54 | # Optional tuning (uncomment/tune as needed) 55 | # - REPLICATION_FACTOR=3 56 | # - WRITE_CONCERN=2 57 | - READ_MAX_FANOUT=1 58 | # - READ_PER_TRY_MS=100 59 | - READ_HEDGE_DELAY_MS=2 60 | - READ_HEDGE_INTERVAL_MS=2 61 | - WRITE_TIMEOUT_MS=2000 62 | # - READ_TIMEOUT_MS=3000 63 | # - SUSPICION_AFTER_MS=1500 64 | - WEIGHT_UPDATE_MS=500 65 | - GOSSIP_INTERVAL_MS=300 66 | networks: [mesh] 67 | healthcheck: 68 | test: ["CMD-SHELL", "wget -qO- http://localhost:8082/ready >/dev/null 2>&1 || exit 1"] 69 | interval: 2s 70 | timeout: 1s 71 | retries: 30 72 | restart: unless-stopped 73 | 74 | node3: 75 | build: 76 | context: ../.. 77 | dockerfile: _benchmarks/cluster/node/Dockerfile 78 | container_name: kioshun-mesh-node3 79 | environment: 80 | - PORT=8083 81 | - CACHE_BIND=:5013 82 | - CACHE_PUBLIC=node3:5013 83 | - CACHE_SEEDS=node1:5011,node2:5012,node3:5013 84 | - CACHE_AUTH=supersecret 85 | - ALLOW_KILL=true 86 | - KILL_TOKEN=letmein 87 | # Optional tuning (uncomment/tune as needed) 88 | # - REPLICATION_FACTOR=3 89 | # - WRITE_CONCERN=2 90 | - READ_MAX_FANOUT=1 91 | # - READ_PER_TRY_MS=100 92 | - READ_HEDGE_DELAY_MS=2 93 | - READ_HEDGE_INTERVAL_MS=2 94 | - WRITE_TIMEOUT_MS=2000 95 | # - READ_TIMEOUT_MS=3000 96 | # - SUSPICION_AFTER_MS=1500 97 | - WEIGHT_UPDATE_MS=500 98 | - GOSSIP_INTERVAL_MS=300 99 | networks: [mesh] 100 | healthcheck: 101 | test: ["CMD-SHELL", "wget -qO- http://localhost:8083/ready >/dev/null 2>&1 || exit 1"] 102 | interval: 2s 103 | timeout: 1s 104 | retries: 30 105 | restart: unless-stopped 106 | 107 | runner: 108 | build: 109 | context: ../.. 110 | dockerfile: _benchmarks/cluster/runner/Dockerfile 111 | container_name: kioshun-mesh-runner 112 | environment: 113 | - TARGETS=http://node1:8081,http://node2:8082,http://node3:8083 114 | - DURATION=120s 115 | - CONCURRENCY=256 116 | - KEYS=50000 117 | - SET_RATIO=30 118 | - LOG_EVERY=0 119 | - STATS_EVERY=10s 120 | - SET_TTL_MS=-1 121 | - KILL_MODE=none # default: clean run; set to "random" or "target" to enable failures 122 | # - KILL_AFTER=45s 123 | # - KILL_TARGET=http://node2:8082 124 | # - KILL_TOKEN=letmein 125 | depends_on: 126 | node1: 127 | condition: service_healthy 128 | node2: 129 | condition: service_healthy 130 | node3: 131 | condition: service_healthy 132 | networks: [mesh] 133 | restart: "no" 134 | 135 | direct: 136 | build: 137 | context: ../.. 138 | dockerfile: _benchmarks/cluster/direct/Dockerfile 139 | container_name: kioshun-direct-runner 140 | environment: 141 | - DURATION=120s 142 | - CONCURRENCY=256 143 | - KEYS=50000 144 | - SET_RATIO=30 145 | - SET_TTL_MS=-1 146 | # Optional: node tuning 147 | # - READ_MAX_FANOUT=1 148 | # - READ_PER_TRY_MS=100 149 | # - READ_HEDGE_DELAY_MS=2 150 | # - READ_HEDGE_INTERVAL_MS=2 151 | # - WRITE_TIMEOUT_MS=2000 152 | # - SUSPICION_AFTER_MS=1500 153 | # - WEIGHT_UPDATE_MS=500 154 | # - GOSSIP_INTERVAL_MS=300 155 | # Failure injection 156 | # - KILL_AFTER=45s 157 | networks: [mesh] 158 | restart: "no" 159 | -------------------------------------------------------------------------------- /cluster/heat.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "hash/maphash" 5 | "sync" 6 | "sync/atomic" 7 | ) 8 | 9 | type cmSketch struct { 10 | rows [][]uint32 11 | seeds []maphash.Seed 12 | width int 13 | } 14 | 15 | // newCMS constructs a Count-Min sketch with the given rows and width. 16 | // Collisions are acceptable. It serves as a lightweight frequency estimator. 17 | func newCMS(rows, width int) *cmSketch { 18 | s := &cmSketch{ 19 | rows: make([][]uint32, rows), 20 | seeds: make([]maphash.Seed, rows), 21 | width: width, 22 | } 23 | 24 | for i := 0; i < rows; i++ { 25 | s.rows[i] = make([]uint32, width) 26 | s.seeds[i] = maphash.MakeSeed() 27 | } 28 | return s 29 | } 30 | 31 | // add increments counters for the given key across all rows. 32 | func (c *cmSketch) add(key []byte, n uint32) { 33 | for i := range c.rows { 34 | var h maphash.Hash 35 | h.SetSeed(c.seeds[i]) 36 | h.Write(key) 37 | idx := h.Sum64() % uint64(c.width) 38 | c.rows[i][idx] += n 39 | } 40 | } 41 | 42 | type ssEntry struct { 43 | K string 44 | C uint64 45 | } 46 | 47 | type spaceSaving struct { 48 | mu sync.Mutex 49 | cap int // capacity k 50 | h []*ssEntry // min-heap by C 51 | idx map[string]int // key -> index in heap 52 | } 53 | 54 | // newSpaceSaving builds a Space-Saving top-k structure storing at most k keys. 55 | func newSpaceSaving(k int) *spaceSaving { 56 | if k < 1 { 57 | k = 1 58 | } 59 | return &spaceSaving{ 60 | cap: k, 61 | idx: make(map[string]int, k), 62 | } 63 | } 64 | 65 | // add updates the estimated frequency of key using Space-Saving rules. When 66 | // full, it replaces the current minimum counter with the new key. 67 | func (s *spaceSaving) add(k []byte, inc uint64) { 68 | key := string(k) 69 | s.mu.Lock() 70 | defer s.mu.Unlock() 71 | 72 | // existing key: increment and fix heap. 73 | if i, ok := s.idx[key]; ok { 74 | e := s.h[i] 75 | e.C = addSat64(e.C, inc) 76 | s.siftDown(i) // count increased; min-heap needs siftDown 77 | return 78 | } 79 | 80 | // room available: insert new node. 81 | if len(s.h) < s.cap { 82 | e := &ssEntry{K: key, C: inc} 83 | s.h = append(s.h, e) 84 | i := len(s.h) - 1 85 | s.idx[key] = i 86 | s.siftUp(i) 87 | return 88 | } 89 | 90 | // full: replace current min with (key, minC + inc). Reuse the min node (no alloc). 91 | minIdx := 0 92 | minNode := s.h[minIdx] 93 | oldKey := minNode.K 94 | delete(s.idx, oldKey) 95 | 96 | minNode.K = key 97 | minNode.C = addSat64(minNode.C, inc) // space-Saving rule: newC = minC + inc 98 | s.idx[key] = minIdx 99 | s.siftDown(minIdx) 100 | } 101 | 102 | // export returns the current top-k keys and approximate counts. 103 | func (s *spaceSaving) export() []HotKey { 104 | s.mu.Lock() 105 | defer s.mu.Unlock() 106 | out := make([]HotKey, 0, len(s.h)) 107 | for _, e := range s.h { 108 | out = append(out, HotKey{K: []byte(e.K), C: e.C}) 109 | } 110 | return out 111 | } 112 | 113 | // siftUp restores the min-heap property by moving the node at i up toward the 114 | // root while its count is less than its parent. 115 | func (s *spaceSaving) siftUp(i int) { 116 | for i > 0 { 117 | p := (i - 1) / 2 118 | if s.h[p].C <= s.h[i].C { 119 | break 120 | } 121 | s.swap(i, p) 122 | i = p 123 | } 124 | } 125 | 126 | // siftDown restores the min-heap property by moving the node at i down to the 127 | // smallest child while its count is greater than that child. 128 | func (s *spaceSaving) siftDown(i int) { 129 | n := len(s.h) 130 | for { 131 | l := 2*i + 1 132 | if l >= n { 133 | return 134 | } 135 | 136 | small := l 137 | r := l + 1 138 | if r < n && s.h[r].C < s.h[l].C { 139 | small = r 140 | } 141 | 142 | if s.h[i].C <= s.h[small].C { 143 | return 144 | } 145 | s.swap(i, small) 146 | i = small 147 | } 148 | } 149 | 150 | // swap exchanges two heap nodes and updates their indices in the map. 151 | func (s *spaceSaving) swap(i, j int) { 152 | s.h[i], s.h[j] = s.h[j], s.h[i] 153 | s.idx[s.h[i].K] = i 154 | s.idx[s.h[j].K] = j 155 | } 156 | 157 | // addSat64 adds two uint64 numbers with saturation at max uint64 on overflow. 158 | func addSat64(a, b uint64) uint64 { 159 | c := a + b 160 | if c < a { 161 | return ^uint64(0) 162 | } 163 | return c 164 | } 165 | 166 | type heat struct { 167 | cms *cmSketch 168 | ss *spaceSaving 169 | sampleN uint32 170 | ctr uint32 171 | } 172 | 173 | // newHeat ties CM-sketch and Space-Saving together and supports sampling to 174 | // reduce overhead under very high request rates. 175 | func newHeat(rows, width, k, sampleN int) *heat { 176 | return &heat{ 177 | cms: newCMS(rows, width), 178 | ss: newSpaceSaving(k), 179 | sampleN: uint32(sampleN), 180 | } 181 | } 182 | 183 | // sample records a key access at a reduced rate (1/sampleN). When sampleN=1, 184 | // every key is recorded. 185 | func (h *heat) sample(key []byte) { 186 | if h.sampleN <= 1 { 187 | h.cms.add(key, 1) 188 | h.ss.add(key, 1) 189 | return 190 | } 191 | if atomic.AddUint32(&h.ctr, 1)%h.sampleN == 0 { 192 | h.cms.add(key, 1) 193 | h.ss.add(key, 1) 194 | } 195 | } 196 | 197 | func (h *heat) exportTopK() []HotKey { 198 | return h.ss.export() 199 | } 200 | -------------------------------------------------------------------------------- /cluster/bf_rpc.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "sort" 7 | "time" 8 | ) 9 | 10 | func absExpiryAt(base time.Time, ttl time.Duration) int64 { 11 | if ttl <= 0 { 12 | return 0 13 | } 14 | return base.Add(ttl).UnixNano() 15 | } 16 | 17 | // rpcBackfillDigest builds digests for the requested prefix depth considering 18 | // only keys that the target node should own (according to this donor's ring). 19 | // It returns per-bucket counts and XOR(hash^version) so the joiner can detect 20 | // which buckets differ and page only those keys. 21 | func (n *Node[K, V]) rpcBackfillDigest(req MsgBackfillDigestReq) MsgBackfillDigestResp { 22 | depth := int(req.Depth) 23 | if depth <= 0 || depth > 8 { 24 | depth = 2 25 | } 26 | 27 | r := n.ring.Load().(*ring) 28 | 29 | targetID := NodeID(req.TargetID) 30 | if !r.hasID(targetID) { 31 | return MsgBackfillDigestResp{ 32 | Base: Base{T: MTBackfillDigestResp, ID: req.ID}, 33 | Depth: uint8(depth), 34 | NotInRing: true, 35 | } 36 | } 37 | 38 | type agg struct { 39 | c uint32 40 | h uint64 41 | } 42 | 43 | // Aggregate per-bucket count and XOR(hash^version) to detect 44 | // differences between donor and joiner without shipping all keys. 45 | buckets := make(map[string]agg, 1<<12) 46 | 47 | keys := n.local.Keys() 48 | for _, k := range keys { 49 | h64 := n.hash64Of(k) 50 | if !r.ownsHash(targetID, h64) { 51 | continue 52 | } 53 | 54 | var ver uint64 55 | if n.cfg.LWWEnabled { 56 | kb := n.kc.EncodeKey(k) 57 | n.verMu.RLock() 58 | ver = n.version[string(kb)] 59 | n.verMu.RUnlock() 60 | } 61 | 62 | var hb [8]byte 63 | binary.BigEndian.PutUint64(hb[:], h64) 64 | prefix := string(hb[:depth]) 65 | 66 | a := buckets[prefix] 67 | a.c++ 68 | a.h ^= (h64 ^ ver) 69 | buckets[prefix] = a 70 | } 71 | 72 | out := make([]BucketDigest, 0, len(buckets)) 73 | for p, a := range buckets { 74 | out = append(out, BucketDigest{Prefix: []byte(p), Count: a.c, Hash64: a.h}) 75 | } 76 | sort.Slice(out, func(i, j int) bool { return bytes.Compare(out[i].Prefix, out[j].Prefix) < 0 }) 77 | return MsgBackfillDigestResp{ 78 | Base: Base{T: MTBackfillDigestResp, ID: req.ID}, 79 | Depth: uint8(depth), 80 | Buckets: out, 81 | } 82 | } 83 | 84 | // rpcBackfillKeys returns the next page of keys within a given hash-prefix 85 | // bucket that the target should own, ordered by 64-bit key hash. Pagination is 86 | // driven by the last 8-byte hash cursor provided by the caller. Values may be 87 | // compressed, and expirations are converted to absolute nanoseconds. 88 | func (n *Node[K, V]) rpcBackfillKeys(req MsgBackfillKeysReq) MsgBackfillKeysResp { 89 | prefix := req.Prefix 90 | depth := len(prefix) 91 | if depth <= 0 || depth > 8 { 92 | return MsgBackfillKeysResp{Base: Base{T: MTBackfillKeysResp, ID: req.ID}, Done: true} 93 | } 94 | 95 | r := n.ring.Load().(*ring) 96 | 97 | targetID := NodeID(req.TargetID) 98 | if !r.hasID(targetID) { 99 | return MsgBackfillKeysResp{ 100 | Base: Base{T: MTBackfillKeysResp, ID: req.ID}, 101 | Done: true, 102 | NotInRing: true, 103 | } 104 | } 105 | 106 | limit := req.Limit 107 | if limit <= 0 || limit > 4096 { 108 | limit = 1024 109 | } 110 | 111 | // decode cursor (last key-hash). The donor walks keys by hash order 112 | // inside a bucket to provide consistent pagination. 113 | var after uint64 114 | if len(req.Cursor) == 8 { 115 | after = binary.BigEndian.Uint64(req.Cursor) 116 | } 117 | 118 | type row struct { 119 | h uint64 120 | k K 121 | kb []byte 122 | } 123 | rows := make([]row, 0, limit*2) 124 | 125 | keys := n.local.Keys() 126 | for _, k := range keys { 127 | h64 := n.hash64Of(k) 128 | 129 | var hb [8]byte 130 | binary.BigEndian.PutUint64(hb[:], h64) 131 | if !bytes.Equal(hb[:depth], prefix) { 132 | continue 133 | } 134 | 135 | if !r.ownsHash(targetID, h64) || h64 <= after { 136 | continue 137 | } 138 | rows = append(rows, row{h: h64, k: k, kb: n.kc.EncodeKey(k)}) 139 | } 140 | 141 | // sort by key-hash to respect the cursor pagination. 142 | sort.Slice(rows, func(i, j int) bool { return rows[i].h < rows[j].h }) 143 | if len(rows) > limit { 144 | rows = rows[:limit] 145 | } 146 | 147 | items := make([]KV, 0, len(rows)) 148 | now := time.Now() 149 | for _, r := range rows { 150 | v, ttl, ok := n.local.GetWithTTL(r.k) 151 | if !ok { 152 | continue 153 | } 154 | 155 | bv, _ := n.codec.Encode(v) 156 | b2, cp := n.maybeCompress(bv) 157 | 158 | var ver uint64 159 | if n.cfg.LWWEnabled { 160 | n.verMu.RLock() 161 | ver = n.version[string(r.kb)] 162 | n.verMu.RUnlock() 163 | } 164 | 165 | // 0 means no expiration. 166 | abs := absExpiryAt(now, ttl) 167 | items = append(items, KV{ 168 | K: append([]byte(nil), r.kb...), 169 | V: append([]byte(nil), b2...), 170 | E: abs, 171 | Ver: ver, 172 | Cp: cp, 173 | }) 174 | } 175 | 176 | resp := MsgBackfillKeysResp{ 177 | Base: Base{T: MTBackfillKeysResp, ID: req.ID}, 178 | Items: items, 179 | Done: len(items) == 0, 180 | } 181 | if len(rows) > 0 { 182 | var next [8]byte 183 | binary.BigEndian.PutUint64(next[:], rows[len(rows)-1].h) 184 | resp.NextCursor = append([]byte(nil), next[:]...) 185 | } 186 | return resp 187 | } 188 | -------------------------------------------------------------------------------- /cluster/rendezvous.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "math/bits" 5 | "sort" 6 | "sync/atomic" 7 | 8 | "github.com/cespare/xxhash/v2" 9 | ) 10 | 11 | type nodeMeta struct { 12 | ID NodeID 13 | Addr string 14 | weight uint64 // scaled 0..1_000_000 15 | salt uint64 // per-node salt (pre-hashed ID) 16 | } 17 | 18 | // newMeta initializes per-node rendezvous metadata with a default weight and 19 | // a precomputed salt derived from the node ID. 20 | func newMeta(id NodeID, addr string) *nodeMeta { 21 | return &nodeMeta{ 22 | ID: id, Addr: addr, 23 | weight: 500_000, 24 | salt: xxhash.Sum64String(string(id)), 25 | } 26 | } 27 | 28 | // Weight returns the current scaled weight as a [0,1] float. 29 | func (n *nodeMeta) Weight() float64 { 30 | return float64(atomic.LoadUint64(&n.weight)) / 1_000_000.0 31 | } 32 | 33 | type ring struct { 34 | nodes []*nodeMeta 35 | rf int 36 | } 37 | 38 | func newRing(rf int) *ring { return &ring{rf: rf} } 39 | 40 | // ownersFromKeyHash returns the top rf owners for a 64-bit key hash using 41 | // weighted rendezvous hashing. Node salt keeps per-node independence. 42 | func (r *ring) ownersFromKeyHash(keyHash uint64) []*nodeMeta { 43 | type pair struct { 44 | s uint64 // rendezvous score 45 | w uint64 // scaled weight (0..1_000_000) 46 | n *nodeMeta 47 | } 48 | arr := make([]pair, 0, len(r.nodes)) 49 | for _, nm := range r.nodes { 50 | arr = append(arr, pair{ 51 | s: mix64(keyHash ^ nm.salt), 52 | w: atomic.LoadUint64(&nm.weight), // snapshot once 53 | n: nm, 54 | }) 55 | } 56 | 57 | less := func(i, j int) bool { 58 | hi1, lo1 := bits.Mul64(arr[i].s, arr[i].w) 59 | hi2, lo2 := bits.Mul64(arr[j].s, arr[j].w) 60 | if hi1 != hi2 { 61 | return hi1 > hi2 // higher product first 62 | } 63 | if lo1 != lo2 { 64 | return lo1 > lo2 65 | } 66 | return arr[i].n.ID < arr[j].n.ID // tie-break 67 | } 68 | sort.Slice(arr, less) 69 | 70 | n := r.rf 71 | if n > len(arr) { 72 | n = len(arr) 73 | } 74 | out := make([]*nodeMeta, n) 75 | for i := 0; i < n; i++ { 76 | out[i] = arr[i].n 77 | } 78 | return out 79 | } 80 | 81 | // ownersTopNFromKeyHash returns the top N candidates by weighted rendezvous 82 | // score. Used for hot-key shadowing beyond rf. 83 | func (r *ring) ownersTopNFromKeyHash(keyHash uint64, n int) []*nodeMeta { 84 | // variant that returns the top-N candidates for hot-key shadowing. 85 | // Use the same integer 128-bit ranking as ownersFromKeyHash for 86 | // consistent ordering and tie-breaking. 87 | type pair struct { 88 | s uint64 // rendezvous score 89 | w uint64 // scaled weight (0..1_000_000) 90 | n *nodeMeta 91 | } 92 | 93 | arr := make([]pair, 0, len(r.nodes)) 94 | for _, nm := range r.nodes { 95 | arr = append(arr, pair{ 96 | s: mix64(keyHash ^ nm.salt), 97 | w: atomic.LoadUint64(&nm.weight), 98 | n: nm, 99 | }) 100 | } 101 | 102 | less := func(i, j int) bool { 103 | hi1, lo1 := bits.Mul64(arr[i].s, arr[i].w) 104 | hi2, lo2 := bits.Mul64(arr[j].s, arr[j].w) 105 | if hi1 != hi2 { 106 | return hi1 > hi2 107 | } 108 | if lo1 != lo2 { 109 | return lo1 > lo2 110 | } 111 | return arr[i].n.ID < arr[j].n.ID 112 | } 113 | sort.Slice(arr, less) 114 | 115 | if n > len(arr) { 116 | n = len(arr) 117 | } 118 | 119 | out := make([]*nodeMeta, n) 120 | for i := 0; i < n; i++ { 121 | out[i] = arr[i].n 122 | } 123 | return out 124 | } 125 | 126 | // ownsHash reports whether selfID is among the top-rf owners for keyHash. 127 | func (r *ring) ownsHash(selfID NodeID, keyHash uint64) bool { 128 | if len(r.nodes) == 0 || r.rf <= 0 { 129 | return false 130 | } 131 | top := r.rf 132 | if top > len(r.nodes) { 133 | top = len(r.nodes) 134 | } 135 | 136 | type slot struct { 137 | hi, lo uint64 // 128-bit product of (score * weight) 138 | n *nodeMeta 139 | } 140 | best := make([]slot, 0, top) 141 | worst := 0 142 | 143 | worse := func(a slot, b slot) bool { 144 | if a.hi != b.hi { 145 | return a.hi < b.hi 146 | } 147 | if a.lo != b.lo { 148 | return a.lo < b.lo 149 | } 150 | return a.n.ID > b.n.ID 151 | } 152 | 153 | for _, nm := range r.nodes { 154 | s := mix64(keyHash ^ nm.salt) 155 | w := atomic.LoadUint64(&nm.weight) 156 | hi, lo := bits.Mul64(s, w) 157 | 158 | if len(best) < top { 159 | best = append(best, slot{hi: hi, lo: lo, n: nm}) 160 | if len(best) == 1 || worse(best[len(best)-1], best[worst]) { 161 | worst = len(best) - 1 162 | } 163 | continue 164 | } 165 | 166 | if !worse(slot{hi, lo, nm}, best[worst]) { 167 | best[worst] = slot{hi: hi, lo: lo, n: nm} 168 | // recompute worst 169 | worst = 0 170 | for i := 1; i < len(best); i++ { 171 | if worse(best[i], best[worst]) { 172 | worst = i 173 | } 174 | } 175 | } 176 | } 177 | 178 | for _, sl := range best { 179 | if sl.n.ID == selfID { 180 | return true 181 | } 182 | } 183 | return false 184 | } 185 | 186 | // mix64: fast 64-bit mixer (SplitMix64 finalizer). 187 | func mix64(x uint64) uint64 { 188 | x ^= x >> 30 189 | x *= 0xbf58476d1ce4e5b9 190 | x ^= x >> 27 191 | x *= 0x94d049bb133111eb 192 | x ^= x >> 31 193 | return x 194 | } 195 | 196 | // hasID returns true when the ID participates in this ring view. 197 | func (r *ring) hasID(id NodeID) bool { 198 | for _, nm := range r.nodes { 199 | if nm.ID == id { 200 | return true 201 | } 202 | } 203 | return false 204 | } 205 | -------------------------------------------------------------------------------- /internal/httpcache/index.go: -------------------------------------------------------------------------------- 1 | package httpcache 2 | 3 | import ( 4 | "strings" 5 | "sync" 6 | ) 7 | 8 | const ( 9 | rootPath = "/" 10 | pathSeparator = "/" 11 | wildcardChar = "*" 12 | ) 13 | 14 | // PatternNode represents a single node in the path tree. 15 | type PatternNode struct { 16 | children map[string]*PatternNode 17 | keys map[string]bool 18 | } 19 | 20 | // PatternIndex maintains a tree structure that maps URL paths to cache keys. 21 | type PatternIndex struct { 22 | mu sync.RWMutex 23 | root *PatternNode 24 | } 25 | 26 | func NewPatternIndex() *PatternIndex { 27 | return &PatternIndex{root: newPatternNode()} 28 | } 29 | 30 | func newPatternNode() *PatternNode { 31 | return &PatternNode{ 32 | children: make(map[string]*PatternNode), 33 | keys: make(map[string]bool), 34 | } 35 | } 36 | 37 | // DefaultPathExtractor returns an empty string; override to map keys to paths. 38 | func DefaultPathExtractor(key string) string { return "" } 39 | 40 | // NormalizePath converts a URL path into a normalized slice of path segments 41 | // Empty path handling: 42 | // - Empty string converts to root path ("/") 43 | // - Ensures all paths have a canonical representation 44 | // 45 | // Path cleaning process: 46 | // 1. Trim leading and trailing slashes to remove "/path/" -> "path" 47 | // 2. After trimming, empty result indicates root path (returns empty slice) 48 | // 3. Split remaining path by separator into individual segments 49 | // 4. Filter out empty segments caused by double slashes ("//") or malformed paths 50 | // 51 | // Examples: 52 | // - "" -> [] 53 | // - "/" -> [] 54 | // - "/api/v1/" -> ["api", "v1"] 55 | // - "//api//v1//" -> ["api", "v1"] 56 | // - "api/v1" -> ["api", "v1"] 57 | // 58 | // This normalization ensures that equivalent paths (with different slash patterns) 59 | // map to the same tree location, preventing duplicate entries 60 | func normalizePath(path string) []string { 61 | if path == "" { 62 | path = rootPath 63 | } 64 | 65 | trimmed := strings.Trim(path, pathSeparator) 66 | if trimmed == "" { 67 | return []string{} 68 | } 69 | 70 | segments := strings.Split(trimmed, pathSeparator) 71 | result := make([]string, 0, len(segments)) 72 | for _, seg := range segments { 73 | if seg != "" { 74 | result = append(result, seg) 75 | } 76 | } 77 | return result 78 | } 79 | 80 | // AddKey associates a cache key with a specific path in the trie. 81 | func (pi *PatternIndex) AddKey(path, key string) { 82 | pi.mu.Lock() 83 | defer pi.mu.Unlock() 84 | 85 | node := pi.root 86 | segments := normalizePath(path) 87 | for _, s := range segments { 88 | if node.children[s] == nil { 89 | node.children[s] = newPatternNode() 90 | } 91 | node = node.children[s] 92 | } 93 | node.keys[key] = true 94 | } 95 | 96 | // RemoveKey removes a cache key from the specified path. 97 | func (pi *PatternIndex) RemoveKey(path, key string) { 98 | pi.mu.Lock() 99 | defer pi.mu.Unlock() 100 | 101 | segments := normalizePath(path) 102 | node := pi.findNode(segments) 103 | if node != nil { 104 | delete(node.keys, key) 105 | } 106 | } 107 | 108 | // GetMatchingKeys returns all cache keys that match the given pattern 109 | // 110 | // Exact path matching: 111 | // - Pattern without '*' suffix matches only keys stored at that exact path 112 | // - Uses findNode() to locate the specific tree node 113 | // - Collects only keys stored directly at the target node 114 | // - Example: "/api/users" matches keys at exactly "/api/users" 115 | // 116 | // Wildcard pattern matching: 117 | // - Pattern ending with '*' enables prefix-based subtree matching 118 | // - Strips the '*' suffix and finds the base path node 119 | // - Recursively collects keys from the base node and all descendant nodes 120 | // - Example: "/api/*" matches keys at "/api", "/api/users", "/api/users/123", etc. 121 | func (pi *PatternIndex) GetMatchingKeys(pattern string) []string { 122 | pi.mu.RLock() 123 | defer pi.mu.RUnlock() 124 | 125 | if pattern == "" { 126 | pattern = rootPath 127 | } 128 | 129 | isWildcard := strings.HasSuffix(pattern, wildcardChar) 130 | if isWildcard { 131 | pattern = strings.TrimSuffix(pattern, wildcardChar) 132 | } 133 | 134 | segments := normalizePath(pattern) 135 | node := pi.findNode(segments) 136 | if node == nil { 137 | return nil 138 | } 139 | if isWildcard { 140 | return pi.collectAllKeys(node) 141 | } 142 | return pi.collectDirectKeys(node) 143 | } 144 | 145 | func (pi *PatternIndex) findNode(segments []string) *PatternNode { 146 | node := pi.root 147 | for _, s := range segments { 148 | next := node.children[s] 149 | if next == nil { 150 | return nil 151 | } 152 | node = next 153 | } 154 | return node 155 | } 156 | 157 | func (pi *PatternIndex) collectDirectKeys(node *PatternNode) []string { 158 | if len(node.keys) == 0 { 159 | return nil 160 | } 161 | 162 | keys := make([]string, 0, len(node.keys)) 163 | for k := range node.keys { 164 | keys = append(keys, k) 165 | } 166 | return keys 167 | } 168 | 169 | func (pi *PatternIndex) collectAllKeys(node *PatternNode) []string { 170 | var keys []string 171 | for k := range node.keys { 172 | keys = append(keys, k) 173 | } 174 | for _, c := range node.children { 175 | keys = append(keys, pi.collectAllKeys(c)...) 176 | } 177 | return keys 178 | } 179 | 180 | func (pi *PatternIndex) Clear() { 181 | pi.mu.Lock() 182 | pi.root = newPatternNode() 183 | pi.mu.Unlock() 184 | } 185 | -------------------------------------------------------------------------------- /cluster/config.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "crypto/tls" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/cespare/xxhash/v2" 9 | ) 10 | 11 | type NodeID string 12 | 13 | type TLSMode struct { 14 | Enable bool 15 | CertFile string 16 | KeyFile string 17 | CAFile string 18 | RequireClientCert bool 19 | MinVersion uint16 20 | PreferServerCipherSuites bool 21 | CipherSuites []uint16 22 | CurvePreferences []tls.CurveID 23 | } 24 | 25 | type Security struct { 26 | AuthToken string 27 | TLS TLSMode 28 | MaxFrameSize int 29 | MaxKeySize int 30 | MaxValueSize int 31 | ReadTimeout time.Duration 32 | WriteTimeout time.Duration 33 | IdleTimeout time.Duration 34 | MaxInflightPerPeer int 35 | CompressionThreshold int 36 | LeaseLoadQPS int 37 | ReadBufSize int 38 | WriteBufSize int 39 | AllowUnauthenticatedClients bool 40 | // MaxConcurrentHandshakes caps simultaneous TLS handshakes. 41 | // > 0 : fixed cap 42 | // = 0 : auto => max(64, 32*GOMAXPROCS) 43 | // < 0 : disabled (no gating) 44 | MaxConcurrentHandshakes int 45 | } 46 | 47 | type DropPolicy uint8 48 | 49 | const ( 50 | DropOldest DropPolicy = iota 51 | DropNewest 52 | DropNone 53 | ) 54 | 55 | type HandoffConfig struct { 56 | Enable *bool 57 | Pause bool 58 | MaxItems int 59 | MaxBytes int64 60 | PerPeerCap int 61 | PerPeerBytes int64 62 | TTL time.Duration 63 | ReplayRPS int 64 | DropPolicy DropPolicy 65 | AutopauseItems int 66 | AutopauseBytes int64 67 | } 68 | 69 | func (h *HandoffConfig) IsEnabled() bool { 70 | return h.Enable == nil || *h.Enable 71 | } 72 | 73 | func (h *HandoffConfig) FillDefaults() { 74 | if h.Enable == nil { 75 | b := true 76 | h.Enable = &b 77 | } 78 | if !*h.Enable { 79 | return 80 | } 81 | if h.DropPolicy == 0 { 82 | h.DropPolicy = DropOldest 83 | } 84 | if h.MaxItems == 0 { 85 | h.MaxItems = 500_000 86 | } 87 | if h.MaxBytes == 0 { 88 | h.MaxBytes = 2 << 30 // ~2 GiB 89 | } 90 | if h.PerPeerCap == 0 { 91 | h.PerPeerCap = 50_000 92 | } 93 | if h.PerPeerBytes == 0 { 94 | h.PerPeerBytes = 512 << 20 // ~512 MiB 95 | } 96 | if h.TTL <= 0 { 97 | h.TTL = 10 * time.Minute 98 | } 99 | if h.ReplayRPS <= 0 { 100 | h.ReplayRPS = 20_000 101 | } 102 | if h.AutopauseItems == 0 && h.MaxItems > 0 { 103 | h.AutopauseItems = h.MaxItems * 9 / 10 104 | } 105 | if h.AutopauseBytes == 0 && h.MaxBytes > 0 { 106 | h.AutopauseBytes = int64(h.MaxBytes * 9 / 10) 107 | } 108 | } 109 | 110 | func BoolPtr(b bool) *bool { return &b } 111 | 112 | type Config struct { 113 | ID NodeID 114 | BindAddr string 115 | PublicURL string 116 | Seeds []string 117 | ReplicationFactor int 118 | WriteConcern int 119 | // Client read tuning 120 | ReadMaxFanout int 121 | ReadHedgeDelay time.Duration 122 | ReadHedgeInterval time.Duration 123 | ReadPerTryTimeout time.Duration 124 | GossipInterval time.Duration 125 | SuspicionAfter time.Duration 126 | TombstoneAfter time.Duration 127 | WeightUpdate time.Duration 128 | HotsetPeriod time.Duration 129 | HotsetSize int 130 | MirrorTTL time.Duration 131 | LeaseTTL time.Duration 132 | RebalanceInterval time.Duration 133 | BackfillInterval time.Duration 134 | RebalanceLimit int 135 | Sec Security 136 | LWWEnabled bool 137 | PerConnWorkers int 138 | PerConnQueue int 139 | 140 | Handoff HandoffConfig 141 | } 142 | 143 | func Default() Config { 144 | return Config{ 145 | ReplicationFactor: 2, 146 | WriteConcern: 1, 147 | ReadMaxFanout: 2, 148 | ReadHedgeDelay: 3 * time.Millisecond, 149 | ReadHedgeInterval: 3 * time.Millisecond, 150 | ReadPerTryTimeout: 200 * time.Millisecond, 151 | GossipInterval: 500 * time.Millisecond, 152 | SuspicionAfter: 2 * time.Second, 153 | TombstoneAfter: 30 * time.Second, 154 | WeightUpdate: 1 * time.Second, 155 | HotsetPeriod: 2 * time.Second, 156 | HotsetSize: 1024, 157 | MirrorTTL: 30 * time.Second, 158 | LeaseTTL: 300 * time.Millisecond, 159 | RebalanceInterval: 2 * time.Second, 160 | BackfillInterval: 30 * time.Second, 161 | RebalanceLimit: 500, 162 | Sec: Security{ 163 | MaxFrameSize: 4 << 20, 164 | MaxKeySize: 128 << 10, 165 | MaxValueSize: 2 << 20, 166 | ReadTimeout: 3 * time.Second, 167 | WriteTimeout: 3 * time.Second, 168 | IdleTimeout: 10 * time.Second, 169 | MaxInflightPerPeer: 256, 170 | CompressionThreshold: 64 << 10, 171 | LeaseLoadQPS: 0, 172 | ReadBufSize: 32 << 10, 173 | WriteBufSize: 32 << 10, 174 | TLS: TLSMode{ 175 | PreferServerCipherSuites: true, 176 | }, 177 | AllowUnauthenticatedClients: true, 178 | MaxConcurrentHandshakes: 0, 179 | }, 180 | PerConnWorkers: 64, 181 | PerConnQueue: 128, 182 | LWWEnabled: true, 183 | 184 | Handoff: HandoffConfig{ 185 | Enable: BoolPtr(true), 186 | Pause: false, 187 | MaxItems: 500_000, 188 | MaxBytes: 2 << 30, 189 | PerPeerCap: 50_000, 190 | PerPeerBytes: 512 << 20, 191 | TTL: 10 * time.Minute, 192 | ReplayRPS: 20_000, 193 | DropPolicy: DropOldest, 194 | AutopauseItems: 500_000 * 9 / 10, 195 | AutopauseBytes: int64((2 << 30) * 9 / 10), 196 | }, 197 | } 198 | } 199 | 200 | // EnsureID assigns a stable ID when not provided. 201 | // Default: 16-hex digest of PublicURL. 202 | func (c *Config) EnsureID() { 203 | if c.ID != "" { 204 | return 205 | } 206 | sum := xxhash.Sum64String(c.PublicURL) 207 | c.ID = NodeID(fmt.Sprintf("%016x", sum)) 208 | } 209 | -------------------------------------------------------------------------------- /hash.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "encoding/binary" 5 | "fmt" 6 | "math/bits" 7 | ) 8 | 9 | // xxHash64 (seed=0) tuned for cache hot paths: 10 | // preserves spec mixing, uses FNV-1a for tiny strings, and applies avalanche-only for ints. 11 | const ( 12 | // xxHash64 primes (spec). 13 | prime64_1 = 0x9E3779B185EBCA87 14 | prime64_2 = 0xC2B2AE3D27D4EB4F 15 | prime64_3 = 0x165667B19E3779F9 16 | prime64_4 = 0x85EBCA77C2B2AE63 17 | prime64_5 = 0x27D4EB2F165667C5 18 | 19 | // Precomputed seeds for seed=0 (v1 and v4 initial values per spec). 20 | seed64_1 = 0x60EA27EEADC0B5D6 // prime64_1 + prime64_2 21 | seed64_4 = 0x61C8864E7A143579 // -prime64_1 (two's complement) 22 | 23 | // Size/rotation params (spec). 24 | largeInputThreshold = 32 25 | 26 | roundRotation = 31 27 | mergeRotation = 27 28 | smallRotation = 23 29 | tinyRotation = 11 30 | 31 | // Avalanche xor-shifts (order matters). 32 | avalancheShift1 = 33 33 | avalancheShift2 = 29 34 | avalancheShift3 = 32 35 | 36 | // Lane-combine rotations (spec). 37 | v1Rotation = 1 38 | v2Rotation = 7 39 | v3Rotation = 12 40 | v4Rotation = 18 41 | ) 42 | 43 | // Strategy heuristic for string keys: ≤8B → FNV-1a, >8B → xxHash64. 44 | const ( 45 | stringByteLength = 8 46 | ) 47 | 48 | // hasher provides type-specialized hashing without reflection (stateless, goroutine-safe). 49 | type hasher[K comparable] struct{} 50 | 51 | // newHasher returns a tiny value-type hasher for K (no captured state). 52 | func newHasher[K comparable]() hasher[K] { 53 | return hasher[K]{} 54 | } 55 | 56 | // hash routes by key type: 57 | // ints → avalanche-only, strings → FNV/xxHash by length, 58 | // others → formatted string then string hashing. 59 | func (h hasher[K]) hash(key K) uint64 { 60 | switch k := any(key).(type) { 61 | case string: 62 | return h.hashString(k) 63 | case int: 64 | return xxHash64Avalanche(uint64(k)) 65 | case int32: 66 | return xxHash64Avalanche(uint64(k)) 67 | case int64: 68 | return xxHash64Avalanche(uint64(k)) 69 | case uint: 70 | return xxHash64Avalanche(uint64(k)) 71 | case uint32: 72 | return xxHash64Avalanche(uint64(k)) 73 | case uint64: 74 | return xxHash64Avalanche(k) 75 | default: 76 | // Fallback allocates; prefer native K types to avoid it. 77 | return h.hashString(fmt.Sprintf("%v", k)) 78 | } 79 | } 80 | 81 | // hashString selects FNV-1a for very short strings, xxHash64 otherwise. 82 | func (h hasher[K]) hashString(s string) uint64 { 83 | if len(s) <= stringByteLength { 84 | return fnvHash64(s) 85 | } 86 | return xxHash64(s) 87 | } 88 | 89 | // xxHash64 computes xxHash64(seed=0) for a string; 90 | // ≥32B uses 4-lane path, smaller uses small-input path, then avalanche. 91 | func xxHash64(input string) uint64 { 92 | data := []byte(input) 93 | length := len(data) 94 | 95 | var h64 uint64 96 | if length >= largeInputThreshold { 97 | h64 = xxHash64Large(data, uint64(length)) 98 | } else { 99 | h64 = prime64_5 + uint64(length) // small-input init 100 | h64 = xxHash64Small(data, h64) 101 | } 102 | 103 | return xxHash64Avalanche(h64) 104 | } 105 | 106 | // xxHash64Large processes 32B blocks with four accumulators, combines lanes, then finalizes the tail. 107 | func xxHash64Large(data []byte, length uint64) uint64 { 108 | // Seed accumulators for seed=0. 109 | v1 := uint64(seed64_1) 110 | v2 := uint64(prime64_2) 111 | v3 := uint64(0) 112 | v4 := uint64(seed64_4) 113 | 114 | // 32B per iteration (8B per lane). 115 | for len(data) >= largeInputThreshold { 116 | v1 = xxHash64Round(v1, binary.LittleEndian.Uint64(data[0:8])) 117 | v2 = xxHash64Round(v2, binary.LittleEndian.Uint64(data[8:16])) 118 | v3 = xxHash64Round(v3, binary.LittleEndian.Uint64(data[16:24])) 119 | v4 = xxHash64Round(v4, binary.LittleEndian.Uint64(data[24:32])) 120 | data = data[largeInputThreshold:] 121 | } 122 | 123 | // Combine lanes with distinct rotations, then merge rounds. 124 | h64 := bits.RotateLeft64(v1, v1Rotation) + 125 | bits.RotateLeft64(v2, v2Rotation) + 126 | bits.RotateLeft64(v3, v3Rotation) + 127 | bits.RotateLeft64(v4, v4Rotation) 128 | 129 | h64 = xxHash64MergeRound(h64, v1) 130 | h64 = xxHash64MergeRound(h64, v2) 131 | h64 = xxHash64MergeRound(h64, v3) 132 | h64 = xxHash64MergeRound(h64, v4) 133 | 134 | h64 += length 135 | return xxHash64Finalize(data, h64) 136 | } 137 | 138 | // xxHash64Small forwards small inputs directly to finalization. 139 | func xxHash64Small(data []byte, h64 uint64) uint64 { 140 | return xxHash64Finalize(data, h64) 141 | } 142 | 143 | // xxHash64Round is one per-lane round: (acc + input*prime2) → rot(31) → *prime1. 144 | func xxHash64Round(acc, input uint64) uint64 { 145 | acc += input * prime64_2 146 | acc = bits.RotateLeft64(acc, roundRotation) 147 | acc *= prime64_1 148 | return acc 149 | } 150 | 151 | // xxHash64MergeRound folds a lane into the main hash during lane combination. 152 | func xxHash64MergeRound(h64, val uint64) uint64 { 153 | val = xxHash64Round(0, val) 154 | h64 ^= val 155 | h64 = h64*prime64_1 + prime64_4 156 | return h64 157 | } 158 | 159 | // xxHash64Finalize folds tail bytes (8B → 4B → 1B) and applies the final avalanche. 160 | func xxHash64Finalize(data []byte, h64 uint64) uint64 { 161 | for len(data) >= 8 { 162 | k1 := binary.LittleEndian.Uint64(data[0:8]) 163 | k1 = xxHash64Round(0, k1) 164 | h64 ^= k1 165 | h64 = bits.RotateLeft64(h64, mergeRotation)*prime64_1 + prime64_4 166 | data = data[8:] 167 | } 168 | 169 | if len(data) >= 4 { 170 | k1 := uint64(binary.LittleEndian.Uint32(data[0:4])) 171 | h64 ^= k1 * prime64_1 172 | h64 = bits.RotateLeft64(h64, smallRotation)*prime64_2 + prime64_3 173 | data = data[4:] 174 | } 175 | 176 | // Final 0..3 bytes. 177 | for len(data) > 0 { 178 | k1 := uint64(data[0]) 179 | h64 ^= k1 * prime64_5 180 | h64 = bits.RotateLeft64(h64, tinyRotation) * prime64_1 181 | data = data[1:] 182 | } 183 | 184 | return h64 185 | } 186 | 187 | // xxHash64Avalanche performs the final xor-shift/multiply chain to enforce avalanche behavior. 188 | func xxHash64Avalanche(h64 uint64) uint64 { 189 | h64 ^= h64 >> avalancheShift1 190 | h64 *= prime64_2 191 | h64 ^= h64 >> avalancheShift2 192 | h64 *= prime64_3 193 | h64 ^= h64 >> avalancheShift3 194 | return h64 195 | } 196 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. 4 | 5 | ### [0.3.15](https://github.com/unkn0wn-root/kioshun/compare/v0.3.14...v0.3.15) (2025-10-30) 6 | 7 | ### [0.3.14](https://github.com/unkn0wn-root/kioshun/compare/v0.3.13...v0.3.14) (2025-10-29) 8 | 9 | ### [0.3.13](https://github.com/unkn0wn-root/kioshun/compare/v0.3.12...v0.3.13) (2025-10-29) 10 | 11 | 12 | ### Bug Fixes 13 | 14 | * expiration race when upgrading shard lock during get ([dcbc8f0](https://github.com/unkn0wn-root/kioshun/commit/dcbc8f08b883f60f6617519db8929991e7d7a42c)) 15 | 16 | ### [0.3.12](https://github.com/unkn0wn-root/kioshun/compare/v0.3.11...v0.3.12) (2025-10-29) 17 | 18 | ### [0.3.11](https://github.com/unkn0wn-root/kioshun/compare/v0.3.10...v0.3.11) (2025-09-07) 19 | 20 | 21 | ### Features 22 | 23 | * add MaxConcurrentHandshakes and resoect container limits with GOMAXPROCS ([bfbcc78](https://github.com/unkn0wn-root/kioshun/commit/bfbcc7842fc3fd0dcf3c4db654ee6fbadf406f96)) 24 | * **cluster:** use ID instead of PublicURL ([e5e9bb3](https://github.com/unkn0wn-root/kioshun/commit/e5e9bb39b201fc434293800b385dc0a5f04bfc2c)) 25 | 26 | 27 | ### Bug Fixes 28 | 29 | * tests after publicurl to node id change ([d9e7f83](https://github.com/unkn0wn-root/kioshun/commit/d9e7f837ff264a4db6b32c4caed9bcb3e18e2667)) 30 | 31 | ### [0.3.10](https://github.com/unkn0wn-root/kioshun/compare/v0.3.9...v0.3.10) (2025-09-05) 32 | 33 | 34 | ### Bug Fixes 35 | 36 | * **node:** make close idempotent ([4c183b5](https://github.com/unkn0wn-root/kioshun/commit/4c183b54db016e836074017f9f8b476594e171d3)) 37 | 38 | ### [0.3.9](https://github.com/unkn0wn-root/kioshun/compare/v0.3.8...v0.3.9) (2025-09-05) 39 | 40 | 41 | ### Features 42 | 43 | * **admission:** drop atomics since we hold lock in cache ([1667b84](https://github.com/unkn0wn-root/kioshun/commit/1667b8492a7102ff7d434e2032e9963a3b51dc9e)) 44 | * **client:** if local is primary and key exists, serve locally ([6adb9e2](https://github.com/unkn0wn-root/kioshun/commit/6adb9e2f74ddc5de4b2f08056fb2eef8ab2a20ca)) 45 | 46 | ### [0.3.8](https://github.com/unkn0wn-root/kioshun/compare/v0.3.7...v0.3.8) (2025-09-05) 47 | 48 | 49 | ### Features 50 | 51 | * add NotInRing to indicate that donor is not ready yet ([0fd11f5](https://github.com/unkn0wn-root/kioshun/commit/0fd11f5fd7db04edb9d15782f8063771dd25f0c3)) 52 | 53 | ### [0.3.7](https://github.com/unkn0wn-root/kioshun/compare/v0.3.6...v0.3.7) (2025-09-05) 54 | 55 | 56 | ### Bug Fixes 57 | 58 | * **node:** add itself to ring ([cc96554](https://github.com/unkn0wn-root/kioshun/commit/cc965545264e6be40e99720a037fd283710344f3)) 59 | 60 | ### [0.3.6](https://github.com/unkn0wn-root/kioshun/compare/v0.3.5...v0.3.6) (2025-09-05) 61 | 62 | 63 | ### Features 64 | 65 | * micro opt. and fixes ([d0322de](https://github.com/unkn0wn-root/kioshun/commit/d0322de4fa3dc35f3844afa16c6474b7afc65f9f)) 66 | 67 | ### [0.3.5](https://github.com/unkn0wn-root/kioshun/compare/v0.3.4...v0.3.5) (2025-09-03) 68 | 69 | ### [0.3.4](https://github.com/unkn0wn-root/kioshun/compare/v0.3.3...v0.3.4) (2025-09-03) 70 | 71 | ### [0.3.3](https://github.com/unkn0wn-root/kioshun/compare/v0.3.2...v0.3.3) (2025-09-03) 72 | 73 | 74 | ### Features 75 | 76 | * added new cluster benchmarks - http via wrapper and direct - via inter claster RPC ([287cedd](https://github.com/unkn0wn-root/kioshun/commit/287cedde636154d340a05bfcc6c8c5cec304f854)) 77 | 78 | 79 | ### Bug Fixes 80 | 81 | * make eviction test more robust after admission change ([8d4a759](https://github.com/unkn0wn-root/kioshun/commit/8d4a75903ebe059b1c78a73f87563bf941362c82)) 82 | 83 | ### [0.3.2](https://github.com/unkn0wn-root/kioshun/compare/v0.3.1...v0.3.2) (2025-09-03) 84 | 85 | 86 | ### Features 87 | 88 | * add cluster tests ([dd1ab76](https://github.com/unkn0wn-root/kioshun/commit/dd1ab764ef6e8d4f535e325358a2a96ce0ce8775)) 89 | 90 | 91 | ### Bug Fixes 92 | 93 | * reset peer only on fatal err; fix example api paths ([409fd08](https://github.com/unkn0wn-root/kioshun/commit/409fd08e3c46da6e526572d70963abc09e86f65f)) 94 | 95 | ### [0.3.1](https://github.com/unkn0wn-root/kioshun/compare/v0.3.0...v0.3.1) (2025-09-03) 96 | 97 | 98 | ### Features 99 | 100 | * add clustered api to examples dir ([90047df](https://github.com/unkn0wn-root/kioshun/commit/90047dfe759fa1188da4ba083be2cca9ec7fad4e)) 101 | 102 | ## [0.3.0](https://github.com/unkn0wn-root/kioshun/compare/v0.2.3...v0.3.0) (2025-09-02) 103 | 104 | 105 | ### Features 106 | 107 | * add dockerfile(cmd) and docker-compose (_examples) ([5e777a6](https://github.com/unkn0wn-root/kioshun/commit/5e777a62fd56047dd4aa906d2ceb97d8d79598f5)) 108 | * add kioshun adapter ([48a79af](https://github.com/unkn0wn-root/kioshun/commit/48a79afad83b1bc04abc650cfecc052311dcc306)) 109 | * add kioshun node starter to cmd ([c6df4e9](https://github.com/unkn0wn-root/kioshun/commit/c6df4e9c3db3f29db28fb5687abc3feb0820acc6)) 110 | * add snapshot bridge between kioshun and cluster ([a8cd204](https://github.com/unkn0wn-root/kioshun/commit/a8cd20442b85eb505896bc21fe7f7cf000f141f5)) 111 | * add timeout-only exponential backoff ([ca3fbc1](https://github.com/unkn0wn-root/kioshun/commit/ca3fbc130643b73b9f7105b764ad1eca28d3047c)) 112 | * alias distributed cache as client and accept context ([888f5cb](https://github.com/unkn0wn-root/kioshun/commit/888f5cb4c9381855e6f44848d0edc4dd8c39793a)) 113 | * get from other replicas and add panelize node ([7f4803a](https://github.com/unkn0wn-root/kioshun/commit/7f4803ad1f69210773b0695d0ce6485c78d2c807)) 114 | * move trie and util to internal and fix adapter type ([7ff9dac](https://github.com/unkn0wn-root/kioshun/commit/7ff9dac93ee5b71c991e6d56708d257197bdc7be)) 115 | * reset peer on peer closed ([b17ead0](https://github.com/unkn0wn-root/kioshun/commit/b17ead0f492d7b47e532a0b0e306361e40064b77)) 116 | 117 | 118 | ### Bug Fixes 119 | 120 | * adapter type ([98d3842](https://github.com/unkn0wn-root/kioshun/commit/98d38429c08b1477e3c176397f2292d3e2cc4146)) 121 | 122 | ### [0.2.3](https://github.com/unkn0wn-root/kioshun/compare/v0.2.2...v0.2.3) (2025-08-29) 123 | 124 | ### [0.2.2](https://github.com/unkn0wn-root/kioshun/compare/v0.2.1...v0.2.2) (2025-08-29) 125 | 126 | ### [0.2.1](https://github.com/unkn0wn-root/kioshun/compare/v0.2.0...v0.2.1) (2025-08-28) 127 | -------------------------------------------------------------------------------- /_examples/advanced/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "time" 7 | 8 | "github.com/unkn0wn-root/kioshun" 9 | ) 10 | 11 | type User struct { 12 | ID string `json:"id"` 13 | Name string `json:"name"` 14 | Email string `json:"email"` 15 | CreatedAt time.Time `json:"created_at"` 16 | } 17 | 18 | func main() { 19 | fmt.Println("=== Advanced Cache Usage Example ===") 20 | 21 | config := cache.Config{ 22 | MaxSize: 10000, 23 | ShardCount: 16, 24 | CleanupInterval: 1 * time.Minute, 25 | DefaultTTL: 30 * time.Minute, 26 | EvictionPolicy: cache.LRU, 27 | StatsEnabled: true, 28 | } 29 | 30 | userCache := cache.New[string, User](config) 31 | defer userCache.Close() 32 | 33 | fmt.Println("\n1. Operations with complex data types:") 34 | users := []User{ 35 | {ID: "1", Name: "Alice Johnson", Email: "alice@example.com", CreatedAt: time.Now()}, 36 | {ID: "2", Name: "Bob Smith", Email: "bob@example.com", CreatedAt: time.Now()}, 37 | {ID: "3", Name: "Charlie Brown", Email: "charlie@example.com", CreatedAt: time.Now()}, 38 | } 39 | 40 | for _, user := range users { 41 | userCache.Set(user.ID, user, time.Duration(30+len(user.Name))*time.Second) 42 | } 43 | 44 | fmt.Println("\n2. Concurrent access:") 45 | 46 | var wg sync.WaitGroup 47 | numWorkers := 10 48 | operationsPerWorker := 100 49 | 50 | for i := 0; i < numWorkers; i++ { 51 | wg.Add(1) 52 | go func(workerID int) { 53 | defer wg.Done() 54 | 55 | for j := 0; j < operationsPerWorker; j++ { 56 | key := fmt.Sprintf("user:%d:%d", workerID, j) 57 | user := User{ 58 | ID: key, 59 | Name: fmt.Sprintf("User %d-%d", workerID, j), 60 | Email: fmt.Sprintf("user%d_%d@example.com", workerID, j), 61 | CreatedAt: time.Now(), 62 | } 63 | 64 | switch j % 4 { 65 | case 0: // Set 66 | userCache.Set(key, user, 1*time.Hour) 67 | case 1: // Get 68 | if u, found := userCache.Get(key); found { 69 | _ = u.Name // Use the value 70 | } 71 | case 2: // GetWithTTL 72 | if u, ttl, found := userCache.GetWithTTL(key); found { 73 | _ = u.Name 74 | _ = ttl 75 | } 76 | case 3: // Exists 77 | userCache.Exists(key) 78 | } 79 | } 80 | }(i) 81 | } 82 | 83 | wg.Wait() 84 | fmt.Printf("Completed %d concurrent ops\n", numWorkers*operationsPerWorker) 85 | 86 | fmt.Println("\n3. Cache manager for multiple cache instances:") 87 | 88 | manager := cache.NewManager() 89 | defer manager.CloseAll() 90 | 91 | manager.RegisterCache("users", cache.UserCacheConfig()) 92 | manager.RegisterCache("sessions", cache.SessionCacheConfig()) 93 | manager.RegisterCache("api_responses", cache.APICacheConfig()) 94 | 95 | userManagedCache, _ := cache.GetCache[string, User](manager, "users") 96 | sessionCache, _ := cache.GetCache[string, string](manager, "sessions") 97 | apiCache, _ := cache.GetCache[string, []byte](manager, "api_responses") 98 | 99 | userManagedCache.Set("managed_user", users[0], 1*time.Hour) 100 | sessionCache.Set("session_123", "user_session_token", 2*time.Hour) 101 | apiCache.Set("api_response_1", []byte(`{"status": "success"}`), 15*time.Minute) 102 | 103 | fmt.Println("\n4. Global cache usage:") 104 | 105 | cache.RegisterGlobalCache("global_users", cache.UserCacheConfig()) 106 | cache.RegisterGlobalCache("global_sessions", cache.SessionCacheConfig()) 107 | 108 | globalUserCache, _ := cache.GetGlobalCache[string, User]("global_users") 109 | globalSessionCache, _ := cache.GetGlobalCache[string, string]("global_sessions") 110 | 111 | globalUserCache.Set("global_user_1", users[0], 1*time.Hour) 112 | globalSessionCache.Set("global_session_1", "global_token", 2*time.Hour) 113 | 114 | fmt.Println("\n5. Performance monitoring:") 115 | 116 | // Generate some activity 117 | for i := 0; i < 1000; i++ { 118 | key := fmt.Sprintf("perf_test_%d", i) 119 | userCache.Set(key, users[i%len(users)], 1*time.Hour) 120 | 121 | // Mix reads and writes 122 | if i%3 == 0 { 123 | userCache.Get(key) 124 | } 125 | } 126 | 127 | stats := userCache.Stats() 128 | fmt.Printf("Performance Statistics:\n") 129 | fmt.Printf(" Total Operations: %d\n", stats.Hits+stats.Misses) 130 | fmt.Printf(" Hits: %d\n", stats.Hits) 131 | fmt.Printf(" Misses: %d\n", stats.Misses) 132 | fmt.Printf(" Hit Ratio: %.2f%%\n", stats.HitRatio*100) 133 | fmt.Printf(" Evictions: %d\n", stats.Evictions) 134 | fmt.Printf(" Expirations: %d\n", stats.Expirations) 135 | fmt.Printf(" Current Size: %d\n", stats.Size) 136 | fmt.Printf(" Max Capacity: %d\n", stats.Capacity) 137 | fmt.Printf(" Shards: %d\n", stats.Shards) 138 | 139 | fmt.Println("\n6. TTL and expiration handling:") 140 | 141 | // short TTL 142 | shortTTLCache := cache.NewWithDefaults[string, string]() 143 | defer shortTTLCache.Close() 144 | 145 | shortTTLCache.Set("short_lived_1", "expires_soon", 1*time.Second) 146 | shortTTLCache.Set("short_lived_2", "expires_later", 3*time.Second) 147 | 148 | fmt.Printf("Initial size: %d\n", shortTTLCache.Size()) 149 | 150 | time.Sleep(2 * time.Second) 151 | fmt.Printf("After 2 seconds: %d\n", shortTTLCache.Size()) 152 | 153 | if _, found := shortTTLCache.Get("short_lived_1"); !found { 154 | fmt.Println("short_lived_1 has expired") 155 | } 156 | if _, found := shortTTLCache.Get("short_lived_2"); found { 157 | fmt.Println("short_lived_2 still exists") 158 | } 159 | 160 | fmt.Println("\n7. Manual cleanup:") 161 | 162 | userCache.TriggerCleanup() 163 | fmt.Println("Manual cleanup triggered") 164 | 165 | fmt.Println("\n8. Batch operations:") 166 | 167 | batchCache := cache.NewWithDefaults[string, string]() 168 | defer batchCache.Close() 169 | 170 | // simulate batch insert 171 | start := time.Now() 172 | for i := 0; i < 10000; i++ { 173 | batchCache.Set(fmt.Sprintf("batch_key_%d", i), fmt.Sprintf("batch_value_%d", i), 1*time.Hour) 174 | } 175 | insertDuration := time.Since(start) 176 | 177 | // batch read 178 | start = time.Now() 179 | for i := 0; i < 10000; i++ { 180 | batchCache.Get(fmt.Sprintf("batch_key_%d", i)) 181 | } 182 | readDuration := time.Since(start) 183 | 184 | fmt.Printf("Batch insert (10,000 items): %v\n", insertDuration) 185 | fmt.Printf("Batch read (10,000 items): %v\n", readDuration) 186 | fmt.Printf("Insert rate: %.0f ops/sec\n", 10000/insertDuration.Seconds()) 187 | fmt.Printf("Read rate: %.0f ops/sec\n", 10000/readDuration.Seconds()) 188 | 189 | fmt.Println("\n=== Example completed ===") 190 | } 191 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Kioshun Logo 3 | 4 | # Kioshun - In-Memory Cache for Go 5 | 6 | *"kee-oh-shoon" /kiːoʊʃuːn/* 7 | 8 | [![Go Version](https://img.shields.io/badge/Go-1.24+-blue.svg)](https://golang.org) 9 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) 10 | [![CI](https://github.com/unkn0wn-root/kioshun/actions/workflows/test.yml/badge.svg)](https://github.com/unkn0wn-root/kioshun/actions) 11 | 12 | 13 | *Thread-safe, sharded in-memory cache for Go - with an optional peer-to-peer cluster backend* 14 |
15 | 16 | ## Index 17 | 18 | - [What is Kioshun?](#what-is-kioshun) 19 | - [Cluster (Overview)](#cluster-overview) 20 | - [Internals](INTERNALS.md) 21 | - [Installation](#installation) 22 | - [Quick Start](#quick-start) 23 | - [Configuration](#configuration) 24 | - [API](#api) 25 | - [HTTP Middleware](MIDDLEWARE.md) 26 | - [Benchmark Results](#benchmark-results) 27 | 28 | ## What is Kioshun? 29 | 30 | Kioshun is a thread-safe (and fast!), in-memory cache for Go. You can **run it as a local cache** just like any other in-memory caches, or turn on the **peer-to-peer cluster** when you want replicas across hosts. 31 | 32 | If you want to know more about Kioshun internals and how it works under the hood - see [Kioshun Internals](INTERNALS.md) 33 | 34 | ## Cluster Overview 35 | 36 | > [!NOTE] 37 | > Clustering is fully **optional**. If you don’t enable the cluster, Kioshun runs as a standalone, in‑memory cache. 38 | 39 | Kioshun’s cluster turns every service instance into a **small, self-managing peer-to-peer cache**. You just point each one at a few reachable *Seeds* and it discovers the rest, builds a weighted rendezvous and replicates writes with configurable RF/WC so hot data stays local. Gossip keeps the peer list fresh, hinted handoff plus backfill repair all gaps, and reads go straight to the primary owner while read-through uses single-flight leases. 40 | 41 | ``` 42 | ┌─────────────┐ Gossip + Weights ┌─────────────┐ 43 | │ Service A │◀──────────────────────────▶│ Service B │ 44 | │ + Node │◀───────────▶◀───────────▶ │ + Node │ 45 | └──────┬──────┘ └──────┬──────┘ 46 | │ Owner‑routed Get/Set (RF) │ 47 | └──────────────▶◀──────────────────────────┘ 48 | Service C + Node 49 | ``` 50 | 51 | Small multinode example: 52 | 53 | ```bash 54 | # on each server 55 | CACHE_BIND=:4443 56 | CACHE_PUBLIC=srv-a:4443 # srv-b / srv-c on others 57 | CACHE_SEEDS=srv-a:4443,srv-b:4443,srv-c:4443 58 | CACHE_AUTH=supersecret 59 | ``` 60 | 61 | ```go 62 | // in code 63 | local := cache.NewWithDefaults[string, []byte]() 64 | 65 | cfg := cluster.Default() 66 | cfg.BindAddr = os.Getenv("CACHE_BIND") 67 | cfg.PublicURL = os.Getenv("CACHE_PUBLIC") 68 | cfg.Seeds = strings.Split(os.Getenv("CACHE_SEEDS"), ",") 69 | cfg.ReplicationFactor = 3; cfg.WriteConcern = 2 70 | cfg.Sec.AuthToken = os.Getenv("CACHE_AUTH") 71 | 72 | node := cluster.NewNode[string, []byte](cfg, cluster.StringKeyCodec[string]{}, local, cluster.BytesCodec{}) 73 | if err := node.Start(); err != nil { 74 | panic(err) 75 | } 76 | 77 | dc := cluster.NewDistributedCache[string, []byte](node) 78 | ``` 79 | 80 | Only a subset of nodes need to appear in `CACHE_SEEDS`. The list is purely for bootstrap - include a few stable peers so new processes can reach at least one live seed, then gossip distributes the rest of the membership automatically, whether you run 3 caches or 20. 81 | 82 | > See **CLUSTER.md** for more details. 83 | 84 | ## Installation 85 | 86 | ```bash 87 | go get github.com/unkn0wn-root/kioshun 88 | ``` 89 | 90 | ## Quick Start 91 | 92 | ```go 93 | package main 94 | 95 | import ( 96 | "fmt" 97 | "time" 98 | 99 | cache "github.com/unkn0wn-root/kioshun" 100 | ) 101 | 102 | func main() { 103 | // Create cache with default configuration 104 | c := cache.NewWithDefaults[string, string]() 105 | defer c.Close() 106 | 107 | // Set with default TTL (30 min) 108 | c.Set("user:123", "David Nice 1", cache.DefaultExpiration) 109 | 110 | // Set with no expiration 111 | c.Set("user:123", "David Nice 2", cache.NoExpiration) 112 | 113 | // Set value with custom TTL 114 | c.Set("user:123", "David Nice 3", 5*time.Minute) 115 | 116 | // Get value 117 | if value, found := c.Get("user:123"); found { 118 | fmt.Printf("User: %s\n", value) 119 | } 120 | 121 | // Get cache statistics 122 | stats := c.Stats() 123 | fmt.Printf("Hit ratio: %.2f%%\n", stats.HitRatio*100) 124 | } 125 | ``` 126 | 127 | ## Configuration 128 | 129 | ### Basic Configuration 130 | 131 | ```go 132 | config := cache.Config{ 133 | MaxSize: 100000, // Maximum number of items 134 | ShardCount: 16, // Number of shards (0 = auto-detect) 135 | CleanupInterval: 5 * time.Minute, // Cleanup frequency 136 | DefaultTTL: 30 * time.Minute, // Default expiration time 137 | EvictionPolicy: cache.AdmissionLFU, // Eviction algorithm (default) 138 | StatsEnabled: true, // Enable statistics collection 139 | } 140 | 141 | cache := cache.New[string, any](config) 142 | ``` 143 | 144 | ## API 145 | 146 | ```go 147 | cache.Set(key, value, ttl time.Duration) error 148 | cache.SetWithCallback(key, value, ttl, callback func(key, value)) error 149 | cache.Get(key) (value, found bool) 150 | cache.GetWithTTL(key) (value, ttl time.Duration, found bool) 151 | cache.Keys() []K 152 | cache.Clear() 153 | cache.Delete(key) bool 154 | cache.Exists(key) bool 155 | cache.Size() int64 156 | cache.Stats() Stats 157 | cache.TriggerCleanup() 158 | cache.Close() error 159 | ``` 160 | 161 | ### Statistics 162 | 163 | ```go 164 | type Stats struct { 165 | Hits int64 166 | Misses int64 167 | Evictions int64 168 | Expirations int64 169 | Size int64 170 | Capacity int64 171 | HitRatio float64 172 | Shards int 173 | } 174 | ``` 175 | 176 | ## HTTP Middleware 177 | 178 | Kioshun provides HTTP middleware out-of-the-box. 179 | 180 | ```go 181 | config := cache.DefaultMiddlewareConfig() 182 | config.DefaultTTL = 5 * time.Minute 183 | config.MaxSize = 100000 184 | 185 | middleware := cache.NewHTTPCacheMiddleware(config) 186 | defer middleware.Close() 187 | 188 | http.Handle("/api/users", middleware.Middleware(usersHandler)) 189 | ``` 190 | > See **[MIDDLEWARE.md](MIDDLEWARE.md)** for complete documentation, examples, and advanced configuration. 191 | 192 | ## Benchmark Results 193 | 194 | Latest benchmark run (Apple M4 Max, Go 1.24.7): 195 | - `SET`: 100,000,000 ops/sec · 75.55 ns/op · 41 B/op · 3 allocs/op 196 | - `GET`: 231,967,180 ops/sec · 25.87 ns/op · 31 B/op · 2 allocs/op 197 | - `Real-World`: 52,742,550 ops/sec · 65.25 ns/op · 48 B/op · 3 allocs/op 198 | 199 | Full suite: [_benchmarks/README.md](_benchmarks/README.md) 200 | -------------------------------------------------------------------------------- /cluster/replication.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "sync/atomic" 7 | "time" 8 | 9 | cbor "github.com/fxamacker/cbor/v2" 10 | ) 11 | 12 | type replicator[K comparable, V any] struct { 13 | node *Node[K, V] 14 | } 15 | 16 | // replicateSet sends a write to all owners and waits for WC acknowledgements. 17 | // - Pre-compresses the value once per request to avoid per-peer work. 18 | // - Enqueues hinted-handoff on any peer failure so a recovering node can catch up. 19 | // - Fast path: if local commit already satisfies WC, fire-and-forget to peers and 20 | // rely on hinted handoff to close any gaps, the caller is unblocked. 21 | func (r *replicator[K, V]) replicateSet(ctx context.Context, key []byte, val []byte, exp int64, ver uint64, owners []*nodeMeta) error { 22 | required := r.node.cfg.WriteConcern 23 | if required < 1 { 24 | required = 1 25 | } 26 | 27 | acks := 0 28 | if len(owners) > 0 && owners[0].ID == r.node.cfg.ID { 29 | acks++ 30 | } 31 | want := required - acks 32 | 33 | // pre-compress once for all peers. 34 | b2, cp := r.node.maybeCompress(val) 35 | 36 | // helper to send and enqueue hint on failure 37 | sendOne := func(pid NodeID, pc *peerConn) { 38 | if pc == nil { 39 | if r.node.hh != nil { 40 | r.node.hh.enqueueSet(pid, key, b2, exp, ver, cp) 41 | } 42 | return 43 | } 44 | 45 | reqID := r.node.nextReqID() 46 | msg := &MsgSet{Base: Base{T: MTSet, ID: reqID}, Key: key, Val: b2, Exp: exp, Ver: ver, Cp: cp} 47 | raw, err := pc.request(msg, reqID, r.node.cfg.Sec.WriteTimeout) 48 | if err != nil { 49 | if r.node.hh != nil { 50 | r.node.hh.enqueueSet(pid, key, b2, exp, ver, cp) 51 | } 52 | return 53 | } 54 | 55 | var resp MsgSetResp 56 | if e := cbor.Unmarshal(raw, &resp); e != nil || !resp.OK { 57 | if r.node.hh != nil { 58 | r.node.hh.enqueueSet(pid, key, b2, exp, ver, cp) 59 | } 60 | return 61 | } 62 | } 63 | 64 | // Fast path: local already satisfies WC. Fire and forget to remaining owners, 65 | // still capturing failures into hinted handoff without blocking the caller. 66 | if want <= 0 { 67 | for _, own := range owners { 68 | if own.ID == r.node.cfg.ID { 69 | continue 70 | } 71 | pc := r.node.getPeer(own.ID) 72 | go sendOne(own.ID, pc) 73 | } 74 | return nil 75 | } 76 | 77 | // slow path: need acknowledgements from peers to meet write concern. 78 | remaining := int32(want) 79 | timer := time.NewTimer(r.node.cfg.Sec.WriteTimeout + time.Second) 80 | defer timer.Stop() 81 | 82 | errCh := make(chan error, len(owners)) 83 | for _, own := range owners { 84 | if own.ID == r.node.cfg.ID { 85 | continue 86 | } 87 | pc := r.node.getPeer(own.ID) 88 | if pc == nil { 89 | // we know this one will miss; enqueue and continue 90 | if r.node.hh != nil { 91 | r.node.hh.enqueueSet(own.ID, key, b2, exp, ver, cp) 92 | } 93 | continue 94 | } 95 | 96 | go func(pid NodeID, p *peerConn) { 97 | reqID := r.node.nextReqID() 98 | msg := &MsgSet{Base: Base{T: MTSet, ID: reqID}, Key: key, Val: b2, Exp: exp, Ver: ver, Cp: cp} 99 | raw, err := p.request(msg, reqID, r.node.cfg.Sec.WriteTimeout) 100 | if err != nil { 101 | // enqueue and return an error 102 | if r.node.hh != nil { 103 | r.node.hh.enqueueSet(pid, key, b2, exp, ver, cp) 104 | } 105 | errCh <- err 106 | return 107 | } 108 | 109 | var resp MsgSetResp 110 | if e := cbor.Unmarshal(raw, &resp); e != nil || !resp.OK { 111 | if r.node.hh != nil { 112 | r.node.hh.enqueueSet(pid, key, b2, exp, ver, cp) 113 | } 114 | if e == nil && resp.Err != "" { 115 | errCh <- errors.New(resp.Err) 116 | } else { 117 | errCh <- errors.New("set not ok") 118 | } 119 | return 120 | } 121 | // success 122 | if atomic.AddInt32(&remaining, -1) <= 0 { 123 | errCh <- nil 124 | } 125 | }(own.ID, pc) 126 | } 127 | 128 | for { 129 | select { 130 | case err := <-errCh: 131 | if err == nil { 132 | return nil // met write concern 133 | } 134 | // keep waiting for success until timeout; errors alone don't abort early 135 | case <-timer.C: 136 | return ErrTimeout 137 | case <-ctx.Done(): 138 | return ctx.Err() 139 | } 140 | } 141 | } 142 | 143 | func (r *replicator[K, V]) replicateDelete(ctx context.Context, key []byte, owners []*nodeMeta, ver uint64) error { 144 | required := r.node.cfg.WriteConcern 145 | if required < 1 { 146 | required = 1 147 | } 148 | acks := 0 149 | if len(owners) > 0 && owners[0].ID == r.node.cfg.ID { 150 | acks++ 151 | } 152 | want := required - acks 153 | 154 | sendOne := func(pid NodeID, pc *peerConn) { 155 | if pc == nil { 156 | if r.node.hh != nil { 157 | r.node.hh.enqueueDel(pid, key, ver) 158 | } 159 | return 160 | } 161 | 162 | reqID := r.node.nextReqID() 163 | msg := &MsgDel{Base: Base{T: MTDelete, ID: reqID}, Key: key, Ver: ver} 164 | raw, err := pc.request(msg, reqID, r.node.cfg.Sec.WriteTimeout) 165 | if err != nil { 166 | if r.node.hh != nil { 167 | r.node.hh.enqueueDel(pid, key, ver) 168 | } 169 | return 170 | } 171 | 172 | var resp MsgDelResp 173 | if e := cbor.Unmarshal(raw, &resp); e != nil || !resp.OK { 174 | if r.node.hh != nil { 175 | r.node.hh.enqueueDel(pid, key, ver) 176 | } 177 | return 178 | } 179 | } 180 | 181 | if want <= 0 { 182 | for _, own := range owners { 183 | if own.ID == r.node.cfg.ID { 184 | continue 185 | } 186 | pc := r.node.getPeer(own.ID) 187 | go sendOne(own.ID, pc) 188 | } 189 | return nil 190 | } 191 | 192 | remaining := int32(want) 193 | timer := time.NewTimer(r.node.cfg.Sec.WriteTimeout + time.Second) 194 | defer timer.Stop() 195 | errCh := make(chan error, len(owners)) 196 | 197 | for _, own := range owners { 198 | if own.ID == r.node.cfg.ID { 199 | continue 200 | } 201 | pc := r.node.getPeer(own.ID) 202 | if pc == nil { 203 | if r.node.hh != nil { 204 | r.node.hh.enqueueDel(own.ID, key, ver) 205 | } 206 | continue 207 | } 208 | 209 | go func(pid NodeID, p *peerConn) { 210 | reqID := r.node.nextReqID() 211 | msg := &MsgDel{Base: Base{T: MTDelete, ID: reqID}, Key: key, Ver: ver} 212 | raw, err := p.request(msg, reqID, r.node.cfg.Sec.WriteTimeout) 213 | if err != nil { 214 | if r.node.hh != nil { 215 | r.node.hh.enqueueDel(pid, key, ver) 216 | } 217 | errCh <- err 218 | return 219 | } 220 | 221 | var resp MsgDelResp 222 | if e := cbor.Unmarshal(raw, &resp); e != nil || !resp.OK { 223 | if r.node.hh != nil { 224 | r.node.hh.enqueueDel(pid, key, ver) 225 | } 226 | if e == nil && resp.Err != "" { 227 | errCh <- errors.New(resp.Err) 228 | } else { 229 | errCh <- errors.New("delete not ok") 230 | } 231 | return 232 | } 233 | if atomic.AddInt32(&remaining, -1) <= 0 { 234 | errCh <- nil 235 | } 236 | }(own.ID, pc) 237 | } 238 | 239 | for { 240 | select { 241 | case err := <-errCh: 242 | if err == nil { 243 | return nil 244 | } 245 | case <-timer.C: 246 | return ErrTimeout 247 | case <-ctx.Done(): 248 | return ctx.Err() 249 | } 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /eviction.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | ) 7 | 8 | const ( 9 | // AdmissionLFU sampling: take a small randomized sample and pick the least-frequent (tie: oldest). 10 | defaultAdmissionLFUSampleSize = 5 11 | maxAdmissionLFUSampleSize = 20 12 | ) 13 | 14 | // evictor defines a policy that evicts exactly one item when a shard is full (called under shard write lock). 15 | type evictor[K comparable, V any] interface { 16 | evict(s *shard[K, V], itemPool *sync.Pool, statsEnabled bool) bool 17 | } 18 | 19 | // lruEvictor removes the least-recently-used item using the shard's intrusive LRU list. 20 | type lruEvictor[K comparable, V any] struct{} 21 | 22 | // evict unlinks and recycles the tail.prev (LRU) item; updates size/stats; O(1). 23 | func (e lruEvictor[K, V]) evict(s *shard[K, V], itemPool *sync.Pool, statsEnabled bool) bool { 24 | // Empty list check: only sentinels present. 25 | if s.tail.prev == s.head { 26 | return false 27 | } 28 | 29 | // Victim is the node just before the tail sentinel. 30 | lru := s.tail.prev 31 | if lru != nil && lru.key != nil { 32 | // cacheItem.key is stored as 'any' to allow deletion without recomputing the hash. 33 | // We assert to K here; inserts always set key with the correct type. 34 | if key, ok := lru.key.(K); ok { 35 | delete(s.data, key) 36 | } 37 | 38 | s.removeFromLRU(lru) // O(1) unlink from intrusive list 39 | itemPool.Put(lru) // recycle to reduce GC churn 40 | atomic.AddInt64(&s.size, -1) 41 | 42 | if statsEnabled { 43 | atomic.AddInt64(&s.evictions, 1) 44 | } 45 | return true 46 | } 47 | return false 48 | } 49 | 50 | // lfuEvictor removes the global least-frequent item via the O(1) LFU bucket list. 51 | type lfuEvictor[K comparable, V any] struct{} 52 | 53 | // evict pops the LFU item, unlinks from LRU too, recycles, and updates size/stats; O(1). 54 | func (e lfuEvictor[K, V]) evict(s *shard[K, V], itemPool *sync.Pool, statsEnabled bool) bool { 55 | lfu := s.lfuList.removeLFU() 56 | if lfu == nil { 57 | return false 58 | } 59 | 60 | if lfu.key != nil { 61 | if key, ok := lfu.key.(K); ok { 62 | delete(s.data, key) 63 | } 64 | 65 | // We maintain the LRU list for uniform unlinking/cleanup across policies. 66 | s.removeFromLRU(lfu) 67 | itemPool.Put(lfu) 68 | atomic.AddInt64(&s.size, -1) 69 | 70 | if statsEnabled { 71 | atomic.AddInt64(&s.evictions, 1) 72 | } 73 | return true 74 | } 75 | return false 76 | } 77 | 78 | // fifoEvictor treats the LRU list as insertion order and removes the oldest item. 79 | type fifoEvictor[K comparable, V any] struct{} 80 | 81 | // evict deletes the earliest inserted (tail.prev), unlinks from optional LFU, and updates stats; O(1). 82 | func (e fifoEvictor[K, V]) evict(s *shard[K, V], itemPool *sync.Pool, statsEnabled bool) bool { 83 | if s.tail.prev == s.head { 84 | return false 85 | } 86 | 87 | oldest := s.tail.prev 88 | if oldest != nil && oldest.key != nil { 89 | if key, ok := oldest.key.(K); ok { 90 | delete(s.data, key) 91 | } 92 | s.removeFromLRU(oldest) 93 | 94 | if s.lfuList != nil { 95 | s.lfuList.remove(oldest) 96 | } 97 | 98 | itemPool.Put(oldest) 99 | atomic.AddInt64(&s.size, -1) 100 | if statsEnabled { 101 | atomic.AddInt64(&s.evictions, 1) 102 | } 103 | return true 104 | } 105 | return false 106 | } 107 | 108 | // admissionLFUEvictor does approximate-LFU by sampling and optionally gating via the admission filter. 109 | type admissionLFUEvictor[K comparable, V any] struct { 110 | sampleSize int // desired sample size; clamped to [1, maxAdmissionLFUSampleSize] 111 | } 112 | 113 | // pickVictim scans up to sampleSize items (randomized map order) and returns the worst (freq↑, age↑); nil if empty. 114 | func (e admissionLFUEvictor[K, V]) pickVictim(s *shard[K, V]) *cacheItem[V] { 115 | if len(s.data) == 0 { 116 | return nil 117 | } 118 | 119 | n := e.sampleSize 120 | if n <= 0 { 121 | n = defaultAdmissionLFUSampleSize 122 | } else if n > maxAdmissionLFUSampleSize { 123 | n = maxAdmissionLFUSampleSize 124 | } 125 | if n > len(s.data) { 126 | n = len(s.data) 127 | } 128 | 129 | var victim *cacheItem[V] 130 | cnt := 0 131 | for _, it := range s.data { 132 | // Lower frequency is worse; if equal, older lastAccess is worse. 133 | if victim == nil || 134 | it.frequency < victim.frequency || 135 | (it.frequency == victim.frequency && it.lastAccess < victim.lastAccess) { 136 | victim = it 137 | } 138 | cnt++ 139 | if cnt >= n { 140 | break // bounded sample scan 141 | } 142 | } 143 | return victim 144 | } 145 | 146 | // removeVictim unlinks victim from shard structures, recycles it, updates size/stats (caller holds write lock). 147 | func (e admissionLFUEvictor[K, V]) removeVictim(s *shard[K, V], victim *cacheItem[V], itemPool *sync.Pool, statsEnabled bool) { 148 | if key, ok := victim.key.(K); ok { 149 | delete(s.data, key) 150 | } 151 | s.removeFromLRU(victim) 152 | // No LFU heap: AdmissionLFU doesn't maintain the O(1) LFU list. 153 | itemPool.Put(victim) 154 | atomic.AddInt64(&s.size, -1) 155 | if statsEnabled { 156 | atomic.AddInt64(&s.evictions, 1) 157 | } 158 | } 159 | 160 | // evict samples, picks a victim, evicts it, and records lastVictimFrequency for observability. 161 | func (e admissionLFUEvictor[K, V]) evict( 162 | s *shard[K, V], 163 | itemPool *sync.Pool, 164 | statsEnabled bool, 165 | ) bool { 166 | victim := e.pickVictim(s) 167 | if victim == nil { 168 | return false 169 | } 170 | if s.admission != nil { 171 | s.lastVictimFrequency = uint64(victim.frequency) 172 | } 173 | e.removeVictim(s, victim, itemPool, statsEnabled) 174 | 175 | return true 176 | } 177 | 178 | // evictWithAdmission runs sample → shouldAdmit → (optional) evict; denies return false to let Set() skip pollution. 179 | func (e admissionLFUEvictor[K, V]) evictWithAdmission( 180 | s *shard[K, V], 181 | itemPool *sync.Pool, 182 | statsEnabled bool, 183 | admission *adaptiveAdmissionFilter, 184 | keyHash uint64, 185 | ) bool { 186 | victim := e.pickVictim(s) 187 | if victim == nil { 188 | return false 189 | } 190 | 191 | freq := uint64(victim.frequency) 192 | victimAge := victim.lastAccess 193 | s.lastVictimFrequency = freq // stored for deb/metrics 194 | 195 | // Admission gate: compare candidate(keyHash) vs victim(freq/age). 196 | if !admission.shouldAdmit(keyHash, freq, victimAge) { 197 | return false 198 | } 199 | e.removeVictim(s, victim, itemPool, statsEnabled) 200 | 201 | // Feed back pressure to the admission filter for adaptive probability. 202 | admission.RecordEviction() 203 | 204 | return true 205 | } 206 | 207 | // createEvictor returns the policy implementation for the selected EvictionPolicy. 208 | func createEvictor[K comparable, V any](policy EvictionPolicy) evictor[K, V] { 209 | switch policy { 210 | case LRU: 211 | return lruEvictor[K, V]{} 212 | case LFU: 213 | return lfuEvictor[K, V]{} 214 | case FIFO: 215 | return fifoEvictor[K, V]{} 216 | case AdmissionLFU: 217 | return admissionLFUEvictor[K, V]{sampleSize: defaultAdmissionLFUSampleSize} 218 | default: 219 | return admissionLFUEvictor[K, V]{sampleSize: defaultAdmissionLFUSampleSize} 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /benchmarks/cluster/node/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "log" 7 | "net/http" 8 | "os" 9 | "strconv" 10 | "strings" 11 | "time" 12 | 13 | cache "github.com/unkn0wn-root/kioshun" 14 | "github.com/unkn0wn-root/kioshun/cluster" 15 | ) 16 | 17 | type setReq struct { 18 | K string `json:"k"` 19 | V string `json:"v"` 20 | TTLms int64 `json:"ttl_ms"` 21 | } 22 | 23 | func getenv(k, d string) string { 24 | if v := os.Getenv(k); v != "" { 25 | return v 26 | } 27 | return d 28 | } 29 | 30 | func splitCSV(s string) []string { 31 | parts := strings.Split(s, ",") 32 | out := make([]string, 0, len(parts)) 33 | for _, p := range parts { 34 | p = strings.TrimSpace(p) 35 | if p != "" { 36 | out = append(out, p) 37 | } 38 | } 39 | return out 40 | } 41 | 42 | func main() { 43 | port := getenv("PORT", "8081") 44 | bind := getenv("CACHE_BIND", ":5011") 45 | pub := getenv("CACHE_PUBLIC", "node1:5011") 46 | seeds := splitCSV(getenv("CACHE_SEEDS", pub)) 47 | auth := getenv("CACHE_AUTH", "") 48 | allowKill := strings.ToLower(getenv("ALLOW_KILL", "false")) == "true" 49 | killToken := getenv("KILL_TOKEN", "") 50 | 51 | // in-process local cache 52 | local := cache.NewWithDefaults[string, []byte]() 53 | 54 | // cluster node 55 | cfg := cluster.Default() 56 | cfg.BindAddr = bind 57 | cfg.PublicURL = pub 58 | cfg.Seeds = seeds 59 | cfg.Sec.AuthToken = auth 60 | cfg.ID = cluster.NodeID(cfg.PublicURL) 61 | cfg.PerConnWorkers = 128 62 | cfg.PerConnQueue = 256 63 | cfg.Sec.MaxInflightPerPeer = 512 64 | 65 | // optional via env 66 | if v := getenv("REPLICATION_FACTOR", ""); v != "" { 67 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 68 | cfg.ReplicationFactor = n 69 | } 70 | } else { 71 | cfg.ReplicationFactor = 3 72 | } 73 | 74 | if v := getenv("WRITE_CONCERN", ""); v != "" { 75 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 76 | cfg.WriteConcern = n 77 | } 78 | } else { 79 | cfg.WriteConcern = 2 80 | } 81 | 82 | if v := getenv("READ_MAX_FANOUT", ""); v != "" { 83 | if n, err := strconv.Atoi(v); err == nil && n >= 1 { 84 | cfg.ReadMaxFanout = n 85 | } 86 | } 87 | 88 | if v := getenv("READ_PER_TRY_MS", ""); v != "" { 89 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 90 | cfg.ReadPerTryTimeout = time.Duration(n) * time.Millisecond 91 | } 92 | } 93 | 94 | if v := getenv("READ_HEDGE_DELAY_MS", ""); v != "" { 95 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 96 | cfg.ReadHedgeDelay = time.Duration(n) * time.Millisecond 97 | } 98 | } 99 | 100 | if v := getenv("READ_HEDGE_INTERVAL_MS", ""); v != "" { 101 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 102 | cfg.ReadHedgeInterval = time.Duration(n) * time.Millisecond 103 | } 104 | } 105 | 106 | if v := getenv("WRITE_TIMEOUT_MS", ""); v != "" { 107 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 108 | cfg.Sec.WriteTimeout = time.Duration(n) * time.Millisecond 109 | } 110 | } 111 | 112 | if v := getenv("READ_TIMEOUT_MS", ""); v != "" { 113 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 114 | cfg.Sec.ReadTimeout = time.Duration(n) * time.Millisecond 115 | } 116 | } 117 | 118 | if v := getenv("SUSPICION_AFTER_MS", ""); v != "" { 119 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 120 | cfg.SuspicionAfter = time.Duration(n) * time.Millisecond 121 | } 122 | } 123 | 124 | if v := getenv("WEIGHT_UPDATE_MS", ""); v != "" { 125 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 126 | cfg.WeightUpdate = time.Duration(n) * time.Millisecond 127 | } 128 | } 129 | 130 | if v := getenv("GOSSIP_INTERVAL_MS", ""); v != "" { 131 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 132 | cfg.GossipInterval = time.Duration(n) * time.Millisecond 133 | } 134 | } 135 | 136 | kc := cluster.StringKeyCodec[string]{} 137 | vc := cluster.BytesCodec{} 138 | node := cluster.NewNode[string, []byte](cfg, kc, local, vc) 139 | if err := node.Start(); err != nil { 140 | log.Fatalf("start node: %v", err) 141 | } 142 | defer node.Stop() 143 | 144 | mux := http.NewServeMux() 145 | 146 | // GET /get?k=... 147 | mux.HandleFunc("/get", func(w http.ResponseWriter, r *http.Request) { 148 | k := r.URL.Query().Get("k") 149 | if k == "" { 150 | http.Error(w, "missing k", http.StatusBadRequest) 151 | return 152 | } 153 | 154 | ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second) 155 | defer cancel() 156 | v, found, err := node.Get(ctx, k) 157 | if err != nil { 158 | http.Error(w, err.Error(), http.StatusBadGateway) 159 | return 160 | } 161 | 162 | if !found { 163 | w.Header().Set("X-Cache", "MISS") 164 | http.Error(w, "not found", http.StatusNotFound) 165 | log.Printf("[MISS] k=%s", k) 166 | return 167 | } 168 | 169 | src := "HIT_REMOTE" 170 | if local.Exists(k) { 171 | src = "HIT_LOCAL" 172 | } 173 | w.Header().Set("X-Cache", src) 174 | w.WriteHeader(http.StatusOK) 175 | _, _ = w.Write(v) 176 | log.Printf("[%s] k=%s sz=%d", src, k, len(v)) 177 | }) 178 | 179 | // POST /set {k,v,ttl_ms} 180 | mux.HandleFunc("/set", func(w http.ResponseWriter, r *http.Request) { 181 | if r.Method != http.MethodPost { 182 | http.NotFound(w, r) 183 | return 184 | } 185 | 186 | var in setReq 187 | if err := json.NewDecoder(r.Body).Decode(&in); err != nil { 188 | http.Error(w, err.Error(), http.StatusBadRequest) 189 | return 190 | } 191 | 192 | ttl := time.Duration(in.TTLms) * time.Millisecond 193 | ctx, cancel := context.WithTimeout(r.Context(), 3*time.Second) 194 | defer cancel() 195 | if err := node.Set(ctx, in.K, []byte(in.V), ttl); err != nil { 196 | http.Error(w, err.Error(), http.StatusBadGateway) 197 | return 198 | } 199 | w.WriteHeader(http.StatusOK) 200 | _, _ = w.Write([]byte("OK")) 201 | log.Printf("[SET] k=%s ttl_ms=%d vlen=%d", in.K, in.TTLms, len(in.V)) 202 | }) 203 | 204 | // GET /stats → local shard stats 205 | mux.HandleFunc("/stats", func(w http.ResponseWriter, r *http.Request) { 206 | st := local.Stats() 207 | w.Header().Set("Content-Type", "application/json") 208 | _ = json.NewEncoder(w).Encode(st) 209 | }) 210 | 211 | // GET /ready simple readiness check 212 | mux.HandleFunc("/ready", func(w http.ResponseWriter, r *http.Request) { 213 | w.WriteHeader(http.StatusOK) 214 | }) 215 | 216 | // POST /kill?token=...&after_ms=0 — for failure injection during tests 217 | mux.HandleFunc("/kill", func(w http.ResponseWriter, r *http.Request) { 218 | if !allowKill { 219 | http.Error(w, "kill disabled", http.StatusForbidden) 220 | return 221 | } 222 | 223 | if killToken != "" && r.URL.Query().Get("token") != killToken { 224 | http.Error(w, "unauthorized", http.StatusUnauthorized) 225 | return 226 | } 227 | 228 | afterMs, _ := strconv.Atoi(r.URL.Query().Get("after_ms")) 229 | w.WriteHeader(http.StatusOK) 230 | _, _ = w.Write([]byte("bye")) 231 | go func() { 232 | if afterMs > 0 { 233 | time.Sleep(time.Duration(afterMs) * time.Millisecond) 234 | } 235 | log.Printf("[KILL] exiting process on request") 236 | node.Stop() 237 | time.Sleep(50 * time.Millisecond) 238 | os.Exit(0) 239 | }() 240 | }) 241 | 242 | log.Printf("mesh node up on :%s | node %s bind %s seeds=%v", port, cfg.PublicURL, cfg.BindAddr, cfg.Seeds) 243 | if err := http.ListenAndServe(":"+port, mux); err != nil { 244 | log.Fatal(err) 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark Results - Kioshun vs. Ristretto, go-cache and freecache 2 | 3 | ## Benchmark Configuration 4 | 5 | The benchmarks compare **Kioshun** with **AdmissionLFU** eviction policy against other popular Go cache libraries: 6 | 7 | ### Cache Configurations Used 8 | 9 | | Cache Library | Configuration | Notes | 10 | |---------------|---------------|-------| 11 | | **Kioshun** | MaxSize: 100,000
ShardCount: CPU cores × 4
EvictionPolicy: **AdmissionLFU**
DefaultTTL: 1 hour
CleanupInterval: 5 min | AdmissionLFU eviction policy with admission control | 12 | | **Ristretto** | NumCounters: 1,000,000
MaxCost: 100,000
BufferItems: 64 | TinyLFU-based admission policy | 13 | | **BigCache** | MaxEntriesInWindow: 100,000
Shards: CPU cores (power of 2)
MaxEntrySize: 64KB
HardMaxCacheSize: 256MB | No eviction policy, size-based | 14 | | **FreeCache** | Size: 128MB | Segmented LRU | 15 | | **go-cache** | DefaultExpiration: 1 hour
CleanupInterval: 5 min | Simple map-based with cleanup | 16 | 17 | **Test Environment (latest run):** 18 | - **CPU:** Apple M4 Max (arm64) 19 | - **OS:** macOS (Darwin arm64) 20 | - **Go Version:** 1.24.7 21 | - **Benchmark knobs:** `go test -bench … -benchmem`, 16-way parallelism, `-benchtime` 5s (core workloads) / 3s (stress suites) 22 | - **Kioshun config:** AdmissionLFU, `ShardCount = runtime.NumCPU() * 4` (64 shards), `MaxSize = 100 000` 23 | 24 | ## Running Benchmarks 25 | 26 | ```bash 27 | # Run comparison benchmarks 28 | make bench-compare 29 | 30 | # Run stress tests 31 | make stress-test 32 | 33 | # Run all benchmarks with the benchmark runner 34 | make bench-runner 35 | 36 | # Run all benchmark tests 37 | make bench 38 | ``` 39 | 40 | ## Core Operations 41 | 42 | ### SET Operations 43 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 44 | |---------------|---------|-------|------|-----------| 45 | | **Kioshun** | 100,000,000 | 75.55 | 41 | 3 | 46 | | **FreeCache** | 81,768,051 | 74.19 | 24 | 1 | 47 | | **Ristretto** | 58,714,996 | 90.86 | 154 | 5 | 48 | | **BigCache** | 37,852,590 | 151.5 | 40 | 2 | 49 | | **go-cache** | 19,841,619 | 341.0 | 57 | 3 | 50 | 51 | ### GET Operations 52 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 53 | |---------------|---------|-------|------|-----------| 54 | | **Ristretto** | 244,472,186 | 23.09 | 31 | 2 | 55 | | **Kioshun** | 239,967,180 | 25.87 | 31 | 2 | 56 | | **FreeCache** | 77,851,767 | 79.62 | 1,039 | 2 | 57 | | **BigCache** | 76,458,728 | 76.81 | 1,047 | 3 | 58 | | **go-cache** | 44,541,900 | 136.6 | 15 | 1 | 59 | 60 | ## Workload-Specific 61 | 62 | ### Mixed Operations (70% reads, 30% writes) 63 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 64 | |---------------|---------|-------|------|-----------| 65 | | **Kioshun** | 114,716,242 | 51.47 | 31 | 2 | 66 | | **Ristretto** | 96,006,397 | 62.33 | 69 | 3 | 67 | | **FreeCache** | 80,013,957 | 73.54 | 732 | 2 | 68 | | **BigCache** | 38,290,142 | 150.0 | 742 | 3 | 69 | | **go-cache** | 30,545,562 | 200.3 | 22 | 2 | 70 | 71 | ### High Contention Scenarios 72 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 73 | |---------------|---------|-------|------|-----------| 74 | | **Kioshun** | 85,443,963 | 77.03 | 34 | 2 | 75 | | **FreeCache** | 68,861,860 | 87.68 | 554 | 1 | 76 | | **BigCache** | 36,476,380 | 154.0 | 568 | 2 | 77 | | **go-cache** | 29,068,076 | 228.2 | 33 | 1 | 78 | | **Ristretto** | 27,175,748 | 223.5 | 83 | 3 | 79 | 80 | ### Read-Heavy Workloads (90% reads, 10% writes) 81 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 82 | |---------------|---------|-------|------|-----------| 83 | | **Ristretto** | 101,089,580 | 33.34 | 45 | 3 | 84 | | **Kioshun** | 97,650,378 | 39.92 | 31 | 2 | 85 | | **FreeCache** | 46,611,218 | 76.60 | 937 | 2 | 86 | | **BigCache** | 26,093,739 | 132.6 | 946 | 3 | 87 | | **go-cache** | 19,943,032 | 180.8 | 18 | 2 | 88 | 89 | ### Write-Heavy Workloads (90% writes, 10% reads) 90 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 91 | |---------------|---------|-------|------|-----------| 92 | | **Kioshun** | 96,439,025 | 36.25 | 31 | 2 | 93 | | **FreeCache** | 52,917,732 | 66.29 | 118 | 2 | 94 | | **Ristretto** | 22,717,962 | 147.7 | 133 | 5 | 95 | | **BigCache** | 21,079,129 | 167.2 | 133 | 3 | 96 | | **go-cache** | 14,755,354 | 231.2 | 37 | 2 | 97 | 98 | ## Simulate 'Real-World' Workflow 99 | 100 | ### Real-World Workload Simulation 101 | | Cache Library | Ops/sec | ns/op | B/op | allocs/op | 102 | |---------------|---------|-------|------|-----------| 103 | | **Kioshun** | 53,742,550 | 65.25 | 48 | 3 | 104 | | **FreeCache** | 44,717,696 | 85.09 | 738 | 2 | 105 | | **Ristretto** | 29,713,388 | 112.0 | 96 | 3 | 106 | | **BigCache** | 21,115,576 | 185.9 | 818 | 3 | 107 | | **go-cache** | 16,147,178 | 230.7 | 40 | 2 | 108 | 109 | ### Memory Efficiency 110 | | Cache Library | Ops/sec | bytes/op | 111 | |---------------|---------|----------| 112 | | **Kioshun** | 45,916,828 | **40.0** | 113 | | Value size sweep (1–64 KB) held steady at ~67–70 ns/op with 40 B/op and 2 allocs/op. 114 | 115 | ## Performance Characteristics (Kioshun AdmissionLFU) 116 | 117 | - ~36–77 ns/op on write-heavy or high-contention microbenchmarks, ~26 ns/op on pure GETs 118 | - ~53 M ops/sec in the mixed “real-world” pattern (65 ns/op average) 119 | - Peak GET throughput observed: ~232 M ops/sec (26 ns/op) 120 | 121 | ## Stress Test Results 122 | 123 | ### High Load Scenarios 124 | | Load Profile | Ops/sec | ns/op | B/op | allocs/op | Description | 125 | |-------------|---------|-------|------|-----------|-------------| 126 | | **Small + High Concurrency** | 55,777,849 | 61.52 | 27 | 2 | Many goroutines, small cache | 127 | | **Medium + Mixed Load** | 53,624,493 | 66.63 | 31 | 2 | Balanced read/write operations | 128 | | **Large + Read Heavy** | 64,021,102 | 55.19 | 38 | 2 | Large cache, mostly reads | 129 | | **XLarge + Write Heavy** | 40,838,030 | 80.50 | 40 | 3 | Very large cache, mostly writes | 130 | | **Extreme + Balanced** | 42,276,840 | 85.33 | 40 | 3 | Maximum scale, balanced ops | 131 | 132 | ### Advanced Stress Test Results 133 | 134 | #### Contention Stress Test 135 | | Test | Ops/sec | ns/op | B/op | allocs/op | 136 | |------|---------|-------|------|-----------| 137 | | **High Contention** | 40,442,905 | 83.94 | 34 | 2 | 138 | 139 | #### Eviction Policy Performance 140 | | Eviction Policy | Ops/sec | ns/op | B/op | allocs/op | 141 | |-----------------|---------|-------|------|-----------| 142 | | **FIFO** | **42,899,701** | 82.10 | 46 | 3 | 143 | | **AdmissionLFU** | 41,337,319 | 177.0 | 59 | 3 | 144 | | **LRU** | 31,638,396 | 153.1 | 57 | 3 | 145 | | **LFU** | 24,112,208 | 194.8 | 57 | 3 | 146 | 147 | #### Memory Pressure Tests 148 | | Value Size | Ops/sec | ns/op | B/op | allocs/op | 149 | |------------|---------|-------|------|-----------| 150 | | **1KB** | 45,916,828 | 69.71 | 40 | 2 | 151 | | **4KB** | 58,272,031 | 68.42 | 40 | 2 | 152 | | **16KB** | 55,135,164 | 68.89 | 40 | 2 | 153 | | **64KB** | 57,774,400 | 67.40 | 40 | 2 | 154 | 155 | #### Sharding Efficiency Analysis 156 | | Shards | Ops/sec | ns/op | B/op | allocs/op | 157 | |--------|---------|-------|------|-----------| 158 | | **1** | 15,451,604 | 341.2 | 45 | 3 | 159 | | **2** | 15,700,284 | 299.6 | 44 | 3 | 160 | | **4** | 20,301,433 | 205.3 | 45 | 3 | 161 | | **8** | 27,256,491 | 145.4 | 45 | 3 | 162 | | **16** | 35,702,301 | 115.4 | 46 | 3 | 163 | | **32** | 41,248,432 | 91.13 | 46 | 3 | 164 | | **64** | 53,728,068 | 76.04 | 47 | 3 | 165 | | **128** | 66,081,164 | **62.31** | 47 | 3 | 166 | -------------------------------------------------------------------------------- /fnv_test.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "strconv" 5 | "testing" 6 | ) 7 | 8 | func TestFnvHash64(t *testing.T) { 9 | tests := []struct { 10 | name string 11 | input string 12 | }{ 13 | { 14 | name: "empty string", 15 | input: "", 16 | }, 17 | { 18 | name: "single character", 19 | input: "a", 20 | }, 21 | { 22 | name: "short string", 23 | input: "test", 24 | }, 25 | { 26 | name: "longer string", 27 | input: "Hello, World!", 28 | }, 29 | { 30 | name: "numeric string", 31 | input: "12345", 32 | }, 33 | { 34 | name: "special characters", 35 | input: "!@#$%^&*()", 36 | }, 37 | { 38 | name: "unicode characters", 39 | input: "🚀🌟💫", 40 | }, 41 | } 42 | 43 | for _, tt := range tests { 44 | t.Run(tt.name, func(t *testing.T) { 45 | result := fnvHash64(tt.input) 46 | if result == 0 && tt.input != "" { 47 | t.Errorf("fnvHash64(%q) returned 0 unexpectedly", tt.input) 48 | } 49 | }) 50 | } 51 | } 52 | 53 | func TestFnvHash64_Consistency(t *testing.T) { 54 | // same input always produces the same hash 55 | testStrings := []string{ 56 | "", 57 | "a", 58 | "test", 59 | "Hello, World!", 60 | "This is a longer test string with various characters!@#$%^&*()", 61 | "unicode: 🚀🌟💫✨🎯", 62 | } 63 | 64 | for _, str := range testStrings { 65 | t.Run("consistency_"+str, func(t *testing.T) { 66 | hash1 := fnvHash64(str) 67 | hash2 := fnvHash64(str) 68 | hash3 := fnvHash64(str) 69 | 70 | if hash1 != hash2 || hash2 != hash3 { 71 | t.Errorf("fnvHash64(%q) not consistent: %d, %d, %d", str, hash1, hash2, hash3) 72 | } 73 | }) 74 | } 75 | } 76 | 77 | func TestFnvHash64_Distribution(t *testing.T) { 78 | // similar strings produce different hashes 79 | testPairs := []struct { 80 | str1, str2 string 81 | }{ 82 | {"abc", "abd"}, 83 | {"test", "Test"}, 84 | {"hello", "Hello"}, 85 | {"123", "124"}, 86 | {"", " "}, 87 | {"a", "aa"}, 88 | {"hello world", "hello world "}, 89 | } 90 | 91 | for _, pair := range testPairs { 92 | t.Run("distribution_"+pair.str1+"_vs_"+pair.str2, func(t *testing.T) { 93 | hash1 := fnvHash64(pair.str1) 94 | hash2 := fnvHash64(pair.str2) 95 | 96 | if hash1 == hash2 { 97 | t.Errorf("fnvHash64(%q) == fnvHash64(%q) = %d (collision)", pair.str1, pair.str2, hash1) 98 | } 99 | }) 100 | } 101 | } 102 | 103 | func TestFnvHash64_CaseSensitive(t *testing.T) { 104 | // hash is case-sensitive 105 | testCases := []struct { 106 | lower, upper string 107 | }{ 108 | {"hello", "HELLO"}, 109 | {"world", "WORLD"}, 110 | {"test", "TEST"}, 111 | {"abc", "ABC"}, 112 | } 113 | 114 | for _, tc := range testCases { 115 | t.Run("case_sensitive_"+tc.lower, func(t *testing.T) { 116 | lowerHash := fnvHash64(tc.lower) 117 | upperHash := fnvHash64(tc.upper) 118 | 119 | if lowerHash == upperHash { 120 | t.Errorf("fnvHash64 is not case-sensitive: %q and %q have same hash %d", tc.lower, tc.upper, lowerHash) 121 | } 122 | }) 123 | } 124 | } 125 | 126 | func TestFnvHash64_LongStrings(t *testing.T) { 127 | longStr := "" 128 | for i := 0; i < 10000; i++ { 129 | longStr += "a" 130 | } 131 | 132 | hash := fnvHash64(longStr) 133 | 134 | // Should not panic and should return a valid hash 135 | if hash == 0 { 136 | t.Error("fnvHash64 returned 0 for long string") 137 | } 138 | 139 | // Test consistency with long strings 140 | hash2 := fnvHash64(longStr) 141 | if hash != hash2 { 142 | t.Error("fnvHash64 not consistent with long strings") 143 | } 144 | } 145 | 146 | func TestFnvHash64_BinaryData(t *testing.T) { 147 | // Test with binary data (null bytes, control characters) 148 | binaryData := []string{ 149 | "\x00\x01\x02\x03", 150 | "\xff\xfe\xfd\xfc", 151 | "\x00hello\x00world\x00", 152 | string([]byte{0, 255, 128, 64, 32, 16, 8, 4, 2, 1}), 153 | } 154 | 155 | for i, data := range binaryData { 156 | t.Run("binary_data_"+strconv.Itoa(i), func(t *testing.T) { 157 | hash := fnvHash64(data) 158 | 159 | // Should not panic and should return a valid hash 160 | if hash == 0 { 161 | t.Error("fnvHash64 returned 0 for binary data") 162 | } 163 | 164 | // Test consistency 165 | hash2 := fnvHash64(data) 166 | if hash != hash2 { 167 | t.Error("fnvHash64 not consistent with binary data") 168 | } 169 | }) 170 | } 171 | } 172 | 173 | func TestFnvHash64_EdgeCases(t *testing.T) { 174 | // Test edge cases 175 | edgeCases := []string{ 176 | "", // empty string 177 | " ", // single space 178 | "\n", // newline 179 | "\t", // tab 180 | "\r\n", // windows line ending 181 | "a", // single character 182 | "aa", // repeated character 183 | "aaa", // multiple repeated characters 184 | "aaaa", // even more repeated characters 185 | "abcd", // simple sequence 186 | "dcba", // reverse sequence 187 | "1234567890", // numbers 188 | "!@#$%^&*()", // special characters 189 | } 190 | 191 | hashes := make(map[uint64]string) 192 | 193 | for _, str := range edgeCases { 194 | t.Run("edge_case_"+str, func(t *testing.T) { 195 | hash := fnvHash64(str) 196 | 197 | // Check for collisions among edge cases 198 | if existing, exists := hashes[hash]; exists { 199 | t.Errorf("Hash collision: %q and %q both hash to %d", str, existing, hash) 200 | } 201 | hashes[hash] = str 202 | 203 | // Verify consistency 204 | hash2 := fnvHash64(str) 205 | if hash != hash2 { 206 | t.Errorf("fnvHash64(%q) not consistent: %d != %d", str, hash, hash2) 207 | } 208 | }) 209 | } 210 | } 211 | 212 | func TestFnvHash64_Performance(t *testing.T) { 213 | // Test that the function doesn't have obvious performance issues 214 | testStr := "This is a test string for performance testing" 215 | 216 | // Run multiple iterations to check for performance consistency 217 | for i := 0; i < 1000; i++ { 218 | hash := fnvHash64(testStr) 219 | if hash == 0 { 220 | t.Error("Unexpected zero hash") 221 | } 222 | } 223 | } 224 | 225 | // Benchmark tests 226 | func BenchmarkFnvHash64_Short(b *testing.B) { 227 | s := "test" 228 | b.ResetTimer() 229 | for i := 0; i < b.N; i++ { 230 | fnvHash64(s) 231 | } 232 | } 233 | 234 | func BenchmarkFnvHash64_Medium(b *testing.B) { 235 | s := "This is a medium length string for benchmarking" 236 | b.ResetTimer() 237 | for i := 0; i < b.N; i++ { 238 | fnvHash64(s) 239 | } 240 | } 241 | 242 | func BenchmarkFnvHash64_Long(b *testing.B) { 243 | s := "" 244 | for i := 0; i < 1000; i++ { 245 | s += "a" 246 | } 247 | b.ResetTimer() 248 | for i := 0; i < b.N; i++ { 249 | fnvHash64(s) 250 | } 251 | } 252 | 253 | func BenchmarkFnvHash64_VeryLong(b *testing.B) { 254 | s := "" 255 | for i := 0; i < 10000; i++ { 256 | s += "test string " 257 | } 258 | b.ResetTimer() 259 | for i := 0; i < b.N; i++ { 260 | fnvHash64(s) 261 | } 262 | } 263 | 264 | func BenchmarkFnvHash64_Empty(b *testing.B) { 265 | s := "" 266 | b.ResetTimer() 267 | for i := 0; i < b.N; i++ { 268 | fnvHash64(s) 269 | } 270 | } 271 | 272 | func BenchmarkFnvHash64_SingleChar(b *testing.B) { 273 | s := "a" 274 | b.ResetTimer() 275 | for i := 0; i < b.N; i++ { 276 | fnvHash64(s) 277 | } 278 | } 279 | 280 | func BenchmarkFnvHash64_URL(b *testing.B) { 281 | s := "https://example.com/api/v1/users/123?param=value&another=test" 282 | b.ResetTimer() 283 | for i := 0; i < b.N; i++ { 284 | fnvHash64(s) 285 | } 286 | } 287 | 288 | func BenchmarkFnvHash64_UUID(b *testing.B) { 289 | s := "550e8400-e29b-41d4-a716-446655440000" 290 | b.ResetTimer() 291 | for i := 0; i < b.N; i++ { 292 | fnvHash64(s) 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /internal/httpcache/index_test.go: -------------------------------------------------------------------------------- 1 | package httpcache 2 | 3 | import ( 4 | "reflect" 5 | "sort" 6 | "testing" 7 | ) 8 | 9 | func TestNewPatternIndex(t *testing.T) { 10 | pi := NewPatternIndex() 11 | 12 | if pi == nil { 13 | t.Fatal("NewPatternIndex() returned nil") 14 | } 15 | 16 | if pi.root == nil { 17 | t.Fatal("root node is nil") 18 | } 19 | 20 | if pi.root.children == nil { 21 | t.Fatal("root children map is nil") 22 | } 23 | 24 | if pi.root.keys == nil { 25 | t.Fatal("root keys map is nil") 26 | } 27 | } 28 | 29 | func TestNormalizePath(t *testing.T) { 30 | tests := []struct { 31 | name string 32 | path string 33 | expected []string 34 | }{ 35 | {name: "empty path", path: "", expected: []string{}}, 36 | {name: "root path", path: "/", expected: []string{}}, 37 | {name: "single segment", path: "/api", expected: []string{"api"}}, 38 | {name: "multiple segments", path: "/api/v1/users", expected: []string{"api", "v1", "users"}}, 39 | {name: "trailing slash", path: "/api/v1/users/", expected: []string{"api", "v1", "users"}}, 40 | {name: "no leading slash", path: "api/v1/users", expected: []string{"api", "v1", "users"}}, 41 | {name: "multiple slashes", path: "//api//v1//users//", expected: []string{"api", "v1", "users"}}, 42 | } 43 | 44 | for _, tt := range tests { 45 | t.Run(tt.name, func(t *testing.T) { 46 | result := normalizePath(tt.path) 47 | if !reflect.DeepEqual(result, tt.expected) { 48 | t.Errorf("normalizePath(%q) = %v, expected %v", tt.path, result, tt.expected) 49 | } 50 | }) 51 | } 52 | } 53 | 54 | func TestPatternIndex_AddKey(t *testing.T) { 55 | pi := NewPatternIndex() 56 | 57 | // Test adding keys to different paths 58 | pi.AddKey("/api/v1", "key1") 59 | pi.AddKey("/api/v1", "key2") 60 | pi.AddKey("/api/v2", "key3") 61 | pi.AddKey("/", "rootkey") 62 | 63 | // Verify keys were added correctly 64 | keys := pi.GetMatchingKeys("/api/v1") 65 | sort.Strings(keys) 66 | expected := []string{"key1", "key2"} 67 | sort.Strings(expected) 68 | 69 | if !reflect.DeepEqual(keys, expected) { 70 | t.Errorf("Expected keys %v, got %v", expected, keys) 71 | } 72 | 73 | // Test root path 74 | rootKeys := pi.GetMatchingKeys("/") 75 | if len(rootKeys) != 1 || rootKeys[0] != "rootkey" { 76 | t.Errorf("Expected root key [rootkey], got %v", rootKeys) 77 | } 78 | } 79 | 80 | func TestPatternIndex_RemoveKey(t *testing.T) { 81 | pi := NewPatternIndex() 82 | 83 | // Add some keys 84 | pi.AddKey("/api/v1", "key1") 85 | pi.AddKey("/api/v1", "key2") 86 | pi.AddKey("/api/v2", "key3") 87 | 88 | // Remove one key 89 | pi.RemoveKey("/api/v1", "key1") 90 | 91 | // Verify key was removed 92 | keys := pi.GetMatchingKeys("/api/v1") 93 | if len(keys) != 1 || keys[0] != "key2" { 94 | t.Errorf("Expected [key2], got %v", keys) 95 | } 96 | 97 | // Remove non-existent key (should not panic) 98 | pi.RemoveKey("/api/v1", "nonexistent") 99 | 100 | // Remove from non-existent path (should not panic) 101 | pi.RemoveKey("/nonexistent", "key1") 102 | } 103 | 104 | func TestPatternIndex_GetMatchingKeys(t *testing.T) { 105 | pi := NewPatternIndex() 106 | 107 | // Add test data 108 | pi.AddKey("/api/v1/users", "users-key1") 109 | pi.AddKey("/api/v1/users", "users-key2") 110 | pi.AddKey("/api/v1/posts", "posts-key1") 111 | pi.AddKey("/api/v2/users", "v2-users-key1") 112 | pi.AddKey("/static/css", "css-key1") 113 | pi.AddKey("/", "root-key") 114 | 115 | tests := []struct { 116 | name string 117 | pattern string 118 | expected []string 119 | }{ 120 | {name: "exact match", pattern: "/api/v1/users", expected: []string{"users-key1", "users-key2"}}, 121 | {name: "wildcard match", pattern: "/api/v1/*", expected: []string{"users-key1", "users-key2", "posts-key1"}}, 122 | {name: "broader wildcard", pattern: "/api/*", expected: []string{"users-key1", "users-key2", "posts-key1", "v2-users-key1"}}, 123 | {name: "root wildcard", pattern: "/*", expected: []string{"users-key1", "users-key2", "posts-key1", "v2-users-key1", "css-key1", "root-key"}}, 124 | {name: "no match", pattern: "/nonexistent", expected: nil}, 125 | {name: "root exact", pattern: "/", expected: []string{"root-key"}}, 126 | {name: "empty pattern", pattern: "", expected: []string{"root-key"}}, 127 | } 128 | 129 | for _, tt := range tests { 130 | t.Run(tt.name, func(t *testing.T) { 131 | result := pi.GetMatchingKeys(tt.pattern) 132 | sort.Strings(result) 133 | sort.Strings(tt.expected) 134 | if !reflect.DeepEqual(result, tt.expected) { 135 | t.Errorf("GetMatchingKeys(%q) = %v, expected %v", tt.pattern, result, tt.expected) 136 | } 137 | }) 138 | } 139 | } 140 | 141 | func TestPatternIndex_Clear(t *testing.T) { 142 | pi := NewPatternIndex() 143 | 144 | // Add some data 145 | pi.AddKey("/api/v1", "key1") 146 | pi.AddKey("/api/v2", "key2") 147 | 148 | // Verify data exists 149 | keys := pi.GetMatchingKeys("/*") 150 | if len(keys) == 0 { 151 | t.Fatal("Expected keys before clear") 152 | } 153 | 154 | // Clear the index 155 | pi.Clear() 156 | 157 | // Verify everything is cleared 158 | keys = pi.GetMatchingKeys("/*") 159 | if len(keys) != 0 { 160 | t.Errorf("Expected no keys after clear, got %v", keys) 161 | } 162 | 163 | // Verify we can still add keys after clear 164 | pi.AddKey("/test", "test-key") 165 | keys = pi.GetMatchingKeys("/test") 166 | if len(keys) != 1 || keys[0] != "test-key" { 167 | t.Errorf("Expected [test-key] after clear and add, got %v", keys) 168 | } 169 | } 170 | 171 | func TestPatternIndex_ConcurrentAccess(t *testing.T) { 172 | pi := NewPatternIndex() 173 | 174 | // Test concurrent reads and writes 175 | done := make(chan bool) 176 | 177 | // Writer goroutine 178 | go func() { 179 | for i := 0; i < 100; i++ { 180 | pi.AddKey("/api/test", "key"+string(rune(i))) 181 | } 182 | done <- true 183 | }() 184 | 185 | // Reader goroutine 186 | go func() { 187 | for i := 0; i < 100; i++ { 188 | pi.GetMatchingKeys("/api/*") 189 | } 190 | done <- true 191 | }() 192 | 193 | <-done 194 | <-done 195 | 196 | // Verify final state 197 | keys := pi.GetMatchingKeys("/api/test") 198 | if len(keys) != 100 { 199 | t.Errorf("Expected 100 keys, got %d", len(keys)) 200 | } 201 | } 202 | 203 | func TestPatternNode_Creation(t *testing.T) { 204 | node := newPatternNode() 205 | if node == nil { 206 | t.Fatal("newPatternNode() returned nil") 207 | } 208 | if node.children == nil { 209 | t.Fatal("children map is nil") 210 | } 211 | if node.keys == nil { 212 | t.Fatal("keys map is nil") 213 | } 214 | if len(node.children) != 0 { 215 | t.Errorf("Expected empty children map, got %d items", len(node.children)) 216 | } 217 | if len(node.keys) != 0 { 218 | t.Errorf("Expected empty keys map, got %d items", len(node.keys)) 219 | } 220 | } 221 | 222 | func BenchmarkPatternIndex_AddKey(b *testing.B) { 223 | pi := NewPatternIndex() 224 | b.ResetTimer() 225 | for i := 0; i < b.N; i++ { 226 | pi.AddKey("/api/v1/users", "key"+string(rune(i))) 227 | } 228 | } 229 | 230 | func BenchmarkPatternIndex_GetMatchingKeys(b *testing.B) { 231 | pi := NewPatternIndex() 232 | for i := 0; i < 1000; i++ { 233 | pi.AddKey("/api/v1/users", "key"+string(rune(i))) 234 | } 235 | b.ResetTimer() 236 | for i := 0; i < b.N; i++ { 237 | pi.GetMatchingKeys("/api/v1/users") 238 | } 239 | } 240 | 241 | func BenchmarkPatternIndex_WildcardMatch(b *testing.B) { 242 | pi := NewPatternIndex() 243 | for i := 0; i < 100; i++ { 244 | pi.AddKey("/api/v1/users", "users-key"+string(rune(i))) 245 | pi.AddKey("/api/v1/posts", "posts-key"+string(rune(i))) 246 | pi.AddKey("/api/v2/users", "v2-users-key"+string(rune(i))) 247 | } 248 | b.ResetTimer() 249 | for i := 0; i < b.N; i++ { 250 | pi.GetMatchingKeys("/api/*") 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /CLUSTER.md: -------------------------------------------------------------------------------- 1 | # Kioshun Cluster 2 | 3 | This document describes the distributed cache cluster components and protocols used by `kioshun/cluster`. It focuses on the replication model, failure handling, and the wire protocol. 4 | 5 | ## Usage Model (peer-to-peer) 6 | 7 | - Peer-to-peer mesh: each service instance runs a full cluster peer. Nodes discover each other via `Seeds`, gossip membership/weights, form a weighted rendezvous ring, and replicate directly. There is no coordinator or proxy. 8 | - Identity: every node has a stable `ID` (type `NodeID`) used in membership and the ring. If not provided, `ID` defaults to a 16‑hex digest derived from `PublicURL` (`Config.EnsureID()`). `PublicURL` is the dialable address. 9 | - In-process node: embed a `cluster.Node` in your service. Start it with a unique `PublicURL` and `BindAddr`, configure `Seeds` with known peers, then wrap it with `NewDistributedCache` to call `Set/Get`. 10 | 11 | ### Kioshun Mesh-style vs. Redis-style 12 | 13 | - Redis Cluster: clients stay clients; they compute slot→node and connect over a client protocol. 14 | - Kioshun Cluster: your app becomes a node in the mesh. It gossips, can be chosen as an owner, stores a shard locally, and routes requests to primary owners when needed. 15 | 16 | ### Quickstart 17 | 18 | Run the same service on three servers. Each instance is a peer in the mesh. 19 | 20 | On each server, env: 21 | 22 | ``` 23 | CACHE_BIND=:4443 24 | CACHE_PUBLIC=srv-a:4443 # use srv-b:4443 and srv-c:4443 on other servers 25 | CACHE_SEEDS=srv-a:4443,srv-b:4443,srv-c:4443 26 | CACHE_AUTH=supersecret 27 | ``` 28 | 29 | In code: 30 | 31 | ``` 32 | local := cache.NewWithDefaults[string, []byte]() 33 | 34 | cfg := cluster.Default() 35 | cfg.BindAddr = os.Getenv("CACHE_BIND") 36 | cfg.PublicURL = os.Getenv("CACHE_PUBLIC") 37 | cfg.Seeds = strings.Split(os.Getenv("CACHE_SEEDS"), ",") 38 | cfg.ReplicationFactor = 3 39 | cfg.WriteConcern = 2 40 | cfg.Sec.AuthToken = os.Getenv("CACHE_AUTH") 41 | 42 | node := cluster.NewNode[string, []byte](cfg, cluster.StringKeyCodec[string]{}, local, cluster.BytesCodec{}) 43 | if err := node.Start(); err != nil { panic(err) } 44 | dc := cluster.NewDistributedCache[string, []byte](node) 45 | _ = dc.Set("k", []byte("v"), time.Minute) 46 | v, ok := dc.Get("k") 47 | _ = v; _ = ok 48 | ``` 49 | 50 | Key points: 51 | 52 | - Mesh like: every instance is a peer - it gossips, can be selected as an owner, and stores a shard locally. 53 | - Reachability: `CACHE_PUBLIC` must be routable between peers. 54 | - Durability: tune `ReplicationFactor` and `WriteConcern` (e.g., RF=3, WC=2). 55 | - Security: set the same `CACHE_AUTH` on all peers. Enable TLS in config if required. When `AuthToken` is empty and `AllowUnauthenticatedClients` is true, public client RPCs are allowed without Hello; peer‑only RPCs still require Hello. 56 | - Adapter scope: `Clear/Size/Stats` act on the local shard only. 57 | 58 | ## Architecture 59 | 60 | ``` 61 | Clients ──▶ Node (API) ──▶ Owners (RF replicas) over TCP/TLS (CBOR frames) 62 | │ │ 63 | ├── Gossip/Weights (peer discovery + load-based ring) 64 | └── Hinted Handoff (per‑peer queues) 65 | ``` 66 | 67 | Core components: 68 | - Rendezvous ring with dynamic weights chooses RF owners per key. 69 | - LWW replication with HLC versions guarantees monotonic conflict resolution. 70 | - Hinted handoff replays missed writes to down/unreachable owners. 71 | - Digest‑based backfill repairs divergence on join and periodically. It is also kicked shortly after membership epoch changes. 72 | 73 | ## Data Flow 74 | 75 | Write path (Set/Delete): 76 | 1. Compute owners via rendezvous (RF replicas, ordered). 77 | 2. If local is primary, apply locally and record HLC version. 78 | 3. Replicate to remaining owners in parallel and wait for WC acknowledgements. 79 | 4. On peer error, enqueue a hinted‑handoff record (per‑peer queue). 80 | 81 | Read path (Get): 82 | 1. If local is primary and key exists, serve locally. 83 | 2. Otherwise route to ring owners. Healthy (non‑penalized) peers are tried first; additional legs are hedged after a small delay. 84 | 3. Responses are decompressed/decoded; "notfound" from an owner is treated as a clean miss. 85 | 86 | ## Failure Handling 87 | 88 | Hinted handoff: 89 | - Per‑peer queues store newest version per key with absolute expiry. 90 | - Replay loop drains queues with exponential backoff and global RPS limit. 91 | - Auto‑pause stops enqueues under high backlog; replay continues to drain. 92 | - Transport timeouts increment a short penalty window per peer. Penalized peers are temporarily deprioritized for reads. 93 | 94 | Backfill/repair: 95 | - Donor builds bucket digests (count, XOR(hash^version)) for a prefix depth. 96 | - Joiner compares to its local digests - on mismatch, pages keys by bucket. 97 | - Keys are imported with versions and absolute expiries; LWW prunes stale. Backfill also runs shortly after membership epoch increases to start repair promptly. 98 | 99 | Ring membership: 100 | - Gossip exchanges peer identity (`ID`), current address, seen timestamps, and load. 101 | - Weight updates rebuild a weighted rendezvous ring for owner selection. 102 | 103 | ## Consistency Model 104 | 105 | - Eventual consistency for reads; WC controls write durability/freshness. 106 | - LWW (HLC) ensures monotonic versions across nodes. 107 | - Rebalancer migrates keys to new primaries (preserves TTL; HLC versions). 108 | 109 | ## Wire Protocol 110 | 111 | Transport: 112 | - Length‑prefixed frames over TCP (optional TLS); each frame carries a CBOR message. 113 | - Initial Hello authenticates/identifies peers when auth is enabled. `MsgHello{FromID, FromAddr, Token}` → `MsgHelloResp{OK, PeerID, Err}`. When `AllowUnauthenticatedClients` is true and no token is set, public client RPCs may be sent as the first frame without Hello. Peer‑only RPCs still require Hello. 114 | 115 | Messages: 116 | - Set/Delete (+Bulk) carry keys, values (for Set), compression flag, absolute expiry, and version (HLC). 117 | - Get/GetBulk return found flag(s), value bytes, compression flags, and optional expiry; they do not carry version. 118 | - LeaseLoad supports coordinated loader on primary with single‑flight leases. 119 | - Gossip exchanges peer list (ID + Addr), load metrics, hot key samples, and epoch. 120 | - BackfillDigest/BackfillKeys implement incremental repair. 121 | 122 | ``` 123 | ┌────────┬───────────────┬───────────────────────────┐ 124 | │ Frame │ 4B length N │ N bytes: CBOR(Message) │ 125 | └────────┴───────────────┴───────────────────────────┘ 126 | 127 | Message Base: { t: MsgType, id: uint64 } 128 | Key/Value: []byte (value may be gzip-compressed; Cp=true) 129 | Set/Delete versions: uint64 (HLC) 130 | ``` 131 | 132 | ## Hinted Handoff 133 | 134 | ``` 135 | enqueue(write→peer) ──▶ per‑peer queue (max items/bytes, TTL, DropPolicy) 136 | │ 137 | replay loop (RPS) ───────────────┘─▶ send → ok: drop; fail: backoff + requeue 138 | ``` 139 | 140 | - Coalesces by key: keeps newest version; older hints replaced in place. 141 | - Drops expired values (SET with already expired E) and aged hints (TTL). 142 | - Auto‑resume when backlog drains below hysteresis threshold. 143 | 144 | ## Backfill 145 | 146 | ``` 147 | Joiner → Donor: BackfillDigest(depth) 148 | Donor → Joiner: Buckets [{prefix, count, hash}] 149 | Joiner compares; for mismatched buckets: 150 | Joiner → Donor: BackfillKeys(prefix, cursor, limit) 151 | Donor → Joiner: Items [{K, V, E, Ver, Cp}] + next cursor 152 | ``` 153 | 154 | - Depth controls bucket granularity (default 2 bytes = 65,536 buckets). 155 | - Cursor is last key‑hash in bucket for stable pagination. 156 | - Typical page sizes: initial pass ~1024, periodic passes ~512. 157 | 158 | ## Rebalance 159 | 160 | - Periodically samples local keys - if primary changed, pushes key to new primary using current HLC and remaining TTL → absolute expiry; deletes local key on success. 161 | 162 | ## Tuning 163 | 164 | - RF ≥ 3, WC ≥ 2 for balanced durability and freshness. 165 | - Handoff: set per‑peer caps and RPS to sustainable values. TTL high enough to cover expected downtimes. 166 | - Backfill: adjust depth for dataset size. Tune page size to donor capacity. 167 | - Timeouts: read/write/idle tuned to network characteristics; inflight caps per peer. 168 | -------------------------------------------------------------------------------- /cluster/bf_join.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "encoding/binary" 5 | "sort" 6 | "time" 7 | 8 | "github.com/cespare/xxhash/v2" 9 | cbor "github.com/fxamacker/cbor/v2" 10 | cache "github.com/unkn0wn-root/kioshun" 11 | ) 12 | 13 | type bucketSig struct { 14 | count uint32 15 | hash uint64 16 | } 17 | 18 | const defaultBackfillDepth = 2 // 65,536 buckets 19 | 20 | // readyPollInterval picks a small poll period relative to configured cadences. 21 | func readyPollInterval(cfg Config) time.Duration { 22 | p := 150 * time.Millisecond 23 | if cfg.GossipInterval > 0 && cfg.GossipInterval/4 < p { 24 | p = cfg.GossipInterval / 4 25 | } 26 | if cfg.WeightUpdate > 0 && cfg.WeightUpdate/4 < p { 27 | p = cfg.WeightUpdate / 4 28 | } 29 | if p < 100*time.Millisecond { 30 | p = 100 * time.Millisecond 31 | } 32 | if p > 500*time.Millisecond { 33 | p = 500 * time.Millisecond 34 | } 35 | return p 36 | } 37 | 38 | // backfillLoop waits until the node has a minimally ready view of the 39 | // cluster (some peers connected or a ring with >1 node), then performs an 40 | // initial state backfill from peers. After startup it periodically runs a 41 | // light repair pass to reconcile keys that may have diverged due to 42 | // membership changes or temporary failures. 43 | // Wait for initial membership/ring readiness with a bounded timeout 44 | // to avoid running a no-op backfill before peers and ring are populated. 45 | // Conditions to proceed: at least one peer connected OR ring has >1 node. 46 | func (n *Node[K, V]) backfillLoop() { 47 | timeout := 3 * time.Second 48 | if n.cfg.GossipInterval > 0 { 49 | if d := 3 * n.cfg.GossipInterval; d > timeout { 50 | timeout = d 51 | } 52 | } 53 | 54 | if n.cfg.WeightUpdate > 0 { 55 | if d := 3 * n.cfg.WeightUpdate; d > timeout { 56 | timeout = d 57 | } 58 | } 59 | 60 | deadline := time.Now().Add(timeout) 61 | poll := readyPollInterval(n.cfg) // typically ~150ms 62 | tk := time.NewTicker(poll) 63 | defer tk.Stop() 64 | for { 65 | r := n.ring.Load().(*ring) 66 | if len(n.peerIDs()) > 0 || len(r.nodes) > 1 { 67 | break 68 | } 69 | if time.Now().After(deadline) { 70 | break 71 | } 72 | select { 73 | case <-tk.C: 74 | case <-n.stop: 75 | return 76 | } 77 | } 78 | 79 | n.backfillOnce(defaultBackfillDepth, 1024) 80 | 81 | iv := n.cfg.BackfillInterval 82 | if iv <= 0 { 83 | iv = n.cfg.RebalanceInterval 84 | if iv <= 0 { 85 | iv = 30 * time.Second 86 | } 87 | } 88 | t := time.NewTicker(iv) 89 | defer t.Stop() 90 | for { 91 | select { 92 | case <-t.C: 93 | n.backfillOnce(defaultBackfillDepth, 512) // light repair 94 | case <-n.backfillKick: 95 | // triggered by membership epoch increase -> run a light repair promptly. 96 | n.backfillOnce(defaultBackfillDepth, 512) 97 | case <-n.stop: 98 | return 99 | } 100 | } 101 | } 102 | 103 | // backfillOnce reconciles this node's owned keyspace with peers by: 104 | // 1. Computing local per-bucket digests for owned keys. 105 | // 2. Asking each donor for its digests targeted at this node. 106 | // 3. For buckets that differ, paging through donor keys in hash order 107 | // using a cursor, decoding values, and importing successfully decoded 108 | // items into the local shard (and LWW version table when enabled). 109 | // 110 | // The donor list excludes self and peers we are not connected to. 111 | func (n *Node[K, V]) backfillOnce(depth int, page int) { 112 | donors := n.peerIDs() 113 | selfID := n.cfg.ID 114 | tmp := donors[:0] 115 | for _, d := range donors { 116 | if d != selfID && n.getPeer(d) != nil { 117 | tmp = append(tmp, d) 118 | } 119 | } 120 | 121 | donors = tmp 122 | if len(donors) == 0 { 123 | return 124 | } 125 | 126 | // Compute a local view of bucket digests we own to detect divergence. 127 | local := n.computeLocalDigests(depth) 128 | sort.Slice(donors, func(i, j int) bool { return donors[i] < donors[j] }) 129 | 130 | for _, d := range donors { 131 | pc := n.getPeer(d) 132 | if pc == nil { 133 | continue 134 | } 135 | 136 | req := &MsgBackfillDigestReq{ 137 | Base: Base{T: MTBackfillDigestReq, ID: n.nextReqID()}, 138 | TargetID: string(selfID), 139 | Depth: uint8(depth), 140 | } 141 | 142 | raw, err := pc.request(req, req.ID, n.cfg.Sec.ReadTimeout) 143 | if err != nil { 144 | continue 145 | } 146 | 147 | var dr MsgBackfillDigestResp 148 | if e := cbor.Unmarshal(raw, &dr); e != nil { 149 | continue 150 | } 151 | if dr.NotInRing { 152 | // Donor hasn't integrated us into its ring yet; skip this donor for now 153 | continue 154 | } 155 | if len(dr.Buckets) == 0 { 156 | continue 157 | } 158 | 159 | for _, b := range dr.Buckets { 160 | lp := local[string(b.Prefix)] 161 | if lp.count == b.Count && lp.hash == b.Hash64 { 162 | continue // already in sync for this bucket 163 | } 164 | 165 | // Page through differing buckets using last key-hash cursor to keep 166 | // pagination deterministic and avoid duplicates/skips across pages. 167 | var cursor []byte 168 | for { 169 | kReq := &MsgBackfillKeysReq{ 170 | Base: Base{T: MTBackfillKeysReq, ID: n.nextReqID()}, 171 | TargetID: string(selfID), 172 | Prefix: append([]byte(nil), b.Prefix...), 173 | Limit: page, 174 | Cursor: cursor, 175 | } 176 | 177 | raw2, err := pc.request(kReq, kReq.ID, n.cfg.Sec.ReadTimeout) 178 | if err != nil { 179 | break 180 | } 181 | 182 | var kr MsgBackfillKeysResp 183 | if e := cbor.Unmarshal(raw2, &kr); e != nil { 184 | break 185 | } 186 | if kr.NotInRing { 187 | // Donor not ready; stop paging this bucket from this donor for now 188 | break 189 | } 190 | if len(kr.Items) == 0 { 191 | break 192 | } 193 | 194 | // Decode and import only keys that successfully decode and pass 195 | // size/time limits; errors are skipped to keep repair moving. 196 | toImport := make([]cache.Item[K, V], 0, len(kr.Items)) 197 | for _, kv := range kr.Items { 198 | k, err := n.kc.DecodeKey(kv.K) 199 | if err != nil { 200 | continue 201 | } 202 | 203 | vb, err := n.maybeDecompress(kv.V, kv.Cp) 204 | if err != nil { 205 | continue 206 | } 207 | 208 | v, err := n.codec.Decode(vb) 209 | if err != nil { 210 | continue 211 | } 212 | 213 | toImport = append(toImport, cache.Item[K, V]{ 214 | Key: k, 215 | Val: v, 216 | ExpireAbs: kv.E, 217 | Version: kv.Ver, 218 | }) 219 | } 220 | 221 | if len(toImport) > 0 { 222 | n.local.Import(toImport) 223 | if n.cfg.LWWEnabled { 224 | n.verMu.Lock() 225 | for _, it := range toImport { 226 | n.version[string(n.kc.EncodeKey(it.Key))] = it.Version 227 | } 228 | n.verMu.Unlock() 229 | 230 | last := toImport[len(toImport)-1].Version 231 | if last > 0 { 232 | n.clock.Observe(last) 233 | } 234 | } 235 | // Update our running local digest with imported batch to avoid 236 | // asking for keys we've already reconciled in this pass. 237 | local = n.updateLocalDigestWithBatch(local, depth, toImport) 238 | } 239 | 240 | if len(kr.NextCursor) == 8 { 241 | cursor = append([]byte(nil), kr.NextCursor...) 242 | } else { 243 | break 244 | } 245 | } 246 | } 247 | } 248 | } 249 | 250 | // computeLocalDigests returns an orderless digest per key-hash prefix 251 | // bucket for keys this node currently owns. The digest includes count and 252 | // XOR(hash^version) so that donors and joiners can cheaply detect drift 253 | // without moving all keys. Depth is clamped to [1,8] bytes of the 64-bit 254 | // key hash in big-endian order. 255 | // helper (optional) 256 | func (n *Node[K, V]) computeLocalDigests(depth int) map[string]bucketSig { 257 | if depth <= 0 || depth > 8 { 258 | depth = 2 259 | } 260 | 261 | m := make(map[string]bucketSig, 1<<12) 262 | keys := n.local.Keys() 263 | r := n.ring.Load().(*ring) 264 | selfID := n.cfg.ID 265 | 266 | for _, k := range keys { 267 | h64 := n.hash64Of(k) 268 | if !r.ownsHash(selfID, h64) { 269 | continue 270 | } 271 | 272 | var hb [8]byte 273 | binary.BigEndian.PutUint64(hb[:], h64) 274 | prefix := string(hb[:depth]) 275 | 276 | ver := uint64(0) 277 | if n.cfg.LWWEnabled { 278 | kb := n.kc.EncodeKey(k) 279 | n.verMu.RLock() 280 | ver = n.version[string(kb)] 281 | n.verMu.RUnlock() 282 | } 283 | 284 | s := m[prefix] 285 | s.count++ 286 | s.hash ^= (h64 ^ ver) 287 | m[prefix] = s 288 | } 289 | return m 290 | } 291 | 292 | // updateLocalDigestWithBatch updates an existing local digest with a set 293 | // of imported items so subsequent comparisons consider already-synced 294 | // keys and avoid re-requesting them in the same backfill run. 295 | func (n *Node[K, V]) updateLocalDigestWithBatch(m map[string]bucketSig, depth int, batch []cache.Item[K, V]) map[string]bucketSig { 296 | for _, it := range batch { 297 | h64 := n.hash64Of(it.Key) 298 | var hb [8]byte 299 | binary.BigEndian.PutUint64(hb[:], h64) 300 | prefix := string(hb[:depth]) 301 | 302 | ver := uint64(0) 303 | if n.cfg.LWWEnabled { 304 | ver = it.Version 305 | } 306 | 307 | s := m[prefix] 308 | s.count++ 309 | s.hash ^= (h64 ^ ver) 310 | m[prefix] = s 311 | } 312 | return m 313 | } 314 | 315 | func (n *Node[K, V]) hash64Of(k K) uint64 { 316 | if kh, ok := any(n.kc).(KeyHasher[K]); ok { 317 | return kh.Hash64(k) 318 | } 319 | return xxhash.Sum64(n.kc.EncodeKey(k)) 320 | } 321 | -------------------------------------------------------------------------------- /cluster/transport.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "bufio" 5 | "crypto/tls" 6 | "encoding/binary" 7 | "errors" 8 | "io" 9 | "net" 10 | "sync" 11 | "sync/atomic" 12 | "syscall" 13 | "time" 14 | 15 | cbor "github.com/fxamacker/cbor/v2" 16 | ) 17 | 18 | const ( 19 | penaltyBase = 2 * time.Second // first timeout → 2s 20 | penaltyMax = 8 * time.Second // cap the penalty 21 | backoffWindow = 5 * time.Second // time window to keep growing the streak 22 | ) 23 | 24 | type peerConn struct { 25 | addr string // current dial address for this peer 26 | selfID NodeID 27 | selfAddr string 28 | peerID NodeID 29 | conn net.Conn 30 | r *bufio.Reader 31 | w *bufio.Writer 32 | mu sync.Mutex 33 | pend sync.Map // reqID -> chan []byte 34 | closed chan struct{} 35 | maxFrame int 36 | readTO time.Duration 37 | writeTO time.Duration 38 | idleTO time.Duration 39 | inflightCh chan struct{} 40 | token string 41 | penaltyUntil int64 42 | lastTimeout int64 43 | toStreak uint32 44 | } 45 | 46 | // dialPeer establishes a connection, performs Hello (auth+identity), 47 | // and starts the read loop. Returns the learned peerID. 48 | func dialPeer(selfID NodeID, selfAddr, addr string, tlsConf *tls.Config, maxFrame int, 49 | readTO, writeTO, idleTO time.Duration, inflight int, token string, 50 | ) (*peerConn, NodeID, error) { 51 | d := &net.Dialer{ 52 | Timeout: readTO, 53 | KeepAlive: 45 * time.Second, 54 | Control: func(network, address string, c syscall.RawConn) error { 55 | var ctrlErr error 56 | _ = c.Control(func(fd uintptr) { 57 | _ = syscall.SetsockoptInt(int(fd), syscall.IPPROTO_TCP, syscall.TCP_NODELAY, 1) 58 | }) 59 | return ctrlErr 60 | }, 61 | } 62 | 63 | var c net.Conn 64 | var err error 65 | if tlsConf != nil { 66 | c, err = tls.DialWithDialer(d, "tcp", addr, tlsConf) 67 | } else { 68 | c, err = d.Dial("tcp", addr) 69 | } 70 | if err != nil { 71 | return nil, "", err 72 | } 73 | 74 | if tlsConf == nil { 75 | if tc, ok := c.(*net.TCPConn); ok { 76 | _ = tc.SetNoDelay(true) 77 | _ = tc.SetKeepAlive(true) 78 | _ = tc.SetKeepAlivePeriod(45 * time.Second) 79 | } 80 | } 81 | 82 | pc := &peerConn{ 83 | addr: addr, 84 | selfID: selfID, 85 | selfAddr: selfAddr, 86 | conn: c, 87 | r: bufio.NewReaderSize(c, 64<<10), 88 | w: bufio.NewWriterSize(c, 64<<10), 89 | closed: make(chan struct{}), 90 | maxFrame: maxFrame, 91 | readTO: readTO, 92 | writeTO: writeTO, 93 | idleTO: idleTO, 94 | inflightCh: make(chan struct{}, inflight), 95 | token: token, 96 | } 97 | 98 | if err := pc.hello(); err != nil { 99 | _ = c.Close() 100 | return nil, "", err 101 | } 102 | // start the demultiplexing reader: one goroutine reads frames and routes 103 | // them to the waiting requester channel keyed by Base.ID. 104 | go pc.readLoop() 105 | return pc, pc.peerID, nil 106 | } 107 | 108 | // hello performs an authentication handshake by sending MsgHello and expecting 109 | // a positive MsgHelloResp. 110 | func (p *peerConn) hello() error { 111 | id := uint64(time.Now().UnixNano()) 112 | msg := &MsgHello{ 113 | Base: Base{T: MTHello, ID: id}, 114 | FromID: string(p.selfID), 115 | FromAddr: p.selfAddr, 116 | Token: p.token, 117 | } 118 | raw, err := cborEnc.Marshal(msg) 119 | if err != nil { 120 | return err 121 | } 122 | 123 | if err := p.writeFrame(raw); err != nil { 124 | return err 125 | } 126 | 127 | respRaw, err := p.readFrame() 128 | if err != nil { 129 | return err 130 | } 131 | 132 | var base Base 133 | if err := cbor.Unmarshal(respRaw, &base); err != nil { 134 | return err 135 | } 136 | 137 | if base.T != MTHelloResp { 138 | return errors.New("bad hello resp") 139 | } 140 | 141 | var hr MsgHelloResp 142 | if err := cbor.Unmarshal(respRaw, &hr); err != nil { 143 | return err 144 | } 145 | if !hr.OK { 146 | if hr.Err == "" { 147 | hr.Err = "unauthorized" 148 | } 149 | return errors.New(hr.Err) 150 | } 151 | 152 | p.peerID = NodeID(hr.PeerID) 153 | return nil 154 | } 155 | 156 | // close closes the underlying connection and marks the peer as closed. 157 | func (p *peerConn) close() { 158 | _ = p.conn.Close() 159 | select { 160 | case <-p.closed: 161 | default: 162 | close(p.closed) 163 | } 164 | } 165 | 166 | // failAll closes all pending request channels and the connection to unblock 167 | // waiters when the connection is no longer usable. 168 | func (p *peerConn) failAll(err error) { 169 | // notify all pending requests that the connection failed. 170 | p.pend.Range(func(_, chAny any) bool { 171 | if ch, ok := chAny.(chan []byte); ok { 172 | // close channel so request() unblocks and returns "peer closed". 173 | close(ch) 174 | } 175 | return true 176 | }) 177 | p.close() 178 | } 179 | 180 | // readLoop continuously reads frames, demultiplexes them by request ID, and 181 | // delivers payloads to the waiting channels created by request(). 182 | func (p *peerConn) readLoop() { 183 | for { 184 | buf, err := p.readFrame() 185 | if err != nil { 186 | p.failAll(err) 187 | return 188 | } 189 | var base Base 190 | if err := cbor.Unmarshal(buf, &base); err != nil { 191 | continue 192 | } 193 | if chAny, ok := p.pend.Load(base.ID); ok { 194 | p.pend.Delete(base.ID) 195 | if ch, ok := chAny.(chan []byte); ok { 196 | ch <- buf 197 | close(ch) 198 | } 199 | } 200 | } 201 | } 202 | 203 | // readFrame reads one length-prefixed frame with deadlines and size checks. 204 | func (p *peerConn) readFrame() ([]byte, error) { 205 | _ = p.conn.SetReadDeadline(time.Now().Add(p.readTO)) 206 | var hdr [4]byte 207 | if _, err := io.ReadFull(p.r, hdr[:]); err != nil { 208 | return nil, err 209 | } 210 | 211 | n := int(binary.BigEndian.Uint32(hdr[:])) 212 | if p.maxFrame > 0 && n > p.maxFrame { 213 | return nil, errors.New("frame too large") 214 | } 215 | 216 | buf := make([]byte, n) 217 | if _, err := io.ReadFull(p.r, buf); err != nil { 218 | return nil, err 219 | } 220 | _ = p.conn.SetReadDeadline(time.Now().Add(p.idleTO)) 221 | return buf, nil 222 | } 223 | 224 | // writeFrame writes one length-prefixed frame with a write deadline. 225 | func (p *peerConn) writeFrame(payload []byte) error { 226 | p.mu.Lock() 227 | defer p.mu.Unlock() 228 | _ = p.conn.SetWriteDeadline(time.Now().Add(p.writeTO)) 229 | return writeFrameBuf(p.w, payload) 230 | } 231 | 232 | // writeFrameBuf writes a frame and flushes the buffered writer. 233 | func writeFrameBuf(w *bufio.Writer, payload []byte) error { 234 | if err := writeFrame(w, payload); err != nil { 235 | return err 236 | } 237 | return w.Flush() 238 | } 239 | 240 | // writeFrame writes a 4-byte big-endian length header followed by payload. 241 | func writeFrame(w io.Writer, payload []byte) error { 242 | var hdr [4]byte 243 | binary.BigEndian.PutUint32(hdr[:], uint32(len(payload))) 244 | if _, err := w.Write(hdr[:]); err != nil { 245 | return err 246 | } 247 | _, err := w.Write(payload) 248 | return err 249 | } 250 | 251 | // request sends a message and waits for a response with matching ID or until 252 | // timeout. It bounds per-peer concurrency via inflightCh and applies timeout 253 | // penalties on repeated expirations to avoid hot-looping on bad peers. 254 | func (p *peerConn) request(msg any, id uint64, timeout time.Duration) ([]byte, error) { 255 | select { 256 | case p.inflightCh <- struct{}{}: 257 | default: 258 | return nil, errors.New("peer inflight limit") 259 | } 260 | defer func() { <-p.inflightCh }() 261 | 262 | sel, err := cborEnc.Marshal(msg) 263 | if err != nil { 264 | return nil, err 265 | } 266 | // each request registers a one-shot channel under its ID; readLoop 267 | // delivers the response or request times out and cleans up the slot. 268 | ch := make(chan []byte, 1) 269 | p.pend.Store(id, ch) 270 | 271 | if err := p.writeFrame(sel); err != nil { 272 | p.pend.Delete(id) 273 | return nil, err 274 | } 275 | 276 | timer := time.NewTimer(timeout) 277 | defer timer.Stop() 278 | select { 279 | case resp, ok := <-ch: 280 | if !ok { 281 | return nil, ErrPeerClosed 282 | } 283 | return resp, nil 284 | case <-timer.C: 285 | p.pend.Delete(id) 286 | p.penalizeTimeout() // backoff on repeated timeouts 287 | return nil, ErrTimeout 288 | } 289 | } 290 | 291 | // penalizeTimeout bumps a short penalty - repeated timeouts within backoffWindow 292 | // grow the penalty (2s → 4s → 8s), capped by penaltyMax. O(1), timeout-path only. 293 | func (p *peerConn) penalizeTimeout() { 294 | now := time.Now() 295 | last := time.Unix(0, atomic.LoadInt64(&p.lastTimeout)) 296 | var streak uint32 297 | if now.Sub(last) > backoffWindow { 298 | // stale last-timeout: reset streak to 1 299 | atomic.StoreUint32(&p.toStreak, 1) 300 | streak = 1 301 | } else { 302 | // same window: increment 303 | streak = atomic.AddUint32(&p.toStreak, 1) 304 | } 305 | atomic.StoreInt64(&p.lastTimeout, now.UnixNano()) 306 | 307 | // penalty = base << (streak-1), capped 308 | shift := streak - 1 309 | if shift > 3 { // 2s<<3 = 16s 310 | shift = 3 311 | } 312 | 313 | d := penaltyBase << shift 314 | if d > penaltyMax { 315 | d = penaltyMax 316 | } 317 | atomic.StoreInt64(&p.penaltyUntil, now.Add(d).UnixNano()) 318 | } 319 | 320 | // penalized reports whether the peer is currently under penalty. 321 | func (p *peerConn) penalized() bool { 322 | return time.Now().UnixNano() < atomic.LoadInt64(&p.penaltyUntil) 323 | } 324 | -------------------------------------------------------------------------------- /benchmarks/cluster/direct/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "math/rand" 7 | "os" 8 | "os/signal" 9 | "sort" 10 | "strconv" 11 | "sync" 12 | "sync/atomic" 13 | "syscall" 14 | "time" 15 | 16 | cache "github.com/unkn0wn-root/kioshun" 17 | "github.com/unkn0wn-root/kioshun/cluster" 18 | ) 19 | 20 | type stats struct { 21 | Total int64 22 | Gets int64 23 | Sets int64 24 | Errs int64 25 | Hits int64 26 | Miss int64 27 | WrongValue int64 28 | GetLatUs []int64 29 | SetLatUs []int64 30 | } 31 | 32 | func getenv(k, d string) string { 33 | if v := os.Getenv(k); v != "" { 34 | return v 35 | } 36 | return d 37 | } 38 | 39 | func percentile(vals []int64, p float64) float64 { 40 | if len(vals) == 0 { 41 | return 0 42 | } 43 | cp := append([]int64(nil), vals...) 44 | 45 | sort.Slice(cp, func(i, j int) bool { return cp[i] < cp[j] }) 46 | 47 | rank := p * float64(len(cp)-1) 48 | lo := int(rank) 49 | hi := lo + 1 50 | if hi >= len(cp) { 51 | return float64(cp[lo]) 52 | } 53 | 54 | frac := rank - float64(lo) 55 | return float64(cp[lo])*(1-frac) + float64(cp[hi])*frac 56 | } 57 | 58 | func applyNodeEnv(cfg *cluster.Config) { 59 | if v := getenv("REPLICATION_FACTOR", ""); v != "" { 60 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 61 | cfg.ReplicationFactor = n 62 | } 63 | } 64 | 65 | if v := getenv("WRITE_CONCERN", ""); v != "" { 66 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 67 | cfg.WriteConcern = n 68 | } 69 | } 70 | 71 | if v := getenv("READ_MAX_FANOUT", ""); v != "" { 72 | if n, err := strconv.Atoi(v); err == nil && n >= 1 { 73 | cfg.ReadMaxFanout = n 74 | } 75 | } 76 | 77 | if v := getenv("READ_PER_TRY_MS", ""); v != "" { 78 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 79 | cfg.ReadPerTryTimeout = time.Duration(n) * time.Millisecond 80 | } 81 | } 82 | 83 | if v := getenv("READ_HEDGE_DELAY_MS", ""); v != "" { 84 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 85 | cfg.ReadHedgeDelay = time.Duration(n) * time.Millisecond 86 | } 87 | } 88 | 89 | if v := getenv("READ_HEDGE_INTERVAL_MS", ""); v != "" { 90 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 91 | cfg.ReadHedgeInterval = time.Duration(n) * time.Millisecond 92 | } 93 | } 94 | 95 | if v := getenv("WRITE_TIMEOUT_MS", ""); v != "" { 96 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 97 | cfg.Sec.WriteTimeout = time.Duration(n) * time.Millisecond 98 | } 99 | } 100 | 101 | if v := getenv("READ_TIMEOUT_MS", ""); v != "" { 102 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 103 | cfg.Sec.ReadTimeout = time.Duration(n) * time.Millisecond 104 | } 105 | } 106 | 107 | if v := getenv("SUSPICION_AFTER_MS", ""); v != "" { 108 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 109 | cfg.SuspicionAfter = time.Duration(n) * time.Millisecond 110 | } 111 | } 112 | 113 | if v := getenv("WEIGHT_UPDATE_MS", ""); v != "" { 114 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 115 | cfg.WeightUpdate = time.Duration(n) * time.Millisecond 116 | } 117 | } 118 | 119 | if v := getenv("GOSSIP_INTERVAL_MS", ""); v != "" { 120 | if n, err := strconv.Atoi(v); err == nil && n > 0 { 121 | cfg.GossipInterval = time.Duration(n) * time.Millisecond 122 | } 123 | } 124 | } 125 | 126 | func main() { 127 | rand.Seed(time.Now().UnixNano()) 128 | d, _ := time.ParseDuration(getenv("DURATION", "60s")) 129 | conc, _ := strconv.Atoi(getenv("CONCURRENCY", "512")) 130 | keys, _ := strconv.Atoi(getenv("KEYS", "50000")) 131 | setRatio, _ := strconv.Atoi(getenv("SET_RATIO", "10")) 132 | setTTLms, _ := strconv.ParseInt(getenv("SET_TTL_MS", "-1"), 10, 64) 133 | auth := getenv("CACHE_AUTH", "") 134 | 135 | // ports and addresses for the three nodes (within this process) 136 | bind1 := getenv("BIND1", ":7011") 137 | pub1 := getenv("PUB1", "127.0.0.1:7011") 138 | bind2 := getenv("BIND2", ":7012") 139 | pub2 := getenv("PUB2", "127.0.0.1:7012") 140 | bind3 := getenv("BIND3", ":7013") 141 | pub3 := getenv("PUB3", "127.0.0.1:7013") 142 | seeds := []string{pub1, pub2, pub3} 143 | 144 | // create three nodes 145 | mk := func(bind, pub string) *cluster.Node[string, []byte] { 146 | local := cache.NewWithDefaults[string, []byte]() 147 | cfg := cluster.Default() 148 | cfg.BindAddr = bind 149 | cfg.PublicURL = pub 150 | cfg.Seeds = seeds 151 | cfg.Sec.AuthToken = auth 152 | cfg.ID = cluster.NodeID(cfg.PublicURL) 153 | cfg.ReplicationFactor = 3 154 | cfg.WriteConcern = 2 155 | cfg.PerConnWorkers = 128 156 | cfg.PerConnQueue = 256 157 | cfg.Sec.MaxInflightPerPeer = 512 158 | applyNodeEnv(&cfg) 159 | 160 | n := cluster.NewNode[string, []byte](cfg, cluster.StringKeyCodec[string]{}, local, cluster.BytesCodec{}) 161 | if err := n.Start(); err != nil { 162 | panic(err) 163 | } 164 | return n 165 | } 166 | 167 | n1 := mk(bind1, pub1) 168 | n2 := mk(bind2, pub2) 169 | n3 := mk(bind3, pub3) 170 | defer n1.Stop() 171 | defer n2.Stop() 172 | defer n3.Stop() 173 | 174 | // wait until ring is usable: best-effort small Set retry 175 | readyCtx, cancelReady := context.WithTimeout(context.Background(), 5*time.Second) 176 | defer cancelReady() 177 | 178 | for { 179 | if err := n1.Set(readyCtx, "__warmup__", []byte("ok"), time.Second); err == nil { 180 | break 181 | } 182 | 183 | select { 184 | case <-time.After(50 * time.Millisecond): 185 | case <-readyCtx.Done(): 186 | } 187 | if readyCtx.Err() != nil { 188 | break 189 | } 190 | } 191 | 192 | // drive load via node1 client API 193 | driver := n1 194 | 195 | deadline := time.Now().Add(d) 196 | ctx, cancel := context.WithCancel(context.Background()) 197 | defer cancel() 198 | 199 | sigCh := make(chan os.Signal, 2) 200 | signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) 201 | go func() { 202 | <-sigCh 203 | fmt.Println("[RUNNER] signal received, stopping...") 204 | cancel() 205 | }() 206 | 207 | // optional failure injection: stop node3 208 | if ka := getenv("KILL_AFTER", ""); ka != "" { 209 | if after, err := time.ParseDuration(ka); err == nil && after > 0 { 210 | go func() { 211 | t := time.NewTimer(after) 212 | defer t.Stop() 213 | 214 | select { 215 | case <-t.C: 216 | case <-ctx.Done(): 217 | return 218 | } 219 | 220 | fmt.Println("[KILL] stopping node3") 221 | n3.Stop() 222 | }() 223 | } 224 | } 225 | 226 | var st stats 227 | st.GetLatUs = make([]int64, 0, 2_000_000) 228 | st.SetLatUs = make([]int64, 0, 500_000) 229 | latMu := sync.Mutex{} 230 | 231 | // Progress 232 | go func() { 233 | ticker := time.NewTicker(5 * time.Second) 234 | defer ticker.Stop() 235 | 236 | lastTotal := int64(0) 237 | lastTime := time.Now() 238 | for { 239 | select { 240 | case now := <-ticker.C: 241 | t := atomic.LoadInt64(&st.Total) 242 | dt := now.Sub(lastTime).Seconds() 243 | if dt <= 0 { 244 | dt = 1 245 | } 246 | 247 | qps := float64(t-lastTotal) / dt 248 | 249 | fmt.Printf("[PROGRESS] total=%d qps=%.0f hits=%d miss=%d\n", t, qps, atomic.LoadInt64(&st.Hits), atomic.LoadInt64(&st.Miss)) 250 | 251 | lastTotal = t 252 | lastTime = now 253 | if time.Now().After(deadline) { 254 | return 255 | } 256 | case <-ctx.Done(): 257 | return 258 | } 259 | } 260 | }() 261 | 262 | // Workload 263 | seqs := make([]uint64, keys) 264 | wg := sync.WaitGroup{} 265 | for i := 0; i < conc; i++ { 266 | wg.Add(1) 267 | go func(worker int) { 268 | defer wg.Done() 269 | for { 270 | if time.Now().After(deadline) { 271 | return 272 | } 273 | 274 | select { 275 | case <-ctx.Done(): 276 | return 277 | default: 278 | } 279 | 280 | isSet := rand.Intn(100) < setRatio 281 | kidx := rand.Intn(keys) 282 | key := fmt.Sprintf("k%08d", kidx) 283 | if isSet { 284 | seq := atomic.AddUint64(&seqs[kidx], 1) 285 | val := []byte(fmt.Sprintf("v:%s:%d", key, seq)) 286 | ttl := time.Duration(0) 287 | if setTTLms < 0 { 288 | ttl = 0 289 | } else if setTTLms > 0 { 290 | ttl = time.Duration(setTTLms) * time.Millisecond 291 | } 292 | 293 | begin := time.Now() 294 | if err := driver.Set(ctx, key, val, ttl); err != nil { 295 | atomic.AddInt64(&st.Errs, 1) 296 | } else { 297 | latMu.Lock() 298 | st.SetLatUs = append(st.SetLatUs, time.Since(begin).Microseconds()) 299 | latMu.Unlock() 300 | atomic.AddInt64(&st.Sets, 1) 301 | atomic.AddInt64(&st.Total, 1) 302 | } 303 | } else { 304 | begin := time.Now() 305 | _, ok, err := driver.Get(ctx, key) 306 | lat := time.Since(begin) 307 | 308 | atomic.AddInt64(&st.Total, 1) 309 | atomic.AddInt64(&st.Gets, 1) 310 | 311 | latMu.Lock() 312 | st.GetLatUs = append(st.GetLatUs, lat.Microseconds()) 313 | latMu.Unlock() 314 | if err != nil { 315 | atomic.AddInt64(&st.Errs, 1) 316 | continue 317 | } 318 | 319 | if ok { 320 | atomic.AddInt64(&st.Hits, 1) 321 | } else { 322 | atomic.AddInt64(&st.Miss, 1) 323 | } 324 | } 325 | } 326 | }(i) 327 | } 328 | wg.Wait() 329 | 330 | // Summarize 331 | latMu.Lock() 332 | getP50 := percentile(st.GetLatUs, 0.50) / 1000 333 | getP95 := percentile(st.GetLatUs, 0.95) / 1000 334 | getP99 := percentile(st.GetLatUs, 0.99) / 1000 335 | getP999 := percentile(st.GetLatUs, 0.999) / 1000 336 | setP50 := percentile(st.SetLatUs, 0.50) / 1000 337 | setP95 := percentile(st.SetLatUs, 0.95) / 1000 338 | setP99 := percentile(st.SetLatUs, 0.99) / 1000 339 | setP999 := percentile(st.SetLatUs, 0.999) / 1000 340 | latMu.Unlock() 341 | 342 | hitRatio := 0.0 343 | if st.Gets > 0 { 344 | hitRatio = float64(st.Hits) / float64(st.Gets) * 100 345 | } 346 | fmt.Println("=== Kioshun Direct Bench Summary ===") 347 | fmt.Printf("Nodes: %s,%s,%s\n", pub1, pub2, pub3) 348 | fmt.Printf("Total: %d | GETs: %d | SETs: %d | Errors: %d\n", st.Total, st.Gets, st.Sets, st.Errs) 349 | fmt.Printf("Hits: %d | Miss: %d | HitRatio=%.2f%%\n", st.Hits, st.Miss, hitRatio) 350 | fmt.Printf("GET p50=%.2fms p95=%.2fms p99=%.2fms p99.9=%.2fms | SET p50=%.2fms p95=%.2fms p99=%.2fms p99.9=%.2fms\n", getP50, getP95, getP99, getP999, setP50, setP95, setP99, setP999) 351 | } 352 | --------------------------------------------------------------------------------