├── go.sum ├── go.mod ├── .gitignore ├── LICENSE ├── example_test.go ├── tree_test.go ├── tree.go ├── merge_test.go ├── merge.go └── README.md /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/achille-roussel/kway-go 2 | 3 | go 1.23 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | 23 | *~ 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Achille Roussel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package kway_test 2 | 3 | import ( 4 | "fmt" 5 | "iter" 6 | "testing" 7 | 8 | "github.com/achille-roussel/kway-go" 9 | ) 10 | 11 | func ExampleMerge() { 12 | sequence := func(min, max, step int) iter.Seq2[int, error] { 13 | return func(yield func(int, error) bool) { 14 | for i := min; i < max; i += step { 15 | if !yield(i, nil) { 16 | return 17 | } 18 | } 19 | } 20 | } 21 | 22 | for value, err := range kway.Merge( 23 | sequence(0, 5, 1), // 0,1,2,3,4 24 | sequence(1, 5, 2), // 1,3 25 | sequence(2, 5, 3), // 2 26 | ) { 27 | if err != nil { 28 | panic(err) 29 | } 30 | fmt.Printf("%v,", value) 31 | } 32 | 33 | // Output: 34 | // 0,1,1,2,2,3,3,4, 35 | } 36 | 37 | func ExampleMergeSlice() { 38 | sequence := func(min, max, step, size int) iter.Seq2[[]int, error] { 39 | return func(yield func([]int, error) bool) { 40 | values := make([]int, size) 41 | for i := min; i < max; i += step { 42 | for j := range values { 43 | values[j] = i + j 44 | } 45 | if !yield(values, nil) { 46 | return 47 | } 48 | } 49 | } 50 | } 51 | 52 | for values, err := range kway.MergeSlice( 53 | sequence(0, 5, 1, 2), // [0,1],[1,2],[2,3],[3,4],[4,5] 54 | sequence(1, 5, 2, 2), // [1,2],[3,4] 55 | sequence(2, 5, 3, 2), // [2,3] 56 | ) { 57 | if err != nil { 58 | panic(err) 59 | } 60 | for _, value := range values { 61 | fmt.Printf("%v,", value) 62 | } 63 | } 64 | 65 | // Output: 66 | // 0,1,1,1,2,2,2,2,3,3,3,3,4,4,4,5, 67 | } 68 | 69 | func ExampleMerge_Channels(t *testing.T) { 70 | sequence := func(min, max, step int) iter.Seq2[int, error] { 71 | values, done := make(chan int), make(chan struct{}) 72 | go func() { 73 | defer close(values) 74 | 75 | for i := min; i < max; i += step { 76 | select { 77 | case values <- i: 78 | case <-done: 79 | } 80 | } 81 | }() 82 | 83 | return func(yield func(int, error) bool) { 84 | for value := range values { 85 | if !yield(value, nil) { 86 | close(done) 87 | for range values { 88 | } // wait for the goroutine to finish 89 | break 90 | } 91 | } 92 | } 93 | } 94 | 95 | for value, err := range kway.Merge( 96 | sequence(0, 5, 1), // 0,1,2,3,4 97 | sequence(1, 5, 2), // 1,3 98 | sequence(2, 5, 3), // 2 99 | ) { 100 | if err != nil { 101 | panic(err) 102 | } 103 | fmt.Printf("%v,", value) 104 | } 105 | 106 | // Output: 107 | // 0,1,1,2,2,3,3,4, 108 | } 109 | -------------------------------------------------------------------------------- /tree_test.go: -------------------------------------------------------------------------------- 1 | package kway 2 | 3 | import ( 4 | "iter" 5 | "slices" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func words[T any](values ...T) iter.Seq2[[]T, error] { 11 | return func(yield func([]T, error) bool) { 12 | var v [1]T 13 | for _, v[0] = range values { 14 | if !yield(v[:], nil) { 15 | break 16 | } 17 | } 18 | } 19 | } 20 | 21 | func TestTree(t *testing.T) { 22 | tests := []struct { 23 | scenario string 24 | sequences [][]string 25 | }{ 26 | { 27 | scenario: "empty tree", 28 | sequences: [][]string{}, 29 | }, 30 | 31 | { 32 | scenario: "three sequences with no elements", 33 | sequences: [][]string{{}, {}, {}}, 34 | }, 35 | 36 | { 37 | scenario: "one sequence with one element", 38 | sequences: [][]string{{"a"}}, 39 | }, 40 | 41 | { 42 | scenario: "one sequence with three elements", 43 | sequences: [][]string{{"a", "b", "c"}}, 44 | }, 45 | 46 | { 47 | scenario: "three sequences with one element", 48 | sequences: [][]string{{"a"}, {"b"}, {"c"}}, 49 | }, 50 | 51 | { 52 | scenario: "three sequences of three elements", 53 | sequences: [][]string{ 54 | {"a", "d", "g"}, 55 | {"b", "e", "h"}, 56 | {"c", "f", "i"}, 57 | }, 58 | }, 59 | 60 | { 61 | scenario: "one sequence with the first element and a second sequence with the other elements", 62 | sequences: [][]string{ 63 | {"a"}, 64 | {"b", "c", "d", "e", "f", "g", "h", "i"}, 65 | }, 66 | }, 67 | 68 | { 69 | scenario: "one sequence with the last element and a second sequence with the other elements", 70 | sequences: [][]string{ 71 | {"z"}, 72 | {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, 73 | }, 74 | }, 75 | } 76 | 77 | for _, test := range tests { 78 | t.Run(test.scenario, func(t *testing.T) { 79 | var seqs = make([]iter.Seq2[[]string, error], len(test.sequences)) 80 | for i, seq := range test.sequences { 81 | seqs[i] = words(seq...) 82 | } 83 | 84 | var tree = makeTree(seqs...) 85 | var values []string 86 | var buffer [1]string 87 | for { 88 | n, err := tree.next(buffer[:], strings.Compare) 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | if n == 0 { 93 | break 94 | } 95 | values = append(values, buffer[0]) 96 | } 97 | 98 | var want []string 99 | for _, seq := range test.sequences { 100 | want = append(want, seq...) 101 | } 102 | slices.Sort(want) 103 | 104 | if !slices.Equal(values, want) { 105 | t.Errorf("expected replayed values to be in order, got %v, want %v", values, want) 106 | } 107 | }) 108 | } 109 | } 110 | 111 | func TestParent(t *testing.T) { 112 | if p := parent((2 * 10) + 1); p != 10 { 113 | t.Errorf("expected parent of 21 to be 10, got %d", p) 114 | } 115 | if p := parent((2 * 10) + 2); p != 10 { 116 | t.Errorf("expected parent of 22 to be 10, got %d", p) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /tree.go: -------------------------------------------------------------------------------- 1 | package kway 2 | 3 | import ( 4 | "iter" 5 | ) 6 | 7 | type tree[T any] struct { 8 | cursors []cursor[T] 9 | nodes []node 10 | count int 11 | winner node 12 | } 13 | 14 | type node struct { 15 | index int 16 | value int 17 | } 18 | 19 | type cursor[T any] struct { 20 | values []T 21 | err error 22 | next func() ([]T, error, bool) 23 | stop func() 24 | } 25 | 26 | func makeTree[T any](seqs ...iter.Seq2[[]T, error]) tree[T] { 27 | t := tree[T]{ 28 | cursors: make([]cursor[T], len(seqs)), 29 | winner: node{index: -1, value: -1}, 30 | } 31 | 32 | for i, seq := range seqs { 33 | next, stop := iter.Pull2(seq) 34 | t.cursors[i] = cursor[T]{next: next, stop: stop} 35 | } 36 | 37 | t.count = len(t.cursors) 38 | t.nodes = make([]node, 2*len(t.cursors)) 39 | 40 | head := t.nodes[:len(t.nodes)/2] 41 | tail := t.nodes[len(t.nodes)/2:] 42 | 43 | for i := range head { 44 | head[i] = node{index: -1, value: -1} 45 | } 46 | for i := range tail { 47 | tail[i] = node{index: i + len(tail), value: i} 48 | } 49 | return t 50 | } 51 | 52 | func (t *tree[T]) initialize(i int, cmp func(T, T) int) node { 53 | if i >= len(t.nodes) { 54 | return node{index: -1, value: -1} 55 | } 56 | n1 := t.initialize(left(i), cmp) 57 | n2 := t.initialize(right(i), cmp) 58 | if n1.index < 0 && n2.index < 0 { 59 | return t.nodes[i] 60 | } 61 | loser, winner := t.playGame(n1, n2, cmp) 62 | t.nodes[i] = loser 63 | return winner 64 | } 65 | 66 | func (t *tree[T]) playGame(n1, n2 node, cmp func(T, T) int) (loser, winner node) { 67 | if n1.value < 0 { 68 | return n1, n2 69 | } 70 | if n2.value < 0 { 71 | return n2, n1 72 | } 73 | c1 := &t.cursors[n1.value] 74 | c2 := &t.cursors[n2.value] 75 | if c1.err != nil { 76 | return n2, n1 77 | } 78 | if c2.err != nil { 79 | return n1, n2 80 | } 81 | if cmp(c1.values[0], c2.values[0]) < 0 { 82 | return n2, n1 83 | } else { 84 | return n1, n2 85 | } 86 | } 87 | 88 | func (t *tree[T]) next(buf []T, cmp func(T, T) int) (n int, err error) { 89 | if len(buf) == 0 || t.count == 0 { 90 | return 0, nil 91 | } 92 | 93 | winner := t.winner 94 | if winner.index < 0 { 95 | for i := range t.cursors { 96 | c := &t.cursors[i] 97 | values, err, ok := nextNonEmptyValues(c.next) 98 | if ok { 99 | c.values, c.err = values, err 100 | } else { 101 | c.stop() 102 | t.nodes[i+len(t.cursors)] = node{index: -1, value: -1} 103 | t.count-- 104 | continue 105 | } 106 | } 107 | if t.count == 0 { 108 | return 0, nil 109 | } 110 | winner = t.initialize(0, cmp) 111 | } 112 | 113 | for n < len(buf) { 114 | c := &t.cursors[winner.value] 115 | 116 | if len(c.values) > 0 { 117 | buf[n] = c.values[0] 118 | n++ 119 | c.values = c.values[1:] 120 | } 121 | 122 | if len(c.values) == 0 { 123 | if err = c.err; err != nil { 124 | c.err = nil 125 | break 126 | } 127 | values, err, ok := nextNonEmptyValues(c.next) 128 | if ok { 129 | c.values, c.err = values, err 130 | } else { 131 | c.stop() 132 | winner.value = -1 133 | t.nodes[winner.index] = node{index: -1, value: -1} 134 | t.count-- 135 | if t.count == 0 { 136 | break 137 | } 138 | } 139 | } 140 | 141 | for offset := parent(winner.index); true; offset = parent(offset) { 142 | player := t.nodes[offset] 143 | 144 | if player.value >= 0 { 145 | if winner.value < 0 { 146 | t.nodes[offset], winner = winner, player 147 | } else { 148 | c1 := &t.cursors[player.value] 149 | c2 := &t.cursors[winner.value] 150 | if len(c1.values) == 0 || (len(c2.values) != 0 && cmp(c1.values[0], c2.values[0]) < 0) { 151 | t.nodes[offset], winner = winner, player 152 | } 153 | } 154 | } 155 | 156 | if offset == 0 { 157 | break 158 | } 159 | } 160 | } 161 | 162 | t.winner = winner 163 | return n, err 164 | } 165 | 166 | func (t *tree[T]) stop() { 167 | for _, c := range t.cursors { 168 | c.stop() 169 | } 170 | } 171 | 172 | func parent(i int) int { 173 | return (i - 1) / 2 174 | } 175 | 176 | func left(i int) int { 177 | return (2 * i) + 1 178 | } 179 | 180 | func right(i int) int { 181 | return (2 * i) + 2 182 | } 183 | 184 | func nextNonEmptyValues[T any](next func() ([]T, error, bool)) (values []T, err error, ok bool) { 185 | for { 186 | values, err, ok = next() 187 | if len(values) > 0 || err != nil || !ok { 188 | return values, err, ok 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /merge_test.go: -------------------------------------------------------------------------------- 1 | package kway 2 | 3 | import ( 4 | "cmp" 5 | "errors" 6 | "fmt" 7 | "iter" 8 | "slices" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | //go:noinline 14 | func countSlice(n, r int) iter.Seq2[[]int, error] { 15 | return func(yield func([]int, error) bool) { 16 | values := make([]int, r) 17 | for i := range n { 18 | n := i * r 19 | for j := range values { 20 | values[j] = n + j 21 | } 22 | if !yield(values, nil) { 23 | return 24 | } 25 | } 26 | } 27 | } 28 | 29 | //go:noinline 30 | func count(n int) iter.Seq2[int, error] { 31 | return func(yield func(int, error) bool) { 32 | for i := range n { 33 | if !yield(i, nil) { 34 | return 35 | } 36 | } 37 | } 38 | } 39 | 40 | //go:noinline 41 | func sequence(min, max, step int) iter.Seq2[int, error] { 42 | return func(yield func(int, error) bool) { 43 | for i := min; i < max; i += step { 44 | if !yield(i, nil) { 45 | return 46 | } 47 | } 48 | } 49 | } 50 | 51 | func TestMerge(t *testing.T) { 52 | for n := range 10 { 53 | t.Run(fmt.Sprint(n), func(t *testing.T) { 54 | seqs := make([]iter.Seq2[int, error], n) 55 | for i := range seqs { 56 | seqs[i] = count(i) 57 | } 58 | 59 | assertCorrectMerge(t, seqs) 60 | }) 61 | } 62 | } 63 | 64 | func TestMerge2(t *testing.T) { 65 | it := func(s []int) iter.Seq2[int, error] { 66 | return func(yield func(int, error) bool) { 67 | for i := range s { 68 | if !yield(s[i], nil) { 69 | return 70 | } 71 | } 72 | } 73 | } 74 | cases := []struct { 75 | name string 76 | s1 []int 77 | s2 []int 78 | }{ 79 | { 80 | name: "interleaved slices", 81 | s1: []int{0, 3}, 82 | s2: []int{2, 5}, 83 | }, 84 | { 85 | name: "interleaved slices", 86 | s1: []int{2, 5}, 87 | s2: []int{0, 3}, 88 | }, 89 | } 90 | for _, c := range cases { 91 | t.Run(c.name, func(t *testing.T) { 92 | seqs := []iter.Seq2[int, error]{it(c.s1), it(c.s2)} 93 | assertCorrectMerge(t, seqs) 94 | }) 95 | } 96 | } 97 | 98 | func assertCorrectMerge(t *testing.T, seqs []iter.Seq2[int, error]) { 99 | want := make([]int, 0) 100 | for _, seq := range seqs { 101 | v, err := values(seq) 102 | if err != nil { 103 | t.Fatal(err) 104 | } 105 | want = append(want, v...) 106 | } 107 | slices.Sort(want) 108 | 109 | seq := Merge(seqs...) 110 | got, err := values(seq) 111 | if err != nil { 112 | t.Fatal(err) 113 | } 114 | if !slices.Equal(got, want) { 115 | t.Errorf("expected %v, got %v", want, got) 116 | } 117 | } 118 | 119 | func TestMergeContinueAfterError2(t *testing.T) { 120 | errval := errors.New("") 121 | 122 | seq0 := func(yield func(int, error) bool) { 123 | for i := 0; i < 5; i++ { 124 | if !yield(i, nil) { 125 | return 126 | } 127 | } 128 | if !yield(0, errval) { 129 | return 130 | } 131 | for i := 5; i < 10; i++ { 132 | if !yield(i, nil) { 133 | return 134 | } 135 | } 136 | } 137 | 138 | seq1 := func(yield func(int, error) bool) { 139 | for i := 0; i < 10; i++ { 140 | if !yield(i, nil) { 141 | return 142 | } 143 | } 144 | } 145 | 146 | var values []int 147 | var hasError bool 148 | for v, err := range Merge(seq0, seq1) { 149 | if err != nil { 150 | if v != 0 { 151 | t.Errorf("expected 0, got %v", v) 152 | } 153 | if err != errval { 154 | t.Fatal(err) 155 | } 156 | hasError = true 157 | } else { 158 | values = append(values, v) 159 | } 160 | } 161 | 162 | expect := []int{ 163 | 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 164 | 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 165 | } 166 | if !slices.Equal(values, expect) { 167 | t.Errorf("expected %v, got %v", expect, values) 168 | } 169 | if !hasError { 170 | t.Error("expected error") 171 | } 172 | } 173 | 174 | func TestMergeContinueAfterError3(t *testing.T) { 175 | errval := errors.New("") 176 | 177 | seq0 := func(yield func(int, error) bool) { 178 | for i := 0; i < 5; i++ { 179 | if !yield(i, nil) { 180 | return 181 | } 182 | } 183 | if !yield(0, errval) { 184 | return 185 | } 186 | for i := 5; i < 10; i++ { 187 | if !yield(i, nil) { 188 | return 189 | } 190 | } 191 | } 192 | 193 | seq1 := func(yield func(int, error) bool) { 194 | for i := 0; i < 10; i++ { 195 | if !yield(i, nil) { 196 | return 197 | } 198 | } 199 | } 200 | 201 | var values []int 202 | var errCount int 203 | for v, err := range Merge(seq0, seq1, seq0) { 204 | if err != nil { 205 | if v != 0 { 206 | t.Errorf("expected 0, got %v", v) 207 | } 208 | if err != errval { 209 | t.Fatal(err) 210 | } 211 | errCount++ 212 | } else { 213 | values = append(values, v) 214 | } 215 | } 216 | 217 | expect := []int{ 218 | 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 219 | 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 220 | } 221 | if !slices.Equal(values, expect) { 222 | t.Errorf("expected %v, got %v", expect, values) 223 | } 224 | if errCount != 2 { 225 | t.Error("expected error") 226 | } 227 | } 228 | 229 | func values[T any](seq iter.Seq2[T, error]) (values []T, err error) { 230 | for v, err := range seq { 231 | if err != nil { 232 | return nil, err 233 | } 234 | values = append(values, v) 235 | } 236 | return values, nil 237 | } 238 | 239 | func BenchmarkMerge1(b *testing.B) { 240 | benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] { 241 | return MergeFunc(cmp, count(n)) 242 | }) 243 | } 244 | 245 | func BenchmarkMerge2(b *testing.B) { 246 | benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] { 247 | return MergeFunc(cmp, 248 | sequence(0, n-(n/4), 1), 249 | sequence(n/4, n, 2), 250 | ) 251 | }) 252 | } 253 | 254 | func BenchmarkMerge3(b *testing.B) { 255 | benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] { 256 | return MergeFunc(cmp, 257 | sequence(0, n, 2), 258 | sequence(n/4, n, 1), 259 | sequence(n/3, n, 3), 260 | ) 261 | }) 262 | } 263 | 264 | func benchmark[V cmp.Ordered](b *testing.B, merge func(int, func(V, V) int) iter.Seq2[V, error]) { 265 | comparisons := 0 266 | compare := func(a, b V) int { 267 | comparisons++ 268 | return cmp.Compare(a, b) 269 | } 270 | start := time.Now() 271 | count := b.N 272 | for _, err := range merge(count, compare) { 273 | if err != nil { 274 | b.Fatal(err) 275 | } 276 | if count--; count == 0 { 277 | break 278 | } 279 | } 280 | if count != 0 { 281 | b.Fatalf("expected %d values, got %d", b.N, b.N-count) 282 | } 283 | duration := time.Since(start) 284 | b.ReportMetric(float64(b.N)/duration.Seconds(), "merge/s") 285 | b.ReportMetric(float64(comparisons)/float64(b.N), "comp/op") 286 | } 287 | 288 | func TestMergeSlice(t *testing.T) { 289 | for n := range 10 { 290 | t.Run(fmt.Sprint(n), func(t *testing.T) { 291 | seqs := make([]iter.Seq2[[]int, error], n) 292 | want := make([]int, 0, 2*n) 293 | 294 | for i := range seqs { 295 | seqs[i] = countSlice(i, 10) 296 | v, err := values(count(i * 10)) 297 | if err != nil { 298 | t.Fatal(err) 299 | } 300 | want = append(want, v...) 301 | } 302 | 303 | slices.Sort(want) 304 | seq := MergeSlice(seqs...) 305 | 306 | got, err := concatValues(seq) 307 | if err != nil { 308 | t.Fatal(err) 309 | } 310 | if !slices.Equal(got, want) { 311 | t.Errorf("expected %v, got %v", want, got) 312 | } 313 | }) 314 | } 315 | } 316 | 317 | func concatValues[T any](seq iter.Seq2[[]T, error]) (values []T, err error) { 318 | for v, err := range seq { 319 | if err != nil { 320 | return nil, err 321 | } 322 | values = append(values, v...) 323 | } 324 | return values, nil 325 | } 326 | 327 | func BenchmarkMergeSlice1(b *testing.B) { 328 | benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] { 329 | return MergeSliceFunc(cmp, countSlice(n, 100)) 330 | }) 331 | } 332 | 333 | func BenchmarkMergeSlice2(b *testing.B) { 334 | benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] { 335 | return MergeSliceFunc(cmp, 336 | countSlice(n, 100), 337 | countSlice(n, 127), 338 | ) 339 | }) 340 | } 341 | 342 | func BenchmarkMergeSlice3(b *testing.B) { 343 | benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] { 344 | return MergeSliceFunc(cmp, 345 | countSlice(n, 100), 346 | countSlice(n, 101), 347 | countSlice(n, 127), 348 | ) 349 | }) 350 | } 351 | 352 | func benchmarkSlice[V cmp.Ordered](b *testing.B, merge func(int, func(V, V) int) iter.Seq2[[]V, error]) { 353 | comparisons := 0 354 | compare := func(a, b V) int { 355 | comparisons++ 356 | return cmp.Compare(a, b) 357 | } 358 | start := time.Now() 359 | count := b.N 360 | for values, err := range merge(count, compare) { 361 | if err != nil { 362 | b.Fatal(err) 363 | } 364 | if count -= len(values); count <= 0 { 365 | break 366 | } 367 | } 368 | if count > 0 { 369 | b.Fatalf("expected %d values, got %d", b.N, b.N-count) 370 | } 371 | duration := time.Since(start) 372 | b.ReportMetric(float64(b.N)/duration.Seconds(), "merge/s") 373 | b.ReportMetric(float64(comparisons)/float64(b.N), "comp/op") 374 | } 375 | -------------------------------------------------------------------------------- /merge.go: -------------------------------------------------------------------------------- 1 | // Package kway implements k-way merge algorithms for range functions. 2 | package kway 3 | 4 | import ( 5 | "cmp" 6 | "iter" 7 | ) 8 | 9 | const ( 10 | // bufferSize is the size of the buffer used to read values from the 11 | // sequences. 12 | // 13 | // Note: I would like to avoid making this configurable, but I am also 14 | // aware that in latency-sensitive applications, it might be preferable 15 | // to have a smaller buffer size (or none at all), so values are produced 16 | // as soon as they are available. I would like to delay this change until 17 | // there is production data available to prove that it is needed, in my 18 | // experience, k-way merges tend to be used in batch processing systems 19 | // where throughput matters more than latency. One approach I would like 20 | // to experiment with is exponentially growing the buffer size (up to a 21 | // limit), so the merge algorithm can start with a small buffer size which 22 | // allows the first few values to be produces immediately, and then grow 23 | // to optimize for high throughput use cases. 24 | bufferSize = 128 25 | ) 26 | 27 | // Merge merges multiple sequences into one. The sequences must produce ordered 28 | // values. The algorithm complexity is O(n log k), where n is the total number 29 | // of values to merge, and k is the number of sequences. 30 | // 31 | // The implementation is based on a loser-tree data structure, which minimizes 32 | // the number of calls to the comparison function compared to the typical use 33 | // of a min-heap. 34 | // 35 | // The function returns a sequence that yields merged values and is intended to 36 | // be used in a for-range loop: 37 | // 38 | // for v, err := range kway.Merge(seq0, seq1, seq2) { 39 | // if err != nil { 40 | // ... 41 | // } else { 42 | // ... 43 | // } 44 | // } 45 | // 46 | // The algorithm is implemented for sequences of pairs that produce either a 47 | // value or a non-nil error. This design decision was made because k-way merges 48 | // are most often used in distributed streaming systems where each sequence may 49 | // be read from a remote source, and errors could occur when reading the values. 50 | // For use cases where the sequences cannot produce errors, the conversion is 51 | // straightforward: 52 | // 53 | // func noerr[T any](seq iter.Seq[T]) iter.Seq2[T, error] { 54 | // return func(yield func(T, error) bool) { 55 | // for value := range seq { 56 | // if !yield(value, nil) { 57 | // return 58 | // } 59 | // } 60 | // } 61 | // } 62 | // 63 | // The inner implementation of the merge algorithm does not spawn goroutines to 64 | // concurrently read values from the sequences. In some cases where values are 65 | // retrieved from remote sources, it can become a performance bottleneck because 66 | // the total time for the merge becomes bound on the sum of read latency. 67 | // In those cases, it is recommended to wrap the sequences so values can be 68 | // retrieved concurrently from the remote sources and psuhed into the merge 69 | // algorithm via a channel. 70 | // 71 | // For applications that aim to achieve the highest throughput should also use 72 | // MergeSlice instead, as it allows end-to-end batching which greatly amortizes 73 | // the baseline cost of coroutine context switch in the Go runtime. 74 | // 75 | // See MergeFunc for a version of this function that allows the caller to pass 76 | // a custom comparison function. 77 | func Merge[T cmp.Ordered](seqs ...iter.Seq2[T, error]) iter.Seq2[T, error] { 78 | return MergeFunc(cmp.Compare[T], seqs...) 79 | } 80 | 81 | // MergeFunc merges multiple sequences into one using the given comparison 82 | // function to determine the order of values. The sequences must be ordered 83 | // by the same comparison function. 84 | // 85 | // See Merge for more details. 86 | func MergeFunc[T any](cmp func(T, T) int, seqs ...iter.Seq2[T, error]) iter.Seq2[T, error] { 87 | if len(seqs) == 1 { 88 | return seqs[0] 89 | } 90 | var merged iter.Seq2[[]T, error] 91 | if len(seqs) == 2 { 92 | seq0 := buffer(bufferSize, seqs[0]) 93 | seq1 := buffer(bufferSize, seqs[1]) 94 | merged = merge2(cmp, seq0, seq1) 95 | } else { 96 | bufferedSeqs := make([]iter.Seq2[[]T, error], len(seqs)) 97 | for i, seq := range seqs { 98 | bufferedSeqs[i] = buffer(bufferSize, seq) 99 | } 100 | merged = merge(cmp, bufferedSeqs) 101 | } 102 | return unbuffer(merged) 103 | } 104 | 105 | // MergeSlice merges multiple sequences producing slices of ordered values. 106 | // 107 | // The function is intended to be used in applications that have high-throughput 108 | // requirements. By merging slices instead of individual values, the function 109 | // amortizes the baseline costs such as time spent on coroutine context switch 110 | // in the Go runtime, error checks, etc... 111 | // 112 | // The slices yielded when ranging over the returned function may or may not be 113 | // slices that were produced by the input sequences. The function may choose to 114 | // apply buffering when needed, or pass the slices as-is from the sequences. 115 | // They might also be reused across iterations, which means that the caller 116 | // should not retain the slices beyond the block of a for loop. 117 | // 118 | // For example, this code is incorrect: 119 | // 120 | // var values [][]int 121 | // for vs, err := range kway.MergeSlice(seq0, seq1, seq2) { 122 | // if err != nil { 123 | // ... 124 | // } 125 | // values = append(values, vs) 126 | // } 127 | // // Using values here may not contain the expected data, each slice might 128 | // // point to the same backing array and only contain values from the last 129 | // // iteration. 130 | // 131 | // Instead, the caller should copy the values into a new slice: 132 | // 133 | // var values []int 134 | // for vs, err := range kway.MergeSlice(seq0, seq1, seq2) { 135 | // if err != nil { 136 | // ... 137 | // } 138 | // values = append(values, vs...) 139 | // } 140 | // 141 | // Due to the increased complexity that derives from using MergeSlice, 142 | // applications should prefer using Merge, which uses the same algorithm as 143 | // MergeSlice internally, and can already achieve very decent throughput. 144 | // 145 | // See Merge for more details. 146 | func MergeSlice[T cmp.Ordered](seqs ...iter.Seq2[[]T, error]) iter.Seq2[[]T, error] { 147 | return MergeSliceFunc(cmp.Compare[T], seqs...) 148 | } 149 | 150 | // MergeSliceFunc merges multiple sequences producing slices of ordered values 151 | // using the given comparison function to determine the order. The sequences 152 | // must be ordered by the same comparison function. 153 | // 154 | // See MergeSlice for more details. 155 | func MergeSliceFunc[T any](cmp func(T, T) int, seqs ...iter.Seq2[[]T, error]) iter.Seq2[[]T, error] { 156 | switch len(seqs) { 157 | case 1: 158 | return seqs[0] 159 | case 2: 160 | return merge2(cmp, seqs[0], seqs[1]) 161 | default: 162 | return merge(cmp, seqs) 163 | } 164 | } 165 | 166 | func buffer[T any](bufferSize int, seq iter.Seq2[T, error]) iter.Seq2[[]T, error] { 167 | buf := make([]T, bufferSize) 168 | return func(yield func([]T, error) bool) { 169 | n := 0 170 | 171 | var err error 172 | for buf[n], err = range seq { 173 | if err != nil { 174 | if !yield(nil, err) { 175 | return 176 | } 177 | } else if n++; n == len(buf) { 178 | if !yield(buf, nil) { 179 | return 180 | } 181 | n = 0 182 | } 183 | } 184 | 185 | if n > 0 { 186 | yield(buf[:n], nil) 187 | } 188 | } 189 | } 190 | 191 | func unbuffer[T any](seq iter.Seq2[[]T, error]) iter.Seq2[T, error] { 192 | return func(yield func(T, error) bool) { 193 | seq(func(values []T, err error) bool { 194 | var value T 195 | if err != nil && !yield(value, err) { 196 | return false 197 | } 198 | for _, value = range values { 199 | if !yield(value, nil) { 200 | return false 201 | } 202 | } 203 | return true 204 | }) 205 | } 206 | } 207 | 208 | func merge2[T any](cmp func(T, T) int, seq0, seq1 iter.Seq2[[]T, error]) iter.Seq2[[]T, error] { 209 | return func(yield func([]T, error) bool) { 210 | next0, stop0 := iter.Pull2(seq0) 211 | defer stop0() 212 | 213 | next1, stop1 := iter.Pull2(seq1) 214 | defer stop1() 215 | 216 | values0, err, ok0 := next0() 217 | if err != nil && !yield(nil, err) { 218 | return 219 | } 220 | 221 | values1, err, ok1 := next1() 222 | if err != nil && !yield(nil, err) { 223 | return 224 | } 225 | 226 | buffer := make([]T, bufferSize) 227 | offset := 0 228 | i0 := 0 229 | i1 := 0 230 | for ok0 && ok1 { 231 | for i0 < len(values0) && i1 < len(values1) { 232 | v0 := values0[i0] 233 | v1 := values1[i1] 234 | 235 | if (offset + 1) >= len(buffer) { 236 | if !yield(buffer[:offset], nil) { 237 | return 238 | } 239 | offset = 0 240 | } 241 | 242 | diff := cmp(v0, v1) 243 | switch { 244 | case diff < 0: 245 | buffer[offset] = v0 246 | offset++ 247 | i0++ 248 | case diff > 0: 249 | buffer[offset] = v1 250 | offset++ 251 | i1++ 252 | default: 253 | buffer[offset+0] = v0 254 | buffer[offset+1] = v1 255 | offset += 2 256 | i0++ 257 | i1++ 258 | } 259 | } 260 | 261 | if i0 == len(values0) { 262 | i0 = 0 263 | if values0, err, ok0 = next0(); err != nil && !yield(nil, err) { 264 | return 265 | } 266 | } 267 | 268 | if i1 == len(values1) { 269 | i1 = 0 270 | if values1, err, ok1 = next1(); err != nil && !yield(nil, err) { 271 | return 272 | } 273 | } 274 | } 275 | 276 | if offset > 0 && !yield(buffer[:offset], nil) { 277 | return 278 | } 279 | 280 | values0 = values0[i0:] 281 | values1 = values1[i1:] 282 | 283 | for ok0 && yield(values0, nil) { 284 | if values0, err, ok0 = next0(); err != nil && !yield(nil, err) { 285 | return 286 | } 287 | } 288 | 289 | for ok1 && yield(values1, nil) { 290 | if values1, err, ok1 = next1(); err != nil && !yield(nil, err) { 291 | return 292 | } 293 | } 294 | } 295 | } 296 | 297 | func merge[T any](cmp func(T, T) int, seqs []iter.Seq2[[]T, error]) iter.Seq2[[]T, error] { 298 | return func(yield func([]T, error) bool) { 299 | tree := makeTree(seqs...) 300 | defer tree.stop() 301 | 302 | buffer := make([]T, bufferSize) 303 | for { 304 | n, err := tree.next(buffer, cmp) 305 | if err == nil && n == 0 { 306 | return 307 | } 308 | if !yield(buffer[:n], err) { 309 | return 310 | } 311 | } 312 | } 313 | } 314 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kway-go [![Go Reference](https://pkg.go.dev/badge/github.com/achille-roussel/kway-go.svg)](https://pkg.go.dev/github.com/achille-roussel/kway-go) 2 | K-way merge with Go 1.23 range functions 3 | 4 | [bboreham]: https://github.com/bboreham 5 | [godoc]: https://pkg.go.dev/github.com/achille-roussel/kway-go@v0.2.0#pkg-examples 6 | [gophercon]: https://www.gophercon.com/agenda/session/1160355 7 | 8 | ## Installation 9 | 10 | This package is intended to be used as a library and installed with: 11 | ```sh 12 | go get github.com/achille-roussel/kway-go 13 | ``` 14 | 15 | ## Usage 16 | 17 | The package contains variations of the K-way merge algorithm for different 18 | forms of iterator sequences: 19 | 20 | * **Merge** and **MergeFunc** operate on sequences that yield single 21 | values. **Merge** must be used on ordered values, while **MergeFunc** 22 | accepts a comparison function as first argument to customize the 23 | ordering logic. 24 | 25 | * **MergeSlice** and **MergeSliceFunc** are similar functions but operate on 26 | sequences that yield slices of values. These are intended for applications 27 | with higher throughput requirements that use batching or read values from 28 | paging APIs. 29 | 30 | The sequences being merged must each be ordered using the same comparison logic 31 | than the one used for the merge, or the algorithm will not be able to produce an 32 | ordered sequence of values. 33 | 34 | The following code snippets illustrates how to merge three ordered sequences 35 | into one: 36 | ```go 37 | for v, err := range kway.Merge(seq0, seq1, seq2) { 38 | ... 39 | } 40 | ``` 41 | 42 | More examples are available in the [Go doc][godoc]. 43 | 44 | ### Error Handling 45 | 46 | The merge functions report errors seen from the input sequences, but the 47 | presence of errors does not interrupt the merge operations. When an error 48 | occurs, it is immediately bubbled up to the program, but if more values are 49 | available in the input sequences, the program can continue consuming them after 50 | handling the error. This model delegates the decision of how to handle errors to 51 | the application, allowing it to carry or abort depending on the error value or 52 | type, for example: 53 | 54 | ```go 55 | for v, err := range kway.Merge(sequences...) { 56 | if err != nil { 57 | // handle the error, the program may choose to break out of the loop 58 | // or carry on to read the next value. 59 | ... 60 | } else { 61 | // a value is available, process it 62 | ... 63 | } 64 | } 65 | ``` 66 | 67 | ## Implementation 68 | 69 | The K-way merge algorithm was inspired by the talk from 70 | [Bryan Boreham][bboreham] at [Gophercon 2023][gophercon], which described 71 | how using a loser-tree instead of a min-heap improved performance of Loki's 72 | merge of log records. 73 | 74 | The `kway-go` package also adds a specialization for cases where the program 75 | is merging exactly two sequences, since this can be implemented as a simple 76 | union of two sets which has a much lower compute and memory footprint. 77 | 78 | ## Performance 79 | 80 | K-way merge is often used in stream processing or database engines to merge 81 | distributed query results into a single ordered result set. In those 82 | applications, performance of the underlying algorithms tend to matter: for 83 | example, when performing compaction of sorted records, the merge algorithm is 84 | on the critical path and often where most of the compute is being spent. In that 85 | regard, there are efficiency requirements that the implementation must fulfil to 86 | be a useful solution to those problems. 87 | 88 | > :bulb: While exploring the performance characteristics of the algorithm, it is 89 | > important to keep in mind that absolute numbers are only useful in the context 90 | > where they were collected, since measurements depend on the hardware executing 91 | > the code, and the data being processed. We should use relative performance of 92 | > different benchmarks within a given context as a hint to find opportunities 93 | > for optimizations in production applications, not as universal truths. 94 | 95 | The current implementation has already been optimized to maximize throughput, by 96 | amortizing as much of the baseline costs as possible, and ensure that CPU time is 97 | spent on the important parts of the algorithm. 98 | 99 | As part of this optimization work, it became apparent that while the Go runtime 100 | implementation of coroutines underneath `iter.Pull2` has a much lower compute 101 | footprint than using channels, it still has a significant overhead when reading 102 | values in tight loops of the merge algorithm. 103 | 104 | This graph shows a preview of the results, the full analysis is described in the 105 | following sections: 106 | 107 | ![image](https://github.com/achille-roussel/kway-go/assets/865510/730da27c-e639-4cfe-878a-9cc5c9287e37) 108 | 109 | 110 | ### Establishing a performance baseline 111 | 112 | To explore performance, let's first establish a baseline. We use the throughput 113 | of merging a single sequence, which is simple reading all the values it yields 114 | as comparison point: 115 | ``` 116 | Merge1 592898557 1.843 ns/op 0 comp/op 542741115 merge/s 117 | ``` 118 | This benchmark shows that on this test machine, the highest theoretical 119 | throughput we can achieve is **~540M merge/s** for one sequence, 120 | **~270M merge/s** when merging two sequences, etc... 121 | 122 | ### Performance analysis of the K-way merge algorithm 123 | 124 | Now comparing the performance of merging two and three sequences: 125 | ``` 126 | Merge2 47742177 24.78 ns/op 0.8125 comp/op 40359389 merge/s 127 | Merge3 27540648 42.23 ns/op 1.864 comp/op 23682342 merge/s 128 | ``` 129 | We observe a significant drop in throughput in comparison with iterating over 130 | a single sequence, with the benchmark now performing **~7x slower** than the 131 | theoretical throughput limit. 132 | 133 | The K-way merge algorithm has a complexity of *O(n∙log(k))*, there would also be 134 | a baseline cost for the added code implementing the merge operations, but almost 135 | an order of magnitude difference seems unexpected. 136 | 137 | To understand what is happening, we can look into a CPU profile: 138 | ``` 139 | Duration: 3.46s, Total samples = 2.44s (70.45%) 140 | Showing nodes accounting for 2.40s, 98.36% of 2.44s total 141 | Dropped 9 nodes (cum <= 0.01s) 142 | flat flat% sum% cum cum% 143 | 0.30s 12.30% 12.30% 0.72s 29.51% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge2[go.shape.int].func3 144 | 0.25s 10.25% 22.54% 0.34s 13.93% github.com/achille-roussel/kway-go.(*tree[go.shape.int]).next 145 | 0.21s 8.61% 31.15% 0.76s 31.15% github.com/achille-roussel/kway-go.sequence.func1 146 | 0.17s 6.97% 38.11% 0.26s 10.66% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6.1 147 | 0.15s 6.15% 44.26% 0.25s 10.25% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1.1 148 | 0.15s 6.15% 50.41% 0.21s 8.61% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4.1 149 | 0.14s 5.74% 56.15% 0.23s 9.43% iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func2 150 | 0.13s 5.33% 61.48% 0.13s 5.33% runtime/internal/atomic.(*Uint32).CompareAndSwap (inline) 151 | 0.11s 4.51% 65.98% 0.18s 7.38% iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1.1 152 | 0.10s 4.10% 70.08% 0.27s 11.07% runtime.coroswitch_m 153 | 0.09s 3.69% 73.77% 0.09s 3.69% github.com/achille-roussel/kway-go.benchmark[go.shape.int].func2 154 | 0.09s 3.69% 77.46% 0.09s 3.69% runtime.coroswitch 155 | 0.08s 3.28% 80.74% 0.11s 4.51% gogo 156 | 0.07s 2.87% 83.61% 0.09s 3.69% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2.1 157 | 0.06s 2.46% 86.07% 0.06s 2.46% runtime.mapaccess1_fast64 158 | 0.05s 2.05% 88.11% 0.09s 3.69% github.com/achille-roussel/kway-go.benchmark[go.shape.int].func1 159 | 0.04s 1.64% 89.75% 0.04s 1.64% cmp.Compare[go.shape.int] (inline) 160 | 0.04s 1.64% 91.39% 0.04s 1.64% internal/race.Acquire 161 | 0.04s 1.64% 93.03% 0.04s 1.64% runtime.(*guintptr).cas (inline) 162 | 0.04s 1.64% 94.67% 0.32s 13.11% runtime.mcall 163 | 0.04s 1.64% 96.31% 0.04s 1.64% runtime.save_g 164 | 0.02s 0.82% 97.13% 0.02s 0.82% internal/race.Release 165 | 0.01s 0.41% 97.54% 0.43s 17.62% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge[go.shape.int].func5 166 | 0.01s 0.41% 97.95% 0.04s 1.64% github.com/achille-roussel/kway-go.nextNonEmptyValues[go.shape.int] 167 | 0.01s 0.41% 98.36% 0.08s 3.28% runtime/pprof.(*profMap).lookup 168 | 0 0% 98.36% 0.72s 29.51% github.com/achille-roussel/kway-go.BenchmarkMerge2 169 | 0 0% 98.36% 0.43s 17.62% github.com/achille-roussel/kway-go.BenchmarkMerge3 170 | 0 0% 98.36% 0.35s 14.34% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1 171 | 0 0% 98.36% 0.14s 5.74% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2 172 | 0 0% 98.36% 0.27s 11.07% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4 173 | 0 0% 98.36% 1.15s 47.13% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6 174 | 0 0% 98.36% 1.15s 47.13% github.com/achille-roussel/kway-go.benchmark[go.shape.int] 175 | 0 0% 98.36% 0.76s 31.15% iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1 176 | 0 0% 98.36% 0.76s 31.15% runtime.corostart 177 | ``` 178 | As we can see here, a significant amount of time seems to be spent in the Go 179 | runtime code managing coroutines. While it might be possible to optimize the 180 | runtime, there is a lower bound on how much it can be reduced. 181 | 182 | It is also unlikely that the Go compiler could help here, there are no real 183 | opportunities for inlining or other optimizations. 184 | 185 | ### Performance optimization of the K-way merge algorithm 186 | 187 | We basically have a very high baseline cost for each operation, with the 188 | hypothesis that it is driven by coroutine context switch implemented in the 189 | runtime, the only thing we can do to improve performance is doing less of these. 190 | 191 | This is a typical a baseline cost amortization problem: we want to call the 192 | `next` function returned by `iter.Pull2` less often, which can be done by 193 | introducing buffering. Instead of pulling values one at a time, we can 194 | efficiently buffer N values from each sequence in memory, by transposing 195 | the `iter.Seq2[T, error]` sequences into `iter.Seq2[[]T, error]`. The call 196 | to `next` then only needs to happen when we exhaust the buffer, which ends up 197 | amortizing its cost. 198 | 199 | With an internal buffer size of **128** values per sequence: 200 | ``` 201 | Merge2 190103247 6.133 ns/op 0.8333 comp/op 163045156 merge/s 202 | Merge3 95485022 12.74 ns/op 1.864 comp/op 78492807 merge/s 203 | ``` 204 | Now we made the algorithm **3-4x faster**, and have performance in the range of 205 | **1.5 to 2.5x** the theoretical throughput limit. 206 | 207 | It is interesting to note that the CPU profile didn't seem to indicate that 75% 208 | of the time was spent in the runtime, but reducing the time spent in that code 209 | path has had a non-linear impact on performance. Likely some other CPU 210 | instruction pipeline and caching shenanigans are at play here, possibly impacted 211 | by the atomic compare-and-swap operations in coroutine switches. 212 | 213 | As expected, the CPU profile now shows that almost no time is spent in the 214 | runtime: 215 | ``` 216 | Duration: 3.17s, Total samples = 2.35s (74.08%) 217 | Showing nodes accounting for 2.28s, 97.02% of 2.35s total 218 | Dropped 22 nodes (cum <= 0.01s) 219 | flat flat% sum% cum cum% 220 | 0.45s 19.15% 19.15% 0.56s 23.83% github.com/achille-roussel/kway-go.(*tree[go.shape.int]).next 221 | 0.43s 18.30% 37.45% 0.43s 18.30% github.com/achille-roussel/kway-go.benchmark[go.shape.int].func2 222 | 0.37s 15.74% 53.19% 0.97s 41.28% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge2[go.shape.int].func3 223 | 0.23s 9.79% 62.98% 0.24s 10.21% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1.1 224 | 0.22s 9.36% 72.34% 0.65s 27.66% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6.1 225 | 0.13s 5.53% 77.87% 0.13s 5.53% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4.1 226 | 0.12s 5.11% 82.98% 0.21s 8.94% github.com/achille-roussel/kway-go.benchmark[go.shape.int].func1 227 | 0.10s 4.26% 87.23% 0.52s 22.13% github.com/achille-roussel/kway-go.sequence.func1 228 | 0.09s 3.83% 91.06% 0.09s 3.83% cmp.Compare[go.shape.int] (inline) 229 | 0.05s 2.13% 93.19% 0.05s 2.13% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2.1 230 | 0.03s 1.28% 94.47% 0.06s 2.55% runtime/pprof.(*profMap).lookup 231 | 0.02s 0.85% 95.32% 0.02s 0.85% github.com/achille-roussel/kway-go.parent (inline) 232 | 0.02s 0.85% 96.17% 0.02s 0.85% runtime.asyncPreempt 233 | 0.02s 0.85% 97.02% 0.02s 0.85% runtime.mapaccess1_fast64 234 | 0 0% 97.02% 0.97s 41.28% github.com/achille-roussel/kway-go.BenchmarkMerge2 235 | 0 0% 97.02% 0.76s 32.34% github.com/achille-roussel/kway-go.BenchmarkMerge3 236 | 0 0% 97.02% 0.31s 13.19% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1 237 | 0 0% 97.02% 0.08s 3.40% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2 238 | 0 0% 97.02% 0.13s 5.53% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4 239 | 0 0% 97.02% 0.76s 32.34% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge[go.shape.int].func5 240 | 0 0% 97.02% 1.73s 73.62% github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6 241 | 0 0% 97.02% 1.73s 73.62% github.com/achille-roussel/kway-go.benchmark[go.shape.int] 242 | 0 0% 97.02% 0.52s 22.13% iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1 243 | 0 0% 97.02% 0.52s 22.13% runtime.corostart 244 | ``` 245 | 246 | ### Further optimizations using batch processing 247 | 248 | There is a final performance frontier we can cross. While we are buffering 249 | values internally, the input and output sequences remain `iter.Seq2[T, error]`, 250 | which yield values one by one. Often times in data systems, APIs have pagination 251 | capabilities, or stream processors work on batch of values for the same reason 252 | we added buffering: it reduces the baseline cost of crossing system boundaries. 253 | 254 | If the input sequences are already slices of values, and the output sequence 255 | produces slices of values, we can reduce the internal memory footprint (no need 256 | to allocate memory to buffer the inputs), while also further amortizing the cost 257 | of function calls to yield values in and out of the merge algorithm. 258 | 259 | Applications that fall into those categories can unlock further performance by 260 | using `MergeSlice` instead of `Merge`, which works on `iter.Seq2[[]T, error]` 261 | end-to-end. 262 | 263 | What is interesting with this approach is that in cases where the processing of 264 | inputs and outputs can be batched, this model **can even beat the theoretical 265 | throughput limit**. For example, in the benchmarks we've used, the body of the 266 | loop consuming merged values simply counts the results. When consuming slices 267 | there is no need to iterate over the slices and increment the counter by one 268 | each time, we can batch the operation by incrementing the counter by the length 269 | of the slice, achieving much higher throughput than predicted by the baseline: 270 | ``` 271 | MergeSlice2 477720793 2.273 ns/op 0.6688 comp/op 439971259 merge/s 272 | MergeSlice3 150406080 7.945 ns/op 1.667 comp/op 125861613 merge/s 273 | ``` 274 | 275 | > :warning: Keep in mind that to minimize the footprint, `MergeSlice` resuses 276 | > its output buffer, which means that the application cannot retain it beyond 277 | > the body of the loop raning over the merge function. This can lead to subtle 278 | > bugs that can be difficult to track, `Merge` should always be preferred unless 279 | > there is clear evidence that the increased maintenance cost is worth the 280 | > performance benefits. 281 | --------------------------------------------------------------------------------