├── go.sum
├── go.mod
├── .gitignore
├── LICENSE
├── example_test.go
├── tree_test.go
├── tree.go
├── merge_test.go
├── merge.go
└── README.md


/go.sum:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/achille-roussel/kway-go
2 | 
3 | go 1.23
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If you prefer the allow list template instead of the deny list, see community template:
 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
 3 | #
 4 | # Binaries for programs and plugins
 5 | *.exe
 6 | *.exe~
 7 | *.dll
 8 | *.so
 9 | *.dylib
10 | 
11 | # Test binary, built with `go test -c`
12 | *.test
13 | 
14 | # Output of the go coverage tool, specifically when used with LiteIDE
15 | *.out
16 | 
17 | # Dependency directories (remove the comment below to include it)
18 | # vendor/
19 | 
20 | # Go workspace file
21 | go.work
22 | 
23 | *~
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Achille Roussel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
  1 | package kway_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"iter"
  6 | 	"testing"
  7 | 
  8 | 	"github.com/achille-roussel/kway-go"
  9 | )
 10 | 
 11 | func ExampleMerge() {
 12 | 	sequence := func(min, max, step int) iter.Seq2[int, error] {
 13 | 		return func(yield func(int, error) bool) {
 14 | 			for i := min; i < max; i += step {
 15 | 				if !yield(i, nil) {
 16 | 					return
 17 | 				}
 18 | 			}
 19 | 		}
 20 | 	}
 21 | 
 22 | 	for value, err := range kway.Merge(
 23 | 		sequence(0, 5, 1), // 0,1,2,3,4
 24 | 		sequence(1, 5, 2), // 1,3
 25 | 		sequence(2, 5, 3), // 2
 26 | 	) {
 27 | 		if err != nil {
 28 | 			panic(err)
 29 | 		}
 30 | 		fmt.Printf("%v,", value)
 31 | 	}
 32 | 
 33 | 	// Output:
 34 | 	// 0,1,1,2,2,3,3,4,
 35 | }
 36 | 
 37 | func ExampleMergeSlice() {
 38 | 	sequence := func(min, max, step, size int) iter.Seq2[[]int, error] {
 39 | 		return func(yield func([]int, error) bool) {
 40 | 			values := make([]int, size)
 41 | 			for i := min; i < max; i += step {
 42 | 				for j := range values {
 43 | 					values[j] = i + j
 44 | 				}
 45 | 				if !yield(values, nil) {
 46 | 					return
 47 | 				}
 48 | 			}
 49 | 		}
 50 | 	}
 51 | 
 52 | 	for values, err := range kway.MergeSlice(
 53 | 		sequence(0, 5, 1, 2), // [0,1],[1,2],[2,3],[3,4],[4,5]
 54 | 		sequence(1, 5, 2, 2), // [1,2],[3,4]
 55 | 		sequence(2, 5, 3, 2), // [2,3]
 56 | 	) {
 57 | 		if err != nil {
 58 | 			panic(err)
 59 | 		}
 60 | 		for _, value := range values {
 61 | 			fmt.Printf("%v,", value)
 62 | 		}
 63 | 	}
 64 | 
 65 | 	// Output:
 66 | 	// 0,1,1,1,2,2,2,2,3,3,3,3,4,4,4,5,
 67 | }
 68 | 
 69 | func ExampleMerge_Channels(t *testing.T) {
 70 | 	sequence := func(min, max, step int) iter.Seq2[int, error] {
 71 | 		values, done := make(chan int), make(chan struct{})
 72 | 		go func() {
 73 | 			defer close(values)
 74 | 
 75 | 			for i := min; i < max; i += step {
 76 | 				select {
 77 | 				case values <- i:
 78 | 				case <-done:
 79 | 				}
 80 | 			}
 81 | 		}()
 82 | 
 83 | 		return func(yield func(int, error) bool) {
 84 | 			for value := range values {
 85 | 				if !yield(value, nil) {
 86 | 					close(done)
 87 | 					for range values {
 88 | 					} // wait for the goroutine to finish
 89 | 					break
 90 | 				}
 91 | 			}
 92 | 		}
 93 | 	}
 94 | 
 95 | 	for value, err := range kway.Merge(
 96 | 		sequence(0, 5, 1), // 0,1,2,3,4
 97 | 		sequence(1, 5, 2), // 1,3
 98 | 		sequence(2, 5, 3), // 2
 99 | 	) {
100 | 		if err != nil {
101 | 			panic(err)
102 | 		}
103 | 		fmt.Printf("%v,", value)
104 | 	}
105 | 
106 | 	// Output:
107 | 	// 0,1,1,2,2,3,3,4,
108 | }
109 | 


--------------------------------------------------------------------------------
/tree_test.go:
--------------------------------------------------------------------------------
  1 | package kway
  2 | 
  3 | import (
  4 | 	"iter"
  5 | 	"slices"
  6 | 	"strings"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func words[T any](values ...T) iter.Seq2[[]T, error] {
 11 | 	return func(yield func([]T, error) bool) {
 12 | 		var v [1]T
 13 | 		for _, v[0] = range values {
 14 | 			if !yield(v[:], nil) {
 15 | 				break
 16 | 			}
 17 | 		}
 18 | 	}
 19 | }
 20 | 
 21 | func TestTree(t *testing.T) {
 22 | 	tests := []struct {
 23 | 		scenario  string
 24 | 		sequences [][]string
 25 | 	}{
 26 | 		{
 27 | 			scenario:  "empty tree",
 28 | 			sequences: [][]string{},
 29 | 		},
 30 | 
 31 | 		{
 32 | 			scenario:  "three sequences with no elements",
 33 | 			sequences: [][]string{{}, {}, {}},
 34 | 		},
 35 | 
 36 | 		{
 37 | 			scenario:  "one sequence with one element",
 38 | 			sequences: [][]string{{"a"}},
 39 | 		},
 40 | 
 41 | 		{
 42 | 			scenario:  "one sequence with three elements",
 43 | 			sequences: [][]string{{"a", "b", "c"}},
 44 | 		},
 45 | 
 46 | 		{
 47 | 			scenario:  "three sequences with one element",
 48 | 			sequences: [][]string{{"a"}, {"b"}, {"c"}},
 49 | 		},
 50 | 
 51 | 		{
 52 | 			scenario: "three sequences of three elements",
 53 | 			sequences: [][]string{
 54 | 				{"a", "d", "g"},
 55 | 				{"b", "e", "h"},
 56 | 				{"c", "f", "i"},
 57 | 			},
 58 | 		},
 59 | 
 60 | 		{
 61 | 			scenario: "one sequence with the first element and a second sequence with the other elements",
 62 | 			sequences: [][]string{
 63 | 				{"a"},
 64 | 				{"b", "c", "d", "e", "f", "g", "h", "i"},
 65 | 			},
 66 | 		},
 67 | 
 68 | 		{
 69 | 			scenario: "one sequence with the last element and a second sequence with the other elements",
 70 | 			sequences: [][]string{
 71 | 				{"z"},
 72 | 				{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
 73 | 			},
 74 | 		},
 75 | 	}
 76 | 
 77 | 	for _, test := range tests {
 78 | 		t.Run(test.scenario, func(t *testing.T) {
 79 | 			var seqs = make([]iter.Seq2[[]string, error], len(test.sequences))
 80 | 			for i, seq := range test.sequences {
 81 | 				seqs[i] = words(seq...)
 82 | 			}
 83 | 
 84 | 			var tree = makeTree(seqs...)
 85 | 			var values []string
 86 | 			var buffer [1]string
 87 | 			for {
 88 | 				n, err := tree.next(buffer[:], strings.Compare)
 89 | 				if err != nil {
 90 | 					t.Fatal(err)
 91 | 				}
 92 | 				if n == 0 {
 93 | 					break
 94 | 				}
 95 | 				values = append(values, buffer[0])
 96 | 			}
 97 | 
 98 | 			var want []string
 99 | 			for _, seq := range test.sequences {
100 | 				want = append(want, seq...)
101 | 			}
102 | 			slices.Sort(want)
103 | 
104 | 			if !slices.Equal(values, want) {
105 | 				t.Errorf("expected replayed values to be in order, got %v, want %v", values, want)
106 | 			}
107 | 		})
108 | 	}
109 | }
110 | 
111 | func TestParent(t *testing.T) {
112 | 	if p := parent((2 * 10) + 1); p != 10 {
113 | 		t.Errorf("expected parent of 21 to be 10, got %d", p)
114 | 	}
115 | 	if p := parent((2 * 10) + 2); p != 10 {
116 | 		t.Errorf("expected parent of 22 to be 10, got %d", p)
117 | 	}
118 | }
119 | 


--------------------------------------------------------------------------------
/tree.go:
--------------------------------------------------------------------------------
  1 | package kway
  2 | 
  3 | import (
  4 | 	"iter"
  5 | )
  6 | 
  7 | type tree[T any] struct {
  8 | 	cursors []cursor[T]
  9 | 	nodes   []node
 10 | 	count   int
 11 | 	winner  node
 12 | }
 13 | 
 14 | type node struct {
 15 | 	index int
 16 | 	value int
 17 | }
 18 | 
 19 | type cursor[T any] struct {
 20 | 	values []T
 21 | 	err    error
 22 | 	next   func() ([]T, error, bool)
 23 | 	stop   func()
 24 | }
 25 | 
 26 | func makeTree[T any](seqs ...iter.Seq2[[]T, error]) tree[T] {
 27 | 	t := tree[T]{
 28 | 		cursors: make([]cursor[T], len(seqs)),
 29 | 		winner:  node{index: -1, value: -1},
 30 | 	}
 31 | 
 32 | 	for i, seq := range seqs {
 33 | 		next, stop := iter.Pull2(seq)
 34 | 		t.cursors[i] = cursor[T]{next: next, stop: stop}
 35 | 	}
 36 | 
 37 | 	t.count = len(t.cursors)
 38 | 	t.nodes = make([]node, 2*len(t.cursors))
 39 | 
 40 | 	head := t.nodes[:len(t.nodes)/2]
 41 | 	tail := t.nodes[len(t.nodes)/2:]
 42 | 
 43 | 	for i := range head {
 44 | 		head[i] = node{index: -1, value: -1}
 45 | 	}
 46 | 	for i := range tail {
 47 | 		tail[i] = node{index: i + len(tail), value: i}
 48 | 	}
 49 | 	return t
 50 | }
 51 | 
 52 | func (t *tree[T]) initialize(i int, cmp func(T, T) int) node {
 53 | 	if i >= len(t.nodes) {
 54 | 		return node{index: -1, value: -1}
 55 | 	}
 56 | 	n1 := t.initialize(left(i), cmp)
 57 | 	n2 := t.initialize(right(i), cmp)
 58 | 	if n1.index < 0 && n2.index < 0 {
 59 | 		return t.nodes[i]
 60 | 	}
 61 | 	loser, winner := t.playGame(n1, n2, cmp)
 62 | 	t.nodes[i] = loser
 63 | 	return winner
 64 | }
 65 | 
 66 | func (t *tree[T]) playGame(n1, n2 node, cmp func(T, T) int) (loser, winner node) {
 67 | 	if n1.value < 0 {
 68 | 		return n1, n2
 69 | 	}
 70 | 	if n2.value < 0 {
 71 | 		return n2, n1
 72 | 	}
 73 | 	c1 := &t.cursors[n1.value]
 74 | 	c2 := &t.cursors[n2.value]
 75 | 	if c1.err != nil {
 76 | 		return n2, n1
 77 | 	}
 78 | 	if c2.err != nil {
 79 | 		return n1, n2
 80 | 	}
 81 | 	if cmp(c1.values[0], c2.values[0]) < 0 {
 82 | 		return n2, n1
 83 | 	} else {
 84 | 		return n1, n2
 85 | 	}
 86 | }
 87 | 
 88 | func (t *tree[T]) next(buf []T, cmp func(T, T) int) (n int, err error) {
 89 | 	if len(buf) == 0 || t.count == 0 {
 90 | 		return 0, nil
 91 | 	}
 92 | 
 93 | 	winner := t.winner
 94 | 	if winner.index < 0 {
 95 | 		for i := range t.cursors {
 96 | 			c := &t.cursors[i]
 97 | 			values, err, ok := nextNonEmptyValues(c.next)
 98 | 			if ok {
 99 | 				c.values, c.err = values, err
100 | 			} else {
101 | 				c.stop()
102 | 				t.nodes[i+len(t.cursors)] = node{index: -1, value: -1}
103 | 				t.count--
104 | 				continue
105 | 			}
106 | 		}
107 | 		if t.count == 0 {
108 | 			return 0, nil
109 | 		}
110 | 		winner = t.initialize(0, cmp)
111 | 	}
112 | 
113 | 	for n < len(buf) {
114 | 		c := &t.cursors[winner.value]
115 | 
116 | 		if len(c.values) > 0 {
117 | 			buf[n] = c.values[0]
118 | 			n++
119 | 			c.values = c.values[1:]
120 | 		}
121 | 
122 | 		if len(c.values) == 0 {
123 | 			if err = c.err; err != nil {
124 | 				c.err = nil
125 | 				break
126 | 			}
127 | 			values, err, ok := nextNonEmptyValues(c.next)
128 | 			if ok {
129 | 				c.values, c.err = values, err
130 | 			} else {
131 | 				c.stop()
132 | 				winner.value = -1
133 | 				t.nodes[winner.index] = node{index: -1, value: -1}
134 | 				t.count--
135 | 				if t.count == 0 {
136 | 					break
137 | 				}
138 | 			}
139 | 		}
140 | 
141 | 		for offset := parent(winner.index); true; offset = parent(offset) {
142 | 			player := t.nodes[offset]
143 | 
144 | 			if player.value >= 0 {
145 | 				if winner.value < 0 {
146 | 					t.nodes[offset], winner = winner, player
147 | 				} else {
148 | 					c1 := &t.cursors[player.value]
149 | 					c2 := &t.cursors[winner.value]
150 | 					if len(c1.values) == 0 || (len(c2.values) != 0 && cmp(c1.values[0], c2.values[0]) < 0) {
151 | 						t.nodes[offset], winner = winner, player
152 | 					}
153 | 				}
154 | 			}
155 | 
156 | 			if offset == 0 {
157 | 				break
158 | 			}
159 | 		}
160 | 	}
161 | 
162 | 	t.winner = winner
163 | 	return n, err
164 | }
165 | 
166 | func (t *tree[T]) stop() {
167 | 	for _, c := range t.cursors {
168 | 		c.stop()
169 | 	}
170 | }
171 | 
172 | func parent(i int) int {
173 | 	return (i - 1) / 2
174 | }
175 | 
176 | func left(i int) int {
177 | 	return (2 * i) + 1
178 | }
179 | 
180 | func right(i int) int {
181 | 	return (2 * i) + 2
182 | }
183 | 
184 | func nextNonEmptyValues[T any](next func() ([]T, error, bool)) (values []T, err error, ok bool) {
185 | 	for {
186 | 		values, err, ok = next()
187 | 		if len(values) > 0 || err != nil || !ok {
188 | 			return values, err, ok
189 | 		}
190 | 	}
191 | }
192 | 


--------------------------------------------------------------------------------
/merge_test.go:
--------------------------------------------------------------------------------
  1 | package kway
  2 | 
  3 | import (
  4 | 	"cmp"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"iter"
  8 | 	"slices"
  9 | 	"testing"
 10 | 	"time"
 11 | )
 12 | 
 13 | //go:noinline
 14 | func countSlice(n, r int) iter.Seq2[[]int, error] {
 15 | 	return func(yield func([]int, error) bool) {
 16 | 		values := make([]int, r)
 17 | 		for i := range n {
 18 | 			n := i * r
 19 | 			for j := range values {
 20 | 				values[j] = n + j
 21 | 			}
 22 | 			if !yield(values, nil) {
 23 | 				return
 24 | 			}
 25 | 		}
 26 | 	}
 27 | }
 28 | 
 29 | //go:noinline
 30 | func count(n int) iter.Seq2[int, error] {
 31 | 	return func(yield func(int, error) bool) {
 32 | 		for i := range n {
 33 | 			if !yield(i, nil) {
 34 | 				return
 35 | 			}
 36 | 		}
 37 | 	}
 38 | }
 39 | 
 40 | //go:noinline
 41 | func sequence(min, max, step int) iter.Seq2[int, error] {
 42 | 	return func(yield func(int, error) bool) {
 43 | 		for i := min; i < max; i += step {
 44 | 			if !yield(i, nil) {
 45 | 				return
 46 | 			}
 47 | 		}
 48 | 	}
 49 | }
 50 | 
 51 | func TestMerge(t *testing.T) {
 52 | 	for n := range 10 {
 53 | 		t.Run(fmt.Sprint(n), func(t *testing.T) {
 54 | 			seqs := make([]iter.Seq2[int, error], n)
 55 | 			for i := range seqs {
 56 | 				seqs[i] = count(i)
 57 | 			}
 58 | 
 59 | 			assertCorrectMerge(t, seqs)
 60 | 		})
 61 | 	}
 62 | }
 63 | 
 64 | func TestMerge2(t *testing.T) {
 65 | 	it := func(s []int) iter.Seq2[int, error] {
 66 | 		return func(yield func(int, error) bool) {
 67 | 			for i := range s {
 68 | 				if !yield(s[i], nil) {
 69 | 					return
 70 | 				}
 71 | 			}
 72 | 		}
 73 | 	}
 74 | 	cases := []struct {
 75 | 		name string
 76 | 		s1   []int
 77 | 		s2   []int
 78 | 	}{
 79 | 		{
 80 | 			name: "interleaved slices",
 81 | 			s1:   []int{0, 3},
 82 | 			s2:   []int{2, 5},
 83 | 		},
 84 | 		{
 85 | 			name: "interleaved slices",
 86 | 			s1:   []int{2, 5},
 87 | 			s2:   []int{0, 3},
 88 | 		},
 89 | 	}
 90 | 	for _, c := range cases {
 91 | 		t.Run(c.name, func(t *testing.T) {
 92 | 			seqs := []iter.Seq2[int, error]{it(c.s1), it(c.s2)}
 93 | 			assertCorrectMerge(t, seqs)
 94 | 		})
 95 | 	}
 96 | }
 97 | 
 98 | func assertCorrectMerge(t *testing.T, seqs []iter.Seq2[int, error]) {
 99 | 	want := make([]int, 0)
100 | 	for _, seq := range seqs {
101 | 		v, err := values(seq)
102 | 		if err != nil {
103 | 			t.Fatal(err)
104 | 		}
105 | 		want = append(want, v...)
106 | 	}
107 | 	slices.Sort(want)
108 | 
109 | 	seq := Merge(seqs...)
110 | 	got, err := values(seq)
111 | 	if err != nil {
112 | 		t.Fatal(err)
113 | 	}
114 | 	if !slices.Equal(got, want) {
115 | 		t.Errorf("expected %v, got %v", want, got)
116 | 	}
117 | }
118 | 
119 | func TestMergeContinueAfterError2(t *testing.T) {
120 | 	errval := errors.New("")
121 | 
122 | 	seq0 := func(yield func(int, error) bool) {
123 | 		for i := 0; i < 5; i++ {
124 | 			if !yield(i, nil) {
125 | 				return
126 | 			}
127 | 		}
128 | 		if !yield(0, errval) {
129 | 			return
130 | 		}
131 | 		for i := 5; i < 10; i++ {
132 | 			if !yield(i, nil) {
133 | 				return
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	seq1 := func(yield func(int, error) bool) {
139 | 		for i := 0; i < 10; i++ {
140 | 			if !yield(i, nil) {
141 | 				return
142 | 			}
143 | 		}
144 | 	}
145 | 
146 | 	var values []int
147 | 	var hasError bool
148 | 	for v, err := range Merge(seq0, seq1) {
149 | 		if err != nil {
150 | 			if v != 0 {
151 | 				t.Errorf("expected 0, got %v", v)
152 | 			}
153 | 			if err != errval {
154 | 				t.Fatal(err)
155 | 			}
156 | 			hasError = true
157 | 		} else {
158 | 			values = append(values, v)
159 | 		}
160 | 	}
161 | 
162 | 	expect := []int{
163 | 		0, 0, 1, 1, 2, 2, 3, 3, 4, 4,
164 | 		5, 5, 6, 6, 7, 7, 8, 8, 9, 9,
165 | 	}
166 | 	if !slices.Equal(values, expect) {
167 | 		t.Errorf("expected %v, got %v", expect, values)
168 | 	}
169 | 	if !hasError {
170 | 		t.Error("expected error")
171 | 	}
172 | }
173 | 
174 | func TestMergeContinueAfterError3(t *testing.T) {
175 | 	errval := errors.New("")
176 | 
177 | 	seq0 := func(yield func(int, error) bool) {
178 | 		for i := 0; i < 5; i++ {
179 | 			if !yield(i, nil) {
180 | 				return
181 | 			}
182 | 		}
183 | 		if !yield(0, errval) {
184 | 			return
185 | 		}
186 | 		for i := 5; i < 10; i++ {
187 | 			if !yield(i, nil) {
188 | 				return
189 | 			}
190 | 		}
191 | 	}
192 | 
193 | 	seq1 := func(yield func(int, error) bool) {
194 | 		for i := 0; i < 10; i++ {
195 | 			if !yield(i, nil) {
196 | 				return
197 | 			}
198 | 		}
199 | 	}
200 | 
201 | 	var values []int
202 | 	var errCount int
203 | 	for v, err := range Merge(seq0, seq1, seq0) {
204 | 		if err != nil {
205 | 			if v != 0 {
206 | 				t.Errorf("expected 0, got %v", v)
207 | 			}
208 | 			if err != errval {
209 | 				t.Fatal(err)
210 | 			}
211 | 			errCount++
212 | 		} else {
213 | 			values = append(values, v)
214 | 		}
215 | 	}
216 | 
217 | 	expect := []int{
218 | 		0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
219 | 		5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9,
220 | 	}
221 | 	if !slices.Equal(values, expect) {
222 | 		t.Errorf("expected %v, got %v", expect, values)
223 | 	}
224 | 	if errCount != 2 {
225 | 		t.Error("expected error")
226 | 	}
227 | }
228 | 
229 | func values[T any](seq iter.Seq2[T, error]) (values []T, err error) {
230 | 	for v, err := range seq {
231 | 		if err != nil {
232 | 			return nil, err
233 | 		}
234 | 		values = append(values, v)
235 | 	}
236 | 	return values, nil
237 | }
238 | 
239 | func BenchmarkMerge1(b *testing.B) {
240 | 	benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] {
241 | 		return MergeFunc(cmp, count(n))
242 | 	})
243 | }
244 | 
245 | func BenchmarkMerge2(b *testing.B) {
246 | 	benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] {
247 | 		return MergeFunc(cmp,
248 | 			sequence(0, n-(n/4), 1),
249 | 			sequence(n/4, n, 2),
250 | 		)
251 | 	})
252 | }
253 | 
254 | func BenchmarkMerge3(b *testing.B) {
255 | 	benchmark(b, func(n int, cmp func(int, int) int) iter.Seq2[int, error] {
256 | 		return MergeFunc(cmp,
257 | 			sequence(0, n, 2),
258 | 			sequence(n/4, n, 1),
259 | 			sequence(n/3, n, 3),
260 | 		)
261 | 	})
262 | }
263 | 
264 | func benchmark[V cmp.Ordered](b *testing.B, merge func(int, func(V, V) int) iter.Seq2[V, error]) {
265 | 	comparisons := 0
266 | 	compare := func(a, b V) int {
267 | 		comparisons++
268 | 		return cmp.Compare(a, b)
269 | 	}
270 | 	start := time.Now()
271 | 	count := b.N
272 | 	for _, err := range merge(count, compare) {
273 | 		if err != nil {
274 | 			b.Fatal(err)
275 | 		}
276 | 		if count--; count == 0 {
277 | 			break
278 | 		}
279 | 	}
280 | 	if count != 0 {
281 | 		b.Fatalf("expected %d values, got %d", b.N, b.N-count)
282 | 	}
283 | 	duration := time.Since(start)
284 | 	b.ReportMetric(float64(b.N)/duration.Seconds(), "merge/s")
285 | 	b.ReportMetric(float64(comparisons)/float64(b.N), "comp/op")
286 | }
287 | 
288 | func TestMergeSlice(t *testing.T) {
289 | 	for n := range 10 {
290 | 		t.Run(fmt.Sprint(n), func(t *testing.T) {
291 | 			seqs := make([]iter.Seq2[[]int, error], n)
292 | 			want := make([]int, 0, 2*n)
293 | 
294 | 			for i := range seqs {
295 | 				seqs[i] = countSlice(i, 10)
296 | 				v, err := values(count(i * 10))
297 | 				if err != nil {
298 | 					t.Fatal(err)
299 | 				}
300 | 				want = append(want, v...)
301 | 			}
302 | 
303 | 			slices.Sort(want)
304 | 			seq := MergeSlice(seqs...)
305 | 
306 | 			got, err := concatValues(seq)
307 | 			if err != nil {
308 | 				t.Fatal(err)
309 | 			}
310 | 			if !slices.Equal(got, want) {
311 | 				t.Errorf("expected %v, got %v", want, got)
312 | 			}
313 | 		})
314 | 	}
315 | }
316 | 
317 | func concatValues[T any](seq iter.Seq2[[]T, error]) (values []T, err error) {
318 | 	for v, err := range seq {
319 | 		if err != nil {
320 | 			return nil, err
321 | 		}
322 | 		values = append(values, v...)
323 | 	}
324 | 	return values, nil
325 | }
326 | 
327 | func BenchmarkMergeSlice1(b *testing.B) {
328 | 	benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] {
329 | 		return MergeSliceFunc(cmp, countSlice(n, 100))
330 | 	})
331 | }
332 | 
333 | func BenchmarkMergeSlice2(b *testing.B) {
334 | 	benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] {
335 | 		return MergeSliceFunc(cmp,
336 | 			countSlice(n, 100),
337 | 			countSlice(n, 127),
338 | 		)
339 | 	})
340 | }
341 | 
342 | func BenchmarkMergeSlice3(b *testing.B) {
343 | 	benchmarkSlice(b, func(n int, cmp func(int, int) int) iter.Seq2[[]int, error] {
344 | 		return MergeSliceFunc(cmp,
345 | 			countSlice(n, 100),
346 | 			countSlice(n, 101),
347 | 			countSlice(n, 127),
348 | 		)
349 | 	})
350 | }
351 | 
352 | func benchmarkSlice[V cmp.Ordered](b *testing.B, merge func(int, func(V, V) int) iter.Seq2[[]V, error]) {
353 | 	comparisons := 0
354 | 	compare := func(a, b V) int {
355 | 		comparisons++
356 | 		return cmp.Compare(a, b)
357 | 	}
358 | 	start := time.Now()
359 | 	count := b.N
360 | 	for values, err := range merge(count, compare) {
361 | 		if err != nil {
362 | 			b.Fatal(err)
363 | 		}
364 | 		if count -= len(values); count <= 0 {
365 | 			break
366 | 		}
367 | 	}
368 | 	if count > 0 {
369 | 		b.Fatalf("expected %d values, got %d", b.N, b.N-count)
370 | 	}
371 | 	duration := time.Since(start)
372 | 	b.ReportMetric(float64(b.N)/duration.Seconds(), "merge/s")
373 | 	b.ReportMetric(float64(comparisons)/float64(b.N), "comp/op")
374 | }
375 | 


--------------------------------------------------------------------------------
/merge.go:
--------------------------------------------------------------------------------
  1 | // Package kway implements k-way merge algorithms for range functions.
  2 | package kway
  3 | 
  4 | import (
  5 | 	"cmp"
  6 | 	"iter"
  7 | )
  8 | 
  9 | const (
 10 | 	// bufferSize is the size of the buffer used to read values from the
 11 | 	// sequences.
 12 | 	//
 13 | 	// Note: I would like to avoid making this configurable, but I am also
 14 | 	// aware that in latency-sensitive applications, it might be preferable
 15 | 	// to have a smaller buffer size (or none at all), so values are produced
 16 | 	// as soon as they are available. I would like to delay this change until
 17 | 	// there is production data available to prove that it is needed, in my
 18 | 	// experience, k-way merges tend to be used in batch processing systems
 19 | 	// where throughput matters more than latency. One approach I would like
 20 | 	// to experiment with is exponentially growing the buffer size (up to a
 21 | 	// limit), so the merge algorithm can start with a small buffer size which
 22 | 	// allows the first few values to be produces immediately, and then grow
 23 | 	// to optimize for high throughput use cases.
 24 | 	bufferSize = 128
 25 | )
 26 | 
 27 | // Merge merges multiple sequences into one. The sequences must produce ordered
 28 | // values. The algorithm complexity is O(n log k), where n is the total number
 29 | // of values to merge, and k is the number of sequences.
 30 | //
 31 | // The implementation is based on a loser-tree data structure, which minimizes
 32 | // the number of calls to the comparison function compared to the typical use
 33 | // of a min-heap.
 34 | //
 35 | // The function returns a sequence that yields merged values and is intended to
 36 | // be used in a for-range loop:
 37 | //
 38 | //	for v, err := range kway.Merge(seq0, seq1, seq2) {
 39 | //		if err != nil {
 40 | //			...
 41 | //		} else {
 42 | //			...
 43 | //		}
 44 | //	}
 45 | //
 46 | // The algorithm is implemented for sequences of pairs that produce either a
 47 | // value or a non-nil error. This design decision was made because k-way merges
 48 | // are most often used in distributed streaming systems where each sequence may
 49 | // be read from a remote source, and errors could occur when reading the values.
 50 | // For use cases where the sequences cannot produce errors, the conversion is
 51 | // straightforward:
 52 | //
 53 | //	func noerr[T any](seq iter.Seq[T]) iter.Seq2[T, error] {
 54 | //		return func(yield func(T, error) bool) {
 55 | //			for value := range seq {
 56 | //				if !yield(value, nil) {
 57 | //					return
 58 | //				}
 59 | //			}
 60 | //		}
 61 | //	}
 62 | //
 63 | // The inner implementation of the merge algorithm does not spawn goroutines to
 64 | // concurrently read values from the sequences. In some cases where values are
 65 | // retrieved from remote sources, it can become a performance bottleneck because
 66 | // the total time for the merge becomes bound on the sum of read latency.
 67 | // In those cases, it is recommended to wrap the sequences so values can be
 68 | // retrieved concurrently from the remote sources and psuhed into the merge
 69 | // algorithm via a channel.
 70 | //
 71 | // For applications that aim to achieve the highest throughput should also use
 72 | // MergeSlice instead, as it allows end-to-end batching which greatly amortizes
 73 | // the baseline cost of coroutine context switch in the Go runtime.
 74 | //
 75 | // See MergeFunc for a version of this function that allows the caller to pass
 76 | // a custom comparison function.
 77 | func Merge[T cmp.Ordered](seqs ...iter.Seq2[T, error]) iter.Seq2[T, error] {
 78 | 	return MergeFunc(cmp.Compare[T], seqs...)
 79 | }
 80 | 
 81 | // MergeFunc merges multiple sequences into one using the given comparison
 82 | // function to determine the order of values. The sequences must be ordered
 83 | // by the same comparison function.
 84 | //
 85 | // See Merge for more details.
 86 | func MergeFunc[T any](cmp func(T, T) int, seqs ...iter.Seq2[T, error]) iter.Seq2[T, error] {
 87 | 	if len(seqs) == 1 {
 88 | 		return seqs[0]
 89 | 	}
 90 | 	var merged iter.Seq2[[]T, error]
 91 | 	if len(seqs) == 2 {
 92 | 		seq0 := buffer(bufferSize, seqs[0])
 93 | 		seq1 := buffer(bufferSize, seqs[1])
 94 | 		merged = merge2(cmp, seq0, seq1)
 95 | 	} else {
 96 | 		bufferedSeqs := make([]iter.Seq2[[]T, error], len(seqs))
 97 | 		for i, seq := range seqs {
 98 | 			bufferedSeqs[i] = buffer(bufferSize, seq)
 99 | 		}
100 | 		merged = merge(cmp, bufferedSeqs)
101 | 	}
102 | 	return unbuffer(merged)
103 | }
104 | 
105 | // MergeSlice merges multiple sequences producing slices of ordered values.
106 | //
107 | // The function is intended to be used in applications that have high-throughput
108 | // requirements. By merging slices instead of individual values, the function
109 | // amortizes the baseline costs such as time spent on coroutine context switch
110 | // in the Go runtime, error checks, etc...
111 | //
112 | // The slices yielded when ranging over the returned function may or may not be
113 | // slices that were produced by the input sequences. The function may choose to
114 | // apply buffering when needed, or pass the slices as-is from the sequences.
115 | // They might also be reused across iterations, which means that the caller
116 | // should not retain the slices beyond the block of a for loop.
117 | //
118 | // For example, this code is incorrect:
119 | //
120 | //	var values [][]int
121 | //	for vs, err := range kway.MergeSlice(seq0, seq1, seq2) {
122 | //		if err != nil {
123 | //			...
124 | //		}
125 | //		values = append(values, vs)
126 | //	}
127 | //	// Using values here may not contain the expected data, each slice might
128 | //	// point to the same backing array and only contain values from the last
129 | //	// iteration.
130 | //
131 | // Instead, the caller should copy the values into a new slice:
132 | //
133 | //	var values []int
134 | //	for vs, err := range kway.MergeSlice(seq0, seq1, seq2) {
135 | //		if err != nil {
136 | //			...
137 | //		}
138 | //		values = append(values, vs...)
139 | //	}
140 | //
141 | // Due to the increased complexity that derives from using MergeSlice,
142 | // applications should prefer using Merge, which uses the same algorithm as
143 | // MergeSlice internally, and can already achieve very decent throughput.
144 | //
145 | // See Merge for more details.
146 | func MergeSlice[T cmp.Ordered](seqs ...iter.Seq2[[]T, error]) iter.Seq2[[]T, error] {
147 | 	return MergeSliceFunc(cmp.Compare[T], seqs...)
148 | }
149 | 
150 | // MergeSliceFunc merges multiple sequences producing slices of ordered values
151 | // using the given comparison function to determine the order. The sequences
152 | // must be ordered by the same comparison function.
153 | //
154 | // See MergeSlice for more details.
155 | func MergeSliceFunc[T any](cmp func(T, T) int, seqs ...iter.Seq2[[]T, error]) iter.Seq2[[]T, error] {
156 | 	switch len(seqs) {
157 | 	case 1:
158 | 		return seqs[0]
159 | 	case 2:
160 | 		return merge2(cmp, seqs[0], seqs[1])
161 | 	default:
162 | 		return merge(cmp, seqs)
163 | 	}
164 | }
165 | 
166 | func buffer[T any](bufferSize int, seq iter.Seq2[T, error]) iter.Seq2[[]T, error] {
167 | 	buf := make([]T, bufferSize)
168 | 	return func(yield func([]T, error) bool) {
169 | 		n := 0
170 | 
171 | 		var err error
172 | 		for buf[n], err = range seq {
173 | 			if err != nil {
174 | 				if !yield(nil, err) {
175 | 					return
176 | 				}
177 | 			} else if n++; n == len(buf) {
178 | 				if !yield(buf, nil) {
179 | 					return
180 | 				}
181 | 				n = 0
182 | 			}
183 | 		}
184 | 
185 | 		if n > 0 {
186 | 			yield(buf[:n], nil)
187 | 		}
188 | 	}
189 | }
190 | 
191 | func unbuffer[T any](seq iter.Seq2[[]T, error]) iter.Seq2[T, error] {
192 | 	return func(yield func(T, error) bool) {
193 | 		seq(func(values []T, err error) bool {
194 | 			var value T
195 | 			if err != nil && !yield(value, err) {
196 | 				return false
197 | 			}
198 | 			for _, value = range values {
199 | 				if !yield(value, nil) {
200 | 					return false
201 | 				}
202 | 			}
203 | 			return true
204 | 		})
205 | 	}
206 | }
207 | 
208 | func merge2[T any](cmp func(T, T) int, seq0, seq1 iter.Seq2[[]T, error]) iter.Seq2[[]T, error] {
209 | 	return func(yield func([]T, error) bool) {
210 | 		next0, stop0 := iter.Pull2(seq0)
211 | 		defer stop0()
212 | 
213 | 		next1, stop1 := iter.Pull2(seq1)
214 | 		defer stop1()
215 | 
216 | 		values0, err, ok0 := next0()
217 | 		if err != nil && !yield(nil, err) {
218 | 			return
219 | 		}
220 | 
221 | 		values1, err, ok1 := next1()
222 | 		if err != nil && !yield(nil, err) {
223 | 			return
224 | 		}
225 | 
226 | 		buffer := make([]T, bufferSize)
227 | 		offset := 0
228 | 		i0 := 0
229 | 		i1 := 0
230 | 		for ok0 && ok1 {
231 | 			for i0 < len(values0) && i1 < len(values1) {
232 | 				v0 := values0[i0]
233 | 				v1 := values1[i1]
234 | 
235 | 				if (offset + 1) >= len(buffer) {
236 | 					if !yield(buffer[:offset], nil) {
237 | 						return
238 | 					}
239 | 					offset = 0
240 | 				}
241 | 
242 | 				diff := cmp(v0, v1)
243 | 				switch {
244 | 				case diff < 0:
245 | 					buffer[offset] = v0
246 | 					offset++
247 | 					i0++
248 | 				case diff > 0:
249 | 					buffer[offset] = v1
250 | 					offset++
251 | 					i1++
252 | 				default:
253 | 					buffer[offset+0] = v0
254 | 					buffer[offset+1] = v1
255 | 					offset += 2
256 | 					i0++
257 | 					i1++
258 | 				}
259 | 			}
260 | 
261 | 			if i0 == len(values0) {
262 | 				i0 = 0
263 | 				if values0, err, ok0 = next0(); err != nil && !yield(nil, err) {
264 | 					return
265 | 				}
266 | 			}
267 | 
268 | 			if i1 == len(values1) {
269 | 				i1 = 0
270 | 				if values1, err, ok1 = next1(); err != nil && !yield(nil, err) {
271 | 					return
272 | 				}
273 | 			}
274 | 		}
275 | 
276 | 		if offset > 0 && !yield(buffer[:offset], nil) {
277 | 			return
278 | 		}
279 | 
280 | 		values0 = values0[i0:]
281 | 		values1 = values1[i1:]
282 | 
283 | 		for ok0 && yield(values0, nil) {
284 | 			if values0, err, ok0 = next0(); err != nil && !yield(nil, err) {
285 | 				return
286 | 			}
287 | 		}
288 | 
289 | 		for ok1 && yield(values1, nil) {
290 | 			if values1, err, ok1 = next1(); err != nil && !yield(nil, err) {
291 | 				return
292 | 			}
293 | 		}
294 | 	}
295 | }
296 | 
297 | func merge[T any](cmp func(T, T) int, seqs []iter.Seq2[[]T, error]) iter.Seq2[[]T, error] {
298 | 	return func(yield func([]T, error) bool) {
299 | 		tree := makeTree(seqs...)
300 | 		defer tree.stop()
301 | 
302 | 		buffer := make([]T, bufferSize)
303 | 		for {
304 | 			n, err := tree.next(buffer, cmp)
305 | 			if err == nil && n == 0 {
306 | 				return
307 | 			}
308 | 			if !yield(buffer[:n], err) {
309 | 				return
310 | 			}
311 | 		}
312 | 	}
313 | }
314 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # kway-go [![Go Reference](https://pkg.go.dev/badge/github.com/achille-roussel/kway-go.svg)](https://pkg.go.dev/github.com/achille-roussel/kway-go)
  2 | K-way merge with Go 1.23 range functions
  3 | 
  4 | [bboreham]: https://github.com/bboreham
  5 | [godoc]: https://pkg.go.dev/github.com/achille-roussel/kway-go@v0.2.0#pkg-examples
  6 | [gophercon]: https://www.gophercon.com/agenda/session/1160355
  7 | 
  8 | ## Installation
  9 | 
 10 | This package is intended to be used as a library and installed with:
 11 | ```sh
 12 | go get github.com/achille-roussel/kway-go
 13 | ```
 14 | 
 15 | ## Usage
 16 | 
 17 | The package contains variations of the K-way merge algorithm for different
 18 | forms of iterator sequences:
 19 | 
 20 | * **Merge** and **MergeFunc** operate on sequences that yield single
 21 |   values. **Merge** must be used on ordered values, while **MergeFunc**
 22 |   accepts a comparison function as first argument to customize the
 23 |   ordering logic.
 24 | 
 25 | * **MergeSlice** and **MergeSliceFunc** are similar functions but operate on
 26 |   sequences that yield slices of values. These are intended for applications
 27 |   with higher throughput requirements that use batching or read values from
 28 |   paging APIs.
 29 | 
 30 | The sequences being merged must each be ordered using the same comparison logic
 31 | than the one used for the merge, or the algorithm will not be able to produce an
 32 | ordered sequence of values.
 33 | 
 34 | The following code snippets illustrates how to merge three ordered sequences
 35 | into one:
 36 | ```go
 37 | for v, err := range kway.Merge(seq0, seq1, seq2) {
 38 |     ...
 39 | }
 40 | ```
 41 | 
 42 | More examples are available in the [Go doc][godoc].
 43 | 
 44 | ### Error Handling
 45 | 
 46 | The merge functions report errors seen from the input sequences, but the
 47 | presence of errors does not interrupt the merge operations. When an error
 48 | occurs, it is immediately bubbled up to the program, but if more values are
 49 | available in the input sequences, the program can continue consuming them after
 50 | handling the error. This model delegates the decision of how to handle errors to
 51 | the application, allowing it to carry or abort depending on the error value or
 52 | type, for example:
 53 | 
 54 | ```go
 55 | for v, err := range kway.Merge(sequences...) {
 56 |     if err != nil {
 57 |         // handle the error, the program may choose to break out of the loop
 58 |         // or carry on to read the next value.
 59 |         ...
 60 |     } else {
 61 |         // a value is available, process it
 62 |         ...
 63 |     }
 64 | }
 65 | ```
 66 | 
 67 | ## Implementation
 68 | 
 69 | The K-way merge algorithm was inspired by the talk from
 70 | [Bryan Boreham][bboreham] at [Gophercon 2023][gophercon], which described
 71 | how using a loser-tree instead of a min-heap improved performance of Loki's
 72 | merge of log records.
 73 | 
 74 | The `kway-go` package also adds a specialization for cases where the program
 75 | is merging exactly two sequences, since this can be implemented as a simple
 76 | union of two sets which has a much lower compute and memory footprint.
 77 | 
 78 | ## Performance
 79 | 
 80 | K-way merge is often used in stream processing or database engines to merge
 81 | distributed query results into a single ordered result set. In those
 82 | applications, performance of the underlying algorithms tend to matter: for
 83 | example, when performing compaction of sorted records, the merge algorithm is
 84 | on the critical path and often where most of the compute is being spent. In that
 85 | regard, there are efficiency requirements that the implementation must fulfil to
 86 | be a useful solution to those problems.
 87 | 
 88 | > :bulb: While exploring the performance characteristics of the algorithm, it is
 89 | > important to keep in mind that absolute numbers are only useful in the context
 90 | > where they were collected, since measurements depend on the hardware executing
 91 | > the code, and the data being processed. We should use relative performance of
 92 | > different benchmarks within a given context as a hint to find opportunities
 93 | > for optimizations in production applications, not as universal truths.
 94 | 
 95 | The current implementation has already been optimized to maximize throughput, by
 96 | amortizing as much of the baseline costs as possible, and ensure that CPU time is
 97 | spent on the important parts of the algorithm.
 98 | 
 99 | As part of this optimization work, it became apparent that while the Go runtime
100 | implementation of coroutines underneath `iter.Pull2` has a much lower compute
101 | footprint than using channels, it still has a significant overhead when reading
102 | values in tight loops of the merge algorithm.
103 | 
104 | This graph shows a preview of the results, the full analysis is described in the
105 | following sections:
106 | 
107 | ![image](https://github.com/achille-roussel/kway-go/assets/865510/730da27c-e639-4cfe-878a-9cc5c9287e37)
108 | 
109 | 
110 | ### Establishing a performance baseline
111 | 
112 | To explore performance, let's first establish a baseline. We use the throughput
113 | of merging a single sequence, which is simple reading all the values it yields
114 | as comparison point:
115 | ```
116 | Merge1  592898557  1.843 ns/op  0 comp/op   542741115 merge/s
117 | ```
118 | This benchmark shows that on this test machine, the highest theoretical
119 | throughput we can achieve is **~540M merge/s** for one sequence,
120 | **~270M merge/s** when merging two sequences, etc...
121 | 
122 | ### Performance analysis of the K-way merge algorithm
123 | 
124 | Now comparing the performance of merging two and three sequences:
125 | ```
126 | Merge2   47742177  24.78 ns/op  0.8125 comp/op  40359389 merge/s
127 | Merge3   27540648  42.23 ns/op  1.864 comp/op   23682342 merge/s
128 | ```
129 | We observe a significant drop in throughput in comparison with iterating over
130 | a single sequence, with the benchmark now performing **~7x slower** than the
131 | theoretical throughput limit.
132 | 
133 | The K-way merge algorithm has a complexity of *O(n∙log(k))*, there would also be
134 | a baseline cost for the added code implementing the merge operations, but almost
135 | an order of magnitude difference seems unexpected.
136 | 
137 | To understand what is happening, we can look into a CPU profile:
138 | ```
139 | Duration: 3.46s, Total samples = 2.44s (70.45%)
140 | Showing nodes accounting for 2.40s, 98.36% of 2.44s total
141 | Dropped 9 nodes (cum <= 0.01s)
142 |  flat  flat%   sum%    cum   cum%
143 | 0.30s 12.30% 12.30%  0.72s 29.51%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge2[go.shape.int].func3
144 | 0.25s 10.25% 22.54%  0.34s 13.93%  github.com/achille-roussel/kway-go.(*tree[go.shape.int]).next
145 | 0.21s  8.61% 31.15%  0.76s 31.15%  github.com/achille-roussel/kway-go.sequence.func1
146 | 0.17s  6.97% 38.11%  0.26s 10.66%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6.1
147 | 0.15s  6.15% 44.26%  0.25s 10.25%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1.1
148 | 0.15s  6.15% 50.41%  0.21s  8.61%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4.1
149 | 0.14s  5.74% 56.15%  0.23s  9.43%  iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func2
150 | 0.13s  5.33% 61.48%  0.13s  5.33%  runtime/internal/atomic.(*Uint32).CompareAndSwap (inline)
151 | 0.11s  4.51% 65.98%  0.18s  7.38%  iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1.1
152 | 0.10s  4.10% 70.08%  0.27s 11.07%  runtime.coroswitch_m
153 | 0.09s  3.69% 73.77%  0.09s  3.69%  github.com/achille-roussel/kway-go.benchmark[go.shape.int].func2
154 | 0.09s  3.69% 77.46%  0.09s  3.69%  runtime.coroswitch
155 | 0.08s  3.28% 80.74%  0.11s  4.51%  gogo
156 | 0.07s  2.87% 83.61%  0.09s  3.69%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2.1
157 | 0.06s  2.46% 86.07%  0.06s  2.46%  runtime.mapaccess1_fast64
158 | 0.05s  2.05% 88.11%  0.09s  3.69%  github.com/achille-roussel/kway-go.benchmark[go.shape.int].func1
159 | 0.04s  1.64% 89.75%  0.04s  1.64%  cmp.Compare[go.shape.int] (inline)
160 | 0.04s  1.64% 91.39%  0.04s  1.64%  internal/race.Acquire
161 | 0.04s  1.64% 93.03%  0.04s  1.64%  runtime.(*guintptr).cas (inline)
162 | 0.04s  1.64% 94.67%  0.32s 13.11%  runtime.mcall
163 | 0.04s  1.64% 96.31%  0.04s  1.64%  runtime.save_g
164 | 0.02s  0.82% 97.13%  0.02s  0.82%  internal/race.Release
165 | 0.01s  0.41% 97.54%  0.43s 17.62%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge[go.shape.int].func5
166 | 0.01s  0.41% 97.95%  0.04s  1.64%  github.com/achille-roussel/kway-go.nextNonEmptyValues[go.shape.int]
167 | 0.01s  0.41% 98.36%  0.08s  3.28%  runtime/pprof.(*profMap).lookup
168 |     0     0% 98.36%  0.72s 29.51%  github.com/achille-roussel/kway-go.BenchmarkMerge2
169 |     0     0% 98.36%  0.43s 17.62%  github.com/achille-roussel/kway-go.BenchmarkMerge3
170 |     0     0% 98.36%  0.35s 14.34%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1
171 |     0     0% 98.36%  0.14s  5.74%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2
172 |     0     0% 98.36%  0.27s 11.07%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4
173 |     0     0% 98.36%  1.15s 47.13%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6
174 |     0     0% 98.36%  1.15s 47.13%  github.com/achille-roussel/kway-go.benchmark[go.shape.int]
175 |     0     0% 98.36%  0.76s 31.15%  iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1
176 |     0     0% 98.36%  0.76s 31.15%  runtime.corostart
177 | ```
178 | As we can see here, a significant amount of time seems to be spent in the Go
179 | runtime code managing coroutines. While it might be possible to optimize the
180 | runtime, there is a lower bound on how much it can be reduced.
181 | 
182 | It is also unlikely that the Go compiler could help here, there are no real
183 | opportunities for inlining or other optimizations.
184 | 
185 | ### Performance optimization of the K-way merge algorithm
186 | 
187 | We basically have a very high baseline cost for each operation, with the
188 | hypothesis that it is driven by coroutine context switch implemented in the
189 | runtime, the only thing we can do to improve performance is doing less of these.
190 | 
191 | This is a typical a baseline cost amortization problem: we want to call the
192 | `next` function returned by `iter.Pull2` less often, which can be done by
193 | introducing buffering. Instead of pulling values one at a time, we can
194 | efficiently buffer N values from each sequence in memory, by transposing
195 | the `iter.Seq2[T, error]` sequences into `iter.Seq2[[]T, error]`. The call
196 | to `next` then only needs to happen when we exhaust the buffer, which ends up
197 | amortizing its cost.
198 | 
199 | With an internal buffer size of **128** values per sequence:
200 | ```
201 | Merge2  190103247  6.133 ns/op  0.8333 comp/op  163045156 merge/s
202 | Merge3  95485022  12.74 ns/op   1.864 comp/op    78492807 merge/s
203 | ```
204 | Now we made the algorithm **3-4x faster**, and have performance in the range of
205 | **1.5 to 2.5x** the theoretical throughput limit.
206 | 
207 | It is interesting to note that the CPU profile didn't seem to indicate that 75%
208 | of the time was spent in the runtime, but reducing the time spent in that code
209 | path has had a non-linear impact on performance. Likely some other CPU
210 | instruction pipeline and caching shenanigans are at play here, possibly impacted
211 | by the atomic compare-and-swap operations in coroutine switches.
212 | 
213 | As expected, the CPU profile now shows that almost no time is spent in the
214 | runtime:
215 | ```
216 | Duration: 3.17s, Total samples = 2.35s (74.08%)
217 | Showing nodes accounting for 2.28s, 97.02% of 2.35s total
218 | Dropped 22 nodes (cum <= 0.01s)
219 |  flat  flat%   sum%    cum   cum%
220 | 0.45s 19.15% 19.15%  0.56s 23.83%  github.com/achille-roussel/kway-go.(*tree[go.shape.int]).next
221 | 0.43s 18.30% 37.45%  0.43s 18.30%  github.com/achille-roussel/kway-go.benchmark[go.shape.int].func2
222 | 0.37s 15.74% 53.19%  0.97s 41.28%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge2[go.shape.int].func3
223 | 0.23s  9.79% 62.98%  0.24s 10.21%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1.1
224 | 0.22s  9.36% 72.34%  0.65s 27.66%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6.1
225 | 0.13s  5.53% 77.87%  0.13s  5.53%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4.1
226 | 0.12s  5.11% 82.98%  0.21s  8.94%  github.com/achille-roussel/kway-go.benchmark[go.shape.int].func1
227 | 0.10s  4.26% 87.23%  0.52s 22.13%  github.com/achille-roussel/kway-go.sequence.func1
228 | 0.09s  3.83% 91.06%  0.09s  3.83%  cmp.Compare[go.shape.int] (inline)
229 | 0.05s  2.13% 93.19%  0.05s  2.13%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2.1
230 | 0.03s  1.28% 94.47%  0.06s  2.55%  runtime/pprof.(*profMap).lookup
231 | 0.02s  0.85% 95.32%  0.02s  0.85%  github.com/achille-roussel/kway-go.parent (inline)
232 | 0.02s  0.85% 96.17%  0.02s  0.85%  runtime.asyncPreempt
233 | 0.02s  0.85% 97.02%  0.02s  0.85%  runtime.mapaccess1_fast64
234 |     0     0% 97.02%  0.97s 41.28%  github.com/achille-roussel/kway-go.BenchmarkMerge2
235 |     0     0% 97.02%  0.76s 32.34%  github.com/achille-roussel/kway-go.BenchmarkMerge3
236 |     0     0% 97.02%  0.31s 13.19%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func1
237 |     0     0% 97.02%  0.08s  3.40%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func2
238 |     0     0% 97.02%  0.13s  5.53%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].buffer[go.shape.int].func4
239 |     0     0% 97.02%  0.76s 32.34%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].merge[go.shape.int].func5
240 |     0     0% 97.02%  1.73s 73.62%  github.com/achille-roussel/kway-go.MergeFunc[go.shape.int].unbuffer[go.shape.int].func6
241 |     0     0% 97.02%  1.73s 73.62%  github.com/achille-roussel/kway-go.benchmark[go.shape.int]
242 |     0     0% 97.02%  0.52s 22.13%  iter.Pull2[go.shape.[]go.shape.int,go.shape.interface { Error string }].func1
243 |     0     0% 97.02%  0.52s 22.13%  runtime.corostart
244 | ```
245 | 
246 | ### Further optimizations using batch processing
247 | 
248 | There is a final performance frontier we can cross. While we are buffering
249 | values internally, the input and output sequences remain `iter.Seq2[T, error]`,
250 | which yield values one by one. Often times in data systems, APIs have pagination
251 | capabilities, or stream processors work on batch of values for the same reason
252 | we added buffering: it reduces the baseline cost of crossing system boundaries.
253 | 
254 | If the input sequences are already slices of values, and the output sequence
255 | produces slices of values, we can reduce the internal memory footprint (no need
256 | to allocate memory to buffer the inputs), while also further amortizing the cost
257 | of function calls to yield values in and out of the merge algorithm.
258 | 
259 | Applications that fall into those categories can unlock further performance by
260 | using `MergeSlice` instead of `Merge`, which works on `iter.Seq2[[]T, error]`
261 | end-to-end.
262 | 
263 | What is interesting with this approach is that in cases where the processing of
264 | inputs and outputs can be batched, this model **can even beat the theoretical
265 | throughput limit**. For example, in the benchmarks we've used, the body of the
266 | loop consuming merged values simply counts the results. When consuming slices
267 | there is no need to iterate over the slices and increment the counter by one
268 | each time, we can batch the operation by incrementing the counter by the length
269 | of the slice, achieving much higher throughput than predicted by the baseline:
270 | ```
271 | MergeSlice2  477720793  2.273 ns/op  0.6688 comp/op  439971259 merge/s
272 | MergeSlice3  150406080  7.945 ns/op  1.667 comp/op   125861613 merge/s
273 | ```
274 | 
275 | > :warning: Keep in mind that to minimize the footprint, `MergeSlice` resuses
276 | > its output buffer, which means that the application cannot retain it beyond
277 | > the body of the loop raning over the merge function. This can lead to subtle
278 | > bugs that can be difficult to track, `Merge` should always be preferred unless
279 | > there is clear evidence that the increased maintenance cost is worth the
280 | > performance benefits.
281 | 


--------------------------------------------------------------------------------