├── go.mod
├── inspecting_assembly
    ├── example.o
    └── example.go
├── call
    ├── c.go
    └── example_test.go
├── go.sum
├── writing_benchmarks
    ├── incorrect_test.go
    └── correct_test.go
├── heap_alloc
    └── example_test.go
├── measure_time
    └── main.go
├── memory
    └── example_test.go
├── pointers
    └── example_test.go
├── LICENSE
├── unintentional_copy
    └── example_test.go
├── README.md
├── branch
    └── example_test.go
├── profiling
    └── example_test.go
├── bounds_checks
    └── example_test.go
├── unrolling
    └── example_test.go
├── dispatch
    └── example_test.go
└── loop_alignment
    └── example_test.go


/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/egonelbre/perf-example
2 | 
3 | go 1.22rc2
4 | 
5 | require github.com/loov/hrtime v1.0.3
6 | 


--------------------------------------------------------------------------------
/inspecting_assembly/example.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egonelbre/perf-example/HEAD/inspecting_assembly/example.o


--------------------------------------------------------------------------------
/call/c.go:
--------------------------------------------------------------------------------
1 | package example
2 | 
3 | // int nop() { return 0; }
4 | import "C"
5 | 
6 | func CNop() int { return int(C.nop()) }
7 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/loov/hrtime v1.0.3 h1:LiWKU3B9skJwRPUf0Urs9+0+OE3TxdMuiRPOTwR0gcU=
2 | github.com/loov/hrtime v1.0.3/go.mod h1:yDY3Pwv2izeY4sq7YcPX/dtLwzg5NU1AxWuWxKwd0p0=
3 | 


--------------------------------------------------------------------------------
/writing_benchmarks/incorrect_test.go:
--------------------------------------------------------------------------------
 1 | package writing_benchmarks
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | )
 7 | 
 8 | func BenchmarkNoRepeat(b *testing.B) {
 9 | 	// incorrect way to write a benchmark
10 | 	for i := 0; i < 1000000; i++ {
11 | 		time.Now()
12 | 	}
13 | }
14 | 
15 | func add(a, b int) int { return a*a + b*b }
16 | 
17 | func BenchmarkOptimizedAway(b *testing.B) {
18 | 	// incorrect way to write a benchmark
19 | 	for i := 0; i < b.N; i++ {
20 | 		add(5, 7)
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/heap_alloc/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"encoding/binary"
 6 | )
 7 | 
 8 | var escape []byte
 9 | 
10 | func Heap(v uint64) byte {
11 | 	escape = make([]byte, 8)
12 | 	binary.LittleEndian.PutUint64(escape[:], v)
13 | 	return escape[0]
14 | }
15 | 
16 | func Stack(v uint64) byte {
17 | 	var data [8]byte
18 | 	binary.LittleEndian.PutUint64(data[:], v)
19 | 	return data[0]
20 | }
21 | 
22 | var sink byte
23 | 
24 | func BenchmarkHeap(b *testing.B) {
25 | 	for i := 0; i < b.N; i++ {
26 | 		sink += Heap(uint64(i))
27 | 	}
28 | }
29 | 
30 | func BenchmarkStack(b *testing.B) {
31 | 	for i := 0; i < b.N; i++ {
32 | 		sink += Stack(uint64(i))
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/inspecting_assembly/example.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | // Documentation:
 4 | //
 5 | //    https://go.dev/doc/asm
 6 | //
 7 | 
 8 | // To build a single package:
 9 | //
10 | //    go build -o example.o .
11 | 
12 | // To view assembly from compilation:
13 | //
14 | //    go tool compile -S example.go
15 | 
16 | // go tool objdump -S -s Add example.o
17 | func Add(a, b int) int {
18 | 	return a + b
19 | }
20 | 
21 | // go tool objdump -S -s Loop example.o
22 | func Loop(a, b, n int) (r int) {
23 | 	for i := 0; i < n; i++ {
24 | 		r += a + b
25 | 	}
26 | 	return r
27 | }
28 | 
29 | // Also:
30 | //
31 | //    go install loov.dev/lensm@main
32 | //    lensm example.o
33 | //
34 | // If you want to properly understand assembly,
35 | // then go write a virtual machine first.
36 | //
37 | // https://adventofcode.com/2019 is a pretty good thing to work through.
38 | 


--------------------------------------------------------------------------------
/measure_time/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"time"
 6 | 
 7 | 	"github.com/loov/hrtime"
 8 | )
 9 | 
10 | func main() {
11 | 	for i := 0; i < 10; i++ {
12 | 		_ = hrtime.Now()
13 | 		_ = time.Now()
14 | 	}
15 | 
16 | 	var now [10000000]time.Time
17 | 	start := hrtime.Now()
18 | 	for i := range now {
19 | 		now[i] = time.Now()
20 | 	}
21 | 	finish := hrtime.Now()
22 | 
23 | 	// let's look at system granularity
24 | 	var durations []time.Duration
25 | 	for i := range now[:len(now)-1] {
26 | 		durations = append(durations, now[i+1].Sub(now[i]))
27 | 	}
28 | 
29 | 	histogram := hrtime.NewDurationHistogram(durations,
30 | 		&hrtime.HistogramOptions{
31 | 			BinCount:        10,
32 | 			NiceRange:       true,
33 | 			ClampMaximum:    float64(10 * time.Microsecond),
34 | 			ClampPercentile: 0,
35 | 		})
36 | 
37 | 	fmt.Println(histogram)
38 | 
39 | 	// let's look at measurement overhead
40 | 	fmt.Println(float64(finish-start)/float64(len(now)), "ns")
41 | }
42 | 


--------------------------------------------------------------------------------
/writing_benchmarks/correct_test.go:
--------------------------------------------------------------------------------
 1 | package writing_benchmarks
 2 | 
 3 | import (
 4 | 	"runtime"
 5 | 	"testing"
 6 | 	"time"
 7 | 
 8 | 	"github.com/loov/hrtime"
 9 | )
10 | 
11 | var sinkTime time.Time
12 | var sinkDuration time.Duration
13 | 
14 | func BenchmarkTime(b *testing.B) {
15 | 	for i := 0; i < b.N; i++ {
16 | 		sinkTime = time.Now()
17 | 	}
18 | }
19 | 
20 | func BenchmarkTimeAlternative(b *testing.B) {
21 | 	for i := 0; i < b.N; i++ {
22 | 		runtime.KeepAlive(time.Now())
23 | 	}
24 | }
25 | 
26 | // Using https://pkg.go.dev/golang.org/x/perf/cmd/benchstat
27 | //
28 | //	go test -bench Sub -count 10 | tee bench.log
29 | //	go install golang.org/x/perf/cmd/benchstat@latest
30 | //	benchstat bench.log
31 | //	benchstat old.log new.log
32 | //	benchstat -col /impl -row .name bench.log
33 | func BenchmarkSub(b *testing.B) {
34 | 	b.Run("impl=hrtime", func(b *testing.B) {
35 | 		for i := 0; i < b.N; i++ {
36 | 			sinkDuration = hrtime.Now()
37 | 		}
38 | 	})
39 | 
40 | 	b.Run("impl=time", func(b *testing.B) {
41 | 		for i := 0; i < b.N; i++ {
42 | 			sinkTime = time.Now()
43 | 		}
44 | 	})
45 | }
46 | 


--------------------------------------------------------------------------------
/memory/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"testing"
 6 | )
 7 | 
 8 | // More details at https://igoro.com/archive/gallery-of-processor-cache-effects/
 9 | 
10 | var (
11 | 	data128B = [128 / 4]int32{}              // 128 bytes
12 | 	data1MB  = [1024 * 1024 / 4]int32{}      // 1 MB
13 | 	data64MB = [64 * 1024 * 1024 / 4]int32{} // 64MB
14 | 
15 | 	order []int
16 | )
17 | 
18 | func init() {
19 | 	order = make([]int, 1e6)
20 | 	for i := range order {
21 | 		order[i] = rand.Int()
22 | 	}
23 | }
24 | 
25 | var sink int32
26 | 
27 | func Benchmark128B(b *testing.B) {
28 | 	for i := 0; i < b.N; i++ {
29 | 		var total int32
30 | 		for _, k := range order {
31 | 			total += data128B[k%len(data128B)]
32 | 		}
33 | 		sink = total
34 | 	}
35 | }
36 | 
37 | func Benchmark1MB(b *testing.B) {
38 | 	for i := 0; i < b.N; i++ {
39 | 		var total int32
40 | 		for _, k := range order {
41 | 			total += data1MB[k%len(data1MB)]
42 | 		}
43 | 		sink = total
44 | 	}
45 | }
46 | 
47 | func Benchmark64MB(b *testing.B) {
48 | 	for i := 0; i < b.N; i++ {
49 | 		var total int32
50 | 		for _, k := range order {
51 | 			total += data64MB[k%len(data64MB)]
52 | 		}
53 | 		sink = total
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/pointers/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"slices"
 6 | 	"testing"
 7 | )
 8 | 
 9 | var data []float32
10 | var sorted []*float32
11 | var unsorted []*float32
12 | 
13 | func init() {
14 | 	// data = make([]float32, 100000)
15 | 	data = make([]float32, 10000000)
16 | 	for i := range data {
17 | 		data[i] = rand.Float32()
18 | 	}
19 | 
20 | 	sorted = make([]*float32, len(data))
21 | 	for i := range sorted {
22 | 		sorted[i] = &data[i]
23 | 	}
24 | 	unsorted = slices.Clone(sorted)
25 | 	rand.Shuffle(len(unsorted), func(i, k int) {
26 | 		unsorted[i], unsorted[k] = unsorted[k], unsorted[i]
27 | 	})
28 | }
29 | 
30 | var sink float32
31 | 
32 | func BenchmarkUnsorted(b *testing.B) {
33 | 	for k := 0; k < b.N; k++ {
34 | 		total := float32(0)
35 | 		for _, v := range unsorted {
36 | 			total += *v
37 | 		}
38 | 		sink += total
39 | 	}
40 | }
41 | 
42 | func BenchmarkSorted(b *testing.B) {
43 | 	for k := 0; k < b.N; k++ {
44 | 		total := float32(0)
45 | 		for _, v := range sorted {
46 | 			total += *v
47 | 		}
48 | 		sink += total
49 | 	}
50 | }
51 | 
52 | func BenchmarkData(b *testing.B) {
53 | 	for k := 0; k < b.N; k++ {
54 | 		total := float32(0)
55 | 		for _, v := range data {
56 | 			total += v
57 | 		}
58 | 		sink += total
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/unintentional_copy/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | type Shape struct {
 8 | 	Kind      int
 9 | 	Dimension float32
10 | 
11 | 	ExtraData [10 * 1024]byte
12 | }
13 | 
14 | var data = make([]Shape, 1024)
15 | 
16 | func BenchmarkCopy(b *testing.B) {
17 | 	total := float32(0)
18 | 	for k := 0; k < b.N; k++ {
19 | 		for i, shape := range data {
20 | 			total += shape.Dimension
21 | 			shape.Dimension++
22 | 			data[i] = shape
23 | 		}
24 | 	}
25 | }
26 | 
27 | func BenchmarkReference(b *testing.B) {
28 | 	total := float32(0)
29 | 	for k := 0; k < b.N; k++ {
30 | 		for i := range data {
31 | 			shape := &data[i]
32 | 			total += shape.Dimension
33 | 			shape.Dimension++
34 | 		}
35 | 	}
36 | }
37 | 
38 | func BenchmarkCall(b *testing.B) {
39 | 	total := float32(0)
40 | 	for k := 0; k < b.N; k++ {
41 | 		for _, shape := range data {
42 | 			total += DimensionValue(shape)
43 | 		}
44 | 	}
45 | }
46 | 
47 | func BenchmarkCallPointer(b *testing.B) {
48 | 	total := float32(0)
49 | 	for k := 0; k < b.N; k++ {
50 | 		for i := range data {
51 | 			shape := &data[i]
52 | 			total += DimensionPointer(shape)
53 | 		}
54 | 	}
55 | }
56 | 
57 | //go:noinline
58 | func DimensionValue(v Shape) float32 { return v.Dimension }
59 | 
60 | //go:noinline
61 | func DimensionPointer(v *Shape) float32 { return v.Dimension }
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A session on low-level optimizations
 2 | 
 3 | * [measuring time](./measure_time)
 4 | * [writing benchmarks](./writing_benchmarks)
 5 | * [inspecting assembly](./inspecting_assembly)
 6 | * [basic profiling](./profiling)
 7 | * [branch cost](./branch)
 8 | * [call cost](./call)
 9 | * [dispatch call cost](./dispatch)
10 | * [unintentional copy](./unintentional_copy)
11 | * [pointer cost](./pointers)
12 | * [memory cost](./memory)
13 | * [bounds check cost](./bounds_checks)
14 | * [unrolling](./unrolling)
15 | * [loop alignment](./loop_alignment)
16 | * [heap vs stack allocation)(./heap_alloc)
17 | 
18 | ## Additional Videos / Articles
19 | 
20 | * Intuitive Performance https://www.youtube.com/watch?v=51ZIFNqgCkA
21 | * https://egonelbre.com/a-tale-of-bfs/
22 | * https://egonelbre.com/a-tale-of-bfs-going-parallel/
23 | 
24 | ## Recommended
25 | 
26 | * https://github.com/dgryski/go-perfbook
27 | * https://en.algorithmica.org/hpc/
28 | * https://www.dataorienteddesign.com/dodbook/
29 | * https://www.computerenhance.com/
30 | 
31 | ## Tools
32 | 
33 | * Benchmark statistical analysis (https://golang.org/x/perf/cmd/benchstat)
34 | * Assembly and Code viewer (https://github.com/loov/lensm)
35 | * Visualizing bounds checks (https://github.com/loov/view-annotated-file)
36 | * AMD μProf (https://www.amd.com/en/developer/uprof.html)
37 | * Intel VTune Profiler (https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html)
38 | * Apple Instruments (https://help.apple.com/instruments/mac/current/#)


--------------------------------------------------------------------------------
/call/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"testing"
 6 | )
 7 | 
 8 | var sink int
 9 | 
10 | func Nop() int {
11 | 	return 0
12 | }
13 | 
14 | func BenchmarkInlined(b *testing.B) {
15 | 	for i := 0; i < b.N; i++ {
16 | 		sink += Nop()
17 | 	}
18 | }
19 | 
20 | //go:noinline
21 | func Nop2() int {
22 | 	return 0
23 | }
24 | 
25 | func BenchmarkNotInlined(b *testing.B) {
26 | 	for i := 0; i < b.N; i++ {
27 | 		sink += Nop2()
28 | 	}
29 | }
30 | 
31 | //go:noinline
32 | func createANop() func() int { return Nop2 }
33 | 
34 | func BenchmarkFuncCall(b *testing.B) {
35 | 	nop := createANop()
36 | 	for i := 0; i < b.N; i++ {
37 | 		sink += nop()
38 | 	}
39 | }
40 | 
41 | type Noper interface {
42 | 	Nop() int
43 | }
44 | 
45 | type nop struct{}
46 | 
47 | func (nop) Nop() int { return 0 }
48 | 
49 | type nop2 struct{}
50 | 
51 | func (nop2) Nop() int { return 1 }
52 | 
53 | func BenchmarkDevirtualizedInterfaceCall(b *testing.B) {
54 | 	var nop Noper = nop{}
55 | 	for i := 0; i < b.N; i++ {
56 | 		sink += nop.Nop()
57 | 	}
58 | }
59 | 
60 | func oneof() Noper {
61 | 	if rand.Intn(2) == 0 {
62 | 		return nop{}
63 | 	} else {
64 | 		return nop2{}
65 | 	}
66 | }
67 | 
68 | func BenchmarkInterfaceCall(b *testing.B) {
69 | 	var nop Noper = oneof()
70 | 	b.ResetTimer()
71 | 	for i := 0; i < b.N; i++ {
72 | 		sink += nop.Nop()
73 | 	}
74 | }
75 | 
76 | func BenchmarkInterfaceCall2(b *testing.B) {
77 | 	var nop Noper = oneof()
78 | 	b.ResetTimer()
79 | 	fn := nop.Nop
80 | 	for i := 0; i < b.N; i++ {
81 | 		sink += fn()
82 | 	}
83 | }
84 | 
85 | func BenchmarkC(b *testing.B) {
86 | 	for i := 0; i < b.N; i++ {
87 | 		sink += CNop()
88 | 	}
89 | }
90 | 


--------------------------------------------------------------------------------
/branch/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"math/rand"
 5 | 	"slices"
 6 | 	"sort"
 7 | 	"testing"
 8 | )
 9 | 
10 | // More details at https://igoro.com/archive/fast-and-slow-if-statements-branch-prediction-in-modern-processors/
11 | 
12 | var unsorted []int
13 | var sorted []int
14 | var half []int
15 | 
16 | func init() {
17 | 	unsorted = make([]int, 10000)
18 | 	for i := range unsorted {
19 | 		unsorted[i] = rand.Intn(100)
20 | 	}
21 | 
22 | 	sorted = slices.Clone(unsorted)
23 | 	sort.Ints(sorted)
24 | 
25 | 	half = make([]int, len(unsorted))
26 | 	for i := range half {
27 | 		if i%2 == 0 {
28 | 			half[i] = 0
29 | 		} else {
30 | 			half[i] = 100
31 | 		}
32 | 	}
33 | }
34 | 
35 | //go:noinline
36 | func DiffLimit(vs []int, limit int) int {
37 | 	above := 0
38 | 	below := 0
39 | 	for _, v := range vs {
40 | 		if v > limit {
41 | 			above += v
42 | 		} else {
43 | 			below += v
44 | 		}
45 | 	}
46 | 	return above - below
47 | }
48 | 
49 | func BenchmarkUnsorted(b *testing.B) {
50 | 	for i := 0; i < b.N; i++ {
51 | 		DiffLimit(unsorted, 50)
52 | 	}
53 | }
54 | 
55 | func BenchmarkSorted(b *testing.B) {
56 | 	for i := 0; i < b.N; i++ {
57 | 		DiffLimit(sorted, 50)
58 | 	}
59 | }
60 | 
61 | func BenchmarkHalf(b *testing.B) {
62 | 	for i := 0; i < b.N; i++ {
63 | 		DiffLimit(half, 50)
64 | 	}
65 | }
66 | 
67 | //go:noinline
68 | func DiffLimitCMOV(vs []int, limit int) int {
69 | 	above := 0
70 | 	below := 0
71 | 	for _, v := range vs {
72 | 		if v > limit {
73 | 			above += v
74 | 		}
75 | 		if v < limit {
76 | 			below += v
77 | 		}
78 | 	}
79 | 	return above - below
80 | }
81 | 
82 | func BenchmarkUnsortedCMOV(b *testing.B) {
83 | 	for i := 0; i < b.N; i++ {
84 | 		DiffLimitCMOV(unsorted, 50)
85 | 	}
86 | }
87 | 
88 | func BenchmarkSortedCMOV(b *testing.B) {
89 | 	for i := 0; i < b.N; i++ {
90 | 		DiffLimitCMOV(sorted, 50)
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/profiling/example_test.go:
--------------------------------------------------------------------------------
 1 | package profiling
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"strconv"
 6 | 	"strings"
 7 | 	"testing"
 8 | )
 9 | 
10 | // go test -bench Label -benchmem
11 | //
12 | // go test -bench Label -cpuprofile cpu.prof -benchtime 5s
13 | // go test -bench Label -memprofile mem.prof -benchtime 5s
14 | // go test -bench Label -memprofile mem.prof -benchtime 5s
15 | //
16 | // On Non-Windows:
17 | //    go tool pprof cpu.prof
18 | //    go tool pprof mem.prof
19 | //
20 | //    go tool pprof -lines cpu.prof
21 | //
22 | // On Windows:
23 | //    go tool pprof profiling.test.exe cpu.prof
24 | //    go tool pprof profiling.test.exe mem.prof
25 | //
26 | //    go tool pprof -lines profiling.test.exe cpu.prof
27 | //
28 | // Commands in pprof:
29 | //   top 30
30 | //   top 30 -cum
31 | //   list Format
32 | //   disasm Format
33 | //
34 | 
35 | var sink string
36 | 
37 | //go:noinline
38 | func Format(prefix string, count int, suffix string) string {
39 | 	return fmt.Sprintf("%v%v%v", prefix, count, suffix)
40 | }
41 | 
42 | //go:noinline
43 | func Add(prefix string, count int, suffix string) string {
44 | 	return prefix + strconv.Itoa(count) + suffix
45 | }
46 | 
47 | //go:noinline
48 | func Builder(prefix string, count int, suffix string) string {
49 | 	var b strings.Builder
50 | 	b.Grow(len(prefix) + 13 + len(suffix))
51 | 	b.WriteString(prefix)
52 | 
53 | 	var buffer [13]byte
54 | 	result := strconv.AppendInt(buffer[:], int64(count), 10)
55 | 	b.Write(result)
56 | 
57 | 	b.WriteString(suffix)
58 | 
59 | 	return b.String()
60 | }
61 | 
62 | func BenchmarkLabel(b *testing.B) {
63 | 	b.Run("Format", func(b *testing.B) {
64 | 		for i := 0; i < b.N; i++ {
65 | 			sink = Format("Alpha", i, "Variant")
66 | 		}
67 | 	})
68 | 
69 | 	b.Run("Add", func(b *testing.B) {
70 | 		for i := 0; i < b.N; i++ {
71 | 			sink = Add("Alpha", i, "Variant")
72 | 		}
73 | 	})
74 | 
75 | 	b.Run("Builder", func(b *testing.B) {
76 | 		for i := 0; i < b.N; i++ {
77 | 			sink = Builder("Alpha", i, "Variant")
78 | 		}
79 | 	})
80 | }
81 | 


--------------------------------------------------------------------------------
/bounds_checks/example_test.go:
--------------------------------------------------------------------------------
 1 | package example
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"unsafe"
 6 | )
 7 | 
 8 | // To disable bounds checks entirely:
 9 | //
10 | //    go test -gcflags=-B -bench .
11 | //
12 | // Viewing bounds checks
13 | //
14 | //    go test -gcflags "all=-m -m -d=ssa/check_bce/debug" -bench . 2>analysis.log
15 | //    go install github.com/loov/view-annotated-file@latest
16 | //    view-annotated-file analysis.log
17 | 
18 | var (
19 | 	xs   = make([]float32, 10000)
20 | 	ys   = make([]float32, 10000)
21 | 	incx = 1
22 | 	incy = 1
23 | )
24 | 
25 | //go:noinline
26 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 {
27 | 	var r float32
28 | 	xi, yi := 0, 0
29 | 	for ; n > 0; n-- {
30 | 		r += xs[xi] * ys[yi]
31 | 
32 | 		xi += incx
33 | 		yi += incy
34 | 	}
35 | 	return r
36 | }
37 | 
38 | //go:noinline
39 | func DotUnsafe(xs []float32, incx int, ys []float32, incy int, n int) float32 {
40 | 	var r float32
41 | 	xi, yi := 0, 0
42 | 	for ; n > 0; n-- {
43 | 		r += *unsafeAt(xs, xi) * *unsafeAt(ys, yi)
44 | 
45 | 		xi += incx
46 | 		yi += incy
47 | 	}
48 | 	return r
49 | }
50 | 
51 | //go:noinline
52 | func DotPointers(xs []float32, incx int, ys []float32, incy int, n int) float32 {
53 | 	var r float32
54 | 	xp := unsafe.Pointer(unsafe.SliceData(xs))
55 | 	yp := unsafe.Pointer(unsafe.SliceData(ys))
56 | 	incxp, incyp := uintptr(incx*4), uintptr(incy*4)
57 | 	for ; n > 0; n-- {
58 | 		r += *(*float32)(xp) * *(*float32)(yp)
59 | 		xp = unsafe.Add(xp, incxp)
60 | 		yp = unsafe.Add(yp, incyp)
61 | 	}
62 | 	return r
63 | }
64 | 
65 | var sink float32
66 | 
67 | func BenchmarkDot(b *testing.B) {
68 | 	for i := 0; i < b.N; i++ {
69 | 		sink += Dot(xs, incx, ys, incy, len(xs))
70 | 	}
71 | }
72 | 
73 | func BenchmarkDotUnsafe(b *testing.B) {
74 | 	for i := 0; i < b.N; i++ {
75 | 		sink += DotUnsafe(xs, incx, ys, incy, len(xs))
76 | 	}
77 | }
78 | 
79 | func BenchmarkDotPointers(b *testing.B) {
80 | 	for i := 0; i < b.N; i++ {
81 | 		sink += DotPointers(xs, incx, ys, incy, len(xs))
82 | 	}
83 | }
84 | 
85 | func unsafeAt[T any](xs []T, index int) *T {
86 | 	return (*T)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(xs)), uintptr(index)*unsafe.Sizeof(xs[0])))
87 | }
88 | 


--------------------------------------------------------------------------------
/unrolling/example_test.go:
--------------------------------------------------------------------------------
  1 | package example
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | var (
  8 | 	xs   = make([]float32, 10000)
  9 | 	ys   = make([]float32, 10000)
 10 | 	incx = 1
 11 | 	incy = 1
 12 | )
 13 | 
 14 | //go:noinline
 15 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 16 | 	var r float32
 17 | 	xi, yi := 0, 0
 18 | 	for ; n > 0; n-- {
 19 | 		r += xs[xi] * ys[yi]
 20 | 
 21 | 		xi += incx
 22 | 		yi += incy
 23 | 	}
 24 | 	return r
 25 | }
 26 | 
 27 | //go:noinline
 28 | func DotUnroll(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 29 | 	var r float32
 30 | 	xi, yi := 0, 0
 31 | 	for ; n >= 4; n -= 4 {
 32 | 		r += xs[xi] * ys[yi]
 33 | 		xi += incx
 34 | 		yi += incy
 35 | 
 36 | 		r += xs[xi] * ys[yi]
 37 | 		xi += incx
 38 | 		yi += incy
 39 | 
 40 | 		r += xs[xi] * ys[yi]
 41 | 		xi += incx
 42 | 		yi += incy
 43 | 
 44 | 		r += xs[xi] * ys[yi]
 45 | 		xi += incx
 46 | 		yi += incy
 47 | 	}
 48 | 	for ; n > 0; n-- {
 49 | 		r += xs[xi] * ys[yi]
 50 | 
 51 | 		xi += incx
 52 | 		yi += incy
 53 | 	}
 54 | 	return r
 55 | }
 56 | 
 57 | //go:noinline
 58 | func DotPipeline(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 59 | 	var r1, r2, r3, r4 float32
 60 | 	xi, yi := 0, 0
 61 | 	for ; n >= 4; n -= 4 {
 62 | 		r1 += xs[xi] * ys[yi]
 63 | 		xi += incx
 64 | 		yi += incy
 65 | 
 66 | 		r2 += xs[xi] * ys[yi]
 67 | 		xi += incx
 68 | 		yi += incy
 69 | 
 70 | 		r3 += xs[xi] * ys[yi]
 71 | 		xi += incx
 72 | 		yi += incy
 73 | 
 74 | 		r4 += xs[xi] * ys[yi]
 75 | 		xi += incx
 76 | 		yi += incy
 77 | 	}
 78 | 	for ; n > 0; n-- {
 79 | 		r1 += xs[xi] * ys[yi]
 80 | 
 81 | 		xi += incx
 82 | 		yi += incy
 83 | 	}
 84 | 	return r1 + r2 + r3 + r4
 85 | }
 86 | 
 87 | var sink float32
 88 | 
 89 | func BenchmarkDot(b *testing.B) {
 90 | 	for i := 0; i < b.N; i++ {
 91 | 		sink += Dot(xs, incx, ys, incy, len(xs))
 92 | 	}
 93 | }
 94 | 
 95 | func BenchmarkDotUnroll(b *testing.B) {
 96 | 	for i := 0; i < b.N; i++ {
 97 | 		sink += DotUnroll(xs, incx, ys, incy, len(xs))
 98 | 	}
 99 | }
100 | 
101 | func BenchmarkDotPipeline(b *testing.B) {
102 | 	for i := 0; i < b.N; i++ {
103 | 		sink += DotPipeline(xs, incx, ys, incy, len(xs))
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/dispatch/example_test.go:
--------------------------------------------------------------------------------
  1 | package example
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"math/rand"
  6 | 	"slices"
  7 | 	"sort"
  8 | 	"testing"
  9 | 	"unsafe"
 10 | )
 11 | 
 12 | var unsorted []Shape
 13 | var sorted []Shape
 14 | var nointerface []ShapeStruct
 15 | 
 16 | func init() {
 17 | 	unsorted = make([]Shape, 1e4)
 18 | 	nointerface = make([]ShapeStruct, len(unsorted))
 19 | 
 20 | 	for i := range unsorted {
 21 | 		if rand.Intn(2) == 0 {
 22 | 			unsorted[i] = Circle{rand.Float32()}
 23 | 			nointerface[i] = ShapeStruct{CircleKind, rand.Float32()}
 24 | 		} else {
 25 | 			unsorted[i] = Square{rand.Float32()}
 26 | 			nointerface[i] = ShapeStruct{SquareKind, rand.Float32()}
 27 | 		}
 28 | 	}
 29 | 
 30 | 	sorted = slices.Clone(unsorted)
 31 | 	type iface struct {
 32 | 		itab uintptr
 33 | 		data unsafe.Pointer
 34 | 	}
 35 | 	sort.Slice(sorted, func(i, k int) bool {
 36 | 		a := (*iface)(unsafe.Pointer(&sorted[i])).itab
 37 | 		b := (*iface)(unsafe.Pointer(&sorted[k])).itab
 38 | 		return a < b
 39 | 	})
 40 | }
 41 | 
 42 | type Shape interface {
 43 | 	Area() float32
 44 | }
 45 | 
 46 | type Circle struct{ Radius float32 }
 47 | type Square struct{ Side float32 }
 48 | 
 49 | func (s Circle) Area() float32 {
 50 | 	return math.Pi * s.Radius * s.Radius
 51 | }
 52 | func (s Square) Area() float32 {
 53 | 	return s.Side * s.Side
 54 | }
 55 | 
 56 | func TotalArea(shapes []Shape) (total float32) {
 57 | 	for _, shape := range shapes {
 58 | 		total += shape.Area()
 59 | 	}
 60 | 	return total
 61 | }
 62 | 
 63 | func BenchmarkUnsorted(b *testing.B) {
 64 | 	total := float32(0)
 65 | 	for k := 0; k < b.N; k++ {
 66 | 		total += TotalArea(unsorted)
 67 | 	}
 68 | }
 69 | 
 70 | func BenchmarkSorted(b *testing.B) {
 71 | 	total := float32(0)
 72 | 	for k := 0; k < b.N; k++ {
 73 | 		total += TotalArea(sorted)
 74 | 	}
 75 | }
 76 | 
 77 | type ShapeKind byte
 78 | 
 79 | const (
 80 | 	CircleKind = ShapeKind(0)
 81 | 	SquareKind = ShapeKind(1)
 82 | )
 83 | 
 84 | type ShapeStruct struct {
 85 | 	Kind ShapeKind
 86 | 	Dim  float32
 87 | }
 88 | 
 89 | func (s ShapeStruct) Area() float32 {
 90 | 	switch s.Kind {
 91 | 	case CircleKind:
 92 | 		return math.Pi * s.Dim * s.Dim
 93 | 	case SquareKind:
 94 | 		return s.Dim * s.Dim
 95 | 	}
 96 | 	return 0
 97 | }
 98 | 
 99 | func BenchmarkNoInterface(b *testing.B) {
100 | 	total := float32(0)
101 | 	for k := 0; k < b.N; k++ {
102 | 		for _, shape := range nointerface {
103 | 			total += shape.Area()
104 | 		}
105 | 	}
106 | }
107 | 


--------------------------------------------------------------------------------
/loop_alignment/example_test.go:
--------------------------------------------------------------------------------
  1 | package example
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | var (
  8 | 	xs   = make([]float32, 10000)
  9 | 	ys   = make([]float32, 10000)
 10 | 	incx = 1
 11 | 	incy = 1
 12 | )
 13 | 
 14 | //go:noinline
 15 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 16 | 	var r float32
 17 | 	xi, yi := 0, 0
 18 | 	for ; n > 0; n-- {
 19 | 		r += xs[xi] * ys[yi]
 20 | 
 21 | 		xi += incx
 22 | 		yi += incy
 23 | 	}
 24 | 	return r
 25 | }
 26 | 
 27 | //go:noinline
 28 | func Dot1(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 29 | 	var r float32
 30 | 	xi, yi := 0, 0
 31 | 	for ; n > 0; n-- {
 32 | 		r += xs[xi] * ys[yi]
 33 | 
 34 | 		xi += incx
 35 | 		yi += incy
 36 | 	}
 37 | 	return r
 38 | }
 39 | 
 40 | //go:noinline
 41 | func Dot2(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 42 | 	var r float32
 43 | 	xi, yi := 0, 0
 44 | 	for ; n > 0; n-- {
 45 | 		r += xs[xi] * ys[yi]
 46 | 
 47 | 		xi += incx
 48 | 		yi += incy
 49 | 	}
 50 | 	return r
 51 | }
 52 | 
 53 | //go:noinline
 54 | func Dot3(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 55 | 	var r float32
 56 | 	xi, yi := 0, 0
 57 | 	for ; n > 0; n-- {
 58 | 		r += xs[xi] * ys[yi]
 59 | 
 60 | 		xi += incx
 61 | 		yi += incy
 62 | 	}
 63 | 	return r
 64 | }
 65 | 
 66 | //go:noinline
 67 | func Dot4(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 68 | 	var r float32
 69 | 	xi, yi := 0, 0
 70 | 	for ; n > 0; n-- {
 71 | 		r += xs[xi] * ys[yi]
 72 | 
 73 | 		xi += incx
 74 | 		yi += incy
 75 | 	}
 76 | 	return r
 77 | }
 78 | 
 79 | //go:noinline
 80 | func Dot5(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 81 | 	var r float32
 82 | 	xi, yi := 0, 0
 83 | 	for ; n > 0; n-- {
 84 | 		r += xs[xi] * ys[yi]
 85 | 
 86 | 		xi += incx
 87 | 		yi += incy
 88 | 	}
 89 | 	return r
 90 | }
 91 | 
 92 | //go:noinline
 93 | func Dot6(xs []float32, incx int, ys []float32, incy int, n int) float32 {
 94 | 	var r float32
 95 | 	xi, yi := 0, 0
 96 | 	for ; n > 0; n-- {
 97 | 		r += xs[xi] * ys[yi]
 98 | 
 99 | 		xi += incx
100 | 		yi += incy
101 | 	}
102 | 	return r
103 | }
104 | 
105 | //go:noinline
106 | func Dot7(xs []float32, incx int, ys []float32, incy int, n int) float32 {
107 | 	var r float32
108 | 	xi, yi := 0, 0
109 | 	for ; n > 0; n-- {
110 | 		r += xs[xi] * ys[yi]
111 | 
112 | 		xi += incx
113 | 		yi += incy
114 | 	}
115 | 	return r
116 | }
117 | 
118 | //go:noinline
119 | func Dot8(xs []float32, incx int, ys []float32, incy int, n int) float32 {
120 | 	var r float32
121 | 	xi, yi := 0, 0
122 | 	for ; n > 0; n-- {
123 | 		r += xs[xi] * ys[yi]
124 | 
125 | 		xi += incx
126 | 		yi += incy
127 | 	}
128 | 	return r
129 | }
130 | 
131 | //go:noinline
132 | func Dot9(xs []float32, incx int, ys []float32, incy int, n int) float32 {
133 | 	var r float32
134 | 	xi, yi := 0, 0
135 | 	for ; n > 0; n-- {
136 | 		r += xs[xi] * ys[yi]
137 | 
138 | 		xi += incx
139 | 		yi += incy
140 | 	}
141 | 	return r
142 | }
143 | 
144 | var sink float32
145 | 
146 | func BenchmarkDot1(b *testing.B) {
147 | 	for i := 0; i < b.N; i++ {
148 | 		sink += Dot1(xs, incx, ys, incy, len(xs))
149 | 	}
150 | }
151 | 
152 | func BenchmarkDot2(b *testing.B) {
153 | 	for i := 0; i < b.N; i++ {
154 | 		sink += Dot2(xs, incx, ys, incy, len(xs))
155 | 	}
156 | }
157 | 
158 | func BenchmarkDot3(b *testing.B) {
159 | 	for i := 0; i < b.N; i++ {
160 | 		sink += Dot3(xs, incx, ys, incy, len(xs))
161 | 	}
162 | }
163 | 
164 | func BenchmarkDot4(b *testing.B) {
165 | 	for i := 0; i < b.N; i++ {
166 | 		sink += Dot4(xs, incx, ys, incy, len(xs))
167 | 	}
168 | }
169 | 
170 | func BenchmarkDot5(b *testing.B) {
171 | 	for i := 0; i < b.N; i++ {
172 | 		sink += Dot5(xs, incx, ys, incy, len(xs))
173 | 	}
174 | }
175 | 
176 | func BenchmarkDot6(b *testing.B) {
177 | 	for i := 0; i < b.N; i++ {
178 | 		sink += Dot6(xs, incx, ys, incy, len(xs))
179 | 	}
180 | }
181 | 
182 | func BenchmarkDot7(b *testing.B) {
183 | 	for i := 0; i < b.N; i++ {
184 | 		sink += Dot7(xs, incx, ys, incy, len(xs))
185 | 	}
186 | }
187 | 
188 | func BenchmarkDot8(b *testing.B) {
189 | 	for i := 0; i < b.N; i++ {
190 | 		sink += Dot8(xs, incx, ys, incy, len(xs))
191 | 	}
192 | }
193 | 
194 | func BenchmarkDot9(b *testing.B) {
195 | 	for i := 0; i < b.N; i++ {
196 | 		sink += Dot9(xs, incx, ys, incy, len(xs))
197 | 	}
198 | }
199 | 


--------------------------------------------------------------------------------