├── go.mod ├── inspecting_assembly ├── example.o └── example.go ├── call ├── c.go └── example_test.go ├── go.sum ├── writing_benchmarks ├── incorrect_test.go └── correct_test.go ├── heap_alloc └── example_test.go ├── measure_time └── main.go ├── memory └── example_test.go ├── pointers └── example_test.go ├── LICENSE ├── unintentional_copy └── example_test.go ├── README.md ├── branch └── example_test.go ├── profiling └── example_test.go ├── bounds_checks └── example_test.go ├── unrolling └── example_test.go ├── dispatch └── example_test.go └── loop_alignment └── example_test.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/egonelbre/perf-example 2 | 3 | go 1.22rc2 4 | 5 | require github.com/loov/hrtime v1.0.3 6 | -------------------------------------------------------------------------------- /inspecting_assembly/example.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egonelbre/perf-example/HEAD/inspecting_assembly/example.o -------------------------------------------------------------------------------- /call/c.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | // int nop() { return 0; } 4 | import "C" 5 | 6 | func CNop() int { return int(C.nop()) } 7 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/loov/hrtime v1.0.3 h1:LiWKU3B9skJwRPUf0Urs9+0+OE3TxdMuiRPOTwR0gcU= 2 | github.com/loov/hrtime v1.0.3/go.mod h1:yDY3Pwv2izeY4sq7YcPX/dtLwzg5NU1AxWuWxKwd0p0= 3 | -------------------------------------------------------------------------------- /writing_benchmarks/incorrect_test.go: -------------------------------------------------------------------------------- 1 | package writing_benchmarks 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func BenchmarkNoRepeat(b *testing.B) { 9 | // incorrect way to write a benchmark 10 | for i := 0; i < 1000000; i++ { 11 | time.Now() 12 | } 13 | } 14 | 15 | func add(a, b int) int { return a*a + b*b } 16 | 17 | func BenchmarkOptimizedAway(b *testing.B) { 18 | // incorrect way to write a benchmark 19 | for i := 0; i < b.N; i++ { 20 | add(5, 7) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /heap_alloc/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "testing" 5 | "encoding/binary" 6 | ) 7 | 8 | var escape []byte 9 | 10 | func Heap(v uint64) byte { 11 | escape = make([]byte, 8) 12 | binary.LittleEndian.PutUint64(escape[:], v) 13 | return escape[0] 14 | } 15 | 16 | func Stack(v uint64) byte { 17 | var data [8]byte 18 | binary.LittleEndian.PutUint64(data[:], v) 19 | return data[0] 20 | } 21 | 22 | var sink byte 23 | 24 | func BenchmarkHeap(b *testing.B) { 25 | for i := 0; i < b.N; i++ { 26 | sink += Heap(uint64(i)) 27 | } 28 | } 29 | 30 | func BenchmarkStack(b *testing.B) { 31 | for i := 0; i < b.N; i++ { 32 | sink += Stack(uint64(i)) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /inspecting_assembly/example.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | // Documentation: 4 | // 5 | // https://go.dev/doc/asm 6 | // 7 | 8 | // To build a single package: 9 | // 10 | // go build -o example.o . 11 | 12 | // To view assembly from compilation: 13 | // 14 | // go tool compile -S example.go 15 | 16 | // go tool objdump -S -s Add example.o 17 | func Add(a, b int) int { 18 | return a + b 19 | } 20 | 21 | // go tool objdump -S -s Loop example.o 22 | func Loop(a, b, n int) (r int) { 23 | for i := 0; i < n; i++ { 24 | r += a + b 25 | } 26 | return r 27 | } 28 | 29 | // Also: 30 | // 31 | // go install loov.dev/lensm@main 32 | // lensm example.o 33 | // 34 | // If you want to properly understand assembly, 35 | // then go write a virtual machine first. 36 | // 37 | // https://adventofcode.com/2019 is a pretty good thing to work through. 38 | -------------------------------------------------------------------------------- /measure_time/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/loov/hrtime" 8 | ) 9 | 10 | func main() { 11 | for i := 0; i < 10; i++ { 12 | _ = hrtime.Now() 13 | _ = time.Now() 14 | } 15 | 16 | var now [10000000]time.Time 17 | start := hrtime.Now() 18 | for i := range now { 19 | now[i] = time.Now() 20 | } 21 | finish := hrtime.Now() 22 | 23 | // let's look at system granularity 24 | var durations []time.Duration 25 | for i := range now[:len(now)-1] { 26 | durations = append(durations, now[i+1].Sub(now[i])) 27 | } 28 | 29 | histogram := hrtime.NewDurationHistogram(durations, 30 | &hrtime.HistogramOptions{ 31 | BinCount: 10, 32 | NiceRange: true, 33 | ClampMaximum: float64(10 * time.Microsecond), 34 | ClampPercentile: 0, 35 | }) 36 | 37 | fmt.Println(histogram) 38 | 39 | // let's look at measurement overhead 40 | fmt.Println(float64(finish-start)/float64(len(now)), "ns") 41 | } 42 | -------------------------------------------------------------------------------- /writing_benchmarks/correct_test.go: -------------------------------------------------------------------------------- 1 | package writing_benchmarks 2 | 3 | import ( 4 | "runtime" 5 | "testing" 6 | "time" 7 | 8 | "github.com/loov/hrtime" 9 | ) 10 | 11 | var sinkTime time.Time 12 | var sinkDuration time.Duration 13 | 14 | func BenchmarkTime(b *testing.B) { 15 | for i := 0; i < b.N; i++ { 16 | sinkTime = time.Now() 17 | } 18 | } 19 | 20 | func BenchmarkTimeAlternative(b *testing.B) { 21 | for i := 0; i < b.N; i++ { 22 | runtime.KeepAlive(time.Now()) 23 | } 24 | } 25 | 26 | // Using https://pkg.go.dev/golang.org/x/perf/cmd/benchstat 27 | // 28 | // go test -bench Sub -count 10 | tee bench.log 29 | // go install golang.org/x/perf/cmd/benchstat@latest 30 | // benchstat bench.log 31 | // benchstat old.log new.log 32 | // benchstat -col /impl -row .name bench.log 33 | func BenchmarkSub(b *testing.B) { 34 | b.Run("impl=hrtime", func(b *testing.B) { 35 | for i := 0; i < b.N; i++ { 36 | sinkDuration = hrtime.Now() 37 | } 38 | }) 39 | 40 | b.Run("impl=time", func(b *testing.B) { 41 | for i := 0; i < b.N; i++ { 42 | sinkTime = time.Now() 43 | } 44 | }) 45 | } 46 | -------------------------------------------------------------------------------- /memory/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | ) 7 | 8 | // More details at https://igoro.com/archive/gallery-of-processor-cache-effects/ 9 | 10 | var ( 11 | data128B = [128 / 4]int32{} // 128 bytes 12 | data1MB = [1024 * 1024 / 4]int32{} // 1 MB 13 | data64MB = [64 * 1024 * 1024 / 4]int32{} // 64MB 14 | 15 | order []int 16 | ) 17 | 18 | func init() { 19 | order = make([]int, 1e6) 20 | for i := range order { 21 | order[i] = rand.Int() 22 | } 23 | } 24 | 25 | var sink int32 26 | 27 | func Benchmark128B(b *testing.B) { 28 | for i := 0; i < b.N; i++ { 29 | var total int32 30 | for _, k := range order { 31 | total += data128B[k%len(data128B)] 32 | } 33 | sink = total 34 | } 35 | } 36 | 37 | func Benchmark1MB(b *testing.B) { 38 | for i := 0; i < b.N; i++ { 39 | var total int32 40 | for _, k := range order { 41 | total += data1MB[k%len(data1MB)] 42 | } 43 | sink = total 44 | } 45 | } 46 | 47 | func Benchmark64MB(b *testing.B) { 48 | for i := 0; i < b.N; i++ { 49 | var total int32 50 | for _, k := range order { 51 | total += data64MB[k%len(data64MB)] 52 | } 53 | sink = total 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /pointers/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "math/rand" 5 | "slices" 6 | "testing" 7 | ) 8 | 9 | var data []float32 10 | var sorted []*float32 11 | var unsorted []*float32 12 | 13 | func init() { 14 | // data = make([]float32, 100000) 15 | data = make([]float32, 10000000) 16 | for i := range data { 17 | data[i] = rand.Float32() 18 | } 19 | 20 | sorted = make([]*float32, len(data)) 21 | for i := range sorted { 22 | sorted[i] = &data[i] 23 | } 24 | unsorted = slices.Clone(sorted) 25 | rand.Shuffle(len(unsorted), func(i, k int) { 26 | unsorted[i], unsorted[k] = unsorted[k], unsorted[i] 27 | }) 28 | } 29 | 30 | var sink float32 31 | 32 | func BenchmarkUnsorted(b *testing.B) { 33 | for k := 0; k < b.N; k++ { 34 | total := float32(0) 35 | for _, v := range unsorted { 36 | total += *v 37 | } 38 | sink += total 39 | } 40 | } 41 | 42 | func BenchmarkSorted(b *testing.B) { 43 | for k := 0; k < b.N; k++ { 44 | total := float32(0) 45 | for _, v := range sorted { 46 | total += *v 47 | } 48 | sink += total 49 | } 50 | } 51 | 52 | func BenchmarkData(b *testing.B) { 53 | for k := 0; k < b.N; k++ { 54 | total := float32(0) 55 | for _, v := range data { 56 | total += v 57 | } 58 | sink += total 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /unintentional_copy/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | type Shape struct { 8 | Kind int 9 | Dimension float32 10 | 11 | ExtraData [10 * 1024]byte 12 | } 13 | 14 | var data = make([]Shape, 1024) 15 | 16 | func BenchmarkCopy(b *testing.B) { 17 | total := float32(0) 18 | for k := 0; k < b.N; k++ { 19 | for i, shape := range data { 20 | total += shape.Dimension 21 | shape.Dimension++ 22 | data[i] = shape 23 | } 24 | } 25 | } 26 | 27 | func BenchmarkReference(b *testing.B) { 28 | total := float32(0) 29 | for k := 0; k < b.N; k++ { 30 | for i := range data { 31 | shape := &data[i] 32 | total += shape.Dimension 33 | shape.Dimension++ 34 | } 35 | } 36 | } 37 | 38 | func BenchmarkCall(b *testing.B) { 39 | total := float32(0) 40 | for k := 0; k < b.N; k++ { 41 | for _, shape := range data { 42 | total += DimensionValue(shape) 43 | } 44 | } 45 | } 46 | 47 | func BenchmarkCallPointer(b *testing.B) { 48 | total := float32(0) 49 | for k := 0; k < b.N; k++ { 50 | for i := range data { 51 | shape := &data[i] 52 | total += DimensionPointer(shape) 53 | } 54 | } 55 | } 56 | 57 | //go:noinline 58 | func DimensionValue(v Shape) float32 { return v.Dimension } 59 | 60 | //go:noinline 61 | func DimensionPointer(v *Shape) float32 { return v.Dimension } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A session on low-level optimizations 2 | 3 | * [measuring time](./measure_time) 4 | * [writing benchmarks](./writing_benchmarks) 5 | * [inspecting assembly](./inspecting_assembly) 6 | * [basic profiling](./profiling) 7 | * [branch cost](./branch) 8 | * [call cost](./call) 9 | * [dispatch call cost](./dispatch) 10 | * [unintentional copy](./unintentional_copy) 11 | * [pointer cost](./pointers) 12 | * [memory cost](./memory) 13 | * [bounds check cost](./bounds_checks) 14 | * [unrolling](./unrolling) 15 | * [loop alignment](./loop_alignment) 16 | * [heap vs stack allocation)(./heap_alloc) 17 | 18 | ## Additional Videos / Articles 19 | 20 | * Intuitive Performance https://www.youtube.com/watch?v=51ZIFNqgCkA 21 | * https://egonelbre.com/a-tale-of-bfs/ 22 | * https://egonelbre.com/a-tale-of-bfs-going-parallel/ 23 | 24 | ## Recommended 25 | 26 | * https://github.com/dgryski/go-perfbook 27 | * https://en.algorithmica.org/hpc/ 28 | * https://www.dataorienteddesign.com/dodbook/ 29 | * https://www.computerenhance.com/ 30 | 31 | ## Tools 32 | 33 | * Benchmark statistical analysis (https://golang.org/x/perf/cmd/benchstat) 34 | * Assembly and Code viewer (https://github.com/loov/lensm) 35 | * Visualizing bounds checks (https://github.com/loov/view-annotated-file) 36 | * AMD μProf (https://www.amd.com/en/developer/uprof.html) 37 | * Intel VTune Profiler (https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html) 38 | * Apple Instruments (https://help.apple.com/instruments/mac/current/#) -------------------------------------------------------------------------------- /call/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "math/rand" 5 | "testing" 6 | ) 7 | 8 | var sink int 9 | 10 | func Nop() int { 11 | return 0 12 | } 13 | 14 | func BenchmarkInlined(b *testing.B) { 15 | for i := 0; i < b.N; i++ { 16 | sink += Nop() 17 | } 18 | } 19 | 20 | //go:noinline 21 | func Nop2() int { 22 | return 0 23 | } 24 | 25 | func BenchmarkNotInlined(b *testing.B) { 26 | for i := 0; i < b.N; i++ { 27 | sink += Nop2() 28 | } 29 | } 30 | 31 | //go:noinline 32 | func createANop() func() int { return Nop2 } 33 | 34 | func BenchmarkFuncCall(b *testing.B) { 35 | nop := createANop() 36 | for i := 0; i < b.N; i++ { 37 | sink += nop() 38 | } 39 | } 40 | 41 | type Noper interface { 42 | Nop() int 43 | } 44 | 45 | type nop struct{} 46 | 47 | func (nop) Nop() int { return 0 } 48 | 49 | type nop2 struct{} 50 | 51 | func (nop2) Nop() int { return 1 } 52 | 53 | func BenchmarkDevirtualizedInterfaceCall(b *testing.B) { 54 | var nop Noper = nop{} 55 | for i := 0; i < b.N; i++ { 56 | sink += nop.Nop() 57 | } 58 | } 59 | 60 | func oneof() Noper { 61 | if rand.Intn(2) == 0 { 62 | return nop{} 63 | } else { 64 | return nop2{} 65 | } 66 | } 67 | 68 | func BenchmarkInterfaceCall(b *testing.B) { 69 | var nop Noper = oneof() 70 | b.ResetTimer() 71 | for i := 0; i < b.N; i++ { 72 | sink += nop.Nop() 73 | } 74 | } 75 | 76 | func BenchmarkInterfaceCall2(b *testing.B) { 77 | var nop Noper = oneof() 78 | b.ResetTimer() 79 | fn := nop.Nop 80 | for i := 0; i < b.N; i++ { 81 | sink += fn() 82 | } 83 | } 84 | 85 | func BenchmarkC(b *testing.B) { 86 | for i := 0; i < b.N; i++ { 87 | sink += CNop() 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /branch/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "math/rand" 5 | "slices" 6 | "sort" 7 | "testing" 8 | ) 9 | 10 | // More details at https://igoro.com/archive/fast-and-slow-if-statements-branch-prediction-in-modern-processors/ 11 | 12 | var unsorted []int 13 | var sorted []int 14 | var half []int 15 | 16 | func init() { 17 | unsorted = make([]int, 10000) 18 | for i := range unsorted { 19 | unsorted[i] = rand.Intn(100) 20 | } 21 | 22 | sorted = slices.Clone(unsorted) 23 | sort.Ints(sorted) 24 | 25 | half = make([]int, len(unsorted)) 26 | for i := range half { 27 | if i%2 == 0 { 28 | half[i] = 0 29 | } else { 30 | half[i] = 100 31 | } 32 | } 33 | } 34 | 35 | //go:noinline 36 | func DiffLimit(vs []int, limit int) int { 37 | above := 0 38 | below := 0 39 | for _, v := range vs { 40 | if v > limit { 41 | above += v 42 | } else { 43 | below += v 44 | } 45 | } 46 | return above - below 47 | } 48 | 49 | func BenchmarkUnsorted(b *testing.B) { 50 | for i := 0; i < b.N; i++ { 51 | DiffLimit(unsorted, 50) 52 | } 53 | } 54 | 55 | func BenchmarkSorted(b *testing.B) { 56 | for i := 0; i < b.N; i++ { 57 | DiffLimit(sorted, 50) 58 | } 59 | } 60 | 61 | func BenchmarkHalf(b *testing.B) { 62 | for i := 0; i < b.N; i++ { 63 | DiffLimit(half, 50) 64 | } 65 | } 66 | 67 | //go:noinline 68 | func DiffLimitCMOV(vs []int, limit int) int { 69 | above := 0 70 | below := 0 71 | for _, v := range vs { 72 | if v > limit { 73 | above += v 74 | } 75 | if v < limit { 76 | below += v 77 | } 78 | } 79 | return above - below 80 | } 81 | 82 | func BenchmarkUnsortedCMOV(b *testing.B) { 83 | for i := 0; i < b.N; i++ { 84 | DiffLimitCMOV(unsorted, 50) 85 | } 86 | } 87 | 88 | func BenchmarkSortedCMOV(b *testing.B) { 89 | for i := 0; i < b.N; i++ { 90 | DiffLimitCMOV(sorted, 50) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /profiling/example_test.go: -------------------------------------------------------------------------------- 1 | package profiling 2 | 3 | import ( 4 | "fmt" 5 | "strconv" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | // go test -bench Label -benchmem 11 | // 12 | // go test -bench Label -cpuprofile cpu.prof -benchtime 5s 13 | // go test -bench Label -memprofile mem.prof -benchtime 5s 14 | // go test -bench Label -memprofile mem.prof -benchtime 5s 15 | // 16 | // On Non-Windows: 17 | // go tool pprof cpu.prof 18 | // go tool pprof mem.prof 19 | // 20 | // go tool pprof -lines cpu.prof 21 | // 22 | // On Windows: 23 | // go tool pprof profiling.test.exe cpu.prof 24 | // go tool pprof profiling.test.exe mem.prof 25 | // 26 | // go tool pprof -lines profiling.test.exe cpu.prof 27 | // 28 | // Commands in pprof: 29 | // top 30 30 | // top 30 -cum 31 | // list Format 32 | // disasm Format 33 | // 34 | 35 | var sink string 36 | 37 | //go:noinline 38 | func Format(prefix string, count int, suffix string) string { 39 | return fmt.Sprintf("%v%v%v", prefix, count, suffix) 40 | } 41 | 42 | //go:noinline 43 | func Add(prefix string, count int, suffix string) string { 44 | return prefix + strconv.Itoa(count) + suffix 45 | } 46 | 47 | //go:noinline 48 | func Builder(prefix string, count int, suffix string) string { 49 | var b strings.Builder 50 | b.Grow(len(prefix) + 13 + len(suffix)) 51 | b.WriteString(prefix) 52 | 53 | var buffer [13]byte 54 | result := strconv.AppendInt(buffer[:], int64(count), 10) 55 | b.Write(result) 56 | 57 | b.WriteString(suffix) 58 | 59 | return b.String() 60 | } 61 | 62 | func BenchmarkLabel(b *testing.B) { 63 | b.Run("Format", func(b *testing.B) { 64 | for i := 0; i < b.N; i++ { 65 | sink = Format("Alpha", i, "Variant") 66 | } 67 | }) 68 | 69 | b.Run("Add", func(b *testing.B) { 70 | for i := 0; i < b.N; i++ { 71 | sink = Add("Alpha", i, "Variant") 72 | } 73 | }) 74 | 75 | b.Run("Builder", func(b *testing.B) { 76 | for i := 0; i < b.N; i++ { 77 | sink = Builder("Alpha", i, "Variant") 78 | } 79 | }) 80 | } 81 | -------------------------------------------------------------------------------- /bounds_checks/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "testing" 5 | "unsafe" 6 | ) 7 | 8 | // To disable bounds checks entirely: 9 | // 10 | // go test -gcflags=-B -bench . 11 | // 12 | // Viewing bounds checks 13 | // 14 | // go test -gcflags "all=-m -m -d=ssa/check_bce/debug" -bench . 2>analysis.log 15 | // go install github.com/loov/view-annotated-file@latest 16 | // view-annotated-file analysis.log 17 | 18 | var ( 19 | xs = make([]float32, 10000) 20 | ys = make([]float32, 10000) 21 | incx = 1 22 | incy = 1 23 | ) 24 | 25 | //go:noinline 26 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 { 27 | var r float32 28 | xi, yi := 0, 0 29 | for ; n > 0; n-- { 30 | r += xs[xi] * ys[yi] 31 | 32 | xi += incx 33 | yi += incy 34 | } 35 | return r 36 | } 37 | 38 | //go:noinline 39 | func DotUnsafe(xs []float32, incx int, ys []float32, incy int, n int) float32 { 40 | var r float32 41 | xi, yi := 0, 0 42 | for ; n > 0; n-- { 43 | r += *unsafeAt(xs, xi) * *unsafeAt(ys, yi) 44 | 45 | xi += incx 46 | yi += incy 47 | } 48 | return r 49 | } 50 | 51 | //go:noinline 52 | func DotPointers(xs []float32, incx int, ys []float32, incy int, n int) float32 { 53 | var r float32 54 | xp := unsafe.Pointer(unsafe.SliceData(xs)) 55 | yp := unsafe.Pointer(unsafe.SliceData(ys)) 56 | incxp, incyp := uintptr(incx*4), uintptr(incy*4) 57 | for ; n > 0; n-- { 58 | r += *(*float32)(xp) * *(*float32)(yp) 59 | xp = unsafe.Add(xp, incxp) 60 | yp = unsafe.Add(yp, incyp) 61 | } 62 | return r 63 | } 64 | 65 | var sink float32 66 | 67 | func BenchmarkDot(b *testing.B) { 68 | for i := 0; i < b.N; i++ { 69 | sink += Dot(xs, incx, ys, incy, len(xs)) 70 | } 71 | } 72 | 73 | func BenchmarkDotUnsafe(b *testing.B) { 74 | for i := 0; i < b.N; i++ { 75 | sink += DotUnsafe(xs, incx, ys, incy, len(xs)) 76 | } 77 | } 78 | 79 | func BenchmarkDotPointers(b *testing.B) { 80 | for i := 0; i < b.N; i++ { 81 | sink += DotPointers(xs, incx, ys, incy, len(xs)) 82 | } 83 | } 84 | 85 | func unsafeAt[T any](xs []T, index int) *T { 86 | return (*T)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(xs)), uintptr(index)*unsafe.Sizeof(xs[0]))) 87 | } 88 | -------------------------------------------------------------------------------- /unrolling/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | xs = make([]float32, 10000) 9 | ys = make([]float32, 10000) 10 | incx = 1 11 | incy = 1 12 | ) 13 | 14 | //go:noinline 15 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 { 16 | var r float32 17 | xi, yi := 0, 0 18 | for ; n > 0; n-- { 19 | r += xs[xi] * ys[yi] 20 | 21 | xi += incx 22 | yi += incy 23 | } 24 | return r 25 | } 26 | 27 | //go:noinline 28 | func DotUnroll(xs []float32, incx int, ys []float32, incy int, n int) float32 { 29 | var r float32 30 | xi, yi := 0, 0 31 | for ; n >= 4; n -= 4 { 32 | r += xs[xi] * ys[yi] 33 | xi += incx 34 | yi += incy 35 | 36 | r += xs[xi] * ys[yi] 37 | xi += incx 38 | yi += incy 39 | 40 | r += xs[xi] * ys[yi] 41 | xi += incx 42 | yi += incy 43 | 44 | r += xs[xi] * ys[yi] 45 | xi += incx 46 | yi += incy 47 | } 48 | for ; n > 0; n-- { 49 | r += xs[xi] * ys[yi] 50 | 51 | xi += incx 52 | yi += incy 53 | } 54 | return r 55 | } 56 | 57 | //go:noinline 58 | func DotPipeline(xs []float32, incx int, ys []float32, incy int, n int) float32 { 59 | var r1, r2, r3, r4 float32 60 | xi, yi := 0, 0 61 | for ; n >= 4; n -= 4 { 62 | r1 += xs[xi] * ys[yi] 63 | xi += incx 64 | yi += incy 65 | 66 | r2 += xs[xi] * ys[yi] 67 | xi += incx 68 | yi += incy 69 | 70 | r3 += xs[xi] * ys[yi] 71 | xi += incx 72 | yi += incy 73 | 74 | r4 += xs[xi] * ys[yi] 75 | xi += incx 76 | yi += incy 77 | } 78 | for ; n > 0; n-- { 79 | r1 += xs[xi] * ys[yi] 80 | 81 | xi += incx 82 | yi += incy 83 | } 84 | return r1 + r2 + r3 + r4 85 | } 86 | 87 | var sink float32 88 | 89 | func BenchmarkDot(b *testing.B) { 90 | for i := 0; i < b.N; i++ { 91 | sink += Dot(xs, incx, ys, incy, len(xs)) 92 | } 93 | } 94 | 95 | func BenchmarkDotUnroll(b *testing.B) { 96 | for i := 0; i < b.N; i++ { 97 | sink += DotUnroll(xs, incx, ys, incy, len(xs)) 98 | } 99 | } 100 | 101 | func BenchmarkDotPipeline(b *testing.B) { 102 | for i := 0; i < b.N; i++ { 103 | sink += DotPipeline(xs, incx, ys, incy, len(xs)) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /dispatch/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | "slices" 7 | "sort" 8 | "testing" 9 | "unsafe" 10 | ) 11 | 12 | var unsorted []Shape 13 | var sorted []Shape 14 | var nointerface []ShapeStruct 15 | 16 | func init() { 17 | unsorted = make([]Shape, 1e4) 18 | nointerface = make([]ShapeStruct, len(unsorted)) 19 | 20 | for i := range unsorted { 21 | if rand.Intn(2) == 0 { 22 | unsorted[i] = Circle{rand.Float32()} 23 | nointerface[i] = ShapeStruct{CircleKind, rand.Float32()} 24 | } else { 25 | unsorted[i] = Square{rand.Float32()} 26 | nointerface[i] = ShapeStruct{SquareKind, rand.Float32()} 27 | } 28 | } 29 | 30 | sorted = slices.Clone(unsorted) 31 | type iface struct { 32 | itab uintptr 33 | data unsafe.Pointer 34 | } 35 | sort.Slice(sorted, func(i, k int) bool { 36 | a := (*iface)(unsafe.Pointer(&sorted[i])).itab 37 | b := (*iface)(unsafe.Pointer(&sorted[k])).itab 38 | return a < b 39 | }) 40 | } 41 | 42 | type Shape interface { 43 | Area() float32 44 | } 45 | 46 | type Circle struct{ Radius float32 } 47 | type Square struct{ Side float32 } 48 | 49 | func (s Circle) Area() float32 { 50 | return math.Pi * s.Radius * s.Radius 51 | } 52 | func (s Square) Area() float32 { 53 | return s.Side * s.Side 54 | } 55 | 56 | func TotalArea(shapes []Shape) (total float32) { 57 | for _, shape := range shapes { 58 | total += shape.Area() 59 | } 60 | return total 61 | } 62 | 63 | func BenchmarkUnsorted(b *testing.B) { 64 | total := float32(0) 65 | for k := 0; k < b.N; k++ { 66 | total += TotalArea(unsorted) 67 | } 68 | } 69 | 70 | func BenchmarkSorted(b *testing.B) { 71 | total := float32(0) 72 | for k := 0; k < b.N; k++ { 73 | total += TotalArea(sorted) 74 | } 75 | } 76 | 77 | type ShapeKind byte 78 | 79 | const ( 80 | CircleKind = ShapeKind(0) 81 | SquareKind = ShapeKind(1) 82 | ) 83 | 84 | type ShapeStruct struct { 85 | Kind ShapeKind 86 | Dim float32 87 | } 88 | 89 | func (s ShapeStruct) Area() float32 { 90 | switch s.Kind { 91 | case CircleKind: 92 | return math.Pi * s.Dim * s.Dim 93 | case SquareKind: 94 | return s.Dim * s.Dim 95 | } 96 | return 0 97 | } 98 | 99 | func BenchmarkNoInterface(b *testing.B) { 100 | total := float32(0) 101 | for k := 0; k < b.N; k++ { 102 | for _, shape := range nointerface { 103 | total += shape.Area() 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /loop_alignment/example_test.go: -------------------------------------------------------------------------------- 1 | package example 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | xs = make([]float32, 10000) 9 | ys = make([]float32, 10000) 10 | incx = 1 11 | incy = 1 12 | ) 13 | 14 | //go:noinline 15 | func Dot(xs []float32, incx int, ys []float32, incy int, n int) float32 { 16 | var r float32 17 | xi, yi := 0, 0 18 | for ; n > 0; n-- { 19 | r += xs[xi] * ys[yi] 20 | 21 | xi += incx 22 | yi += incy 23 | } 24 | return r 25 | } 26 | 27 | //go:noinline 28 | func Dot1(xs []float32, incx int, ys []float32, incy int, n int) float32 { 29 | var r float32 30 | xi, yi := 0, 0 31 | for ; n > 0; n-- { 32 | r += xs[xi] * ys[yi] 33 | 34 | xi += incx 35 | yi += incy 36 | } 37 | return r 38 | } 39 | 40 | //go:noinline 41 | func Dot2(xs []float32, incx int, ys []float32, incy int, n int) float32 { 42 | var r float32 43 | xi, yi := 0, 0 44 | for ; n > 0; n-- { 45 | r += xs[xi] * ys[yi] 46 | 47 | xi += incx 48 | yi += incy 49 | } 50 | return r 51 | } 52 | 53 | //go:noinline 54 | func Dot3(xs []float32, incx int, ys []float32, incy int, n int) float32 { 55 | var r float32 56 | xi, yi := 0, 0 57 | for ; n > 0; n-- { 58 | r += xs[xi] * ys[yi] 59 | 60 | xi += incx 61 | yi += incy 62 | } 63 | return r 64 | } 65 | 66 | //go:noinline 67 | func Dot4(xs []float32, incx int, ys []float32, incy int, n int) float32 { 68 | var r float32 69 | xi, yi := 0, 0 70 | for ; n > 0; n-- { 71 | r += xs[xi] * ys[yi] 72 | 73 | xi += incx 74 | yi += incy 75 | } 76 | return r 77 | } 78 | 79 | //go:noinline 80 | func Dot5(xs []float32, incx int, ys []float32, incy int, n int) float32 { 81 | var r float32 82 | xi, yi := 0, 0 83 | for ; n > 0; n-- { 84 | r += xs[xi] * ys[yi] 85 | 86 | xi += incx 87 | yi += incy 88 | } 89 | return r 90 | } 91 | 92 | //go:noinline 93 | func Dot6(xs []float32, incx int, ys []float32, incy int, n int) float32 { 94 | var r float32 95 | xi, yi := 0, 0 96 | for ; n > 0; n-- { 97 | r += xs[xi] * ys[yi] 98 | 99 | xi += incx 100 | yi += incy 101 | } 102 | return r 103 | } 104 | 105 | //go:noinline 106 | func Dot7(xs []float32, incx int, ys []float32, incy int, n int) float32 { 107 | var r float32 108 | xi, yi := 0, 0 109 | for ; n > 0; n-- { 110 | r += xs[xi] * ys[yi] 111 | 112 | xi += incx 113 | yi += incy 114 | } 115 | return r 116 | } 117 | 118 | //go:noinline 119 | func Dot8(xs []float32, incx int, ys []float32, incy int, n int) float32 { 120 | var r float32 121 | xi, yi := 0, 0 122 | for ; n > 0; n-- { 123 | r += xs[xi] * ys[yi] 124 | 125 | xi += incx 126 | yi += incy 127 | } 128 | return r 129 | } 130 | 131 | //go:noinline 132 | func Dot9(xs []float32, incx int, ys []float32, incy int, n int) float32 { 133 | var r float32 134 | xi, yi := 0, 0 135 | for ; n > 0; n-- { 136 | r += xs[xi] * ys[yi] 137 | 138 | xi += incx 139 | yi += incy 140 | } 141 | return r 142 | } 143 | 144 | var sink float32 145 | 146 | func BenchmarkDot1(b *testing.B) { 147 | for i := 0; i < b.N; i++ { 148 | sink += Dot1(xs, incx, ys, incy, len(xs)) 149 | } 150 | } 151 | 152 | func BenchmarkDot2(b *testing.B) { 153 | for i := 0; i < b.N; i++ { 154 | sink += Dot2(xs, incx, ys, incy, len(xs)) 155 | } 156 | } 157 | 158 | func BenchmarkDot3(b *testing.B) { 159 | for i := 0; i < b.N; i++ { 160 | sink += Dot3(xs, incx, ys, incy, len(xs)) 161 | } 162 | } 163 | 164 | func BenchmarkDot4(b *testing.B) { 165 | for i := 0; i < b.N; i++ { 166 | sink += Dot4(xs, incx, ys, incy, len(xs)) 167 | } 168 | } 169 | 170 | func BenchmarkDot5(b *testing.B) { 171 | for i := 0; i < b.N; i++ { 172 | sink += Dot5(xs, incx, ys, incy, len(xs)) 173 | } 174 | } 175 | 176 | func BenchmarkDot6(b *testing.B) { 177 | for i := 0; i < b.N; i++ { 178 | sink += Dot6(xs, incx, ys, incy, len(xs)) 179 | } 180 | } 181 | 182 | func BenchmarkDot7(b *testing.B) { 183 | for i := 0; i < b.N; i++ { 184 | sink += Dot7(xs, incx, ys, incy, len(xs)) 185 | } 186 | } 187 | 188 | func BenchmarkDot8(b *testing.B) { 189 | for i := 0; i < b.N; i++ { 190 | sink += Dot8(xs, incx, ys, incy, len(xs)) 191 | } 192 | } 193 | 194 | func BenchmarkDot9(b *testing.B) { 195 | for i := 0; i < b.N; i++ { 196 | sink += Dot9(xs, incx, ys, incy, len(xs)) 197 | } 198 | } 199 | --------------------------------------------------------------------------------