├── go.sum ├── go.mod ├── constants ├── constants_386.go ├── constants_amd64.go ├── constants_arm.go ├── constants_arm64.go ├── constants_mips.go ├── constants_mips64.go ├── constants_mipsle.go ├── constants_s390x.go ├── constants_wasm.go ├── constants_mips64le.go ├── constants_riscv64.go └── constants_ppc64x.go ├── asm_arm.s ├── asm_arm64.s ├── asm_mips.s ├── asm_mips64.s ├── benchmarks ├── constants.go ├── throughput_test.go └── general_test.go ├── asm_386.s ├── asm_amd64.s ├── LICENSE.md ├── examples └── main.go ├── pool_func.go ├── pool.go ├── README.md └── lib_runtime_linkage.go /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/alphadose/itogami 2 | 3 | go 1.19 4 | -------------------------------------------------------------------------------- /constants/constants_386.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 64 4 | -------------------------------------------------------------------------------- /constants/constants_amd64.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 64 4 | -------------------------------------------------------------------------------- /constants/constants_arm.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_arm64.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 64 4 | -------------------------------------------------------------------------------- /constants/constants_mips.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_mips64.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_mipsle.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_s390x.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 256 4 | -------------------------------------------------------------------------------- /constants/constants_wasm.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 64 4 | -------------------------------------------------------------------------------- /constants/constants_mips64le.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_riscv64.go: -------------------------------------------------------------------------------- 1 | package constants 2 | 3 | const CacheLinePadSize = 32 4 | -------------------------------------------------------------------------------- /constants/constants_ppc64x.go: -------------------------------------------------------------------------------- 1 | //go:build ppc64 || ppc64le 2 | 3 | package constants 4 | 5 | const CacheLinePadSize = 128 6 | -------------------------------------------------------------------------------- /asm_arm.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVD g, r 5 | 6 | TEXT ·GetG(SB),NOSPLIT,$0-4 7 | get_tls(R1) 8 | MOVD R1, gp+0(FP) 9 | RET 10 | -------------------------------------------------------------------------------- /asm_arm64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVD g, r 5 | 6 | TEXT ·GetG(SB),NOSPLIT,$0-8 7 | get_tls(R1) 8 | MOVD R1, gp+0(FP) 9 | RET 10 | -------------------------------------------------------------------------------- /asm_mips.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVD g, r 5 | 6 | TEXT ·GetG(SB),NOSPLIT,$0-4 7 | get_tls(R1) 8 | MOVD R1, gp+0(FP) 9 | RET 10 | -------------------------------------------------------------------------------- /asm_mips64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVD g, r 5 | 6 | TEXT ·GetG(SB),NOSPLIT,$0-8 7 | get_tls(R1) 8 | MOVD R1, gp+0(FP) 9 | RET 10 | -------------------------------------------------------------------------------- /benchmarks/constants.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import "time" 4 | 5 | const ( 6 | RunTimes = 1e6 7 | BenchParam = 10 8 | PoolSize = 5e4 9 | DefaultExpiredTime = 10 * time.Second 10 | ) 11 | -------------------------------------------------------------------------------- /asm_386.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVL TLS, r 5 | #define g(r) 0(r)(TLS*1) 6 | 7 | TEXT ·GetG(SB),NOSPLIT,$0-4 8 | get_tls(CX) 9 | MOVL g(CX), AX 10 | MOVL AX, gp+0(FP) 11 | RET 12 | -------------------------------------------------------------------------------- /asm_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | #include "go_asm.h" 3 | 4 | #define get_tls(r) MOVQ TLS, r 5 | #define g(r) 0(r)(TLS*1) 6 | 7 | TEXT ·GetG(SB),NOSPLIT,$0-8 8 | get_tls(CX) 9 | MOVQ g(CX), AX 10 | MOVQ AX, gp+0(FP) 11 | RET 12 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Anish Mukherjee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmarks/throughput_test.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | "time" 7 | 8 | "github.com/alphadose/itogami" 9 | ) 10 | 11 | var wg1, wg2, wg3 sync.WaitGroup 12 | 13 | const sleepDuration uint8 = 10 14 | 15 | func antsFunc(args any) { 16 | time.Sleep(time.Duration(args.(uint8)) * time.Millisecond) 17 | wg2.Done() 18 | } 19 | 20 | func itoFunc(args uint8) { 21 | time.Sleep(time.Duration(args) * time.Millisecond) 22 | wg3.Done() 23 | } 24 | 25 | // func BenchmarkAntsPooWithFunc(b *testing.B) { 26 | // p, _ := ants.NewPoolWithFunc(PoolSize, antsFunc, ants.WithExpiryDuration(DefaultExpiredTime)) 27 | // defer p.Release() 28 | 29 | // b.ResetTimer() 30 | // b.StartTimer() 31 | // for i := 0; i < b.N; i++ { 32 | // wg2.Add(RunTimes) 33 | // for j := 0; j < RunTimes; j++ { 34 | // p.Invoke(sleepDuration) 35 | // } 36 | // wg2.Wait() 37 | // } 38 | // b.StopTimer() 39 | // } 40 | 41 | func BenchmarkItogamiPoolWithFunc(b *testing.B) { 42 | p := itogami.NewPoolWithFunc(PoolSize, itoFunc) 43 | 44 | b.ResetTimer() 45 | b.StartTimer() 46 | for i := 0; i < b.N; i++ { 47 | wg3.Add(RunTimes) 48 | for j := 0; j < RunTimes; j++ { 49 | p.Invoke(sleepDuration) 50 | } 51 | wg3.Wait() 52 | } 53 | b.StopTimer() 54 | } 55 | -------------------------------------------------------------------------------- /examples/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | "sync/atomic" 7 | "time" 8 | 9 | "github.com/alphadose/itogami" 10 | ) 11 | 12 | const runTimes uint32 = 1000 13 | 14 | var sum uint32 15 | 16 | func myFunc(i uint32) { 17 | atomic.AddUint32(&sum, i) 18 | fmt.Printf("run with %d\n", i) 19 | } 20 | 21 | func demoFunc() { 22 | time.Sleep(10 * time.Millisecond) 23 | println("Hello World") 24 | } 25 | 26 | func examplePool() { 27 | var wg sync.WaitGroup 28 | // Use the common pool 29 | pool := itogami.NewPool(10) 30 | 31 | syncCalculateSum := func() { 32 | demoFunc() 33 | wg.Done() 34 | } 35 | for i := uint32(0); i < runTimes; i++ { 36 | wg.Add(1) 37 | // Submit task to the pool 38 | pool.Submit(syncCalculateSum) 39 | } 40 | wg.Wait() 41 | println("finished all tasks") 42 | } 43 | 44 | func examplePoolWithFunc() { 45 | var wg sync.WaitGroup 46 | // Use the pool with a pre-defined function 47 | pool := itogami.NewPoolWithFunc(10, func(i uint32) { 48 | myFunc(i) 49 | wg.Done() 50 | }) 51 | for i := uint32(0); i < runTimes; i++ { 52 | wg.Add(1) 53 | // Invoke the function with a value 54 | pool.Invoke(i) 55 | } 56 | wg.Wait() 57 | fmt.Printf("finish all tasks, result is %d\n", sum) 58 | } 59 | 60 | func main() { 61 | examplePool() 62 | examplePoolWithFunc() 63 | } 64 | -------------------------------------------------------------------------------- /benchmarks/general_test.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | "time" 7 | 8 | "github.com/alphadose/itogami" 9 | ) 10 | 11 | func demoFunc() { 12 | time.Sleep(time.Duration(BenchParam) * time.Millisecond) 13 | } 14 | 15 | func BenchmarkUnlimitedGoroutines(b *testing.B) { 16 | var wg sync.WaitGroup 17 | 18 | b.ResetTimer() 19 | b.StartTimer() 20 | for i := 0; i < b.N; i++ { 21 | wg.Add(RunTimes) 22 | for j := 0; j < RunTimes; j++ { 23 | go func() { 24 | demoFunc() 25 | wg.Done() 26 | }() 27 | } 28 | wg.Wait() 29 | } 30 | b.StopTimer() 31 | } 32 | 33 | func BenchmarkItogamiPool(b *testing.B) { 34 | var wg sync.WaitGroup 35 | p := itogami.NewPool(PoolSize) 36 | 37 | b.ResetTimer() 38 | b.StartTimer() 39 | for i := 0; i < b.N; i++ { 40 | wg.Add(RunTimes) 41 | for j := 0; j < RunTimes; j++ { 42 | p.Submit(func() { 43 | demoFunc() 44 | wg.Done() 45 | }) 46 | } 47 | wg.Wait() 48 | } 49 | b.StopTimer() 50 | } 51 | 52 | // func BenchmarkErrGroup(b *testing.B) { 53 | // var wg sync.WaitGroup 54 | // var pool errgroup.Group 55 | // pool.SetLimit(PoolSize) 56 | 57 | // b.ResetTimer() 58 | // b.StartTimer() 59 | // for i := 0; i < b.N; i++ { 60 | // wg.Add(RunTimes) 61 | // for j := 0; j < RunTimes; j++ { 62 | // pool.Go(func() error { 63 | // demoFunc() 64 | // wg.Done() 65 | // return nil 66 | // }) 67 | // } 68 | // wg.Wait() 69 | // } 70 | // b.StopTimer() 71 | // } 72 | 73 | // func BenchmarkAntsPool(b *testing.B) { 74 | // var wg sync.WaitGroup 75 | // p, _ := ants.NewPool(PoolSize, ants.WithExpiryDuration(DefaultExpiredTime)) 76 | // defer p.Release() 77 | 78 | // b.ResetTimer() 79 | // b.StartTimer() 80 | // for i := 0; i < b.N; i++ { 81 | // wg.Add(RunTimes) 82 | // for j := 0; j < RunTimes; j++ { 83 | // p.Submit(func() { 84 | // demoFunc() 85 | // wg.Done() 86 | // }) 87 | // } 88 | // wg.Wait() 89 | // } 90 | // b.StopTimer() 91 | // } 92 | 93 | // func BenchmarkGammaZeroPool(b *testing.B) { 94 | // var wg sync.WaitGroup 95 | // p := workerpool.New(PoolSize) 96 | 97 | // b.ResetTimer() 98 | // b.StartTimer() 99 | // for i := 0; i < b.N; i++ { 100 | // wg.Add(RunTimes) 101 | // for j := 0; j < RunTimes; j++ { 102 | // p.Submit(func() { 103 | // demoFunc() 104 | // wg.Done() 105 | // }) 106 | // } 107 | // wg.Wait() 108 | // } 109 | // b.StopTimer() 110 | // } 111 | -------------------------------------------------------------------------------- /pool_func.go: -------------------------------------------------------------------------------- 1 | package itogami 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "unsafe" 7 | ) 8 | 9 | type ( 10 | // a single slot for a worker in PoolWithFunc 11 | slotFunc[T any] struct { 12 | threadPtr unsafe.Pointer 13 | data T 14 | } 15 | 16 | // PoolWithFunc is used for spawning workers for a single pre-defined function with myriad inputs 17 | // useful for throughput bound cases 18 | // has lower memory usage and allocs per op than the default Pool 19 | // 20 | // ( type -> func(T) {} ) where T is a generic parameter 21 | PoolWithFunc[T any] struct { 22 | currSize uint64 23 | _p1 [cacheLinePadSize - unsafe.Sizeof(uint64(0))]byte 24 | maxSize uint64 25 | alloc func() any 26 | free func(any) 27 | task func(T) 28 | _p2 [cacheLinePadSize - unsafe.Sizeof(uint64(0)) - 3*unsafe.Sizeof(func() {})]byte 29 | top atomic.Pointer[dataItem[T]] 30 | _p3 [cacheLinePadSize - unsafe.Sizeof(atomic.Pointer[dataItem[T]]{})]byte 31 | } 32 | ) 33 | 34 | // NewPoolWithFunc returns a new PoolWithFunc 35 | func NewPoolWithFunc[T any](size uint64, task func(T)) *PoolWithFunc[T] { 36 | dataPool := sync.Pool{New: func() any { return new(dataItem[T]) }} 37 | return &PoolWithFunc[T]{maxSize: size, task: task, alloc: dataPool.Get, free: dataPool.Put} 38 | } 39 | 40 | // Invoke invokes the pre-defined method in PoolWithFunc by assigning the data to an already existing worker 41 | // or spawning a new worker given queue size is in limits 42 | func (self *PoolWithFunc[T]) Invoke(value T) { 43 | var s *slotFunc[T] 44 | for { 45 | if s = self.pop(); s != nil { 46 | s.data = value 47 | safe_ready(s.threadPtr) 48 | return 49 | } else if atomic.AddUint64(&self.currSize, 1) <= self.maxSize { 50 | s = &slotFunc[T]{data: value} 51 | go self.loopQ(s) 52 | return 53 | } else { 54 | atomic.AddUint64(&self.currSize, uint64SubtractionConstant) 55 | mcall(gosched_m) 56 | } 57 | } 58 | } 59 | 60 | // represents the infinite loop for a worker goroutine 61 | func (self *PoolWithFunc[T]) loopQ(d *slotFunc[T]) { 62 | d.threadPtr = GetG() 63 | for { 64 | self.task(d.data) 65 | self.push(d) 66 | mcall(fast_park) 67 | } 68 | } 69 | 70 | // Stack implementation below for storing goroutine references 71 | 72 | // a single node in the stack 73 | type dataItem[T any] struct { 74 | next atomic.Pointer[dataItem[T]] 75 | value *slotFunc[T] 76 | } 77 | 78 | // pop pops value from the top of the stack 79 | func (self *PoolWithFunc[T]) pop() (value *slotFunc[T]) { 80 | var top, next *dataItem[T] 81 | for { 82 | top = self.top.Load() 83 | if top == nil { 84 | return 85 | } 86 | next = top.next.Load() 87 | if self.top.CompareAndSwap(top, next) { 88 | value = top.value 89 | top.value = nil 90 | top.next.Store(nil) 91 | self.free(top) 92 | return 93 | } 94 | } 95 | } 96 | 97 | // push pushes a value on top of the stack 98 | func (self *PoolWithFunc[T]) push(v *slotFunc[T]) { 99 | var ( 100 | top *dataItem[T] 101 | item = self.alloc().(*dataItem[T]) 102 | ) 103 | item.value = v 104 | for { 105 | top = self.top.Load() 106 | item.next.Store(top) 107 | if self.top.CompareAndSwap(top, item) { 108 | return 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /pool.go: -------------------------------------------------------------------------------- 1 | package itogami 2 | 3 | import ( 4 | "sync" 5 | "sync/atomic" 6 | "unsafe" 7 | ) 8 | 9 | // a single slot for a worker in Pool 10 | type slot struct { 11 | threadPtr unsafe.Pointer 12 | task func() 13 | } 14 | 15 | // Pool represents the thread-pool for performing any kind of task ( type -> func() {} ) 16 | type Pool struct { 17 | currSize uint64 18 | _p1 [cacheLinePadSize - unsafe.Sizeof(uint64(0))]byte 19 | maxSize uint64 20 | _p2 [cacheLinePadSize - unsafe.Sizeof(uint64(0))]byte 21 | // using a stack keeps cpu caches warm based on FILO property 22 | top atomic.Pointer[node] 23 | _p3 [cacheLinePadSize - unsafe.Sizeof(atomic.Pointer[node]{})]byte 24 | } 25 | 26 | // NewPool returns a new thread pool 27 | func NewPool(size uint64) *Pool { 28 | return &Pool{maxSize: size} 29 | } 30 | 31 | // Submit submits a new task to the pool 32 | // it first tries to use already parked goroutines from the stack if any 33 | // if there are no available worker goroutines, it tries to add a 34 | // new goroutine to the pool if the pool capacity is not exceeded 35 | // in case the pool capacity hit its maximum limit, this function yields the processor to other 36 | // goroutines and loops again for finding available workers 37 | func (self *Pool) Submit(task func()) { 38 | var s *slot 39 | for { 40 | if s = self.pop(); s != nil { 41 | s.task = task 42 | safe_ready(s.threadPtr) 43 | return 44 | } else if atomic.AddUint64(&self.currSize, 1) <= self.maxSize { 45 | s = &slot{task: task} 46 | go self.loopQ(s) 47 | return 48 | } else { 49 | atomic.AddUint64(&self.currSize, uint64SubtractionConstant) 50 | mcall(gosched_m) 51 | } 52 | } 53 | } 54 | 55 | // loopQ is the looping function for every worker goroutine 56 | func (self *Pool) loopQ(s *slot) { 57 | // store self goroutine pointer 58 | s.threadPtr = GetG() 59 | for { 60 | // exec task 61 | s.task() 62 | // notify availability by pushing self reference into stack 63 | self.push(s) 64 | // park and wait for call 65 | mcall(fast_park) 66 | } 67 | } 68 | 69 | // global memory pool for all items used in Pool 70 | var ( 71 | itemPool = sync.Pool{New: func() any { return new(node) }} 72 | itemAlloc = itemPool.Get 73 | itemFree = itemPool.Put 74 | ) 75 | 76 | // internal lock-free stack implementation for parking and waking up goroutines 77 | // Credits -> https://github.com/golang-design/lockfree 78 | 79 | // a single node in this stack 80 | type node struct { 81 | next atomic.Pointer[node] 82 | value *slot 83 | } 84 | 85 | // pop pops value from the top of the stack 86 | func (self *Pool) pop() (value *slot) { 87 | var top, next *node 88 | for { 89 | top = self.top.Load() 90 | if top == nil { 91 | return 92 | } 93 | next = top.next.Load() 94 | if self.top.CompareAndSwap(top, next) { 95 | value = top.value 96 | top.value = nil 97 | top.next.Store(nil) 98 | itemFree(top) 99 | return 100 | } 101 | } 102 | } 103 | 104 | // push pushes a value on top of the stack 105 | func (self *Pool) push(v *slot) { 106 | var ( 107 | top *node 108 | item = itemAlloc().(*node) 109 | ) 110 | item.value = v 111 | for { 112 | top = self.top.Load() 113 | item.next.Store(top) 114 | if self.top.CompareAndSwap(top, item) { 115 | return 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Itogami 2 | 3 | > An experimental goroutine pool implemented using a lock-free stack 4 | 5 | By limiting concurrency with a fixed pool size and recycling goroutines using a stack, itogami saves a lot of memory as compared to using unlimited goroutines and remaining just as fast. 6 | 7 | Benchmarks to support the above claims [here](#benchmarks) 8 | 9 | **Note:- This work is experimental and should not be used in production** 10 | 11 | ## Installation 12 | 13 | You need Golang [1.19.x](https://go.dev/dl/) or above 14 | 15 | ```bash 16 | $ go get github.com/alphadose/itogami 17 | ``` 18 | 19 | ## Usage 20 | 21 | ```go 22 | package main 23 | 24 | import ( 25 | "fmt" 26 | "sync" 27 | "sync/atomic" 28 | "time" 29 | 30 | "github.com/alphadose/itogami" 31 | ) 32 | 33 | const runTimes uint32 = 1000 34 | 35 | var sum uint32 36 | 37 | func myFunc(i uint32) { 38 | atomic.AddUint32(&sum, i) 39 | fmt.Printf("run with %d\n", i) 40 | } 41 | 42 | func demoFunc() { 43 | time.Sleep(10 * time.Millisecond) 44 | println("Hello World") 45 | } 46 | 47 | func examplePool() { 48 | var wg sync.WaitGroup 49 | // Use the common pool 50 | pool := itogami.NewPool(10) 51 | 52 | syncCalculateSum := func() { 53 | demoFunc() 54 | wg.Done() 55 | } 56 | for i := uint32(0); i < runTimes; i++ { 57 | wg.Add(1) 58 | // Submit task to the pool 59 | pool.Submit(syncCalculateSum) 60 | } 61 | wg.Wait() 62 | println("finished all tasks") 63 | } 64 | 65 | func examplePoolWithFunc() { 66 | var wg sync.WaitGroup 67 | // Use the pool with a pre-defined function 68 | pool := itogami.NewPoolWithFunc(10, func(i uint32) { 69 | myFunc(i) 70 | wg.Done() 71 | }) 72 | for i := uint32(0); i < runTimes; i++ { 73 | wg.Add(1) 74 | // Invoke the function with a value 75 | pool.Invoke(i) 76 | } 77 | wg.Wait() 78 | fmt.Printf("finish all tasks, result is %d\n", sum) 79 | } 80 | 81 | func main() { 82 | examplePool() 83 | examplePoolWithFunc() 84 | } 85 | ``` 86 | 87 | ## Benchmarks 88 | 89 | Benchmarking was performed against:- 90 | 91 | 1. Unlimited goroutines 92 | 2. [Ants](https://github.com/panjf2000/ants) 93 | 3. [Gamma-Zero-Worker-Pool](https://github.com/gammazero/workerpool) 94 | 4. [golang.org/x/sync/errgroup](https://pkg.go.dev/golang.org/x/sync/errgroup) 95 | 5. [Bytedance GoPool](https://github.com/bytedance/gopkg/tree/develop/util/gopool) 96 | 97 | Pool size -> 50k 98 | 99 | CPU -> M1, arm64, 8 cores, 3.2 GHz 100 | 101 | OS -> darwin 102 | 103 | Results were computed from [benchstat](https://pkg.go.dev/golang.org/x/perf/cmd/benchstat) of 30 cases 104 | ``` 105 | name time/op 106 | UnlimitedGoroutines-8 331ms ± 4% 107 | ErrGroup-8 515ms ± 9% 108 | AntsPool-8 582ms ± 9% 109 | GammaZeroPool-8 740ms ±13% 110 | BytedanceGoPool-8 572ms ±18% 111 | ItogamiPool-8 337ms ± 1% 112 | 113 | name alloc/op 114 | UnlimitedGoroutines-8 96.3MB ± 0% 115 | ErrGroup-8 120MB ± 0% 116 | AntsPool-8 22.4MB ± 6% 117 | GammaZeroPool-8 18.8MB ± 1% 118 | BytedanceGoPool-8 82.2MB ± 2% 119 | ItogamiPool-8 25.6MB ± 2% 120 | 121 | name allocs/op 122 | UnlimitedGoroutines-8 2.00M ± 0% 123 | ErrGroup-8 3.00M ± 0% 124 | AntsPool-8 1.10M ± 2% 125 | GammaZeroPool-8 1.08M ± 0% 126 | BytedanceGoPool-8 2.59M ± 1% 127 | ItogamiPool-8 1.08M ± 0% 128 | ``` 129 | 130 | The following conclusions can be drawn from the above results:- 131 | 132 | 1. [Itogami](https://github.com/alphadose/itogami) is the fastest among all goroutine pool implementations and slightly slower than unlimited goroutines 133 | 2. [Itogami](https://github.com/alphadose/itogami) has the least `allocs/op` and hence the memory usage scales really well with high load 134 | 3. The memory used per operation is in the acceptable range of other pools and drastically lower than unlimited goroutines 135 | 4. The tolerance (± %) for [Itogami](https://github.com/alphadose/itogami) is quite low for all 3 metrics indicating that the algorithm is quite stable overall 136 | 137 | Benchmarking code available [here](https://github.com/alphadose/go-threadpool-benchmarks) 138 | -------------------------------------------------------------------------------- /lib_runtime_linkage.go: -------------------------------------------------------------------------------- 1 | package itogami 2 | 3 | import ( 4 | "runtime" 5 | "unsafe" 6 | _ "unsafe" 7 | 8 | "github.com/alphadose/itogami/constants" 9 | ) 10 | 11 | const ( 12 | cacheLinePadSize = constants.CacheLinePadSize 13 | uint64SubtractionConstant = ^uint64(0) 14 | ) 15 | 16 | type cacheLinePadding struct{ _ [cacheLinePadSize]byte } 17 | 18 | // Linking ZenQ with golang internal runtime library to allow usage of scheduling primitives 19 | // like goready(), mcall() etc to allow low-level scheduling of goroutines 20 | 21 | type mutex struct { 22 | // Futex-based impl treats it as uint32 key, 23 | // while sema-based impl as M* waitm. 24 | // Used to be a union, but unions break precise GC. 25 | key uintptr 26 | } 27 | 28 | // The functions below are used for scheduling goroutines with exclusive control 29 | // Shifting to the below flow will remove the spinning and mutex lock implementations 30 | 31 | //go:linkname lock runtime.lock 32 | func lock(l *mutex) 33 | 34 | //go:linkname nanotime runtime.nanotime 35 | func nanotime() int64 36 | 37 | //go:linkname unlock runtime.unlock 38 | func unlock(l *mutex) 39 | 40 | //go:linkname goparkunlock runtime.goparkunlock 41 | func goparkunlock(lock *mutex, reason waitReason, traceEv byte, traceskip int) 42 | 43 | // GetG returns the pointer to the current goroutine 44 | // defined in the asm files 45 | func GetG() unsafe.Pointer 46 | 47 | //go:linkname Fastrand runtime.fastrand 48 | func Fastrand() uint32 49 | 50 | //go:linkname fastlog2 runtime.fastlog2 51 | func fastlog2(x float64) float64 52 | 53 | //go:linkname goready runtime.goready 54 | func goready(goroutinePtr unsafe.Pointer, traceskip int) 55 | 56 | //go:linkname gopark runtime.gopark 57 | func gopark(unlockf func(unsafe.Pointer, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int) 58 | 59 | // Active spinning runtime support. 60 | // runtime_canSpin reports whether spinning makes sense at the moment. 61 | //go:linkname runtime_canSpin sync.runtime_canSpin 62 | func runtime_canSpin(i int) bool 63 | 64 | // runtime_doSpin does active spinning. 65 | // //go:linkname runtime_doSpin sync.runtime_doSpin 66 | // func runtime_doSpin() 67 | 68 | func runtime_doSpin() { 69 | spin(30) 70 | } 71 | 72 | //go:linkname osyield runtime.osyield 73 | func osyield() 74 | 75 | //go:linkname runtime_nanotime sync.runtime_nanotime 76 | func runtime_nanotime() int64 77 | 78 | // Semacquire waits until *s > 0 and then atomically decrements it. 79 | // It is intended as a simple sleep primitive for use by the synchronization 80 | // library and should not be used directly. 81 | //go:linkname runtime_Semacquire sync.runtime_Semacquire 82 | func runtime_Semacquire(s *uint32) 83 | 84 | // SemacquireMutex is like Semacquire, but for profiling contended Mutexes. 85 | // If lifo is true, queue waiter at the head of wait queue. 86 | // skipframes is the number of frames to omit during tracing, counting from 87 | // runtime_SemacquireMutex's caller. 88 | //go:linkname runtime_SemacquireMutex sync.runtime_SemacquireMutex 89 | func runtime_SemacquireMutex(s *uint32, lifo bool, skipframes int) 90 | 91 | // Semrelease atomically increments *s and notifies a waiting goroutine 92 | // if one is blocked in Semacquire. 93 | // It is intended as a simple wakeup primitive for use by the synchronization 94 | // library and should not be used directly. 95 | // If handoff is true, pass count directly to the first waiter. 96 | // skipframes is the number of frames to omit during tracing, counting from 97 | // runtime_Semrelease's caller. 98 | //go:linkname runtime_Semrelease sync.runtime_Semrelease 99 | func runtime_Semrelease(s *uint32, handoff bool, skipframes int) 100 | 101 | //go:linkname goyield runtime.goyield 102 | func goyield() 103 | 104 | //go:linkname mcall runtime.mcall 105 | func mcall(fn func(unsafe.Pointer)) 106 | 107 | //go:linkname park_m runtime.park_m 108 | func park_m(gp unsafe.Pointer) 109 | 110 | //go:linkname fastrandn runtime.fastrandn 111 | func fastrandn(n uint32) uint32 112 | 113 | //go:linkname throw runtime.throw 114 | func throw(s string) 115 | 116 | //go:linkname Readgstatus runtime.readgstatus 117 | func Readgstatus(gp unsafe.Pointer) uint32 118 | 119 | //go:linkname casgstatus runtime.casgstatus 120 | func casgstatus(gp unsafe.Pointer, oldval, newval uint32) 121 | 122 | //go:linkname dropg runtime.dropg 123 | func dropg() 124 | 125 | //go:linkname schedule runtime.schedule 126 | func schedule() 127 | 128 | //go:linkname mallocgc runtime.mallocgc 129 | func mallocgc(size uintptr, typ unsafe.Pointer, needzero bool) unsafe.Pointer 130 | 131 | //go:linkname sysFree runtime.sysFree 132 | func sysFree(v unsafe.Pointer, n uintptr, sysStat unsafe.Pointer) 133 | 134 | //go:linkname sysFreeOS runtime.sysFreeOS 135 | func sysFreeOS(v unsafe.Pointer, n uintptr) 136 | 137 | //go:linkname gosched_m runtime.gosched_m 138 | func gosched_m(gp unsafe.Pointer) 139 | 140 | //go:linkname spin runtime.procyield 141 | func spin(cycles uint32) 142 | 143 | //go:linkname ProcPin runtime.procPin 144 | func ProcPin() int 145 | 146 | //go:linkname ProcUnpin runtime.procUnpin 147 | func ProcUnpin() 148 | 149 | // custom parking function 150 | func fast_park(gp unsafe.Pointer) { 151 | dropg() 152 | casgstatus(gp, _Grunning, _Gwaiting) 153 | schedule() 154 | } 155 | 156 | // whether the system has multiple cores or a single core 157 | var multicore = runtime.NumCPU() > 1 158 | 159 | // call ready after ensuring the goroutine is parked 160 | func safe_ready(gp unsafe.Pointer) { 161 | for Readgstatus(gp)&^_Gscan != _Gwaiting { 162 | mcall(gosched_m) 163 | } 164 | goready(gp, 1) 165 | } 166 | 167 | type waitReason uint8 168 | 169 | const ( 170 | waitReasonZero waitReason = iota // "" 171 | waitReasonGCAssistMarking // "GC assist marking" 172 | waitReasonIOWait // "IO wait" 173 | waitReasonChanReceiveNilChan // "chan receive (nil chan)" 174 | waitReasonChanSendNilChan // "chan send (nil chan)" 175 | waitReasonDumpingHeap // "dumping heap" 176 | waitReasonGarbageCollection // "garbage collection" 177 | waitReasonGarbageCollectionScan // "garbage collection scan" 178 | waitReasonPanicWait // "panicwait" 179 | waitReasonSelect // "select" 180 | waitReasonSelectNoCases // "select (no cases)" 181 | waitReasonGCAssistWait // "GC assist wait" 182 | waitReasonGCSweepWait // "GC sweep wait" 183 | waitReasonGCScavengeWait // "GC scavenge wait" 184 | waitReasonChanReceive // "chan receive" 185 | waitReasonChanSend // "chan send" 186 | waitReasonFinalizerWait // "finalizer wait" 187 | waitReasonForceGCIdle // "force gc (idle)" 188 | waitReasonSemacquire // "semacquire" 189 | waitReasonSleep // "sleep" 190 | waitReasonSyncCondWait // "sync.Cond.Wait" 191 | waitReasonTimerGoroutineIdle // "timer goroutine (idle)" 192 | waitReasonTraceReaderBlocked // "trace reader (blocked)" 193 | waitReasonWaitForGCCycle // "wait for GC cycle" 194 | waitReasonGCWorkerIdle // "GC worker (idle)" 195 | waitReasonPreempted // "preempted" 196 | waitReasonDebugCall // "debug call" 197 | ) 198 | 199 | // Event types in the trace, args are given in square brackets. 200 | const ( 201 | traceEvNone = 0 // unused 202 | traceEvBatch = 1 // start of per-P batch of events [pid, timestamp] 203 | traceEvFrequency = 2 // contains tracer timer frequency [frequency (ticks per second)] 204 | traceEvStack = 3 // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}] 205 | traceEvGomaxprocs = 4 // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id] 206 | traceEvProcStart = 5 // start of P [timestamp, thread id] 207 | traceEvProcStop = 6 // stop of P [timestamp] 208 | traceEvGCStart = 7 // GC start [timestamp, seq, stack id] 209 | traceEvGCDone = 8 // GC done [timestamp] 210 | traceEvGCSTWStart = 9 // GC STW start [timestamp, kind] 211 | traceEvGCSTWDone = 10 // GC STW done [timestamp] 212 | traceEvGCSweepStart = 11 // GC sweep start [timestamp, stack id] 213 | traceEvGCSweepDone = 12 // GC sweep done [timestamp, swept, reclaimed] 214 | traceEvGoCreate = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id] 215 | traceEvGoStart = 14 // goroutine starts running [timestamp, goroutine id, seq] 216 | traceEvGoEnd = 15 // goroutine ends [timestamp] 217 | traceEvGoStop = 16 // goroutine stops (like in select{}) [timestamp, stack] 218 | traceEvGoSched = 17 // goroutine calls Gosched [timestamp, stack] 219 | traceEvGoPreempt = 18 // goroutine is preempted [timestamp, stack] 220 | traceEvGoSleep = 19 // goroutine calls Sleep [timestamp, stack] 221 | traceEvGoBlock = 20 // goroutine blocks [timestamp, stack] 222 | traceEvGoUnblock = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack] 223 | traceEvGoBlockSend = 22 // goroutine blocks on chan send [timestamp, stack] 224 | traceEvGoBlockRecv = 23 // goroutine blocks on chan recv [timestamp, stack] 225 | traceEvGoBlockSelect = 24 // goroutine blocks on select [timestamp, stack] 226 | traceEvGoBlockSync = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack] 227 | traceEvGoBlockCond = 26 // goroutine blocks on Cond [timestamp, stack] 228 | traceEvGoBlockNet = 27 // goroutine blocks on network [timestamp, stack] 229 | traceEvGoSysCall = 28 // syscall enter [timestamp, stack] 230 | traceEvGoSysExit = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp] 231 | traceEvGoSysBlock = 30 // syscall blocks [timestamp] 232 | traceEvGoWaiting = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id] 233 | traceEvGoInSyscall = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id] 234 | traceEvHeapAlloc = 33 // gcController.heapLive change [timestamp, heap_alloc] 235 | traceEvHeapGoal = 34 // gcController.heapGoal (formerly next_gc) change [timestamp, heap goal in bytes] 236 | traceEvTimerGoroutine = 35 // not currently used; previously denoted timer goroutine [timer goroutine id] 237 | traceEvFutileWakeup = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp] 238 | traceEvString = 37 // string dictionary entry [ID, length, string] 239 | traceEvGoStartLocal = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id] 240 | traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack] 241 | traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp] 242 | traceEvGoStartLabel = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id] 243 | traceEvGoBlockGC = 42 // goroutine blocks on GC assist [timestamp, stack] 244 | traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack] 245 | traceEvGCMarkAssistDone = 44 // GC mark assist done [timestamp] 246 | traceEvUserTaskCreate = 45 // trace.NewContext [timestamp, internal task id, internal parent task id, stack, name string] 247 | traceEvUserTaskEnd = 46 // end of a task [timestamp, internal task id, stack] 248 | traceEvUserRegion = 47 // trace.WithRegion [timestamp, internal task id, mode(0:start, 1:end), stack, name string] 249 | traceEvUserLog = 48 // trace.Log [timestamp, internal task id, key string id, stack, value string] 250 | traceEvCount = 49 251 | // Byte is used but only 6 bits are available for event type. 252 | // The remaining 2 bits are used to specify the number of arguments. 253 | // That means, the max event type value is 63. 254 | ) 255 | 256 | // defined constants 257 | const ( 258 | // G status 259 | // 260 | // Beyond indicating the general state of a G, the G status 261 | // acts like a lock on the goroutine's stack (and hence its 262 | // ability to execute user code). 263 | // 264 | // If you add to this list, add to the list 265 | // of "okay during garbage collection" status 266 | // in mgcmark.go too. 267 | // 268 | // TODO(austin): The _Gscan bit could be much lighter-weight. 269 | // For example, we could choose not to run _Gscanrunnable 270 | // goroutines found in the run queue, rather than CAS-looping 271 | // until they become _Grunnable. And transitions like 272 | // _Gscanwaiting -> _Gscanrunnable are actually okay because 273 | // they don't affect stack ownership. 274 | 275 | // _Gidle means this goroutine was just allocated and has not 276 | // yet been initialized. 277 | _Gidle = iota // 0 278 | 279 | // _Grunnable means this goroutine is on a run queue. It is 280 | // not currently executing user code. The stack is not owned. 281 | _Grunnable // 1 282 | 283 | // _Grunning means this goroutine may execute user code. The 284 | // stack is owned by this goroutine. It is not on a run queue. 285 | // It is assigned an M and a P (g.m and g.m.p are valid). 286 | _Grunning // 2 287 | 288 | // _Gsyscall means this goroutine is executing a system call. 289 | // It is not executing user code. The stack is owned by this 290 | // goroutine. It is not on a run queue. It is assigned an M. 291 | _Gsyscall // 3 292 | 293 | // _Gwaiting means this goroutine is blocked in the runtime. 294 | // It is not executing user code. It is not on a run queue, 295 | // but should be recorded somewhere (e.g., a channel wait 296 | // queue) so it can be ready()d when necessary. The stack is 297 | // not owned *except* that a channel operation may read or 298 | // write parts of the stack under the appropriate channel 299 | // lock. Otherwise, it is not safe to access the stack after a 300 | // goroutine enters _Gwaiting (e.g., it may get moved). 301 | _Gwaiting // 4 302 | 303 | // _Gmoribund_unused is currently unused, but hardcoded in gdb 304 | // scripts. 305 | _Gmoribund_unused // 5 306 | 307 | // _Gdead means this goroutine is currently unused. It may be 308 | // just exited, on a free list, or just being initialized. It 309 | // is not executing user code. It may or may not have a stack 310 | // allocated. The G and its stack (if any) are owned by the M 311 | // that is exiting the G or that obtained the G from the free 312 | // list. 313 | _Gdead // 6 314 | 315 | // _Genqueue_unused is currently unused. 316 | _Genqueue_unused // 7 317 | 318 | // _Gcopystack means this goroutine's stack is being moved. It 319 | // is not executing user code and is not on a run queue. The 320 | // stack is owned by the goroutine that put it in _Gcopystack. 321 | _Gcopystack // 8 322 | 323 | // _Gpreempted means this goroutine stopped itself for a 324 | // suspendG preemption. It is like _Gwaiting, but nothing is 325 | // yet responsible for ready()ing it. Some suspendG must CAS 326 | // the status to _Gwaiting to take responsibility for 327 | // ready()ing this G. 328 | _Gpreempted // 9 329 | 330 | // _Gscan combined with one of the above states other than 331 | // _Grunning indicates that GC is scanning the stack. The 332 | // goroutine is not executing user code and the stack is owned 333 | // by the goroutine that set the _Gscan bit. 334 | // 335 | // _Gscanrunning is different: it is used to briefly block 336 | // state transitions while GC signals the G to scan its own 337 | // stack. This is otherwise like _Grunning. 338 | // 339 | // atomicstatus&~Gscan gives the state the goroutine will 340 | // return to when the scan completes. 341 | _Gscan = 0x1000 342 | _Gscanrunnable = _Gscan + _Grunnable // 0x1001 343 | _Gscanrunning = _Gscan + _Grunning // 0x1002 344 | _Gscansyscall = _Gscan + _Gsyscall // 0x1003 345 | _Gscanwaiting = _Gscan + _Gwaiting // 0x1004 346 | _Gscanpreempted = _Gscan + _Gpreempted // 0x1009 347 | ) 348 | --------------------------------------------------------------------------------