├── .gitignore ├── concrete_vs_interface.go ├── concrete_vs_interface_pointers_inplace.go ├── concrete_vs_interface_pointers.go └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | *.gor 7 | 8 | # Folders 9 | _obj 10 | _test 11 | 12 | # Architecture specific extensions/prefixes 13 | *.[568vq] 14 | [568vq].out 15 | 16 | *.cgo1.go 17 | *.cgo2.c 18 | _cgo_defun.c 19 | _cgo_gotypes.go 20 | _cgo_export.* 21 | 22 | _testmain.go 23 | 24 | *.exe 25 | *.test 26 | 27 | 28 | *.sublime-workspace 29 | *.sw* 30 | *.un* 31 | 32 | app.conf.json 33 | docker.conf.json 34 | 35 | target/ 36 | -------------------------------------------------------------------------------- /concrete_vs_interface.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/pkg/profile" 8 | ) 9 | 10 | // ----------------------------------------------------------------------------- 11 | 12 | // Int provides an `int64` that implements the `Summable` interface. 13 | type Int int64 14 | 15 | // Sum simply adds two `Int`s. 16 | func (i Int) Sum(i2 Int) Int { return i + i2 } 17 | 18 | type Summable interface { 19 | Sum(i Int) Int 20 | } 21 | 22 | // ----------------------------------------------------------------------------- 23 | 24 | const nbOps int = 1e8 25 | 26 | func main() { 27 | defer profile.Start(profile.CPUProfile).Stop() 28 | 29 | var start time.Time 30 | 31 | var iConcrete Int 32 | start = time.Now() 33 | for i := 0; i < nbOps; i++ { 34 | iConcrete = iConcrete.Sum(Int(10)) 35 | } 36 | _ = iConcrete 37 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 38 | 39 | var iInterface Summable = Int(0) 40 | start = time.Now() 41 | for i := 0; i < nbOps; i++ { 42 | iInterface = iInterface.Sum(Int(10)) 43 | } 44 | _ = iInterface 45 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 46 | } 47 | -------------------------------------------------------------------------------- /concrete_vs_interface_pointers_inplace.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/pkg/profile" 8 | ) 9 | 10 | // ----------------------------------------------------------------------------- 11 | 12 | // Int provides an `int64` that implements the `Summable` interface. 13 | type Int int64 14 | 15 | // Sum simply adds two `Int`s. 16 | func (i *Int) Sum(i2 Int) { *i += i2 } 17 | 18 | type Summable interface { 19 | Sum(i Int) 20 | } 21 | 22 | // ----------------------------------------------------------------------------- 23 | 24 | const nbOps int = 1e8 25 | 26 | func main() { 27 | defer profile.Start(profile.CPUProfile).Stop() 28 | 29 | var start time.Time 30 | var zero Int 31 | 32 | var iConcrete *Int = &zero 33 | start = time.Now() 34 | for i := 0; i < nbOps; i++ { 35 | iConcrete.Sum(Int(10)) 36 | } 37 | _ = iConcrete 38 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 39 | 40 | var iInterface Summable = &zero 41 | start = time.Now() 42 | for i := 0; i < nbOps; i++ { 43 | iInterface.Sum(Int(10)) 44 | } 45 | _ = iInterface 46 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 47 | } 48 | -------------------------------------------------------------------------------- /concrete_vs_interface_pointers.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/pkg/profile" 8 | ) 9 | 10 | // ----------------------------------------------------------------------------- 11 | 12 | // Int provides an `int64` that implements the `Summable` interface. 13 | type Int int64 14 | 15 | // Sum simply adds two `Int`s. 16 | func (i *Int) Sum(i2 Int) *Int { *i += i2; return i } 17 | 18 | type Summable interface { 19 | Sum(i Int) *Int 20 | } 21 | 22 | // ----------------------------------------------------------------------------- 23 | 24 | const nbOps int = 1e8 25 | 26 | func main() { 27 | defer profile.Start(profile.CPUProfile).Stop() 28 | 29 | var start time.Time 30 | var zero Int 31 | 32 | var iConcrete *Int = &zero 33 | start = time.Now() 34 | for i := 0; i < nbOps; i++ { 35 | iConcrete = iConcrete.Sum(Int(10)) 36 | } 37 | _ = iConcrete 38 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 39 | 40 | var iInterface Summable = &zero 41 | start = time.Now() 42 | for i := 0; i < nbOps; i++ { 43 | iInterface = iInterface.Sum(Int(10)) 44 | } 45 | _ = iInterface 46 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 47 | } 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Interface-Bench 2 | 3 | A quick look at some of Go's subtleties regarding the use of interfaces and the performance issues that might ensue. 4 | 5 | 6 | 7 | **Table of Contents** 8 | 9 | - [Interface-Bench](#interface-bench) 10 | - [Round I: Method calls on concrete types vs. interfaces](#round-i-method-calls-on-concrete-types-vs-interfaces) 11 | - [Why are the concrete calls this fast?](#why-are-the-concrete-calls-this-fast) 12 | - [Why are the interface calls this slow?](#why-are-the-interface-calls-this-slow) 13 | - [Round II: Pointers](#round-ii-pointers) 14 | - [4x slower concrete calls](#4x-slower-concrete-calls) 15 | - [5x faster interface calls](#5x-faster-interface-calls) 16 | - [Round III: In-place](#round-iii-in-place) 17 | - [Wait... did the concrete calls just get slower?!](#wait-did-the-concrete-calls-just-get-slower) 18 | - [Interface calls are now as fast as concrete calls!](#interface-calls-are-now-as-fast-as-concrete-calls) 19 | - [Conclusion](#conclusion) 20 | 21 | 22 | 23 | ## Round I: Method calls on concrete types vs. interfaces 24 | 25 | Let's compare the performances of doing 100 million method calls on a concrete type (`int64`) vs. an interface. 26 | The code is [as follows](./concrete_vs_interface.go): 27 | ```Go 28 | package main 29 | 30 | import ( 31 | "fmt" 32 | "time" 33 | 34 | "github.com/pkg/profile" 35 | ) 36 | 37 | // ----------------------------------------------------------------------------- 38 | 39 | // Int provides an `int64` that implements the `Summable` interface. 40 | type Int int64 41 | 42 | // Sum simply adds two `Int`s. 43 | func (i Int) Sum(i2 Int) Int { return i + i2 } 44 | 45 | type Summable interface { 46 | Sum(i Int) Int 47 | } 48 | 49 | // ----------------------------------------------------------------------------- 50 | 51 | const nbOps int = 1e8 52 | 53 | func main() { 54 | defer profile.Start(profile.CPUProfile).Stop() 55 | 56 | var start time.Time 57 | 58 | var iConcrete Int 59 | start = time.Now() 60 | for i := 0; i < nbOps; i++ { 61 | iConcrete = iConcrete.Sum(Int(10)) 62 | } 63 | _ = iConcrete 64 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 65 | 66 | var iInterface Summable = Int(0) 67 | start = time.Now() 68 | for i := 0; i < nbOps; i++ { 69 | iInterface = iInterface.Sum(Int(10)) 70 | } 71 | _ = iInterface 72 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 73 | } 74 | ``` 75 | 76 | Pretty straightforward stuff. The results look like these: 77 | ``` 78 | $ go version 79 | go version go1.6.2 darwin/amd64 80 | $ sysctl -n machdep.cpu.brand_string 81 | Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz 82 | $ go run concrete_vs_interface.go 83 | [concrete] computed 100000000 sums in 41.966579ms 84 | [interface] computed 100000000 sums in 2.799456753s 85 | ``` 86 | 87 | At first glance, it would seem that going through the interface dispatch machinery comes with a terrifying 6500% slow-down... and, indeed, that's what I thought at first; until @twotwotwo [rightfully demonstrated how completely broken my benchmark actually was](https://github.com/teh-cmc/interface-bench/issues/1). 88 | 89 | So, what is really going on here? 90 | For a concise answer, have a look at #1. If you're looking for the long version, you may continue reading below. 91 | 92 | ### Why are the concrete calls this fast? 93 | 94 | 100 million calls to a `Sum` method in 42ms, that's a staggering ~2.4 billion sums per second. Anyway you look at it, that's _real_ fast. 95 | 96 | There are two main reasons behind this speed. 97 | 98 | **1. inlining** 99 | 100 | Using the `-m` gcflag will show that the compiler is inlining the call to `iConcrete.Sum(Int(10))`: 101 | ``` 102 | $ go run -gcflags '-m' main.go 103 | # command-line-arguments 104 | ./main.go:16: can inline Int.Sum 105 | ./main.go:34: inlining call to Int.Sum 106 | ...(rest omitted)... 107 | ``` 108 | 109 | This obviously avoids a lot of copying. Running the same code with inlining disabled (via the `-l` gcflag) shows a 10x drop in performance: 110 | ``` 111 | $ go run -gcflags '-l' main.go 112 | [concrete] computed 100000000 sums in 413.927641ms 113 | ``` 114 | 115 | **2. no escaping** 116 | 117 | There are no pointers involved in this snippet, no variables escaping from the stack either; and with `Sum` being inlined, everything is literally happening within the same stack-frame: there is simply no work at all to be done by the memory allocator nor the garbage collector. 118 | 119 | Go code can't go much faster than this. 120 | 121 | ### Why are the interface calls this slow? 122 | 123 | 100 million calls in 2800ms (~35.5 million per sec), on the other hand, seems particularly slow. 124 | 125 | As @twotwotwo mentioned in #1, this slow-down stems from a change that shipped with the 1.4 release of the Go runtime: 126 | > The implementation of interface values has been modified. In earlier releases, the interface contained a word that was either a pointer or a one-word scalar value, depending on the type of the concrete object stored. This implementation was problematical for the garbage collector, so as of 1.4 interface values always hold a pointer. In running programs, most interface values were pointers anyway, so the effect is minimal, but programs that store integers (for example) in interfaces will see more allocations. 127 | 128 | Because of this, every time `iInterface.Sum(Int(10))` returns a result and assigns it to `iInterface`, `sizeof(Int)` bytes have to be allocated on the heap and the value of the current variable has to be copied to that new location. 129 | 130 | This obviously induces a huge amount of work; and, indeed, a pprof trace shows that most of the time is spent allocating bytes and copying values as part of the process of converting types to interfaces (i.e. `runtime.convT2I`): 131 | ``` 132 | flat flat% sum% cum cum% 133 | 740ms 28.24% 28.24% 1180ms 45.04% runtime.mallocgc 134 | 340ms 12.98% 41.22% 1520ms 58.02% runtime.newobject 135 | 270ms 10.31% 51.53% 270ms 10.31% runtime.mach_semaphore_signal 136 | 260ms 9.92% 61.45% 2490ms 95.04% main.main 137 | 220ms 8.40% 69.85% 2090ms 79.77% runtime.convT2I 138 | 180ms 6.87% 76.72% 180ms 6.87% runtime.memmove 139 | 150ms 5.73% 82.44% 330ms 12.60% runtime.typedmemmove 140 | 130ms 4.96% 87.40% 130ms 4.96% main.(*Int).Sum 141 | 100ms 3.82% 91.22% 100ms 3.82% runtime.prefetchnta 142 | 80ms 3.05% 94.27% 80ms 3.05% runtime.(*mspan).sweep.func1 143 | ``` 144 | 145 | Note that GC/STW latencies are not even part of the equation here: if you try running this program with the GC disabled (`GOGC=off`), you should get the exact same results (with Go 1.6+ at least). 146 | 147 | So, how can we fix this? 148 | 149 | ## Round II: Pointers 150 | 151 | An idea that naturally comes to mind when trying to reduce copying is to use pointers. 152 | The code is [as follows](./concrete_vs_interface_pointers.go): 153 | ```Go 154 | package main 155 | 156 | import ( 157 | "fmt" 158 | "time" 159 | 160 | "github.com/pkg/profile" 161 | ) 162 | 163 | // ----------------------------------------------------------------------------- 164 | 165 | // Int provides an `int64` that implements the `Summable` interface. 166 | type Int int64 167 | 168 | // Sum simply adds two `Int`s. 169 | func (i *Int) Sum(i2 Int) *Int { *i += i2; return i } 170 | 171 | type Summable interface { 172 | Sum(i Int) *Int 173 | } 174 | 175 | // ----------------------------------------------------------------------------- 176 | 177 | const nbOps int = 1e8 178 | 179 | func main() { 180 | defer profile.Start(profile.CPUProfile).Stop() 181 | 182 | var start time.Time 183 | var zero Int 184 | 185 | var iConcrete *Int = &zero 186 | start = time.Now() 187 | for i := 0; i < nbOps; i++ { 188 | iConcrete = iConcrete.Sum(Int(10)) 189 | } 190 | _ = iConcrete 191 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 192 | 193 | var iInterface Summable = &zero 194 | start = time.Now() 195 | for i := 0; i < nbOps; i++ { 196 | iInterface = iInterface.Sum(Int(10)) 197 | } 198 | _ = iInterface 199 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 200 | } 201 | ``` 202 | 203 | The code is almost the same, except that `Sum` now applies to a pointer and returns that same pointer as a result. 204 | 205 | The results look like these: 206 | ``` 207 | $ go version 208 | go version go1.6.2 darwin/amd64 209 | $ sysctl -n machdep.cpu.brand_string 210 | Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz 211 | $ go run concrete_vs_interface_pointers.go 212 | [concrete] computed 100000000 sums in 178.189757ms 213 | [interface] computed 100000000 sums in 593.659837ms 214 | ``` 215 | 216 | ### 4x slower concrete calls 217 | 218 | The reason for this slow-down is simply the overhead of dereferencing the `iConcrete` pointer for each summation. 219 | 220 | Not much more we can do here. 221 | 222 | ### 5x faster interface calls 223 | 224 | The reason for this speed-up is that we've completely removed the need to allocate `sizeof(Int)` on the heap and copy values around every time we assign the return value of `Sum` to `iInterface`. 225 | 226 | A quick look at the pprof trace will confirm our thoughts: 227 | ``` 228 | flat flat% sum% cum cum% 229 | 280ms 56.00% 56.00% 280ms 56.00% main.(*Int).Sum 230 | 150ms 30.00% 86.00% 430ms 86.00% main.main 231 | 70ms 14.00% 100% 70ms 14.00% runtime.usleep 232 | 0 0% 100% 430ms 86.00% runtime.goexit 233 | 0 0% 100% 430ms 86.00% runtime.main 234 | 0 0% 100% 70ms 14.00% runtime.mstart 235 | 0 0% 100% 70ms 14.00% runtime.mstart1 236 | 0 0% 100% 70ms 14.00% runtime.sysmon 237 | ``` 238 | There is effectively no trace of `runtime.mallocgc`, `runtime.convT2I` or anything else that's part of the process of converting types to interfaces (T2I) here. 239 | 240 | This is still 3-4x slower than concrete calls though; can we make it even faster? 241 | 242 | ## Round III: In-place 243 | 244 | Since we're now applying `Sum` to a pointer, we might as well not return anything. 245 | This will make some nice chaining patterns impossible but, on the other hand, should entirely remove the overhead of creating an interface every time we assign the return value of `Sum` to `iInterface`. 246 | The code is [as follows](./concrete_vs_interface_pointers_inplace.go): 247 | ```Go 248 | package main 249 | 250 | import ( 251 | "fmt" 252 | "time" 253 | 254 | "github.com/pkg/profile" 255 | ) 256 | 257 | // ----------------------------------------------------------------------------- 258 | 259 | // Int provides an `int64` that implements the `Summable` interface. 260 | type Int int64 261 | 262 | // Sum simply adds two `Int`s. 263 | func (i *Int) Sum(i2 Int) { *i += i2 } 264 | 265 | type Summable interface { 266 | Sum(i Int) 267 | } 268 | 269 | // ----------------------------------------------------------------------------- 270 | 271 | const nbOps int = 1e8 272 | 273 | func main() { 274 | defer profile.Start(profile.CPUProfile).Stop() 275 | 276 | var start time.Time 277 | var zero Int 278 | 279 | var iConcrete *Int = &zero 280 | start = time.Now() 281 | for i := 0; i < nbOps; i++ { 282 | iConcrete.Sum(Int(10)) 283 | } 284 | _ = iConcrete 285 | fmt.Printf("[concrete] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 286 | 287 | var iInterface Summable = &zero 288 | start = time.Now() 289 | for i := 0; i < nbOps; i++ { 290 | iInterface.Sum(Int(10)) 291 | } 292 | _ = iInterface 293 | fmt.Printf("[interface] computed %d sums in %v\n", nbOps, time.Now().Sub(start)) 294 | } 295 | ``` 296 | 297 | The results look like these: 298 | ``` 299 | $ go version 300 | go version go1.6.2 darwin/amd64 301 | $ sysctl -n machdep.cpu.brand_string 302 | Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz 303 | $ go run concrete_vs_interface_pointers_inplace.go 304 | [concrete] computed 100000000 sums in 215.192313ms 305 | [interface] computed 100000000 sums in 222.486475ms 306 | ``` 307 | 308 | The overhead of going through an interface is now barely noticeable.. in fact, it's sometimes even faster than the concrete calls! 309 | 310 | ### Wait... did the concrete calls just get slower?! 311 | 312 | _Yes, they did._ 313 | Removing the return value of the `Sum` method noticeably made the concrete calls ~17% slower (going from 180ms on average to 210ms). 314 | 315 | I haven't had the time to look into this, so I'm not sure about the exact cause for this slow-down; but I'm going to assume that the presence of the return value allows the compiler to do some tricky optimizations... 316 | I'll dig into this once I find the time; if you know what's going on, please open an issue! 317 | 318 | ### Interface calls are now as fast as concrete calls! 319 | 320 | Finally, now that we've completely removed the need to build interfaces when assigning `Sum`'s return values; we've removed all the overhead we could remove. 321 | 322 | In this configuration, using either concrete or interface calls has virtually the same cost (although, in reality, concrete calls can be made faster with the use of a return value). 323 | 324 | ## Conclusion 325 | 326 | Technically, Go interfaces' method-dispatch machinery barely has any overhead compared to a simple method call on a concrete type. 327 | 328 | In practice, due to the way interfaces are implemented, it's easy to stumble upon various more-or-less obvious pitfalls that can result in a lot of overhead, primarily caused by implicit allocations and copies. 329 | Sometimes, compiler optimizations will save you; and sometimes they won't. 330 | --------------------------------------------------------------------------------