├── .buckconfig ├── .gitignore ├── BUCK ├── LICENSE ├── README.md ├── buckaroo.json ├── buckaroo.lock.json └── smallfun ├── apps └── main.cpp └── include └── smallfun.hpp /.buckconfig: -------------------------------------------------------------------------------- 1 | [cxx] 2 | cxxflags = -O3 -Os -Ofast 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .buckd 2 | .buckconfig.local 3 | buck-out 4 | BUCKAROO_DEPS 5 | buckaroo 6 | -------------------------------------------------------------------------------- /BUCK: -------------------------------------------------------------------------------- 1 | # Generated by Buckaroo - https://buckaroo.pm 2 | include_defs('//BUCKAROO_DEPS') 3 | 4 | cxx_library( 5 | name = 'smallfun', 6 | header_namespace = 'smallfun', 7 | srcs = glob([ 8 | 'smallfun/src/**/*.cpp', 9 | ]), 10 | headers = subdir_glob([ # private include files 11 | ('smallfun/detail', '**/*.h'), # they are only accesible inside the library 12 | ('smallfun/detail', '**/*.hpp'), 13 | ]), 14 | exported_headers = subdir_glob([ # public include files 15 | ('smallfun/include', '**/*.h'), # those will be exported 16 | ('smallfun/include', '**/*.hpp'), # and accessible via 17 | ]), 18 | deps = BUCKAROO_DEPS, 19 | visibility = ['PUBLIC'] 20 | ) 21 | 22 | cxx_binary( 23 | name = 'main', 24 | srcs = ['smallfun/apps/main.cpp'], 25 | deps = [':smallfun'], 26 | linker_flags = ['-lpthread'], 27 | visibility = ['PUBLIC'] 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 LoopPerfect 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # smallfunction 2 | 3 | `SmallFun` is an alternative to `std::function`, which implements *fixed size capture optimization* (a form of small buffer optimization). In some benchmarks, it is 3-5x faster than `std::function`. 4 | 5 | ## Background 6 | 7 | `std::function` is a convenient way to store lambdas with closures (also known as captures), whilst providing a unified interface. 8 | Before `std::function` and lambdas, we would create a hand-crafted functor object like this: 9 | 10 | ```c++= 11 | struct Functor { 12 | // The context, or capture 13 | // For example, an int and an unsigned 14 | int i; 15 | unsigned N; 16 | 17 | // The lambda 18 | int operator() (int j) const { 19 | // For example, a small math function 20 | return i * j + N; 21 | } 22 | }; 23 | ``` 24 | 25 | This repository compares `std::function`, the hand-crafted `Functor` and `SmallFun`. We find that `SmallFun` performs better then `std::function` by being slighly less generic. 26 | 27 | 28 | ## What is std::function's Missed Opportunity? 29 | 30 | `std::function` uses a [PImpl pattern](http://en.cppreference.com/w/cpp/language/pimpl) to provide an unified interface aross all functors for a given signature. 31 | 32 | For example, these two instances `f` and `g` have the same size, despite having different captures: 33 | 34 | ```c++= 35 | int x = 2; 36 | int y = 9; 37 | int z = 4; 38 | 39 | // f captures nothing 40 | std::function f = [](int i) { 41 | return i + 1; 42 | }; 43 | 44 | // g captures x, y and z 45 | std::function g = [=](int i) { 46 | return (i * (x + z)) + y; 47 | }; 48 | ``` 49 | 50 | This is because `std::function` stores the capture on the **heap**. This unifies the size of all instances, but it is also an opportunity for optimization! 51 | 52 | ## How? 53 | 54 | Instead of dynamically allocating memory on the **heap**, we can place the function object (including its virtual table) into a preallocated location on the **stack**. 55 | 56 | This is how we implemented `SmallFun`, which is used much like `std::function`: 57 | 58 | ```c++= 59 | // A SmallFun with capture size of 64 bytes 60 | SmallFun f = [i, N] (int j) { 61 | return i * j + N; 62 | }; 63 | ``` 64 | 65 | 66 | ## Benchmarks 67 | 68 | | test |time(g++6)| time clang++6 & libc++ | note | 69 | |---------------|-----------|---------|------------------------------| 70 | | functor | 191 ns | 120 ns | baseline that's the best we could do: a hand crafted functor | 71 | | sf32 | 312 ns | 300 ns | This is big enough to store 2 ints | 72 | | sf64 | 369 ns | 310 ns | | 73 | | sf128 | 346 ns | 333 ns | | 74 | | sf256 | 376 ns | 320 ns | | 75 | | sf512 | 503 ns | 450 ns | | 76 | | sf1024 | 569 ns | 512 ns | | 77 | | sf2048 | 870 ns | 709 ns | | 78 | | std::function | 1141 ns | 1511 ns | That's how std::function performs | 79 | 80 | 81 | ### The Test 82 | 83 | To test how quickly we can allocate and call functors, we will be saving all the many instances in a vector and executing them in a loop. The results are saved into another vector to ensure that the optimizer does not optimize away what we are testing. 84 | 85 | ```c++= 86 | void stdFunction(benchmark::State& state) { 87 | 88 | unsigned N = 100; 89 | 90 | std::vector> fs(N); 91 | std::vector r(N); 92 | 93 | while (state.KeepRunning()) { 94 | 95 | for (int i = 0; i < N; ++i) { 96 | fs[i] = [i, N] (int j) { // assign to the type erased container 97 | return i * j + N; 98 | }; 99 | }; 100 | 101 | int j = 0; 102 | std::transform(fs.begin(), fs.end(), r.begin(), [&](auto const& f) { 103 | return f(j++); // eval the function objects 104 | }); 105 | } 106 | } 107 | ``` 108 | 109 | 110 | # SmallFun Implementation Details 111 | 112 | We need to combine three C++ patterns: type-erasure, PImpl and placement-new. 113 | 114 | ## Type Erasure 115 | 116 | Type Erasure unifies many implementations into one interface. In our case, every lambda or functor has a custom call operator and destructor. We need to automatically generate an implementation for any type the API consumer will be using. 117 | 118 | This shall be our public interface: 119 | 120 | ```c++= 121 | template 122 | struct Concept { 123 | virtual ReturnType operator()(Xs...) const = 0; 124 | virtual ReturnType operator()(Xs...) = 0; 125 | virtual ~Concept() {}; 126 | }; 127 | ``` 128 | 129 | And for any callable type with a given signature: 130 | 131 | ```c++= 132 | template 133 | struct Model final 134 | : Concept { 135 | F f; 136 | 137 | Model(F const& f) 138 | : f(f) 139 | {} 140 | 141 | virtual ReturnType operator()(Xs...xs) const { 142 | return f(xs...); 143 | } 144 | 145 | virtual ReturnType operator()(Xs...xs) { 146 | return f(xs...); 147 | } 148 | 149 | virtual ~Model() {} 150 | }; 151 | ``` 152 | 153 | Now we can use it the following way 154 | 155 | ```c++= 156 | auto lambda = [](int x) { return x; }; 157 | using lambdaType = decltype(lambda); 158 | 159 | SFConcept* functor = new Model(lambda); 160 | ``` 161 | 162 | This is quite cumbersome and error prone. The next step will be a container. 163 | 164 | 165 | ## PImpl 166 | 167 | PImpl seperates, hides, manages the lifetime of an actual implementation and exposes a limited public API. 168 | 169 | A straightforward implementation could look like this: 170 | 171 | ```c++= 172 | template 173 | class Function { 174 | std::shared_ptr> pimpl; 175 | 176 | public: 177 | Function() {} 178 | 179 | template 180 | Function(F const& f) 181 | : pimpl(new SFModel ) // heap allocation 182 | {} 183 | 184 | ~Function() = default; 185 | }; 186 | ``` 187 | 188 | This is more or less how `std::function` is implemented. 189 | 190 | So how do we get rid of the heap allocation? 191 | 192 | ## placement-new 193 | 194 | Placement-new allocates memory at a given address. For example: 195 | 196 | ```c++= 197 | char memorypool[64]; 198 | int* a = new (memorypool) int[4]; 199 | int* b = new (memorypool + sizeof(int) * 4 ) int[4]; 200 | assert( (void*)a[0] == (void*)memorypool[0] ); 201 | assert( (void*)b[0] == (void*)memorypool[32] ); 202 | ``` 203 | 204 | ## Putting it All Together 205 | 206 | Now we only need to do minor changes to remove the heap allocation: 207 | 208 | ```c++= 209 | template 210 | class SmallFun { 211 | char memory[SIZE]; 212 | public: 213 | template 214 | SmallFun(F const& f) 215 | : new (memory) Model { 216 | assert( sizeof(Model) < SIZE ); 217 | } 218 | 219 | ~SmallFun() { 220 | if (allocated) { 221 | ((concept*)memory)->~concept(); 222 | } 223 | } 224 | }; 225 | ``` 226 | 227 | As you may noticed, if the `Model<...>`'s size is greater than `SIZE` bad bad things will happen and an assert will only catch this at run-time when it is to late... Luckily, this can be catched at compile-time using `enable_if_t`. 228 | 229 | But first what about the copy constructor? 230 | 231 | 232 | ## Copy Constructor 233 | 234 | Unlike the implementation of `std::function`, we cannot just copy nor move a `std::shared_ptr`. We also cannot just copy bitwise the memory as the lambda may manage a resource that can only be released once or has a side-effect. Therefore, we need to make the model able to copy-construct itself for a given memory location: 235 | 236 | We just need to add: 237 | 238 | ```c++= 239 | // ... 240 | 241 | virtual void copy(void* memory) const { 242 | new (memory) Model(f); 243 | } 244 | 245 | 246 | template = 0> 248 | SmallFun(SmallFun const& rhs) { 249 | rhs.copy(memory); 250 | } 251 | 252 | // ... 253 | ``` 254 | 255 | 256 | # Further Remarks 257 | 258 | - As we saw, we can verify at compile-time if a Lambda will fit in our memory. 259 | If it does not, we could provide a fallback to heap allocation. 260 | 261 | - A more generic implementation of `SmallFun` would take a generic allocator. 262 | 263 | - We noticed that we cannot copy the memory just by copying the memory bitwise. However using type-traits, we could check if 264 | the underlying data-type is POD and then copy bitwise. 265 | -------------------------------------------------------------------------------- /buckaroo.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "smallfun", 3 | "dependencies": { 4 | "google/benchmark": "1.1.0" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /buckaroo.lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "google/benchmark": { 3 | "source": { 4 | "url": "https://github.com/google/benchmark/archive/4f8bfeae470950ef005327973f15b0044eceaceb.zip", 5 | "sha256": "9be58359796160a073c9b1e75580239433e72c8fc6b40dd6e2d264b817e0e728", 6 | "subPath": "benchmark-4f8bfeae470950ef005327973f15b0044eceaceb" 7 | }, 8 | "buck": { 9 | "url": "https://raw.githubusercontent.com/nikhedonia/benchmark/cea8820528eee7aa0e8442f8d08ea4cc0fe236d7/BUCK", 10 | "sha256": "8718a5ad5823722a539f8fc9c067e7659c37063107d8f42c381a05001b5a07cc" 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /smallfun/apps/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace smallfun; 12 | 13 | struct Functor { 14 | int i; 15 | unsigned N; 16 | 17 | constexpr int operator()(int j)const { 18 | return i*j+N; 19 | } 20 | }; 21 | 22 | void functor(benchmark::State& state) { 23 | unsigned N = 100; 24 | std::vector fs(N); 25 | std::vector r(N); 26 | 27 | while(state.KeepRunning()) { 28 | 29 | for(int i=0; i < N; ++i) { 30 | fs[i] = Functor{i, N}; 31 | }; 32 | 33 | int j = 0; 34 | std::transform(fs.begin(), fs.end(), r.begin(), [&](auto const& f) { 35 | return f(j++); // execute the functor 36 | }); 37 | } 38 | 39 | if( r[N-1] - r[0] != 9801 ) { 40 | // lets make sure the optimizer does not optimizes away the thing we want to test 41 | std::cout << r[N-1] - r[0] << std::endl; 42 | } 43 | } 44 | 45 | template 46 | void smallFunction(benchmark::State& state) { 47 | unsigned N = 100; 48 | 49 | using sf = SmallFun; 50 | std::vector fs(N); 51 | std::vector r(N); 52 | while(state.KeepRunning()) { 53 | for(int i=0; i < N; ++i) { 54 | fs[i] = [i, N] (int j) { 55 | return i*j+N; 56 | }; 57 | }; 58 | 59 | int j = 0; 60 | std::transform(fs.begin(), fs.end(), r.begin(), [&](auto const& f) { 61 | return f(j++); 62 | }); 63 | } 64 | 65 | if( r[N-1] - r[0] != 9801 ) { 66 | std::cout << r[N-1] - r[0] << std::endl; 67 | } 68 | } 69 | 70 | 71 | void stdFunction(benchmark::State& state) { 72 | unsigned N = 100; 73 | std::vector> fs(N); 74 | std::vector r(N); 75 | while(state.KeepRunning()) { 76 | for(int i=0; i < N; ++i) { 77 | fs[i] = [i, N] (int j) { 78 | return i*j+N; 79 | }; 80 | }; 81 | 82 | int j = 0; 83 | std::transform(fs.begin(), fs.end(), r.begin(), [&](auto const& f) { 84 | return f(j++); 85 | }); 86 | } 87 | 88 | if( r[N-1] - r[0] != 9801 ) { 89 | std::cout << r[N-1] - r[0] << std::endl; 90 | } 91 | } 92 | 93 | 94 | auto sf32 = smallFunction<32>; 95 | auto sf64 = smallFunction<64>; 96 | auto sf128 = smallFunction<128>; 97 | auto sf256 = smallFunction<256>; 98 | auto sf512 = smallFunction<512>; 99 | auto sf1024 = smallFunction<1024>; 100 | auto sf2048 = smallFunction<2048>; 101 | 102 | BENCHMARK(functor); 103 | BENCHMARK(sf32); 104 | BENCHMARK(sf64); 105 | BENCHMARK(sf128); 106 | BENCHMARK(sf256); 107 | BENCHMARK(sf512); 108 | BENCHMARK(sf1024); 109 | BENCHMARK(sf2048); 110 | BENCHMARK(stdFunction); 111 | 112 | BENCHMARK_MAIN(); 113 | -------------------------------------------------------------------------------- /smallfun/include/smallfun.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SMALLFUNCTION_SMALLFUNCTION_HPP 2 | #define SMALLFUNCTION_SMALLFUNCTION_HPP 3 | 4 | #include 5 | 6 | namespace smallfun { 7 | 8 | 9 | template 10 | struct SFConcept { 11 | virtual ReturnType operator()(Xs...)const = 0; 12 | virtual ReturnType operator()(Xs...) = 0; 13 | virtual void copy(void*)const = 0; 14 | virtual ~SFConcept() {}; 15 | }; 16 | 17 | template 18 | struct SFModel final 19 | : SFConcept { 20 | F f; 21 | 22 | SFModel(F const& f) 23 | : f(f) 24 | {} 25 | 26 | virtual void copy(void* memory)const { 27 | new (memory) SFModel(f); 28 | } 29 | 30 | virtual ReturnType operator()(Xs...xs)const { 31 | return f(xs...); 32 | } 33 | 34 | virtual ReturnType operator()(Xs...xs) { 35 | return f(xs...); 36 | } 37 | 38 | virtual ~SFModel() {} 39 | }; 40 | 41 | 42 | 43 | template 44 | struct SmallFun; 45 | 46 | template 47 | class SmallFun { 48 | char memory[size]; 49 | 50 | bool allocated = 0; 51 | using concept = SFConcept; 52 | public: 53 | SmallFun(){} 54 | 55 | template)<=size), bool> = 0 > 57 | SmallFun(F const&f) 58 | : allocated(sizeof(SFModel)) { 59 | new (memory) SFModel(f); 60 | } 61 | 62 | template = 0> 64 | SmallFun(SmallFun const& sf) 65 | : allocated(sf.allocated) { 66 | sf.copy(memory); 67 | } 68 | 69 | 70 | template = 0> 72 | SmallFun& operator=(SmallFun const& sf) { 73 | clean(); 74 | allocated = sf.allocated; 75 | sf.copy(memory); 76 | return *this; 77 | } 78 | 79 | void clean() { 80 | if (allocated) { 81 | ((concept*)memory)->~concept(); 82 | allocated = 0; 83 | } 84 | } 85 | 86 | ~SmallFun() { 87 | if (allocated) { 88 | ((concept*)memory)->~concept(); 89 | } 90 | } 91 | 92 | template 93 | ReturnType operator()(Ys&&...ys) { 94 | return (*(concept*)memory)(std::forward(ys)...); 95 | } 96 | 97 | template 98 | ReturnType operator()(Ys&&...ys)const { 99 | return (*(concept*)memory)(std::forward(ys)...); 100 | } 101 | 102 | void copy(void* data)const { 103 | if (allocated) { 104 | ((concept*)memory)->copy(data); 105 | } 106 | } 107 | }; 108 | 109 | } 110 | 111 | #endif 112 | --------------------------------------------------------------------------------