├── Mind the cache.pdf ├── README.md ├── aos_vs_soa.cpp ├── compact_aos_vs_soa.cpp ├── filtered_sum.cpp ├── linear_traversal.cpp ├── matrix_sum.cpp ├── parallel_count.cpp ├── perf.xlsx ├── poly_containers.cpp └── random_access_aos_vs_soa.cpp /Mind the cache.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joaquintides/usingstdcpp2015/dfec560c7733ab035e0e9690cb603afa7ee11e9a/Mind the cache.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | using std::cpp 2015 2 | ------------------- 3 | 4 | Presentation from Joaquín Mª López Muñoz at [using std::cpp 2015](http://usingstdcpp.org/using-stdcpp-2015/) and associated material. 5 | * [Mind the cache](https://github.com/joaquintides/usingstdcpp2015/raw/master/Mind%20the%20cache.pdf) ([video](https://www.youtube.com/watch?v=TipTVUGBFtY) in Spanish): Classical big-O algorithmic complexity analysis proves insufficient to estimate program performance for modern computer architectures: current processors are equipped with several low-level components (hierarchical cache structures, pipelining, branch prediction) that greatly favor certain code and data layout patterns not taken into account by naïve computation models. In this talk we see some examples of the impact these factors have and provide suggestions for performance improvement based on data locality and regularity in code execution. 6 | -------------------------------------------------------------------------------- /aos_vs_soa.cpp: -------------------------------------------------------------------------------- 1 | /* usingstdcpp2015: AOS vs SOA. 2 | * 3 | * Copyright 2015 Joaquin M Lopez Munoz. 4 | * Distributed under the Boost Software License, Version 1.0. 5 | * (See accompanying file LICENSE_1_0.txt or copy at 6 | * http://www.boost.org/LICENSE_1_0.txt) 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | 65 | struct particle 66 | { 67 | int x,y,z; 68 | int dx,dy,dz; 69 | }; 70 | 71 | using particle_aos=std::vector; 72 | 73 | particle_aos create_particle_aos(int n) 74 | { 75 | particle_aos res; 76 | res.reserve(n); 77 | for(int i=0;i x,y,z; 84 | std::vector dx,dy,dz; 85 | }; 86 | 87 | particle_soa create_particle_soa(int n) 88 | { 89 | particle_soa res; 90 | res.x.reserve(n); 91 | res.y.reserve(n); 92 | res.z.reserve(n); 93 | res.dx.reserve(n); 94 | res.dy.reserve(n); 95 | res.dz.reserve(n); 96 | for(int i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | 65 | struct particle 66 | { 67 | int x,y,z; 68 | }; 69 | 70 | using particle_aos=std::vector; 71 | 72 | particle_aos create_particle_aos(int n) 73 | { 74 | particle_aos res; 75 | res.reserve(n); 76 | for(int i=0;i x,y,z; 83 | }; 84 | 85 | particle_soa create_particle_soa(int n) 86 | { 87 | particle_soa res; 88 | res.x.reserve(n); 89 | res.y.reserve(n); 90 | res.z.reserve(n); 91 | for(int i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | 66 | int main() 67 | { 68 | std::size_t n0=10000,n1=40000000,dn=2000; 69 | double fdn=1.1; 70 | 71 | std::cout<<"filtered sum:"< v; 76 | std::mt19937 gen; 77 | std::uniform_int_distribution<> rnd(0,255); 78 | v.reserve(n); 79 | for(std::size_t i=0;i128)res+=x; 84 | return res; 85 | }; 86 | 87 | std::cout< 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | 68 | int main() 69 | { 70 | std::size_t n0=10000,n1=40000000,dn=2000; 71 | double fdn=1.1; 72 | 73 | std::cout<<"linear traversal:"< v(n); 80 | std::iota(v.begin(),v.end(),0); 81 | std::cout< l(n); 87 | std::iota(l.begin(),l.end(),0); 88 | std::cout< rnd(0,n-1); 95 | std::list l; 96 | for(std::size_t i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | 67 | int main() 68 | { 69 | std::size_t n0=10000,n1=40000000,dn=2000; 70 | double fdn=1.1; 71 | 72 | std::cout<<"matrix sum:"<(std::sqrt(n)); 77 | boost::multi_array a(boost::extents[m][m]); 78 | 79 | /* fill with some values */ 80 | for(std::size_t i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | 66 | int main() 67 | { 68 | std::size_t n0=10000,n1=40000000,dn=2000; 69 | double fdn=1.1; 70 | 71 | std::cout<<"parallel count:"< rnd(0,255); 77 | std::vector v; 78 | v.reserve(n); 79 | 80 | /* fill with some values */ 81 | for(std::size_t i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | #include 65 | #include 66 | #include 67 | #include 68 | #include 69 | #include 70 | 71 | template 72 | class poly_collection_segment_base 73 | { 74 | public: 75 | virtual ~poly_collection_segment_base(){}; 76 | 77 | void insert(const Base& x) 78 | { 79 | this->insert_(x); 80 | } 81 | 82 | template 83 | void for_each(F& f) 84 | { 85 | std::size_t s=this->element_size_(); 86 | for(auto it=this->begin_(),end=it+this->size_()*s;it!=end;it+=s){ 87 | f(*reinterpret_cast(it)); 88 | } 89 | } 90 | 91 | template 92 | void for_each(F& f)const 93 | { 94 | std::size_t s=this->element_size_(); 95 | for(auto it=this->begin_(),end=it+this->size_()*s;it!=end;it+=s){ 96 | f(*reinterpret_cast(it)); 97 | } 98 | } 99 | 100 | private: 101 | virtual void insert_(const Base& x)=0; 102 | virtual char* begin_()=0; 103 | virtual const char* begin_()const=0; 104 | virtual std::size_t size_()const=0; 105 | virtual std::size_t element_size_()const=0; 106 | }; 107 | 108 | template 109 | class poly_collection_segment: 110 | public poly_collection_segment_base 111 | { 112 | private: 113 | virtual void insert_(const Base& x) 114 | { 115 | store.push_back(static_cast(x)); 116 | } 117 | 118 | virtual char* begin_() 119 | { 120 | return reinterpret_cast( 121 | static_cast(const_cast(store.data()))); 122 | } 123 | 124 | virtual const char* begin_()const 125 | { 126 | return reinterpret_cast( 127 | static_cast(store.data())); 128 | } 129 | 130 | virtual std::size_t size_()const{return store.size();} 131 | virtual std::size_t element_size_()const{return sizeof(Derived);} 132 | 133 | std::vector store; 134 | }; 135 | 136 | template 137 | class poly_collection 138 | { 139 | public: 140 | template 141 | void insert( 142 | const Derived& x, 143 | typename std::enable_if::value>::type* =0) 144 | { 145 | auto& pchunk=chunks[typeid(x)]; 146 | if(!pchunk)pchunk.reset(new poly_collection_segment()); 147 | pchunk->insert(x); 148 | } 149 | 150 | template 151 | F for_each(F f) 152 | { 153 | for(const auto& p:chunks)p.second->for_each(f); 154 | return std::move(f); 155 | } 156 | 157 | template 158 | F for_each(F f)const 159 | { 160 | for(const auto& p:chunks) 161 | const_cast(*p.second).for_each(f); 162 | return std::move(f); 163 | } 164 | 165 | private: 166 | typedef poly_collection_segment_base segment; 167 | typedef std::unique_ptr pointer; 168 | 169 | std::map chunks; 170 | }; 171 | 172 | struct base 173 | { 174 | virtual int f()const=0; 175 | virtual ~base(){} 176 | }; 177 | 178 | struct derived1:base 179 | { 180 | virtual int f()const{return 1;}; 181 | }; 182 | 183 | struct derived2:base 184 | { 185 | virtual int f()const{return 2;}; 186 | }; 187 | 188 | struct derived3:base 189 | { 190 | virtual int f()const{return 3;}; 191 | }; 192 | 193 | int main() 194 | { 195 | std::size_t n0=10000,n1=40000000,dn=2000; 196 | double fdn=1.1; 197 | 198 | std::cout<<"polymorphic containers:"<; 205 | std::vector v; 206 | std::mt19937 gen; 207 | std::uniform_int_distribution<> rnd(1,3); 208 | v.reserve(n); 209 | for(std::size_t i=0;i());break; 212 | case 2: v.push_back(std::make_shared());break; 213 | case 3: 214 | default: v.push_back(std::make_shared());break; 215 | } 216 | } 217 | std::shuffle(v.begin(),v.end(),gen); 218 | 219 | auto f=[&](){ 220 | long int res=0; 221 | for(const auto& p:v)res+=p->f(); 222 | return res; 223 | }; 224 | 225 | std::cout< v; 233 | std::mt19937 gen; 234 | std::uniform_int_distribution<> rnd(1,3); 235 | for(std::size_t i=0;i 10 | #include 11 | #include 12 | #include 13 | 14 | std::chrono::high_resolution_clock::time_point measure_start,measure_pause; 15 | 16 | template 17 | double measure(F f) 18 | { 19 | using namespace std::chrono; 20 | 21 | static const int num_trials=10; 22 | static const milliseconds min_time_per_trial(200); 23 | std::array trials; 24 | volatile decltype(f()) res; /* to avoid optimizing f() away */ 25 | 26 | for(int i=0;i>(t2-measure_start).count()/runs; 37 | } 38 | (void)(res); /* var not used warn */ 39 | 40 | std::sort(trials.begin(),trials.end()); 41 | return std::accumulate( 42 | trials.begin()+2,trials.end()-2,0.0)/(trials.size()-4)*1E6; 43 | } 44 | 45 | template 46 | double measure(Size n,F f) 47 | { 48 | return measure(f)/n; 49 | } 50 | 51 | void pause_timing() 52 | { 53 | measure_pause=std::chrono::high_resolution_clock::now(); 54 | } 55 | 56 | void resume_timing() 57 | { 58 | measure_start+=std::chrono::high_resolution_clock::now()-measure_pause; 59 | } 60 | 61 | #include 62 | #include 63 | #include 64 | 65 | struct particle 66 | { 67 | int x,y,z; 68 | }; 69 | 70 | using particle_aos=std::vector; 71 | 72 | particle_aos create_particle_aos(int n) 73 | { 74 | particle_aos res; 75 | res.reserve(n); 76 | for(int i=0;i x,y,z; 83 | }; 84 | 85 | particle_soa create_particle_soa(int n) 86 | { 87 | particle_soa res; 88 | res.x.reserve(n); 89 | res.y.reserve(n); 90 | res.z.reserve(n); 91 | for(int i=0;i rnd(0,n-1); 114 | long int res=0; 115 | for(std::size_t i=0;i rnd(0,n-1); 127 | long int res=0; 128 | for(std::size_t i=0;i