├── .github
    └── FUNDING.yml
├── .gitignore
├── CMakeLists.txt
├── README.md
├── cpp
    ├── pcl_conditional_removal.cpp
    ├── prefer_references.cpp
    ├── reserve.cpp
    ├── small_strings.cpp
    ├── strings_are_vectors.cpp
    └── strings_concatenation.cpp
├── docs
    ├── 2d_matrix_iteration.md
    ├── 2d_transforms.md
    ├── about.md
    ├── boost_flatmap.md
    ├── dont_need_map.md
    ├── img
    │   ├── beautiful.jpg
    │   ├── boom.gif
    │   ├── const_reference.png
    │   ├── cpp.png
    │   ├── davide_yells_at_PCL.jpg
    │   ├── feel_bad.jpg
    │   ├── fizzbuzz.jpg
    │   ├── growing_vector.png
    │   ├── hotspot_heaptrack.jpg
    │   ├── inconceivably.jpg
    │   ├── laser_scan_matcher.png
    │   ├── linked_list.png
    │   ├── modifystring.png
    │   ├── mordor.jpg
    │   ├── motor_profile1.png
    │   ├── motor_profile2.png
    │   ├── multiply_vector.png
    │   ├── palindrome_benchmark.png
    │   ├── pcl.jpg
    │   ├── pcl_fromros.png
    │   ├── quick-bench.png
    │   ├── quote.png
    │   ├── really.jpg
    │   ├── realsense.png
    │   ├── relax_sso.jpg
    │   ├── spider_senses.png
    │   ├── sso_in_action.png
    │   ├── string_concatenation.png
    │   ├── that_is_logn.jpg
    │   ├── think_about_it.jpg
    │   ├── tostring.png
    │   ├── twitter_unordered.png
    │   ├── two_lookups.jpg
    │   ├── vector_reserve.png
    │   ├── velodyne.png
    │   └── why_copy.jpg
    ├── index.md
    ├── no_lists.md
    ├── palindrome.md
    ├── pcl_filter.md
    ├── pcl_fromROS.md
    ├── prefer_references.md
    ├── reserve.md
    ├── small_strings.md
    ├── small_vectors.md
    ├── strings_are_vectors.md
    └── strings_concatenation.md
├── mkdocs.yml
└── overrides
    └── main.html


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: facontidavide
4 | custom: https://www.paypal.me/facontidavide
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | site
 2 | 
 3 | */CMakeLists.txt.user
 4 | 
 5 | build-*
 6 | 
 7 | # Prerequisites
 8 | *.d
 9 | 
10 | # Compiled Object files
11 | *.slo
12 | *.lo
13 | *.o
14 | *.obj
15 | 
16 | # Precompiled Headers
17 | *.gch
18 | *.pch
19 | 
20 | # Compiled Dynamic libraries
21 | *.so
22 | *.dylib
23 | *.dll
24 | 
25 | # Fortran module files
26 | *.mod
27 | *.smod
28 | 
29 | # Compiled Static libraries
30 | *.lai
31 | *.la
32 | *.a
33 | *.lib
34 | 
35 | # Executables
36 | *.exe
37 | *.out
38 | *.app
39 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | 
 3 | project(cpp_optimizations)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 11)
 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 7 | 
 8 | # On ubuntu, install with: sudo apt install libbenchmark-dev
 9 | # https://github.com/google/benchmark
10 | find_package(benchmark REQUIRED)
11 | find_package(Threads)
12 | 
13 | function(compile_example  target file)
14 |     add_executable(${target}  ${file} )
15 |     target_link_libraries(${target}  benchmark Threads::Threads)
16 | endfunction()
17 | 
18 | compile_example(prefer_references          basics/prefer_references/prefer_references.cpp)
19 | 
20 | compile_example(vector_reserve             vectors_everywhere/reserve/reserve.cpp)
21 | 
22 | compile_example(strings_are_vectors        just_a_string/strings_are_vectors/strings_are_vectors.cpp)
23 | compile_example(small_string_optimization  just_a_string/small_strings/small_strings.cpp)
24 | 
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Preface: about me
  2 | 
  3 |  My name is [Davide Faconti](https://twitter.com/facontidavide) and my job is one of the best in the world: I work in **robotics**.
  4 |   
  5 | This blog/repository is maintained in **my spare time** and it is not  related to my work there. Therefore *opinions (and memes) are all mine and don't represent my employer in any way*.
  6 | 
  7 | I love C++ programming and Open Source and this "diary" is my small contribution to the OSS community.
  8 | 
  9 | # CPP Optimizations Diary
 10 | 
 11 | Optimizing code in C++ is something that no one can resist. You can have fun
 12 | and pretend that you are doing something useful for your organization at the same time!
 13 | 
 14 | In this repository, I will record some simple design patterns to improve your code 
 15 | and remove unnecessary overhead in **C++**.
 16 | 
 17 | If you are a seasoned C++ expert, you probably have your own set of rules already.
 18 | 
 19 | These rules help you look like a bad-ass/rockstar/10X engineer to your colleagues.
 20 | 
 21 | You are the kind of person that casually drops a [std::vector<>::reserve](docs/reserve.md) before a loop and
 22 | nods, smiling, looking at the performance improvement and the astonishment of your team member.
 23 | 
 24 | <p align="center"><img src="docs/img/boom.gif" width="350"></p>
 25 | 
 26 | 
 27 | Hopefully, the examples in this repository will help you achieve this status of guru
 28 | and, as a side effect, save the planet from global warming, sparing useless CPU
 29 | cycles from being wasted.
 30 | 
 31 | Then, unfortunately, someone on the other side of the planet will start mining Bitcoins or write her/his 
 32 | application in **Python** and all your effort to save electricity was for nothing.
 33 | 
 34 | I am kidding, Python developers, we love you!
 35 | 
 36 | > Narrator: "he was not kidding..."
 37 | 
 38 | ## Rule 1: measure first (using _good_ tools)
 39 | 
 40 | The very first thing any person concerned about performance should do is:
 41 |  
 42 | - **Measure first** and **make hypothesis later**.
 43 | 
 44 | Me and my colleagues are almost always wrong about the reasons a piece of code is slow. 
 45 | 
 46 | Sometimes we are right, but it is really hard to know in advance how refactoring will
 47 | improve performance. Good profiling tools show in minutes the "low hanging fruits": minimum work, maximum benefit!
 48 | 
 49 | Summarizing: 10 minutes profiling can save you hours of guessing and refactoring.
 50 | 
 51 | My "goto" tools in Linux are [Hotspot](https://github.com/KDAB/hotspot) and 
 52 | [Heaptrack](https://github.com/KDE/heaptrack). I understand Windows has similar
 53 | tools too.
 54 | 
 55 | <p align="center"><img src="docs/img/hotspot_heaptrack.jpg" width="350"></p>
 56 | 
 57 | In the benchmark war, if you are the soldier, these are your rifle and hand grenades.
 58 | 
 59 | Once you know which part of the code deserves to be optimized, you might want to use
 60 | [Google Benchmark](https://github.com/google/benchmark) to measure the time spent in a very specific
 61 | class or function.
 62 | 
 63 | You can even run it Google Benchmark online here: [quick-bench.com](http://quick-bench.com/G7B2w0xPUWgOVvuzI7unES6cU4w).
 64 | 
 65 | ![quick-bench](docs/img/quick-bench.png)
 66 | 
 67 | ## Rule 2: learn good design patterns, use them by default
 68 | 
 69 | Writing good code is like brushing your teeth: you should do it without thinking too much about it.
 70 | 
 71 | It is a muscle that you need to train, that will become stronger over time. But don't worry:
 72 | once you start, you will start seeing recurring patterns that 
 73 | are surprisingly simple and works in many different use cases.
 74 | 
 75 | **Spoiler alert**: one of my most beloved tricks is to _minimize the number of heap allocations_.
 76 | You have no idea how much that helps.
 77 | 
 78 | But let's make something absolutely clear: 
 79 | 
 80 | - Your **first goal** as a developer (software engineer?) is to create code that is **correct** and fulfil the requirements.
 81 | - The **second** most important thing is to make your code **maintainable and readable** for other people.
 82 | - In many cases, you also want to make code faster, because [faster code is better code](https://craigmod.com/essays/fast_software/).
 83 | 
 84 | In other words, think twice before doing any change in your code that makes it less readable or harder to debug,
 85 | just because you believe it may run 2.5% faster.
 86 | 
 87 | # Get started
 88 | 
 89 | For a more comfortable reading experience, visit: https://cpp-optimizations.netlify.app
 90 | 
 91 | ## Optimization examples
 92 | 
 93 | ### "If you pass that by value one more time..."
 94 | 
 95 | - [Use Const reference by default](docs/prefer_references.md).
 96 | 
 97 | - Move semantic (TODO).
 98 | 
 99 | - Return value optimization (TODO).
100 | 
101 | 
102 | ### std::vector<> is your best friend
103 | 
104 | 
105 | - [Use std::vector<>::reserve by default](docs/reserve.md)
106 | 
107 | - ["I have learnt linked-lists at university, should I use them?" Nooope](docs/no_lists.md).
108 | 
109 | - [You don't need a `std::map<>` for that](docs/dont_need_map.md).
110 | 
111 | - [Small vector optimization](docs/small_vectors.md)
112 | 
113 | 
114 | ### "It is just a string, how bad could that be?"
115 | 
116 | - [Strings are (almost) vectors](docs/strings_are_vectors.md)
117 | 
118 | - [When not to worry: small string optimization](docs/small_strings.md).
119 | 
120 | - [String concatenation: the false sense of security of `operator+`](docs/strings_concatenation.md).
121 | 
122 | - `std::string_view`: love at first sight (TODO).
123 | 
124 | ### Don't compute things twice.
125 | 
126 | - [Example: 2D/3D transforms the right way](docs/2d_transforms.md).
127 | 
128 | - [Iterating over a 2D matrix: less elegant, more performant](docs/2d_matrix_iteration.md).
129 | 
130 | ### Fantastic data structures and where to find them.
131 | 
132 | - [I tried `boost::container::flat_map`. You won't imagine what happened next](docs/boost_flatmap.md).
133 | 
134 | ### Case studies
135 | 
136 | - [Simpler and faster way to filter Point Clouds in PCL.](docs/pcl_filter.md)
137 | 
138 | - [Fast Palindrome: the cost of conditional branches](docs/palindrome.md)
139 | 
140 | 
141 | # License
142 | 
143 | <p xmlns:dct="http://purl.org/dc/terms/" xmlns:cc="http://creativecommons.org/ns#" class="license-text">This work   is licensed under <a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0">CC BY-SA 4.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1" /><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/by.svg?ref=chooser-v1" /><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/sa.svg?ref=chooser-v1" /></a></p>
144 | 


--------------------------------------------------------------------------------
/cpp/pcl_conditional_removal.cpp:
--------------------------------------------------------------------------------
  1 | #include <benchmark/benchmark.h>
  2 | #include <pcl/common/centroid.h>
  3 | #include <pcl/common/common.h>
  4 | #include <pcl/filters/conditional_removal.h>
  5 | #include <pcl/io/pcd_io.h>
  6 | #include <pcl/pcl_base.h>
  7 | #include <pcl/point_cloud.h>
  8 | #include <pcl/point_types.h>
  9 | 
 10 | using namespace pcl;
 11 | 
 12 | const char filename[] = "test_pcd.pcd";
 13 | 
 14 | PointCloud<pcl::PointXYZ>::Ptr LoadFromFile(const char *filename)
 15 | {
 16 |   PointCloud<pcl::PointXYZ>::Ptr cloud_in(new PointCloud<pcl::PointXYZ>);
 17 |   if (io::loadPCDFile<pcl::PointXYZ>(filename, *cloud_in) == -1)  //* load the file
 18 |   {
 19 |     std::cerr << "Couldn't read file " << filename << std::endl;
 20 |     throw std::runtime_error("file not found)");
 21 |   }
 22 |   return cloud_in;
 23 | }
 24 | 
 25 | static void PCL_Filter(benchmark::State & state)
 26 | {
 27 |   auto cloud = LoadFromFile(filename);
 28 | 
 29 |   PointXYZ minPt, maxPt;
 30 |   pcl::getMinMax3D(*cloud, minPt, maxPt);
 31 |   const float mid_point_x = (maxPt.x+minPt.x)*0.5 ;
 32 |   const float mid_point_y = (maxPt.y+minPt.y)*0.5 ;
 33 | 
 34 |   PointCloud<PointXYZ>::Ptr outCloud(new PointCloud<PointXYZ>);
 35 | 
 36 |   for (auto _ : state) {
 37 |     outCloud->clear();
 38 |     pcl::ConditionalRemoval<PointXYZ> range_filt;
 39 | 
 40 |     auto range_cond   = boost::make_shared<pcl::ConditionOr<PointXYZ>>();
 41 |     auto comparison_x = boost::make_shared<FieldComparison<PointXYZ>>("x", ComparisonOps::GT, mid_point_x);
 42 |     auto comparison_y = boost::make_shared<FieldComparison<PointXYZ>>("y", ComparisonOps::GT, mid_point_y);
 43 |     range_cond->addComparison(comparison_x);
 44 |     range_cond->addComparison(comparison_y);
 45 | 
 46 |     range_filt.setInputCloud(cloud);
 47 |     range_filt.setCondition(range_cond);
 48 |     range_filt.filter(*outCloud);
 49 |   }
 50 | }
 51 | 
 52 | static void Naive_Filter(benchmark::State & state)
 53 | {
 54 |   auto cloud = LoadFromFile(filename);
 55 | 
 56 |   PointXYZ minPt, maxPt;
 57 |   pcl::getMinMax3D(*cloud, minPt, maxPt);
 58 |   const float mid_point_x = (maxPt.x+minPt.x)*0.5 ;
 59 |   const float mid_point_y = (maxPt.y+minPt.y)*0.5 ;
 60 | 
 61 |   PointCloud<PointXYZ>::Ptr outCloud(new PointCloud<PointXYZ>);
 62 | 
 63 |   for (auto _ : state) {
 64 |     outCloud->clear();
 65 |     for (const auto& point: cloud->points) {
 66 |       if( point.x > mid_point_x && point.y > mid_point_y ){
 67 |         outCloud->push_back( point );
 68 |       }
 69 |     }
 70 |   }
 71 | }
 72 | 
 73 | BENCHMARK(PCL_Filter);
 74 | BENCHMARK(Naive_Filter);
 75 | 
 76 | //----------------------------------------------------------------
 77 | template <typename PointT>
 78 | class GenericCondition : public ConditionBase<PointT>
 79 | {
 80 | public:
 81 |   typedef boost::shared_ptr<GenericCondition<PointT>> Ptr;
 82 |   typedef boost::shared_ptr<const GenericCondition<PointT>> ConstPtr;
 83 |   typedef std::function<bool(const PointT&)> FunctorT;
 84 | 
 85 |   GenericCondition(FunctorT evaluator): ConditionBase<PointT>(),_evaluator(evaluator) {}
 86 | 
 87 |   virtual bool evaluate (const PointT &point) const {
 88 |     return _evaluator(point);
 89 |   }
 90 | private:
 91 |   FunctorT _evaluator;
 92 | };
 93 | 
 94 | static void PCL_Filter_Generic(benchmark::State & state)
 95 | {
 96 |   auto cloud = LoadFromFile(filename);
 97 | 
 98 |   PointXYZ minPt, maxPt;
 99 |   pcl::getMinMax3D(*cloud, minPt, maxPt);
100 |   const float mid_point_x = (maxPt.x+minPt.x)*0.5 ;
101 |   const float mid_point_y = (maxPt.y+minPt.y)*0.5 ;
102 | 
103 |   PointCloud<PointXYZ>::Ptr outCloud(new PointCloud<PointXYZ>);
104 | 
105 |   for (auto _ : state) {
106 |     outCloud->clear();
107 |     pcl::ConditionalRemoval<PointXYZ> range_filt;
108 | 
109 |     auto range_cond = boost::make_shared<GenericCondition<PointXYZ>>(
110 |       [=](const PointXYZ& point){
111 |         return point.x > mid_point_x && point.y > mid_point_y;
112 |       });
113 | 
114 |     range_filt.setInputCloud(cloud);
115 |     range_filt.setCondition(range_cond);
116 |     range_filt.filter(*outCloud);
117 |   }
118 | }
119 | 
120 | BENCHMARK(PCL_Filter_Generic);
121 | 
122 | BENCHMARK_MAIN();
123 | 


--------------------------------------------------------------------------------
/cpp/prefer_references.cpp:
--------------------------------------------------------------------------------
  1 | #include <benchmark/benchmark.h>
  2 | #include <cstring>
  3 | 
  4 | struct Vector3D{
  5 |     double x;
  6 |     double y;
  7 |     double z;
  8 | };
  9 | 
 10 | Vector3D MultiplyByTwo_Value(Vector3D p){
 11 |     return { p.x*2, p.y*2, p.z*2 };
 12 | }
 13 | 
 14 | Vector3D MultiplyByTwo_Ref(const Vector3D& p){
 15 |     return { p.x*2, p.y*2, p.z*2 };
 16 | }
 17 | 
 18 | void MultiplyVector_Value(benchmark::State& state) {
 19 |   Vector3D in = {1,2,3};
 20 |   for (auto _ : state) {
 21 |     Vector3D out = MultiplyByTwo_Value(in);
 22 |     benchmark::DoNotOptimize(out);
 23 |   }
 24 | }
 25 | BENCHMARK(MultiplyVector_Value);
 26 | 
 27 | void MultiplyVector_Ref(benchmark::State& state) {
 28 |   Vector3D in = {1,2,3};
 29 |   for (auto _ : state) {
 30 |     Vector3D out = MultiplyByTwo_Ref(in);
 31 |     benchmark::DoNotOptimize(out);
 32 |   }
 33 | }
 34 | BENCHMARK(MultiplyVector_Ref);
 35 | //----------------------------------
 36 | size_t GetSpaces_Value(std::string str)
 37 | {
 38 |     size_t spaces = 0;
 39 |     for(const char c: str){
 40 |         if( c == ' ') spaces++;
 41 |     }
 42 |     return spaces;
 43 | }
 44 | 
 45 | size_t GetSpaces_Ref(const std::string& str)
 46 | {
 47 |     size_t spaces = 0;
 48 |     for(const char c: str){
 49 |         if( c == ' ') spaces++;
 50 |     }
 51 |     return spaces;
 52 | }
 53 | 
 54 | const std::string LONG_STR("a long string that can't use Small String Optimization");
 55 | 
 56 | void PassStringByValue(benchmark::State& state) {
 57 |   for (auto _ : state) {
 58 |     size_t n = GetSpaces_Value(LONG_STR);
 59 |     benchmark::DoNotOptimize(n);
 60 |   }
 61 | }
 62 | BENCHMARK(PassStringByValue);
 63 | 
 64 | void PassStringByRef(benchmark::State& state) {
 65 |   for (auto _ : state) {
 66 |       size_t n = GetSpaces_Ref(LONG_STR);
 67 |       benchmark::DoNotOptimize(n);
 68 |   }
 69 | }
 70 | BENCHMARK(PassStringByRef);
 71 | 
 72 | //----------------------------------
 73 | 
 74 | size_t Sum_Value(std::vector<unsigned> vect)
 75 | {
 76 |     size_t sum = 0;
 77 |     for(unsigned val: vect) { sum += val; }
 78 |     return sum;
 79 | }
 80 | 
 81 | size_t Sum_Ref(const std::vector<unsigned>& vect)
 82 | {
 83 |     size_t sum = 0;
 84 |     for(unsigned val: vect) {  sum += val; }
 85 |     return sum;
 86 | }
 87 | 
 88 | const std::vector<unsigned> vect_in = { 1, 2, 3, 4, 5 };
 89 | 
 90 | void PassVectorByValue(benchmark::State& state) {
 91 |   for (auto _ : state) {
 92 |     size_t n = Sum_Value(vect_in);
 93 |     benchmark::DoNotOptimize(n);
 94 |   }
 95 | }
 96 | BENCHMARK(PassVectorByValue);
 97 | 
 98 | void PassVectorByRef(benchmark::State& state) {
 99 |   for (auto _ : state) {
100 |       size_t n = Sum_Ref(vect_in);
101 |       benchmark::DoNotOptimize(n);
102 |   }
103 | }
104 | BENCHMARK(PassVectorByRef);
105 | 
106 | //----------------------------------
107 | 
108 | BENCHMARK_MAIN();
109 | 


--------------------------------------------------------------------------------
/cpp/reserve.cpp:
--------------------------------------------------------------------------------
 1 | #include <benchmark/benchmark.h>
 2 | #include <vector>
 3 | 
 4 | 
 5 | static void NoReserve(benchmark::State& state) {
 6 | 
 7 |   for (auto _ : state) {
 8 |     std::vector<size_t> v;
 9 |     for(size_t i=0; i<100; i++){
10 |         v.push_back(i);
11 |     }
12 |     benchmark::DoNotOptimize( v );
13 |   }
14 | }
15 | BENCHMARK(NoReserve);
16 | 
17 | static void WithReserve(benchmark::State& state) {
18 | 
19 |   for (auto _ : state) {
20 |     std::vector<size_t> v;
21 |     v.reserve(100);
22 |     for(size_t i=0; i<100; i++){
23 |         v.push_back(i);
24 |     }
25 |     benchmark::DoNotOptimize( v );
26 |   }
27 | }
28 | BENCHMARK(WithReserve);
29 | 
30 | 
31 | static void ObsessiveRecycling(benchmark::State& state) {
32 | 
33 |   std::vector<size_t> v;
34 |   for (auto _ : state) {
35 |     v.clear();
36 |     for(size_t i=0; i<100; i++){
37 |         v.push_back(i);
38 |     }
39 |     benchmark::DoNotOptimize( v );
40 |   }
41 | }
42 | BENCHMARK(ObsessiveRecycling);
43 | 
44 | BENCHMARK_MAIN();
45 | 


--------------------------------------------------------------------------------
/cpp/small_strings.cpp:
--------------------------------------------------------------------------------
 1 | #include <benchmark/benchmark.h>
 2 | #include <cstring>
 3 | 
 4 | static const char* SHORT_STR = "hello world";
 5 | 
 6 | static void ShortStringCreation(benchmark::State& state) {
 7 |   // Create a string over and over again.
 8 |   // It is just because "short strings optimization" is active
 9 |   // no memory allocations
10 |   for (auto _ : state) {
11 |     std::string created_string(SHORT_STR);
12 |     benchmark::DoNotOptimize(created_string);
13 |   }
14 | }
15 | BENCHMARK(ShortStringCreation);
16 | 
17 | static void ShortStringCopy(benchmark::State& state) {
18 | 
19 |   // Here we create the string only once, but copy repeatably.
20 |   // Why it is much slower than ShortStringCreation?
21 |   // The compiler, apparently, outsmarted me
22 | 
23 |   std::string x; // create once
24 |   for (auto _ : state) {
25 |     x = SHORT_STR; // copy
26 |   }
27 | }
28 | BENCHMARK(ShortStringCopy);
29 | 
30 | static const char* LONG_STR = "this will not fit into small string optimization";
31 | 
32 | static void LongStringCreation(benchmark::State& state) {
33 |   // The long string will trigget memory allocation for sure
34 | 
35 |   for (auto _ : state) {
36 |     std::string created_string(LONG_STR);
37 |     benchmark::DoNotOptimize(created_string);
38 |   }
39 | }
40 | BENCHMARK(LongStringCreation);
41 | 
42 | static void LongStringCopy(benchmark::State& state) {
43 |   // Now we do see an actual advantage, reciclying the same string
44 |   // multiple times
45 |   std::string x;
46 |   for (auto _ : state) {
47 |     x = LONG_STR;
48 |   }
49 | }
50 | BENCHMARK(LongStringCopy);
51 | 
52 | 
53 | BENCHMARK_MAIN();
54 | 


--------------------------------------------------------------------------------
/cpp/strings_are_vectors.cpp:
--------------------------------------------------------------------------------
 1 | #include <benchmark/benchmark.h>
 2 | #include <cstring>
 3 | 
 4 | enum Color{
 5 |     BLUE,
 6 |     RED,
 7 |     YELLOW
 8 | };
 9 | 
10 | std::string ToStringBad(Color c)
11 | {
12 |     switch(c)
13 |     {
14 |     case BLUE:   return "BLUE";
15 |     case RED:    return "RED";
16 |     case YELLOW: return "YELLOW";
17 |     }
18 | }
19 | 
20 | const std::string& ToStringBetter(Color c)
21 | {
22 |     static const std::string color_name[3] ={"BLUE", "RED", "YELLOW"};
23 |     switch(c)
24 |     {
25 |     case BLUE:   return color_name[0];
26 |     case RED:    return color_name[1];
27 |     case YELLOW: return color_name[2];
28 |     }
29 | }
30 | 
31 | static void ToStringByValue(benchmark::State& state) {
32 | 
33 | 
34 |   for (auto _ : state) {
35 |     std::string x = ToStringBad(BLUE);
36 |     benchmark::DoNotOptimize( x );
37 |   }
38 | }
39 | BENCHMARK(ToStringByValue);
40 | 
41 | static void ToStringByReference(benchmark::State& state) {
42 | 
43 |   std::string x; // create once
44 |   for (auto _ : state) {
45 |     const std::string& x = ToStringBetter(BLUE);
46 |     benchmark::DoNotOptimize( x );
47 |   }
48 | }
49 | BENCHMARK(ToStringByReference);
50 | 
51 | //---------------------------------------------
52 | 
53 | 
54 | // Create a new string every time (even if return value optimization may help)
55 | static std::string ModifyString(const std::string& input)
56 | {
57 |     std::string output = input;
58 |     output.append("... indeed");
59 |     return output;
60 | }
61 | // Reuse an existing string that MAYBE, have the space already reserved
62 | // (or maybe not..)
63 | static void ModifyStringBetter(const std::string& input, std::string& output)
64 | {
65 |     output = input;
66 |     output.append("... indeed");
67 | }
68 | 
69 | static const char* LONG_STR = "this will not fit into small string optimization";
70 | 
71 | 
72 | static void ModifyByValuee(benchmark::State& state) {
73 | 
74 |   std::string in(LONG_STR);
75 |   std::string out;
76 |   // Memory must be allocated every time
77 |   for (auto _ : state) {
78 |     out = ModifyString(in);
79 |   }
80 | }
81 | BENCHMARK(ModifyByValuee);
82 | 
83 | static void ModifyByReference(benchmark::State& state) {
84 | 
85 |   std::string in(LONG_STR);
86 |   std::string out;
87 |   // ModifyStringBetter could be as fast as ModifyString, but
88 |   // occasionally faster, because out has the mory already allocated
89 |   // from previous calls
90 |   for (auto _ : state) {
91 |     ModifyStringBetter(in, out);
92 |   }
93 | }
94 | BENCHMARK(ModifyByReference);
95 | 
96 | 
97 | BENCHMARK_MAIN();
98 | 


--------------------------------------------------------------------------------
/cpp/strings_concatenation.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | 
 4 | std::string first_str("This is my first string.");
 5 | std::string second_str("This is the second string I want to append.");
 6 | std::string third_str("This is the third and last string to append."); 
 7 | 
 8 | inline size_t StrSize(const char* str)  {
 9 |   return strlen(str);
10 | }
11 | 
12 | inline size_t StrSize(const std::string& str)  {
13 |   return str.size();
14 | }
15 | 
16 | template <class Head, class... Tail>
17 | size_t StrSize(const Head& head, Tail const&... tail){
18 |   return StrSize(head) + StrSize(tail...);
19 | }
20 | 
21 | template <class Head>
22 | void StrAppend(std::string& out, const Head& head){
23 |   out += head;
24 | }
25 | 
26 | template <class Head, class... Args>
27 | void StrAppend(std::string& out, const Head& head, Args const&... args){
28 |   out += head;
29 |   StrAppend(out, args...);
30 | }
31 | 
32 | template <class... Args> inline
33 | std::string StrCat(Args const&... args){
34 |   size_t tot_size = StrSize(args...);
35 |   std::string out;
36 |   out.reserve(tot_size);
37 | 
38 |   StrAppend(out, args...);
39 |   return out;
40 | }
41 | 
42 | static void DefaultConcatenation(benchmark::State& state) {
43 | 
44 |   for (auto _ : state) {
45 |     std::string big_one = first_str + " " +  second_str + " " +  third_str;
46 |     benchmark::DoNotOptimize(big_one);
47 |   }
48 | }
49 | 
50 | static void ManualConcatenation(benchmark::State& state) {
51 | 
52 |   for (auto _ : state) {
53 |     std::string big_one;
54 |     big_one.reserve(first_str.size() + 
55 |                     second_str.size() + 
56 |                     third_str.size() + 
57 |                     strlen(" ")*2 );
58 | 
59 |     big_one += first_str;
60 |     big_one += " ";
61 |     big_one += second_str;
62 |     big_one += " ";
63 |     big_one += third_str;
64 |     benchmark::DoNotOptimize(big_one);
65 |   }
66 | }
67 | 
68 | static void VariadicConcatenation(benchmark::State& state) {
69 | 
70 |   for (auto _ : state) {
71 |     std::string big_one = StrCat(first_str, " ", second_str, " ", third_str);
72 |     benchmark::DoNotOptimize(big_one);
73 |   }
74 | }
75 | 
76 | // Register the function as a benchmark
77 | BENCHMARK(DefaultConcatenation);
78 | BENCHMARK(ManualConcatenation);
79 | BENCHMARK(VariadicConcatenation);
80 | 


--------------------------------------------------------------------------------
/docs/2d_matrix_iteration.md:
--------------------------------------------------------------------------------
  1 | # Iterating over a 2D matrix
  2 | 
  3 | 2D matrices are very common in my domain (robotics).
  4 | 
  5 | We use them to represent images, gridmaps/costmaps, etc.
  6 | 
  7 | Recently, I realized that one of our algorithms dealing with costmaps was quite slow,
  8 |  and I profiled it using **Hotspot**.
  9 | 
 10 | I realized that one of the bottlenecks was a function that combined together two costmaps to create
 11 | a third one. The function looked qualitatively like this:
 12 | 
 13 | ```C++
 14 | // this is over simplified code, do not argue about it with me
 15 | for( size_t y = y_min; y < y_max; y++ ) 
 16 | {
 17 |     for( size_t x = x_min; x < x_max; x++ ) 
 18 |     {
 19 |         matrix_out( x,y ) = std::max( mat_a( x,y ), mat_b( x,y ) ); 
 20 |     }
 21 | }
 22 | ```
 23 | 
 24 | Pretty straightforward, right? Elegantly written, as it should be.
 25 | But since my measurements revealed that it was using too much CPU, I decided to 
 26 | follow the white rabbit inside the optimization hole.
 27 | 
 28 | ## How do you write a good 2D matrix in C++?
 29 | 
 30 | Have you ever written code like this?
 31 | 
 32 | ```C++
 33 | // My computer science professor did it like this
 34 | float** matrix = (float**) malloc( rows_count * sizeof(float*) );
 35 | for(int r=0; r<rows_count; r++) 
 36 | {
 37 |     matrix[r] = (float*) malloc( columns_count * sizeof(float) );
 38 | }
 39 | // access an element of the matrix like this:
 40 | matrix[row][col] = 42;
 41 | ```
 42 | 
 43 | Then you are grounded. The best you can do with this code is to take it to
 44 | [Mount Doom in Mordor](https://en.wikipedia.org/wiki/Mount_Doom) and throw it inside the volcano.
 45 | 
 46 | 
 47 | ![](img/mordor.jpg)
 48 | 
 49 | 
 50 | First of all, use [Eigen](http://eigen.tuxfamily.org). It is a wonderful library, that is performant and has a beautiful API.
 51 | 
 52 | Secondary, just **for didactic purposes**, this is the way you should implement an efficient matrix (but don't, use Eigen, seriously). It is relevant to show it here, because this is the approach that everybody follows, including Eigen.
 53 | 
 54 | ```C++
 55 | template <typename T> class Matrix2D
 56 | {
 57 | public:
 58 |     Matrix2D(size_t rows, size_t columns):  _num_rows(rows)
 59 |     {
 60 |         _data.resize( rows * columns );
 61 |     }
 62 |     
 63 |     size_t rows() const
 64 |     { 
 65 |     	return _num_rows; 
 66 |     }
 67 |     
 68 |     T& operator()(size_t row, size_t col)  
 69 |     {
 70 |         size_t index = col*_num_rows + row; 
 71 |         return _data[index];
 72 |     }
 73 |     
 74 |     T& operator[](size_t index)  
 75 |     {
 76 |         return _data[index];
 77 |     }
 78 |     
 79 |     // all the other methods omitted
 80 | private:
 81 |     std::vector<T> _data;
 82 |     size_t _num_rows;
 83 | };
 84 | 
 85 | // access an element of the matrix like this:
 86 | matrix(row, col) = 42;
 87 | 
 88 | ```
 89 | 
 90 | This is the most cache-friendly way to build a matrix, with a single memory allocation and 
 91 | the data stored in a way known as [column-wise](https://www.geeksforgeeks.org/row-wise-vs-column-wise-traversal-matrix/).
 92 | 
 93 | To convert a row/column pair into an index in the vector, we need a single multiplication and addition.
 94 | 
 95 | ## Back to my problem
 96 | 
 97 | Do you remember the code we wrote at the beginning?
 98 | 
 99 | We have a number of iterations that is equal to `(x_max-x_min)*(y_max-y_min)`.  
100 | Often, that it is a lot of pixels/cells.
101 | 
102 | In each iteration, we calculate the index 3 times using the formula:
103 | 
104 |      size_t index = col*_num_rows + row;
105 | 
106 | Holy moly, that is a lot of multiplications!
107 | 
108 | It turned out that it was worth rewriting the code like this:
109 | 
110 | ```C++
111 | // calculating the index "by hand"
112 | for(size_t y = y_min; y < y_max; y++) 
113 | {
114 |     size_t offset_out =  y * matrix_out.rows();
115 |     size_t offset_a   =  y * mat_a.rows();
116 |     size_t offset_b   =  y * mat_b.rows();
117 |     for(size_t x = x_min; x < x_max; x++) 
118 |     {
119 |         size_t index_out =  offset_out + x;
120 |         size_t index_a   =  offset_a + x;
121 |         size_t index_b   =  offset_b + x;
122 |         matrix_out( index_out ) = std::max( mat_a( index_a ), mat_b( index_b ) ); 
123 |     }
124 | }
125 | ```
126 | 
127 | So, I know what you are thinking, **my eyes are hurting too**. It is ugly. But the performance boost was too much to ignore.
128 | 
129 | That is not surprising, considering that the number of multiplications is dramatically reduced by a factor `(x_max-x_min)*3`.
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/docs/2d_transforms.md:
--------------------------------------------------------------------------------
  1 | # Don't compute it twice
  2 | 
  3 | The example I am going to present will make some of you react like this:
  4 | 
  5 | ![really](img/really.jpg)
  6 | 
  7 | I say this because it will be absolutely obvious... in retrospective.
  8 | 
  9 | On the other hand, I have seen **this same code** being used in multiple open source projects.
 10 | 
 11 | Projects with hundreds of Github stars missed this (apparently obvious) opportunity for optimization.
 12 | 
 13 | A notable example is: [Speed up improvement for laserOdometry and scanRegister (20%)](https://github.com/laboshinl/loam_velodyne/pull/20)
 14 | 
 15 | ## 2D transforms
 16 | 
 17 | Let's consider this code:
 18 | 
 19 | ```c++
 20 | double x1 = x*cos(ang) - y*sin(ang) + tx;
 21 | double y1 = x*sin(ang) + y*cos(ang) + ty;
 22 | ```
 23 | 
 24 | People with a trained eye (and a little of trigonometric background) will immediately recognize the [affine transform of a 2D point], commonly used in computer graphics and robotics.
 25 | 
 26 | Don't you see anything we can do better? Of course:
 27 | 
 28 | ```c++
 29 | const double Cos = cos(angle);
 30 | const double Sin = sin(angle);
 31 | double x1 = x*Cos - y*Sin + tx;
 32 | double y1 = x*Sin + y*Cos + ty;
 33 | ```
 34 | 
 35 | The cost of trigonometric functions is relatively high and there is absolutely no reason to compute twice the same value.
 36 | The latter code will be 2x times faster then the former, because the cost of multiplication and sum is really low compared with `sin()` and `cos()`.
 37 | 
 38 | In general, if the number of potential angles you need to test is not extremely high, consider to use look-up-table where you can store pre-computed values.
 39 | 
 40 | This is the case, for instance, of laser scan data, that needs to be converted from polar coordinates to cartesian ones.
 41 | 
 42 | ![laser_scan_matcher.png](img/laser_scan_matcher.png)
 43 | 
 44 | A naive implementation would invoke trigonometric functions for each point (in the order of thousands per seconds).
 45 | 
 46 | ```c++
 47 | // Conceptual operation (inefficient)
 48 | // Data is usually stored in a vector of distances
 49 | std::vector<double> scan_distance; // the input
 50 | std::vector<Pos2D> cartesian_points; // the output
 51 | 
 52 | cartesian_points.reserve( scan_distance.size() );
 53 | 
 54 | for(int i=0; i<scan_distance.size(); i++)
 55 | {
 56 |     const double dist = scan_distance[i];
 57 |     const double angle = angle_minimum + (angle_increment*i);
 58 |     double x = dist*cos(angle);
 59 |     double y = dist*sin(angle);
 60 |     cartesian_points.push_back( Pos2D(x,y) );
 61 | }
 62 | ```
 63 | 
 64 | But we should consider that: 
 65 | 
 66 |  - **angle_minimum** and **angle_increment** are constants that never change.
 67 |  - the size of the **scan_distance** is constant too (not its content, of course).
 68 |  
 69 |  This is the perfect example where a LUT makes sense and will dramatically improve performance.
 70 |  
 71 | ```C++
 72 |  
 73 | //------ To do only ONCE -------
 74 | std::vector<double> LUT_cos;
 75 | std::vector<double> LUT_sin;
 76 | 
 77 | for(int i=0; i<scan_distance.size(); i++)
 78 | {
 79 |     const double angle = angle_minimum + (angle_increment*i);
 80 |     LUT_cos.push_back( cos(angle) );
 81 |     LUT_sin.push_back( sin(angle) );
 82 | }
 83 | 
 84 | // ----- The efficient scan conversion ------
 85 | std::vector<double> scan_distance;
 86 | std::vector<Pos2D> cartesian_points;
 87 | 
 88 | cartesian_points.reserve( scan_distance.size() );
 89 | 
 90 | for(int i=0; i<scan_distance.size(); i++)
 91 | {
 92 |     const double dist = scan_distance[i];
 93 |     double x = dist*LUT_cos[i];
 94 |     double y = dist*LUT_sin[i];
 95 |     cartesian_points.push_back( Pos2D(x,y) );
 96 | }
 97 | ```
 98 |  
 99 | 
100 | # Lessons to take home
101 | 
102 | This is a simple example; what you should learn from this is that, whenever an operation is expensive to compute (SQL queries, stateless mathematical operations), you should consider to use a cached value and or to build a look-up-table.
103 | 
104 | But, as always, **measure first** to be sure that the optimization is actually relevant ;)
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
1 | # About me
2 | 
3 |  My name is [Davide Faconti](https://twitter.com/facontidavide) and my job is one of the best in the world: I **create robots**.
4 |   
5 | This blog/repository is maintained in **my spare time** and it is not  related to my work there. Therefore *opinions (and memes) are all mine and don't represent my employer in any way*.
6 | 
7 | I love C++ programming and Open Source and this "diary" is my small contribution to the OSS community.
8 | 


--------------------------------------------------------------------------------
/docs/boost_flatmap.md:
--------------------------------------------------------------------------------
  1 | # Optimizing using an exotic associative container
  2 | 
  3 | 
  4 | 
  5 | ![](img/twitter_unordered.png)
  6 | 
  7 | Come on Michael, you are underestimating me ;)
  8 | 
  9 | # The problem
 10 | 
 11 | The last week, I decided to investigate one of the modules that was using a lot of CPU, or at least, more than I expected.
 12 | 
 13 | I am not going to bother you with the details, but it was a low-level hardware interface to control the motors of our robots.
 14 | 
 15 | I knew enough about this part of the software system, to believe that it was not supposed to keep an entire CPU core busy.
 16 | 
 17 | Needless to say, I used my best friend **Hotspost** to see where the CPU time was spent:
 18 | 
 19 | ![](img/motor_profile1.png)
 20 | 
 21 | This looks a mess, doesn't it? Bear with me.
 22 | 
 23 | If you are new to [flamegraphs](http://www.brendangregg.com/flamegraphs.html), don't freak out. Simply, caller functions are at the bottom of the pyramid and, on top of them, there are  their callees functions.
 24 | 
 25 | What caught my attention was that 30% of the CPU is wasted in the method **std::unordered_map<>::operator[]**.
 26 | 
 27 | ![](img/motor_profile2.png)
 28 | 
 29 | There is a big block on the right side, and then many multiple calls here and there in the code on the left one.
 30 | 
 31 | The problematic container looks more or less like this:
 32 | 
 33 | ```C++
 34 | // simplified code
 35 | std::unordered_map<Address, Entry*> m_dictionary;
 36 | 
 37 | //where
 38 | struct Address{
 39 |     int16_t index;
 40 |     unt8_t subindex;
 41 | };
 42 | ```
 43 | 
 44 | # The solution
 45 | 
 46 | I inspected the code and I found horrible things. Like this:
 47 | 
 48 | ```C++
 49 | // simplified code
 50 | bool hasEntry(const Address& address) 
 51 | {
 52 |     return m_dictionary.count(address) != 0;
 53 | }
 54 | 
 55 | Value getEntry(const Address& address) 
 56 | {
 57 |     if( !hasEntry(address) {
 58 |         throw ...
 59 |     }
 60 |     Entry* = m_dictionary[address];
 61 |     // rest of the code
 62 | }
 63 | ```
 64 | 
 65 | ![](img/two_lookups.jpg)
 66 | 
 67 | The correct way to do it is, instead:
 68 | 
 69 | ```C++
 70 | // simplified code. Only one lookup
 71 | Value getEntry(const Address& address) 
 72 | {
 73 |     auto it = m_dictionary.find(address);
 74 |     if( it ==  m_dictionary.end() ) {
 75 |         throw ...
 76 |     }
 77 |     Entry* = it->second;
 78 |     // rest of the code
 79 | }
 80 | ```
 81 | 
 82 | This change alone cuts by half the overhead of `std::unordered_map` observed in the flamegraph.
 83 | 
 84 | ## Embrace caching
 85 | 
 86 | It is very important to note that once created, the dictionary is never changed at run-time.
 87 | 
 88 | This allows us, as mentioned in the picture, to optimize the large block on the right side using caching.
 89 | 
 90 | In fact, the map lookup is executed inside a callback that has access to the relevant address. But if the **[Address, Entry*]** pair never changes, why don't we directly store the `Entry*`?
 91 | 
 92 | As expected, this completely erase the overhead in that big block using 15% of the CPU.
 93 | 
 94 | If you haven't you may look at similar examples of caching [in one of my previous articles about 2D transformations](2d_transforms.md).
 95 | 
 96 | ## Winning big with `boost::container_flat_map`
 97 | 
 98 | Using caching in the rest of the code would have been a pain. That is the typical example of "death by a 1000 papercuts".
 99 | 
100 | Furthermore, there was something in the back of my head telling me that something was off.
101 | 
102 | Why does `std::hash<Address>` take so long? Is this, maybe,  **one of those rare cases** where I should not look at the big O() notation?
103 | 
104 | `std::unordered_map` lookup is O(1), that is a Good Thing, isn't it?
105 | 
106 | Shall we try some lateral thinking and use another container with O(logn) lookup complexity, but without the cost of the hash function?
107 | 
108 |  Drum roll, please and welcome [boost::container_flat_map](https://www.boost.org/doc/libs/1_74_0/doc/html/container/non_standard_containers.html#container.non_standard_containers.flat_xxx).
109 |  
110 | I am not going to repeat what the documentation explains about the implementation, which is mostly a normal ordered vector, similar to what I discuss [at the end of this article about std::map](dont_need_map.md).
111 | 
112 | The results surprised me: I haven't just "reduced" the overhead, it was completely gone. The cost of `flat_map<>::operator[]` was barely measurable.
113 | 
114 | Basically, just switching to `flat_map` solved the entire problem changing in one line of code!
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/docs/dont_need_map.md:
--------------------------------------------------------------------------------
  1 | # Do you actually need to use std::map?
  2 | 
  3 | `std::map` is one of the most known data structures in C++, the default associative container for most of us, but its popularity has been decreasing over the years.
  4 | 
  5 | Associative containers are used when you have **pairs of key/value** and you want to find a value given its key.
  6 | 
  7 | But, because of the way the nodes of the [red-black tree](https://en.wikipedia.org/wiki/Red%E2%80%93black_tree) are created, `std::map` is
  8 | not much different than an `std::list`, i.e. an unwelcome memory allocation during insertion and a very cache-unfriendly memory layout.
  9 | 
 10 | Before selecting this data structure, ask yourself these questions:
 11 | 
 12 | - do I need all the pairs to be **ordered** by their keys?
 13 | - do I need to iterate often through all the items of the container?
 14 | 
 15 | If the answer to the first question is "no", you may want to switch by default to `std::unordered_map`. 
 16 | 
 17 | In all my benchmarks, this was always a win. Maybe I was lucky and maybe there are situations in which `std::map` would perform better, but I haven't found those cases yet.
 18 | 
 19 | If you answer "yes" to the second question... this will be interesting.
 20 | 
 21 | Sit on my lap, my child, and join my optimization adventures.
 22 | 
 23 | ## Optimizing the Velodyne driver
 24 | 
 25 | This is a Pull Request I am particularly proud of:  
 26 | 
 27 | [Avoid unnecessary computation in RawData::unpack](https://github.com/ros-drivers/velodyne/pull/194)
 28 | 
 29 | To understand how a small change can make a huge difference, think about what this driver is doing.
 30 | 
 31 | ![](img/velodyne.png)
 32 | 
 33 | The Velodyne is a sensor that measures hundreds of thousands of points per seconds (distance from obstacles); it is the most important sensor in most autonomous cars.
 34 | 
 35 | The Velodyne driver converts measurements in polar coordinates to 3D cartesian coordinates (the so-called "PointCloud").
 36 | 
 37 | I profiled the Velodyne driver using **Hotspot** and, surprise surprise, I found that something related to `std::map::operator[]` was using a lot of CPU.
 38 | 
 39 | So I explored the code and I found this:
 40 | 
 41 | ```C++
 42 |  std::map<int, LaserCorrection> laser_corrections;
 43 | ```
 44 |  `LaserCorrection`contains some calibration information that was needed to adjust the measurements.
 45 |  
 46 |  The `int` key of the map is a number in the range  [0, N-1], where N could be 16, 32 or 64. Maybe one day it will reach 128!
 47 |  
 48 |  Furthermore `laser_corrections` was created once (no further insertions)
 49 |  and used over and over again in a loop like this:
 50 |  
 51 | ```C++
 52 |  // code simplified
 53 |  for (int i = 0; i < BLOCKS_PER_PACKET; i++) {
 54 |     //some code
 55 |     for (int j = 0; j < NUM_SCANS; j++) 
 56 |     {   
 57 |         int laser_number = // omitted for simplicity
 58 |         const LaserCorrection &corrections = laser_corrections[laser_number];
 59 |         // some code
 60 |     }
 61 |  }
 62 | ```
 63 | 
 64 | ![](img/that_is_logn.jpg)
 65 | 
 66 | Indeed, behind this innocent line of code:
 67 | 
 68 |       laser_corrections[laser_number];
 69 | 
 70 | There is a search in a red-black tree!
 71 | 
 72 | Remember: the index is **not** a random number, its value is always between 0 and N-1, where N is very small.
 73 | 
 74 | So, I proposed this change and you can not imagine what happened next:
 75 | 
 76 | ```C++
 77 |  std::vector<LaserCorrection> laser_corrections;
 78 | ```
 79 | 
 80 | ![](img/quote.png)
 81 | 
 82 | Summarizing, there wasn't any need for an associative container, because the position in the vector itself (the index in the vector) is working just fine.
 83 | 
 84 | I don't blame in any way the developers of the Velodyne driver, because changes like these make sense only in retrospective:  until you profile your application and do some actual measurements, it is hard to find an unnecesary overhead hidden in plain sight.
 85 | 
 86 | When you think that the rest of the function does **a lot** of mathematical operations, you can understand how counter-intuitive it is that the actual bottleneck was a tiny `std::map`.
 87 | 
 88 | ## Going a step further: vector of [key,value]  pairs
 89 | 
 90 | This example was quite "extreme", because of its very convenient integer key, a small number between 0 and N.
 91 | 
 92 | Nevertheless, in my code I use often a structure like this, instead of a "real" associative container:
 93 | 
 94 | ```C++
 95 | std::vector< std::pair<KeyType, ValueType> > my_map;
 96 | ```
 97 | 
 98 | This is the best data structure if what you need to **iterate frequently over all the elements**.
 99 | 
100 | Most of the times, you can not beat it!
101 | 
102 | > "But Davide, I need to have those elements ordered, that is the reason why I used `std::map`"!
103 | 
104 | Well, if you need them ordered... order them!
105 | 
106 | ```C++
107 | std::sort( my_map.begin(), my_map.end() ) ;
108 | ```
109 | > "But Davide, sometimes I need to search an element in my map"
110 | 
111 | In that case, you can find your element by its key searching in an **ordered** vector with the function 
112 | [std::lower_bound](http://www.cplusplus.com/reference/algorithm/lower_bound/).
113 | 
114 | The complexity of `lower_bound` / `upper_bound` is **O(log n)**, the same as `std::map`, but iteration through all the elements is much, much faster.
115 | 
116 | ## Summarizing
117 | 
118 | - Think about the way you want to access your data.
119 | - Ask yourself if you have frequent or infrequent insertion/deletion.
120 | - Do not underestimate the cost of an associative container.
121 | - Use `std::unordered_map` by default... or `std::vector`, of course !
122 | 


--------------------------------------------------------------------------------
/docs/img/beautiful.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/beautiful.jpg


--------------------------------------------------------------------------------
/docs/img/boom.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/boom.gif


--------------------------------------------------------------------------------
/docs/img/const_reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/const_reference.png


--------------------------------------------------------------------------------
/docs/img/cpp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/cpp.png


--------------------------------------------------------------------------------
/docs/img/davide_yells_at_PCL.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/davide_yells_at_PCL.jpg


--------------------------------------------------------------------------------
/docs/img/feel_bad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/feel_bad.jpg


--------------------------------------------------------------------------------
/docs/img/fizzbuzz.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/fizzbuzz.jpg


--------------------------------------------------------------------------------
/docs/img/growing_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/growing_vector.png


--------------------------------------------------------------------------------
/docs/img/hotspot_heaptrack.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/hotspot_heaptrack.jpg


--------------------------------------------------------------------------------
/docs/img/inconceivably.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/inconceivably.jpg


--------------------------------------------------------------------------------
/docs/img/laser_scan_matcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/laser_scan_matcher.png


--------------------------------------------------------------------------------
/docs/img/linked_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/linked_list.png


--------------------------------------------------------------------------------
/docs/img/modifystring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/modifystring.png


--------------------------------------------------------------------------------
/docs/img/mordor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/mordor.jpg


--------------------------------------------------------------------------------
/docs/img/motor_profile1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/motor_profile1.png


--------------------------------------------------------------------------------
/docs/img/motor_profile2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/motor_profile2.png


--------------------------------------------------------------------------------
/docs/img/multiply_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/multiply_vector.png


--------------------------------------------------------------------------------
/docs/img/palindrome_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/palindrome_benchmark.png


--------------------------------------------------------------------------------
/docs/img/pcl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/pcl.jpg


--------------------------------------------------------------------------------
/docs/img/pcl_fromros.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/pcl_fromros.png


--------------------------------------------------------------------------------
/docs/img/quick-bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/quick-bench.png


--------------------------------------------------------------------------------
/docs/img/quote.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/quote.png


--------------------------------------------------------------------------------
/docs/img/really.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/really.jpg


--------------------------------------------------------------------------------
/docs/img/realsense.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/realsense.png


--------------------------------------------------------------------------------
/docs/img/relax_sso.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/relax_sso.jpg


--------------------------------------------------------------------------------
/docs/img/spider_senses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/spider_senses.png


--------------------------------------------------------------------------------
/docs/img/sso_in_action.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/sso_in_action.png


--------------------------------------------------------------------------------
/docs/img/string_concatenation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/string_concatenation.png


--------------------------------------------------------------------------------
/docs/img/that_is_logn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/that_is_logn.jpg


--------------------------------------------------------------------------------
/docs/img/think_about_it.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/think_about_it.jpg


--------------------------------------------------------------------------------
/docs/img/tostring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/tostring.png


--------------------------------------------------------------------------------
/docs/img/twitter_unordered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/twitter_unordered.png


--------------------------------------------------------------------------------
/docs/img/two_lookups.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/two_lookups.jpg


--------------------------------------------------------------------------------
/docs/img/vector_reserve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/vector_reserve.png


--------------------------------------------------------------------------------
/docs/img/velodyne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/velodyne.png


--------------------------------------------------------------------------------
/docs/img/why_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facontidavide/CPP_Optimizations_Diary/78054adffdc77ae3f2b8a4424a436aeb7195d4c7/docs/img/why_copy.jpg


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | # CPP Optimizations Diary
  2 | 
  3 | Optimizing code in C++ is something that no one can resist. You can have fun
  4 | and pretend  that you are doing something useful for your organization at the same time!
  5 | 
  6 | In this repository, I will record some simple design patterns to improve your code 
  7 | and remove unnecessary overhead in **C++**.
  8 | 
  9 | If you are a seasoned C++ expert, you probably have already a set of rules in your head
 10 | that you always follow.
 11 | 
 12 | These rules help you look like a bad-ass/rockstar/10X engineer to your colleagues.
 13 | 
 14 | You are the kind of person that casually drops a [std::vector<>::reserve](reserve.md) before a loop and
 15 | nods, smiling, looking at the performance improvement and the astonishment of your team member.
 16 | 
 17 | 
 18 | ![](img/boom.gif)
 19 | 
 20 | Hopefully, the examples in this repository will help you achieve this status of guru
 21 | and, as a side effect, save the planet from global warming, sparing useless CPU
 22 | cycles from being wasted.
 23 | 
 24 | Then, of course, someone on the other side of the planet will start mining Bitcoins or write her/his 
 25 | application in **Python** and all your effort to save electricity was for nothing.
 26 | 
 27 | I am kidding, Python developers, we love you!
 28 | 
 29 | > Narrator: "he was not kidding..."
 30 | 
 31 | ## Rule 1: measure first (using _good_ tools)
 32 | 
 33 | The very first thing any person concerned about performance should do is:
 34 |  
 35 | - **Measure first** and **make hypothesis later**.
 36 | 
 37 | Me and my colleagues are almost always wrong about the reasons a piece of code is
 38 | be slow. 
 39 | 
 40 | Sometimes we are right, but it is really hard to know in advance ho refactoring will
 41 | improve performance. Good profiling tools show in minutes the "low hanging fruits": minimum work, maximum benefit!
 42 | 
 43 | Summarizing: 10 Minutes profiling can save you hours of work guessing and refactoring.
 44 | 
 45 | My "goto" tools in Linux are [Hotspot](https://github.com/KDAB/hotspot) and 
 46 | [Heaptrack](https://github.com/KDE/heaptrack). I understand Windows has similar
 47 | tools too.
 48 | 
 49 | ![](img/hotspot_heaptrack.jpg)
 50 | 
 51 | In the benchmark war, if you are the soldier, these are your rifle and hand grenades.
 52 | 
 53 | Once you know which part of the code deserves to be optimized, you might want to use
 54 | [Google Benchmark](https://github.com/google/benchmark) to measure the time spent in a very specific
 55 | class or function.
 56 | 
 57 | You can even run it Google Benchmark online here: [quick-bench.com](http://quick-bench.com/G7B2w0xPUWgOVvuzI7unES6cU4w).
 58 | 
 59 | ![quick-bench](img/quick-bench.png)
 60 | 
 61 | ## Rule 2: learn good design patterns, use them by default
 62 | 
 63 | Writing good code is like brushing your teeth: you should do it without thinking too much about it.
 64 | 
 65 | It is a muscle that you need to train, that will become stronger over time. But don't worry:
 66 | once you start, you will start seeing recurring patterns that 
 67 | are surprisingly simple and works in many different use cases.
 68 | 
 69 | **Spoiler alert**: one of my most beloved tricks is to _minimize the number of heap allocations_.
 70 | You have no idea how much that helps.
 71 | 
 72 | But let's make something absolutely clear: 
 73 | 
 74 | - Your **first goal** as a developer (software engineer?) is to create code that is **correct** and fulfil the requirements.
 75 | - The **second** most important thing is to make your code **maintainable and readable** for other people.
 76 | - In many cases, you also want to make code faster, because [faster code is better code](https://craigmod.com/essays/fast_software/).
 77 | 
 78 | In other words, think twice before doing any change in your code that makes it less readable or harder to debug,
 79 | just because you believe it may run 2.5% faster.
 80 | 
 81 | ## Optimization examples
 82 | 
 83 | ### "If you pass that by value one more time..."
 84 | 
 85 | - [Use Const reference by default](prefer_references.md).
 86 | 
 87 | - Move semantic (TODO).
 88 | 
 89 | - Return value optimization (TODO).
 90 | 
 91 | 
 92 | ### std::vector<> is your best friend
 93 | 
 94 | 
 95 | - [Use std::vector<>::reserve by default](reserve.md)
 96 | 
 97 | - ["I have learnt linked-lists at university, should I use them?" Nooope](no_lists.md).
 98 | 
 99 | - [You don't need a `std::map<>` for that](dont_need_map.md).
100 | 
101 | - [Small vector optimization](small_vectors.md)
102 | 
103 | 
104 | ### "It is just a string, how bad could that be?"
105 | 
106 | - [Strings are (almost) vectors](strings_are_vectors.md)
107 | 
108 | - [When not to worry: small string optimization](small_strings.md).
109 | 
110 | - [String concatenation: the false sense of security of `operator+`](strings_concatenation.md).
111 | 
112 | - `std::string_view`: love at first sight (TODO).
113 | 
114 | ### Don't compute things twice.
115 | 
116 | - [Example: 2D/3D transforms the smart way.](2d_transforms.md )
117 | 
118 | - [Iterating over a 2D matrix: less elegant, more performant](2d_matrix_iteration.md).
119 | 
120 | ### Fantastic data structures and where to find them.
121 | 
122 | - [I tried `boost::container::flat_map`. You won't imagine what happened next](boost_flatmap.md).
123 | 
124 | ### Case studies
125 | 
126 | - [Simpler and faster way to filter Point Clouds in PCL.](pcl_filter.md)
127 | 
128 | - [More PCL optimizations: conversion fro ROS message](pcl_fromROS.md)
129 | 
130 | - [Fast Palindrome: the cost of conditional branches](palindrome.md)
131 | 


--------------------------------------------------------------------------------
/docs/no_lists.md:
--------------------------------------------------------------------------------
 1 | # If you are using std::list<>, you are doing it wrong
 2 | 
 3 | 
 4 | ![](img/linked_list.png)
 5 | 
 6 | I am not wasting time here to repeat benchmarks which a lot of people did already.
 7 | 
 8 | - [std::vector vs std::list benchmark](https://baptiste-wicht.com/posts/2012/11/cpp-benchmark-vector-vs-list.html)
 9 | 
10 | - [Are lists evil? Bjarne Stroustrup](https://isocpp.org/blog/2014/06/stroustrup-lists)
11 | 
12 | - [Video from Bjarne Stroustrup keynote](https://www.youtube.com/watch?v=YQs6IC-vgmo)
13 | 
14 | You think your case is special, a unique snowflake. **It is not**. 
15 | 
16 | You have another STL data structure that is better than `std::list`:
17 |  [std::deque<>](https://es.cppreference.com/w/cpp/container/deque) almost 99% of the time. 
18 |  
19 |  In some cases, even the humble `std::vector` is better than a list.
20 |  
21 | If you like very exotic alternatives have a look at [plf::colony](https://plflib.org/colony.htm).
22 |  
23 | But seriously, just use `vector`or `deque`.
24 | 
25 | ## Real world example: improving the Intel RealSense driver
26 | 
27 | This is a practical example of a Pull Request I sent to the [RealSense](https://github.com/IntelRealSense)
28 | repository a while ago.
29 | 
30 | ![](img/realsense.png)
31 | 
32 | They where using that abomination called `std::list<>` for a reason that I can not understand.
33 | 
34 | Just kidding, Intel Developers, we love you!
35 | 
36 | Here you can find the link to the Pull Request:
37 | - [Considerable CPU saving in BaseRealSenseNode::publishPointCloud()](https://github.com/IntelRealSense/realsense-ros/pull/1097)
38 | 
39 | In a nutshell, the whole PR contains only two tiny changes:
40 | 
41 | ```C++
42 | // We changed this list, created at each camera frame
43 | std::list<unsigned> valid_indices;
44 | 
45 | // With this vector: a class member, that is cleared before reusing
46 | // (but allocated memory is still there)
47 | std::vector<unsigned> _valid_indices;
48 | ```
49 | 
50 | Additionally, we have a quite large object called `sensor_msgs::PointCloud2 msg_pointcloud` that
51 | is converted into a class member that is reused over and over again at each frame.
52 | 
53 | The reported speed improvement is 20%-30%, that is huge, if you think about it.
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/palindrome.md:
--------------------------------------------------------------------------------
  1 | # Having fun with Palindrome words
  2 | 
  3 | This article is not as interesting and reusable as other, but I think it might still be a nice example of how you we can speed up your code reducing the amount of branches.
  4 | 
  5 | The `if` statement is very fast and usually we should not worry about its runtime cost, but there could be few cases in which avoiding it can provide a visible improvement.
  6 | 
  7 | ## Coding interviews...
  8 | 
  9 | At the time of writing this article, I find myself in the wonderful world of job searching. As a consequence, you probably know what that implies: coding interviews!
 10 | 
 11 | I am fine with them, but the other day a person interviewing me asked the following question:
 12 | 
 13 | > Can you write a function to find if a string is a [palindrome](https://en.wikipedia.org/wiki/Palindrome)?
 14 | 
 15 | And I was thinking...
 16 | 
 17 | ![fizzbuss](img/fizzbuzz.jpg)
 18 | 
 19 | To be fair, I am sure he his a great guy and he was just breaking the ice. He certainly had the best intentions, but I think it could have been more productive for both of us to look at some real-world code I wrote in production instead!
 20 | 
 21 | Nevertheless, this is the answer:
 22 | 
 23 | ```C++
 24 | 
 25 | bool IsPalindrome(const std::string& str)
 26 | {
 27 |     const size_t N = str.size();
 28 |     const size_t N_half = N / 2;
 29 |     for(size_t i=0; i<N_half; i++)
 30 |     {
 31 |         if( str[i] != str[N-1-i])
 32 |         {
 33 |             return false;
 34 |         }
 35 |     }
 36 |     return true;
 37 | }
 38 | ```
 39 | 
 40 | Easy-peasy :)
 41 | 
 42 | Booooring!
 43 | 
 44 | Let's have some fun!
 45 | 
 46 | ## A faster IsPalindrome()
 47 | 
 48 | The original version is apparently the best we can do:
 49 | 
 50 | - Zero copy.
 51 | - Stops the loop as soon as possible.
 52 | - Handles well all the corner cases.
 53 | 
 54 | But I realized that there is a way to make it faster, reducing the number of `if` clauses. 
 55 | This can be easily achieved using entire "words" instead, i.e. storing the single bytes in larger data types.
 56 | 
 57 | For instance, let's use the type `uint32_t` to manipulate 4 bytes at once.
 58 | 
 59 | The resulting implementation will be:
 60 | 
 61 | ```C++
 62 | #include <byteswap.h>
 63 | 
 64 | inline bool IsPalindromeWord(const std::string& str)
 65 | {
 66 |     const size_t N = str.size();
 67 |     const size_t N_half = (N/2);
 68 |     const size_t S = sizeof(uint32_t);
 69 |     // number of words of size S in N_half
 70 |     const size_t N_words = (N_half / S);
 71 | 
 72 |     // example: if N = 18, half string is 9 bytes and
 73 |     // we need to compare 2 pairs of words and 1 pair of chars
 74 | 
 75 |     size_t index = 0;
 76 | 
 77 |     for(size_t i=0; i<N_words; i++)
 78 |     {
 79 |         uint32_t word_left, word_right;
 80 |         memcpy(&word_left, &str[index], S);
 81 |         memcpy(&word_right, &str[N - S - index], S);
 82 | 
 83 |         if( word_left != bswap_32(word_right))
 84 |         {
 85 |             return false;
 86 |         }
 87 |         index += S;
 88 |     }
 89 |     // remaining bytes.
 90 |     while(index < N_half)
 91 |     {
 92 |         if( str[index] != str[N-1-index])
 93 |         {
 94 |             return false;
 95 |         }
 96 |         index++;
 97 |     }
 98 |     return true;
 99 | }
100 | ```
101 | 
102 | What is going on here?
103 | 
104 | We are storing "blocks" (words) of 4 bytes and call the comparison operator only **once** for each word.
105 | 
106 | To reverse the order of the bytes in a "specular" way, we use the built-in function [bswap_32](https://man7.org/linux/man-pages/man3/bswap_32.3.html).
107 | 
108 | Note that the hand-made implemetation of this function is **equally fast** in my benchmarks.
109 | 
110 | ```C++
111 | inline uint32_t Swap(const uint32_t& val)
112 | {
113 |   union {
114 |     char c[4];
115 |     uint32_t n;
116 |   } data;
117 |   data.n = val;
118 |   std::swap(data.c[0], data.c[3]);
119 |   std::swap(data.c[1], data.c[2]);
120 |   return data.n;
121 | }
122 | ```
123 | 
124 | Benchmark:
125 | 
126 | ![palindrome_benchmark.png](img/palindrome_benchmark.png)
127 | 
128 | For sufficiently long strings (more than 8 bytes), **the performance gain is about 50% for relatively short string to 150% for long ones**.
129 | 
130 | Actually, for very long strings, we might use words of 128 or 256 bits. This can be achieved using [SIMD](https://stackoverflow.blog/2020/07/08/improving-performance-with-simd-intrinsics-in-three-use-cases/), but this is not the purpose of this article.
131 | 
132 | ## Summarizing
133 | 
134 | This is **not** a good example of "keep your code simple and readable, don't worry too much about optimizations".
135 | 
136 | I am mostly showing this for fun and to highlight that in **some** cases the cost of branches in our code can we non-negligible and it might open the opportunity for some cool optimizations.
137 | 


--------------------------------------------------------------------------------
/docs/pcl_filter.md:
--------------------------------------------------------------------------------
  1 | # Case study: filter a Point Cloud faster
  2 | 
  3 | [Point Cloud Library (PCL)](https://pointclouds.org/) is a very popular
  4 | library used in robotics, autonomous vehicles and 3D perception in general. 
  5 | Its contribution to this fields is **huge**.
  6 | 
  7 | ![](img/pcl.jpg)
  8 | 
  9 | It is a behemoth with more than [12K commits and 5K Github stars](https://github.com/PointCloudLibrary/pcl).
 10 | 
 11 | With about [400 contributors](https://github.com/PointCloudLibrary/pcl/graphs/contributors)
 12 | you would think that it is hard to find opportunities for optimization
 13 | in plain sight, even more inside the most frequently used functions.
 14 | 
 15 | **Me**: Muahahahaha (Evil laugh)
 16 | 
 17 | ## The ConditionalRemoval filter
 18 | 
 19 | I bet that anyone that has been playing around with PCL has used 
 20 | [pcl::ConditionalRemoval](https://pcl-tutorials.readthedocs.io/en/latest/remove_outliers.html) at least once.
 21 | 
 22 | From the [official tutorial](https://pcl-tutorials.readthedocs.io/en/latest/remove_outliers.html):
 23 | 
 24 | ```C++
 25 | // slightly changed for clarity
 26 | using namespace pcl;
 27 | auto range_cond  = std::make_shared<ConditionAnd<PointXYZ> ();
 28 | range_cond->addComparison ( 
 29 |     std::make_shared<FieldComparison<PointXYZ>("z", ComparisonOps::GT, 0.0));
 30 | range_cond->addComparison (
 31 |     std::make_shared<FieldComparison<PointXYZ>("z", ComparisonOps::LT, 1.0)));
 32 | 
 33 | // build the filter
 34 | ConditionalRemoval<PointXYZ> condition_removal;
 35 | condition_removal.setCondition (range_cond);
 36 | condition_removal.setInputCloud (input_cloud);
 37 | // apply filter
 38 | condition_removal.filter (*cloud_filtered);
 39 | ```
 40 | 
 41 | > Basically, we create the condition which a given point must satisfy for it to remain in our PointCloud. 
 42 | > In this example, we use add two comparisons to the condition: greater than (GT) 0.0 and less than (LT) 1.0. 
 43 | > This condition is then used to build the filter.
 44 | 
 45 | Let me rephrase it for people that are not familiar with PCL:
 46 | 
 47 | - An object is created that says: "the Z value of the point must be greater than 0.0".
 48 | - Another object is created saying: "the Z value of the point must be less than 1.0".
 49 | - They are both added to a `ConditionAnd`.
 50 | - We tell `ConditionalRemoval` to use this combined condition.
 51 | - Apply the filter to an input cloud to create a filtered one.
 52 | 
 53 | Now think that the usal Point Cloud has a number of points in the order of 
 54 | **tens of thousands** or more.
 55 |  
 56 |  Think about it:
 57 |  
 58 | ![](img/think_about_it.jpg)
 59 | 
 60 | Seriously, take some minutes to think how, given a **vector** of points that looks like this:
 61 | 
 62 | ```C++
 63 | // oversimplified, not the actual implementation
 64 | struct PointXYZ{
 65 |   float x;
 66 |   float y;
 67 |   float z; 
 68 | };
 69 | ```
 70 | 
 71 | You want to create another point cloud with all the points that pass this condition:
 72 | 
 73 |       0.0 < point.z < 1.0
 74 |       
 75 | I mean, if you ask **me**, this is what I would do, because I am not that smart:
 76 | 
 77 | ```C++
 78 | auto cloud_filtered = std::make_shared<PointCloud<PointXYZ>>();
 79 | 
 80 | for (const auto& point: input_cloud->points) 
 81 | {
 82 |   if( point.z > 0.0 && point.z < 1.0 )
 83 |   {
 84 |     cloud_filtered->push_back( point );
 85 |   }
 86 | }
 87 | ``` 
 88 | This is what we will call the **"naive filter"**.
 89 | 
 90 | Before showing you a benchmark that will knock your socks off (don't worry, I will),
 91 | I must admit that it is an **unfair comparison** because the `pcl` filters
 92 | do many more checks when processing your data to prevent weird corner cases.
 93 | 
 94 | 
 95 | But do not forget that we expressed our conditions like this:
 96 | 
 97 | ```C++
 98 | pcl::FieldComparison<pcl::PointXYZ> ("z", pcl::ComparisonOps::GT, 0.0)));
 99 | pcl::FieldComparison<pcl::PointXYZ> ("z", pcl::ComparisonOps::LT, 1.0)));
100 | ```
101 | 
102 | If you think about it, there **must be** some kind of parser "somewhere".
103 | 
104 | The easiest implementation of a parser is of course a `switch`
105 | statement, but no one would ever do that for **each** of these trillion points...
106 | 
107 | [Oh, snap!](https://github.com/PointCloudLibrary/pcl/blob/pcl-1.11.0/filters/include/pcl/filters/impl/conditional_removal.hpp#L98-L127)
108 | 
109 | Indeed, 2 switch statements called for each point of the cloud.
110 | 
111 | Summarizing: the very fact that these function tries to be "too clever"
112 | using these "composable rules", means that the implementation is **inherently slow**.
113 | 
114 | There is nothing we can do to save them. Nevertheles, we can replace them ;)
115 | 
116 | ## Davide, give me speed AND expressive code.
117 | 
118 | Sure thing, my friend!
119 | 
120 | Since `pcl::FieldComparison` is intrinsically broken (so are all the other Conditions in the library), 
121 | because of their `switch`statements, let me write my own pcl::Condition (must be derived from `pcl::ConditionBase`) like this:
122 | 
123 | ```C++
124 | template <typename PointT>
125 | class GenericCondition : public pcl::ConditionBase<PointT>
126 | {
127 | public:
128 |   typedef std::shared_ptr<GenericCondition<PointT>> Ptr;
129 |   typedef std::shared_ptr<const GenericCondition<PointT>> ConstPtr;
130 |   typedef std::function<bool(const PointT&)> FunctorT;
131 | 
132 |   GenericCondition(FunctorT evaluator): 
133 |     pcl::ConditionBase<PointT>(),_evaluator( evaluator ) 
134 |   {}
135 | 
136 |   virtual bool evaluate (const PointT &point) const {
137 |     // just delegate ALL the work to the injected std::function
138 |     return _evaluator(point);
139 |   }
140 | private:
141 |   FunctorT _evaluator;
142 | };
143 | ```
144 | 
145 | That is literally **all** the code you need, no omissions.
146 | 
147 | I am simply wrapping a `std::function<bool(const PointT&)>` inside  `pcl::ConditionBase`. Nothing else.
148 |  
149 | This is the **old** code:
150 | 
151 | 
152 | ```C++
153 | auto range_cond  = std::make_shared<ConditionAnd<PointXYZ> ();
154 | range_cond->addComparison ( 
155 |     std::make_shared<FieldComparison<PointXYZ>("z", ComparisonOps::GT, 0.0));
156 | range_cond->addComparison (
157 |     std::make_shared<FieldComparison<PointXYZ>("z", ComparisonOps::LT, 1.0)));
158 | ```
159 | 
160 | And this is the **new** one, where my condition is expressed in plain old code:
161 | 
162 | ```C++   
163 | auto range_cond = std::make_shared<GenericCondition<PointXYZ>>(
164 |   [](const PointXYZ& point){ 
165 |       return point.z > 0.0 && point.z < 1.0; 
166 |   });
167 | ```
168 | 
169 | The rest of the code is unchanged!!!
170 | 
171 | ![](img/beautiful.jpg)
172 | 
173 | ## Let's talk about speed
174 | 
175 | You may find the code to replicate my tests [here](https://github.com/facontidavide/CPP_Optimizations_Diary/tree/master/cpp/pcl_conditional_removal.cpp).
176 | 
177 | These are the benchmarks based on my sample cloud and 4 filters (upper and lower bound in X and Y):
178 | 
179 | ```
180 | -------------------------------------------------------------
181 | Benchmark                   Time             CPU   Iterations
182 | -------------------------------------------------------------
183 | PCL_Filter            1403083 ns      1403084 ns          498
184 | Naive_Filter           107418 ns       107417 ns         6586
185 | PCL_Filter_Generic     668223 ns       668191 ns         1069
186 | ```
187 | Your results may change a lot according to the number of conditions and the size of the point cloud.
188 | 
189 | But the lessons to learn are:
190 | 
191 | - The "naive" filter might be an option in many cases and it is blazing fast.
192 | - The "safe" `pcl::ConditionalRemoval` can still be used if you just ditch the builtin `pcl::Conditions` and use instead the much more concise and readable `GenericCondition`.
193 | 
194 | 


--------------------------------------------------------------------------------
/docs/pcl_fromROS.md:
--------------------------------------------------------------------------------
 1 | # Case study: convert ROS message to PCL
 2 | 
 3 | [Point Cloud Library (PCL)](https://pointclouds.org/) 
 4 | seems to be a cornucopia of opportunities for optimizations.
 5 | 
 6 | Even if this is a gratuitous criticism, let's remember what
 7 | **Bjarne Stroustrup** said:
 8 | 
 9 | > "There are only two kinds of languages: the ones people complain about and the ones nobody uses"
10 | 
11 | So, let's keep in mind that PCL has a **huge** role in 
12 | allowing everyone to process pointclouds easily.
13 | The developers and maintainers deserve all my respect for that!
14 | 
15 | Said that, let's jump into my next rant.
16 | 
17 | ![](img/davide_yells_at_PCL.jpg)
18 | 
19 | 
20 | ## Using pcl::fromROSMsg()
21 | 
22 | If you use PCL in ROS, the following code is your bread and butter:
23 | 
24 | ```c++
25 | void cloudCallback(const sensor_msgs::PointCloud2ConstPtr& msg)
26 | {
27 |   pcl::PointCloud<pcl::PointXYZ> cloud;
28 |   pcl::fromROSMsg(*msg, cloud);
29 | 
30 |   //...
31 | }
32 | ```
33 | 
34 | Now, I can not count the number of people complaining that
35 | this conversion alone uses a lot of CPU!
36 | 
37 | I look at its implementation and at the results of Hotspot
38 | (perf profiling) and a problem becomes immediately apparent:
39 | 
40 | ```c++
41 | template<typename T>
42 | void fromROSMsg(const sensor_msgs::msg::PointCloud2 &cloud,
43 |                 pcl::PointCloud<T> &pcl_cloud)
44 | {
45 |   pcl::PCLPointCloud2 pcl_pc2;
46 |   pcl_conversions::toPCL(cloud, pcl_pc2);
47 |   pcl::fromPCLPointCloud2(pcl_pc2, pcl_cloud);
48 | }
49 | ```
50 | 
51 | We are transforming/copying the data twice:
52 | 
53 | - first, we convert from `sensor_msgs::msg::PointCloud2` to 
54 | `pcl::PCLPointCloud2`
55 | - then, from `pcl::PCLPointCloud2` to `pcl::PointCloud<T>`.
56 | 
57 | Digging into the implementation of `pcl_conversions::toPCL`, I found this:
58 | 
59 | ```c++
60 | void toPCL(const sensor_msgs::msg::PointCloud2 &pc2,
61 |            pcl::PCLPointCloud2 &pcl_pc2)
62 | {
63 |   copyPointCloud2MetaData(pc2, pcl_pc2);
64 |   pcl_pc2.data = pc2.data;
65 | }
66 | ```
67 | 
68 | Copying that raw data  from one type to the other is an overhead that can be easily avoided
69 | with some refactoring.
70 | 
71 | This refactoring is not particularly interesting, because I basically "copied and pasted"
72 | the code of `pcl::fromPCLPointCloud2` to use a different input type.
73 | 
74 | Fast-forwarding to the solution, let's have a look at the results:
75 | 
76 | ![](img/pcl_fromros.png)
77 | 
78 | ## What is the takeaway of this story?
79 | 
80 | **Measure, measure, measure**! Don't assume that the "smart people" implemented the best solution and that you can't actively do anything about it.
81 | 
82 | In this case, code clarity and reusing existing functions was prefered over performance.
83 | 
84 | But the impact of this decision is definitively too much to be ignored.
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/docs/prefer_references.md:
--------------------------------------------------------------------------------
  1 | # Value semantic vs references
  2 | 
  3 | What I am going to say here is so trivial that probably any seasoned developer
  4 | knows it already.
  5 | 
  6 | Nevertheless, I keep seeing people doing stuff like this:
  7 | 
  8 | ```C++
  9 | bool OpenFile(str::string filename);
 10 | 
 11 | void DrawPath(std::vector<Points> path);
 12 | 
 13 | Pose DetectFace(Image image);
 14 | 
 15 | Matrix3D Rotate(Matrix3D mat, AxisAngle axis_angle);
 16 | 
 17 | ```
 18 | 
 19 | I made these functions up, but **I do see** code like this in production sometimes.
 20 | 
 21 | What do these functions have in common? You are passing the argument **by value**.
 22 | 
 23 | In other words, whenever you call one of these functions, you make a copy of the input in your scope
 24 | and pass the **copy** to the function.
 25 | 
 26 | ![](img/why_copy.jpg)
 27 | 
 28 | Copies may, or may not, be an expensive operation, according to the size of the object or the fact
 29 | that it requires dynamic heap memory allocation or not.
 30 | 
 31 | In these examples, the objects that probably have a negligible overhead,
 32 | when passed by value, are `Matrix3D` and `AngleAxis`, because we may assume that they don't require
 33 | heap allocations.
 34 | 
 35 | But, even if the overhead is small, is there any reason to waste CPU cycle, if we can avoid it?
 36 | 
 37 | This is a better API:
 38 | 
 39 | 
 40 | ```C++
 41 | bool OpenFile(const str::string& filename); // string_view is even better
 42 | 
 43 | void DrawPath(const std::vector<Points>& path);
 44 | 
 45 | Pose DetectFace(const Image& image);
 46 | 
 47 | Matrix3D Rotate(const Matrix3D& mat, const AxisAngle& axis_angle);
 48 | 
 49 | ```
 50 | 
 51 | In the latter version, we are using what is called **"reference semantic"**.
 52 | 
 53 | You may use **C-style** (not-owning) pointers instead of references and get the same benefits, in terms of
 54 | performance, but here we are telling to the compiler that the arguments are:
 55 | 
 56 | - Constant. We won't change them on the callee side, nor inside the called function.
 57 | - Being a reference, the argument *refers* to an existing object. A raw pointer might have value `nullptr`.
 58 | - Not being a pointer, we are sure that we are not transferring the ownership of the object.
 59 | 
 60 | The cost can be dramatically different, as you may see here:
 61 | 
 62 | ```C++
 63 | size_t GetSpaces_Value(std::string str)
 64 | {
 65 |     size_t spaces = 0;
 66 |     for(const char c: str){
 67 |         if( c == ' ') spaces++;
 68 |     }
 69 |     return spaces;
 70 | }
 71 | 
 72 | size_t GetSpaces_Ref(const std::string& str)
 73 | {
 74 |     size_t spaces = 0;
 75 |     for(const char c: str){
 76 |         if( c == ' ') spaces++;
 77 |     }
 78 |     return spaces;
 79 | }
 80 | 
 81 | const std::string LONG_STR("a long string that can't use Small String Optimization");
 82 | 
 83 | void PassStringByValue(benchmark::State& state) {
 84 |     for (auto _ : state) {
 85 |         size_t n = GetSpaces_Value(LONG_STR);
 86 |     }
 87 | }
 88 | 
 89 | void PassStringByRef(benchmark::State& state) {
 90 |     for (auto _ : state) {
 91 |         size_t n = GetSpaces_Ref(LONG_STR);
 92 |     }
 93 | }
 94 | 
 95 | //----------------------------------
 96 | size_t Sum_Value(std::vector<unsigned> vect)
 97 | {
 98 |     size_t sum = 0;
 99 |     for(unsigned val: vect) { sum += val; }
100 |     return sum;
101 | }
102 | 
103 | size_t Sum_Ref(const std::vector<unsigned>& vect)
104 | {
105 |     size_t sum = 0;
106 |     for(unsigned val: vect) { sum += val; }
107 |     return sum;
108 | }
109 | 
110 | const std::vector<unsigned> vect_in = { 1, 2, 3, 4, 5 };
111 | 
112 | void PassVectorByValue(benchmark::State& state) {
113 |     for (auto _ : state) {
114 |         size_t n = Sum_Value(vect_in);
115 |     }
116 | }
117 | 
118 | void PassVectorByRef(benchmark::State& state) {
119 |     for (auto _ : state) {
120 |         size_t n = Sum_Ref(vect_in);
121 |         benchmark::DoNotOptimize(n);
122 |     }
123 | }
124 | 
125 | ```
126 | 
127 | ![](img/const_reference.png)
128 | 
129 | 
130 | Clearly, passing by reference wins hands down.
131 | 
132 | ## Exceptions to the rule
133 | 
134 | > "That is cool Davide, I will use `const&` everywhere".
135 | 
136 | Let's have a look to another example, first.
137 | 
138 | ```C++
139 | struct Vector3D{
140 |     double x;
141 |     double y;
142 |     double z;
143 | };
144 | 
145 | Vector3D MultiplyByTwo_Value(Vector3D p){
146 |     return { p.x*2, p.y*2, p.z*2 };
147 | }
148 | 
149 | Vector3D MultiplyByTwo_Ref(const Vector3D& p){
150 |     return { p.x*2, p.y*2, p.z*2 };
151 | }
152 | 
153 | void MultiplyVector_Value(benchmark::State& state) {
154 |     Vector3D in = {1,2,3};
155 |     for (auto _ : state) {
156 |         Vector3D out = MultiplyByTwo_Value(in);
157 |     }
158 | }
159 | 
160 | void MultiplyVector_Ref(benchmark::State& state) {
161 |     Vector3D in = {1,2,3};
162 |     for (auto _ : state) {
163 |         Vector3D out = MultiplyByTwo_Ref(in);
164 |     }
165 | }
166 | ```
167 | 
168 | ![](img/multiply_vector.png)
169 | 
170 | 
171 | Interesting! Using `const&` has no benefit at all, this time.
172 | 
173 | When you copy an object that doesn't require heap allocation and is smaller than a few dozens of bytes,
174 | you won't notice any benefit passing them by reference.
175 | 
176 | On the other hand, it will never be slower so, if you are in doubt, using `const&` is always a "safe bet". While passing primitive types by const references can be shown to generate an extra instruction (see https://godbolt.org/z/-rusab). That gets optimized out when compiling with `-O3`.
177 | 
178 | My rule of thumb is: never pass by reference any argument with size 8 bytes or less (integers, doubles, chars, long, etc.).
179 | 
180 | Since we know for sure that there is 0% benefit, writing something like this **makes no sense** and it is "ugly":
181 | 
182 | ```C++
183 | void YouAreTryingTooHardDude(const int& a, const double& b);
184 | ```
185 | 


--------------------------------------------------------------------------------
/docs/reserve.md:
--------------------------------------------------------------------------------
 1 | # Vectors are awesome...
 2 | 
 3 | `std::vector<>`s have a huge advantage when compared to other data structures:
 4 | their elements are packed in memory one next to the other.
 5 | 
 6 | We might have a long discussion about how this may affect performance, based on how memory
 7 | works in modern processors.
 8 | 
 9 | If you want to know more about it, just Google "C++ cache aware programming". For instance:
10 | 
11 | - [CPU Caches and why you Care](https://www.aristeia.com/TalkNotes/codedive-CPUCachesHandouts.pdf)
12 | - [Writing cache friendly C++ (video)](https://www.youtube.com/watch?v=Nz9SiF0QVKY) 
13 | 
14 | Iterating through all the elements of a vector is very fast and they work really really well when we have to
15 | append or remove an element from the back of the structure.
16 | 
17 | # ... when you use `reserve`
18 | 
19 | We need to understand how vectors work under the hood.
20 | When you push an element into an empty or full vector, we need to:
21 | 
22 | - allocate a new block of memory that is larger.
23 | - move all the elements we have already stored in the previous block into the new one. 
24 | 
25 | Both these operations are expensive and we want to avoid them as much as possible, if you can, 
26 | sometimes you just accept things the way they are.
27 | 
28 | The size of the new block is **2X the capacity**. Therefore, if you have 
29 | a vector where both `size()` and  `capacity()` are 100 elements and you `push_back()` element 101th,
30 | the block of memory (and the capacity) will jump to 200. 
31 | 
32 | To prevent these allocations, that may happen multiple times, we can **reserve** the capacity that 
33 | we know (or believe) the vector needs.
34 | 
35 | Let's have a look to a micro-benchmark.
36 | 
37 | ```C++
38 | static void NoReserve(benchmark::State& state) 
39 | {
40 |   for (auto _ : state) {
41 |     // create a vector and add 100 elements
42 |     std::vector<size_t> v;
43 |     for(size_t i=0; i<100; i++){  v.push_back(i);  }
44 |   }
45 | }
46 | 
47 | static void WithReserve(benchmark::State& state) 
48 | {
49 |   for (auto _ : state) {
50 |     // create a vector and add 100 elements, but reserve first
51 |     std::vector<size_t> v;
52 |     v.reserve(100);
53 |     for(size_t i=0; i<100; i++){  v.push_back(i);  }
54 |   }
55 | }
56 | 
57 | 
58 | static void ObsessiveRecycling(benchmark::State& state) {
59 |   // create the vector only once
60 |   std::vector<size_t> v;
61 |   for (auto _ : state) {
62 |     // clear it. Capacity is still 100+ from previous run
63 |     v.clear();
64 |     for(size_t i=0; i<100; i++){  v.push_back(i);  }
65 |   }
66 | }
67 | ```
68 | 
69 | ![](img/vector_reserve.png)
70 | 
71 | Look at the difference! And these are only 100 elements.
72 | 
73 | The number of elements influence the final performance gain a lot, but one thing is sure: it **will** be faster.
74 | 
75 | Note also as the `ObsessiveRecycling` brings a performance gain that is probably visible for small vectors, but negligible with bigger ones.
76 | 
77 | Don't take me wrong, though: `ObsessiveRecycling` will always be faster, even if according to the size of the object you are storing
78 | you may or may not notice that difference.
79 | 
80 | 
81 | ## Recognizing a vector at first sight
82 | 
83 | This is the amount of memory an applications of mine was using over time (image obtained with **Heaptrack**):
84 | 
85 | ![](img/growing_vector.png)
86 | 
87 | Look at that! Something is doubling the amount of memory it is using by a factor of two every few seconds...
88 | 
89 | I wonder what it could be? A vector, of course, because other data structures would have a more "linear" growth.
90 | 
91 | That, by the way, **is a bug in the code that was found thanks to memory profiling**: that vector was not supposed to grow at all.
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/docs/small_strings.md:
--------------------------------------------------------------------------------
 1 | # Small String Optimizations
 2 | 
 3 | Remember when I said that "strings are `std::vector<char>` in disguise"?
 4 | 
 5 | In practice, very smart folks realized that you may store
 6 | small strings inside the already allocated memory.
 7 | 
 8 | Given that the size of a `std::string` is **24 bytes** on a 64-bits
 9 | platform (to store data pointer, size and capacity), some
10 | very cool tricks allow us to store **statically** up to 23 bytes
11 | before you need to allocate memory.
12 | 
13 | That has a huge impact in terms of performance!
14 | 
15 | ![](img/relax_sso.jpg)
16 | 
17 | For the curious minds, here there are some details about the implementation:
18 | 
19 | - [SSO-23](https://github.com/elliotgoodrich/SSO-23)
20 | - [CppCon 2016: “The strange details of std::string at Facebook"](https://www.youtube.com/watch?v=kPR8h4-qZdk)
21 | 
22 | According to your version of the compiler, you may have less than 23 bytes, that is
23 | the theoretical limit.
24 | 
25 | ## Example
26 | 
27 | ```C++
28 | const char* SHORT_STR = "hello world";
29 | 
30 | void ShortStringCreation(benchmark::State& state) {
31 |   // Create a string over and over again.
32 |   // It is just because "short strings optimization" is active
33 |   // no memory allocations
34 |   for (auto _ : state) {
35 |     std::string created_string(SHORT_STR);
36 |   }
37 | }
38 | 
39 | void ShortStringCopy(benchmark::State& state) {
40 |   // Here we create the string only once, but copy repeatably.
41 |   // Why is it much slower than ShortStringCreation?
42 |   // The compiler, apparently, outsmarted me
43 |   std::string x; // create once
44 |   for (auto _ : state) {
45 |     x = SHORT_STR; // copy
46 |   }
47 | }
48 | 
49 | const char* LONG_STR = "this will not fit into small string optimization";
50 | 
51 | void LongStringCreation(benchmark::State& state) {
52 |   // The long string will trigger memory allocation for sure
53 |   for (auto _ : state) {
54 |     std::string created_string(LONG_STR);
55 |   }
56 | }
57 | 
58 | void LongStringCopy(benchmark::State& state) {
59 |   // Now we do see an actual speed-up, when recycling
60 |   // the same string multiple times
61 |   std::string x;
62 |   for (auto _ : state) {
63 |     x = LONG_STR;
64 |   }
65 | }
66 | ```
67 | 
68 | As you may notice, my attempt to be clever and say "I will not create a new string
69 | every time" fails miserably if the string is short, but has a huge impact if the string
70 | is allocating memory.
71 | 
72 | ![](img/sso_in_action.png)
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/docs/small_vectors.md:
--------------------------------------------------------------------------------
  1 | # Small vector optimization 
  2 | 
  3 | By now, I hope I convinced you that `std::vector` is the first data structure that you should consider to use unless you need an associative container.
  4 | 
  5 | But even when we cleverly use `reserve` to prevent superfluous heap allocations and copies, there will be a least **one** heap allocation at the beginning. Can we do better? 
  6 | 
  7 | Sure we can! If you have read already about the [small string optimization](../../small_strings) you know where this is going.
  8 | 
  9 | # "Static" vectors and "Small" vectors
 10 | 
 11 | When you are sure that your vector is small and will remain small-ish even in the worst-case scenario, you can allocate the entire array of elements in the stack, and skip the expensive heap allocation.
 12 | 
 13 | You may think that this is unlikely, but you will be surprised to know that this happens much more often than you may expect. Just 2 weeks ago, I identified this very same pattern in one of our libraries, where the size of some vector could be any number between 0 and 8 at most.
 14 | 
 15 | A 30 minutes refactoring improved the overals speed of our software by 20%!
 16 | 
 17 | Summarizing, you want the familiar API of this guy:
 18 | ```C++
 19 | std::vector<double> my_data; // at least one heap allocation unless size is 0 
 20 | ```
 21 | When in fact, under the hood, you want this:
 22 | ```C++
 23 | double my_data[MAX_SIZE]; // no heap allocations 
 24 | int size_my_data;
 25 | ```
 26 | 
 27 | Let's see a simple and naive implementation of `StaticVector`:
 28 | 
 29 | ```C++
 30 | #include <array>
 31 | #include <initializer_list>
 32 | 
 33 | template <typename T, size_t N>
 34 | class StaticVector
 35 | {
 36 | public:
 37 | 
 38 |   using iterator       = typename std::array<T,N>::iterator;
 39 |   using const_iterator = typename std::array<T,N>::const_iterator;
 40 | 
 41 |   StaticVector(uint8_t n=0): _size(n) {
 42 |     if( _size > N ){
 43 |       throw std::runtime_error("SmallVector overflow");
 44 |     }
 45 |   }
 46 | 
 47 |   StaticVector(const StaticVector& other) = default;
 48 |   StaticVector(StaticVector&& other) = default;
 49 | 
 50 |   StaticVector(std::initializer_list<T> init)
 51 |   {
 52 |     _size = init.size();
 53 |     for(int i=0; i<_size; i++) { _storage[i] = init[i]; }
 54 |   }
 55 | 
 56 |   void push_back(T val){
 57 |     _storage[_size++] = val;
 58 |     if( _size > N ){
 59 |       throw std::runtime_error("SmallVector overflow");
 60 |     }
 61 |   }
 62 | 
 63 |   void pop_back(){
 64 |     if( _size == 0 ){
 65 |       throw std::runtime_error("SmallVector underflow");
 66 |     }
 67 |     back().~T(); // call destructor
 68 |     _size--;
 69 |   }
 70 | 
 71 |   size_t size() const { return _size; }
 72 | 
 73 |   void clear(){ while(_size>0) { pop_back(); } }
 74 | 
 75 |   T& front() { return _storage.front(); }
 76 |   const T& front() const { return _storage.front(); }
 77 | 
 78 |   T& back() { return _storage[_size-1]; }
 79 |   const T& back() const { return _storage[_size-1]; }
 80 | 
 81 |   iterator begin() { return _storage.begin(); }
 82 |   const_iterator begin() const { return _storage.begin(); }
 83 | 
 84 |   iterator end() { return _storage.end(); }
 85 |   const_iterator end() const { return _storage.end(); }
 86 | 
 87 |   T& operator[](uint8_t index) { return _storage[index]; }
 88 |   const T& operator[](uint8_t index) const { return _storage[index]; }
 89 | 
 90 |   T& data() { return _storage.data(); }
 91 |   const T& data() const { return _storage.data(); }
 92 | 
 93 | private:
 94 |   std::array<T,N> _storage;
 95 |   uint8_t _size = 0;
 96 | 
 97 | ```
 98 | 
 99 | **StaticVector** looks like a `std::vector` but is...
100 | 
101 | ![](img/inconceivably.jpg)
102 | 
103 | In some cases, there is a very high probability that a vector-like container will have at most **N** elements, but we are not "absolutely sure".
104 | 
105 | We can still use a container, generally known as **SmallVector**, that will use the pre-allocated  memory from the stack for its first N elements and **only** when the container needs to grow further, will create a new storage block using an heap allocation.
106 | 
107 | ## StaticVector and SmallVector in the wild
108 | 
109 | It turn out that these tricks are well known and can be found implemented and ready to use in many popular libraries:
110 | 
111 | - [Boost::container](https://www.boost.org/doc/libs/1_73_0/doc/html/container.html). If it exists, Boost has it of course.
112 | - [Abseil](https://github.com/abseil/abseil-cpp/tree/master/absl/container). They are called `fixed_array` and `inlined_vector`. 
113 | - For didactic purpose, you may have a look to the [SmallVector used internally by LLVM](https://github.com/llvm/llvm-project/blob/master/llvm/include/llvm/ADT/SmallVector.h)
114 | 
115 | 


--------------------------------------------------------------------------------
/docs/strings_are_vectors.md:
--------------------------------------------------------------------------------
 1 | # It is just a string: should I worry?
 2 | 
 3 |  `std::string` is a wonderful abstraction, when compared to the awful mess of
 4 |  raw pointers and lengths that you have to deal with in **C**.
 5 | 
 6 | I am kidding, C developers, we love you!
 7 | > Or we sympathize for you, depends on how you want to look at it.
 8 | 
 9 | If you think about it, it should be no more than an `std::vector<char>` in disguise,
10 | with some useful utility that makes sense for text, but no much more.
11 | 
12 | On one hand, **it is**, but here comes what is called **Small String Optimization (SSO)**.
13 | 
14 | [Read more about SSO here](small_strings.md).
15 | 
16 | What I want to show you here is that, as any objects that **might**
17 | require memory allocation, you must use best practices you should use
18 | with similar containers (even if, arguably, often you need to worry less).
19 | 
20 | ## ToString
21 | 
22 | ```c++
23 | enum Color{
24 |     BLUE,
25 |     RED,
26 |     YELLOW
27 | };
28 | 
29 | std::string ToStringBad(Color c)
30 | {
31 |     switch(c) {
32 |     case BLUE:   return "BLUE";
33 |     case RED:    return "RED";
34 |     case YELLOW: return "YELLOW";
35 |     }
36 | }
37 | 
38 | const std::string& ToStringBetter(Color c)
39 | {
40 |     static const std::string color_name[3] ={"BLUE", "RED", "YELLOW"};
41 |     switch(c) {
42 |     case BLUE:   return color_name[0];
43 |     case RED:    return color_name[1];
44 |     case YELLOW: return color_name[2];
45 |     }
46 | }
47 | ```
48 | 
49 | This is just an example of how, if you can, you should not create over and
50 | over a string. Of course, I can hear you arguing:
51 | 
52 | "Davide, you are forgetting Return Value Optimization"?
53 | 
54 | I am not. But a `const&` is **always** guaranteed to be the most
55 | performing option, so why try your luck?
56 | 
57 | 
58 | ![](img/tostring.png)
59 | 
60 | ## Reuse temporary strings
61 | 
62 | Here it comes a similar example in which we **potentially**
63 | recycle the memory already allocated in the past.
64 | 
65 | You are not guaranteed to be faster with the latter version, but you
66 | might be.
67 | 
68 | ```c++
69 | // Create a new string every time (even if return value optimization may help)
70 | static std::string ModifyString(const std::string& input)
71 | {
72 |     std::string output = input;
73 |     output.append("... indeed");
74 |     return output;
75 | }
76 | // Reuse an existing string that MAYBE, have the space already reserved
77 | // (or maybe not..)
78 | static void ModifyStringBetter(const std::string& input, std::string& output)
79 | {
80 |     output = input;
81 |     output.append("... indeed");
82 | }
83 | ```
84 | 
85 | And, as expected...
86 | 
87 | ![](img/modifystring.png)
88 | 
89 | 


--------------------------------------------------------------------------------
/docs/strings_concatenation.md:
--------------------------------------------------------------------------------
  1 | # String concatenation
  2 | 
  3 | Warning: before you read this, remember the rule #1 we mentioned at the beginning.
  4 | 
  5 | **Optimize your code only if you can observe a visible overhead 
  6 | with you profiling tools**. Said that...
  7 | 
  8 | As we said, strings are a little more than vectors of characters, therefore
  9 | they may need heap allocations to store all their elements.
 10 | 
 11 | Concatenating strings in C++ is very easy, but there is something we should
 12 | be aware of.
 13 | 
 14 | ## "Default" concatenation
 15 | 
 16 | Look at this familiar line of code. 
 17 | 
 18 | ```C++
 19 | std:string big_string = first + " " + second + " " + third;
 20 | 
 21 | // Where...
 22 | // std::string first("This is my first string.");
 23 | // std::string second("This is the second string I want to append.");
 24 | // std::string third("This is the third and last string to append."); 
 25 | ```
 26 | 
 27 | Noticing anything suspicious? Think about heap allocations...
 28 | 
 29 | ![](img/spider_senses.png)
 30 | 
 31 | Let me rewrite it like this:
 32 | 
 33 | ```C++
 34 | std:string big_string = (((first + " ") + second) + " ") + third;
 35 | ```
 36 | 
 37 | Hopefully you got it. To concatenate strings of this length, you will 
 38 | need multiple heap allocations and copies from the old memory block to the new one.
 39 | 
 40 | If only `std::string` had a method similar to `std::vector::reserve()` :(
 41 | 
 42 | Hey, wait... [what is this](https://en.cppreference.com/w/cpp/string/basic_string/reserve)?
 43 | 
 44 | 
 45 | ## "Manual" concatenation
 46 | 
 47 | Let's use reserve to reduce the amount of heap allocations to exactly one.
 48 | 
 49 | We can calculate the total amount of characters needed by `big_string` in
 50 | advance and reserve it like this:
 51 | 
 52 | ```C++
 53 |     std::string big_one;
 54 |     big_one.reserve(first_str.size() + 
 55 |                     second_str.size() + 
 56 |                     third_str.size() + 
 57 |                     strlen(" ")*2 );
 58 | 
 59 |     big_one += first;
 60 |     big_one += " ";
 61 |     big_one += second;
 62 |     big_one += " ";
 63 |     big_one += third;
 64 | ```
 65 | 
 66 | I know what you are thinking and you are 100% right.
 67 | 
 68 | ![](img/feel_bad.jpg)
 69 | 
 70 | That is a horrible piece of code... that is **2.5 times faster** than the 
 71 | default string concatenation!
 72 | 
 73 | ## Variadic concatenation
 74 | 
 75 | Can we create a string concatenation function that is fast, reusable **and** 
 76 | readable?
 77 | 
 78 | We do, but we need to use some heavy weapons of Modern C++: **variadic templates**.
 79 | 
 80 | There is a very [nice article about variadic templates here](https://arne-mertz.de/2016/11/more-variadic-templates/),
 81 | that you should probably read if you are not familiar with them.
 82 | 
 83 | 
 84 | ```C++
 85 | //--- functions to calculate the total size ---
 86 | size_t StrSize(const char* str) {
 87 |   return strlen(str);
 88 | }
 89 | 
 90 | size_t StrSize(const std::string& str) {
 91 |   return str.size();
 92 | }
 93 | 
 94 | template <class Head, class... Tail>
 95 | size_t StrSize(const Head& head, Tail const&... tail) {
 96 |   return StrSize(head) + StrSize(tail...);
 97 | }
 98 | 
 99 | //--- functions to append strings together ---
100 | template <class Head>
101 | void StrAppend(std::string& out, const Head& head) {
102 |   out += head;
103 | }
104 | 
105 | template <class Head, class... Args>
106 | void StrAppend(std::string& out, const Head& head, Args const&... args) {
107 |   out += head;
108 |   StrAppend(out, args...);
109 | }
110 | 
111 | //--- Finally, the function to concatenate strings ---
112 | template <class... Args> 
113 | std::string StrCat(Args const&... args) {
114 |   size_t tot_size = StrSize(args...);
115 |   std::string out;
116 |   out.reserve(tot_size);
117 | 
118 |   StrAppend(out, args...);
119 |   return out;
120 | }
121 | ```
122 | 
123 | That was a lot of complex code, even for a trained eye. But the good news are
124 | that it is very easy to use:
125 | 
126 | ```C++
127 | std:string big_string = StrCat(first, " ", second, " ", third );
128 | ```
129 | 
130 | So, how fast is that?
131 | 
132 | ![](img/string_concatenation.png)
133 | 
134 | The reason why the version with variadic templates is slightly slower than
135 | the "ugly" manual concatenation is... 
136 | 
137 | I have no idea!
138 | 
139 | What I **do** know is that it is twice as fast as the default one and
140 | it is not an unreadable mess. 
141 | 
142 | ## Before you copy and paste my code...
143 | 
144 | My implementation of `StrCat` is very limited and I just wanted to make a point:
145 | beware of string concatenations in C++.
146 | 
147 | Nevertheless, don't think it twice and use [{fmt}](https://github.com/fmtlib/fmt) instead.
148 | 
149 | Not only it is an easy to integrate, well documented and **very**
150 | fast library to format strings.
151 | 
152 | It is also an implementation of [C++20 std::format](https://en.cppreference.com/w/cpp/utility/format).
153 | 
154 | This means that you can write code that is readable, performant and future proof! 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: CPP Optimizations diary
 2 | 
 3 | site_description: Tip and tricks to optimize your C++ code.
 4 | site_author: Davide Faconti
 5 | 
 6 | copyright: 'Copyright &copy; 2020 Davide Faconti'
 7 | 
 8 | theme:
 9 |   name: 'material'
10 |   custom_dir: overrides
11 |   language: en
12 |   logo: 'img/cpp.png'
13 |   palette:
14 |     primary: blue grey
15 |     accent: purple
16 |   font:
17 |     text: Ubuntu
18 |     code: Roboto Mono
19 |   icon:
20 |     logo: material/library
21 |     repo: fontawesome/brands/git-alt
22 |   
23 | repo_name: 'CPP_Optimizations_Diary'
24 | repo_url: 'https://github.com/facontidavide/CPP_Optimizations_Diary'
25 | 
26 | extra:
27 |   social:
28 |     - icon: fontawesome/brands/twitter
29 |       link: https://twitter.com/facontidavide
30 | 
31 | markdown_extensions:
32 |   - admonition
33 |   - codehilite
34 |   - pymdownx.highlight:
35 |       anchor_linenums: true
36 |   - pymdownx.inlinehilite
37 |   - pymdownx.snippets
38 |   - pymdownx.superfences
39 | 
40 |   
41 | extrahead:
42 |   
43 |   
44 | nav:
45 |   - Home: index.md
46 | 
47 |   - Reference and move semantic:
48 |     - Use references: prefer_references.md
49 | 
50 |   - Vectors are your best friend:
51 |     - Use reserve: reserve.md
52 |     - Avoid std::list : no_lists.md
53 |     - You may not need std::map : dont_need_map.md
54 |     - Small vector optimization: small_vectors.md
55 | 
56 |   - "It's just a string...":
57 |     - Strings are vectors: strings_are_vectors.md
58 |     - Small string optimization: small_strings.md
59 |     - String concatenation: strings_concatenation.md
60 |         
61 |   - Don't compute it twice:
62 |     - More efficient 2D transforms: 2d_transforms.md 
63 |     - Iterating over a 2D matrix: 2d_matrix_iteration.md
64 | 
65 |   - Fantastic data structures:
66 |     - Boost flat_map to the rescue:  boost_flatmap.md  
67 |  
68 |   - Case studies:
69 |     - Faster and simpler PCL filter:  pcl_filter.md
70 |     - More PCL optimizations:  pcl_fromROS.md
71 |     - Fast palindrome:  palindrome.md
72 | 
73 |   - About me: about.md
74 |     
75 | 


--------------------------------------------------------------------------------
/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block extrahead %}
4 | 
5 | <script async defer data-website-id="bb558d85-69ac-4b8f-82b9-c2991512d4c2" src="https://quirky-gharial.pikapod.net/umami.js"></script>
6 | 
7 | {% endblock %}
8 | 


--------------------------------------------------------------------------------