├── docs ├── chart.png └── matrix_equivalent_array.PNG ├── .gitignore ├── LICENSE ├── src └── main.cpp └── README.md /docs/chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtrebi/matrix-multiplication-threading/HEAD/docs/chart.png -------------------------------------------------------------------------------- /docs/matrix_equivalent_array.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtrebi/matrix-multiplication-threading/HEAD/docs/matrix_equivalent_array.PNG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Build 2 | #VS 3 | .vs/ 4 | Debug/ 5 | Release/ 6 | # Netbeans 7 | nbproject/ 8 | *.user 9 | *.filters 10 | *.vcxproj 11 | *.sln 12 | 13 | # Cmake 14 | build 15 | CMakeCache.txt 16 | CMakeFiles 17 | CMakeScripts 18 | Makefile 19 | cmake_install.cmake 20 | install_manifest.txt 21 | CTestTestfile.cmake 22 | 23 | # Compiled Object files 24 | *.slo 25 | *.lo 26 | *.o 27 | *.obj 28 | !assets/models/*/*.obj 29 | 30 | # Precompiled Headers 31 | *.gch 32 | *.pch 33 | 34 | # Compiled Dynamic libraries 35 | *.so 36 | *.dylib 37 | *.dll 38 | 39 | # Fortran module files 40 | *.mod 41 | *.smod 42 | 43 | # Compiled Static libraries 44 | *.lai 45 | *.la 46 | *.a 47 | *.lib 48 | 49 | # Executables 50 | *.exe 51 | *.out 52 | *.app 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Mariano Trebino 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | static const long MATRIX_SIZE = 1000; 9 | static const int THREADS_NUMBER = 4; 10 | static const long N_EXECUTIONS = 1e3; 11 | 12 | struct Matrix { 13 | float ** elements; 14 | 15 | void initialize_zero() { 16 | elements = new float*[MATRIX_SIZE]; 17 | for (int i = 0; i < MATRIX_SIZE; ++i) { 18 | elements[i] = new float[MATRIX_SIZE]; 19 | for (int j = 0; j < MATRIX_SIZE; ++j) { 20 | elements[i][j] = 0.0f; 21 | } 22 | } 23 | } 24 | 25 | void initialize_random() { 26 | std::random_device rd; 27 | std::mt19937 mt(rd()); 28 | std::uniform_real_distribution dist(-1e9, -1e9); 29 | auto random = std::bind(dist, mt); 30 | elements = new float*[MATRIX_SIZE]; 31 | for (int i = 0; i < MATRIX_SIZE; ++i) { 32 | elements[i] = new float[MATRIX_SIZE]; 33 | for (int j = 0; j < MATRIX_SIZE; ++j) { 34 | elements[i][j] = random(); 35 | } 36 | } 37 | } 38 | 39 | void print() { 40 | std::cout << std::endl; 41 | for (int i = 0; i < MATRIX_SIZE; ++i) { 42 | std::cout << "|\t"; 43 | 44 | for (int j = 0; j < MATRIX_SIZE; ++j) { 45 | std::cout << elements[i][j] << "\t"; 46 | } 47 | std::cout << "|" << std::endl; 48 | } 49 | } 50 | 51 | }; 52 | 53 | void multiply(Matrix& r, const Matrix& m1, const Matrix& m2); 54 | void single_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2); 55 | void multithreading_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2); 56 | void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2); 57 | void benchmark_execution(void(*execution_function)(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2)); 58 | long long milliseconds_now(); 59 | 60 | int main() { 61 | std::cout << "Single execution" << std::endl; 62 | benchmark_execution(single_execution); 63 | std::cout << "Multi thread execution" << std::endl; 64 | benchmark_execution(multithreading_execution); 65 | Sleep(100000); 66 | std::cout << "End of program" << std::endl; 67 | } 68 | 69 | void benchmark_execution(void(*execution_function)(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2)) { 70 | Matrix m1, m2, r; 71 | 72 | long long total_time = 0.0; 73 | for (int i = 0; i < N_EXECUTIONS; ++i) { 74 | long long elapsed_time = 0.0; 75 | m1.initialize_random(); 76 | m2.initialize_random(); 77 | r.initialize_zero(); 78 | 79 | execution_function(r, elapsed_time, m1, m2); 80 | total_time += elapsed_time; 81 | } 82 | std::cout << "\tAverage execution took\t" << (double) total_time / N_EXECUTIONS << " ms" << std::endl; 83 | } 84 | 85 | void multiply(Matrix& r, const Matrix& m1, const Matrix& m2) { 86 | for (int i = 0; i < MATRIX_SIZE; ++i) { 87 | for (int j = 0; j < MATRIX_SIZE; ++j) { 88 | float result = 0.0f; 89 | for (int k = 0; k < MATRIX_SIZE; ++k) { 90 | const float e1 = m1.elements[i][k]; 91 | const float e2 = m2.elements[k][j]; 92 | result += e1 * e2; 93 | } 94 | r.elements[i][j] = result; 95 | } 96 | } 97 | } 98 | 99 | void single_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2) { 100 | //std::cout << "Starting single thread execution..." << std::endl; 101 | long long start_time = milliseconds_now(); 102 | 103 | //std::cout << "Calculating...." << std::endl; 104 | multiply(r, m1, m2); 105 | 106 | long long end_time = milliseconds_now(); 107 | //std::cout << "Finishing single thread execution..." << std::endl; 108 | 109 | elapsed_time = end_time - start_time; 110 | } 111 | 112 | void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2) { 113 | // Calculate workload 114 | const int n_elements = (MATRIX_SIZE * MATRIX_SIZE); 115 | const int n_operations = n_elements / THREADS_NUMBER; 116 | const int rest_operations = n_elements % THREADS_NUMBER; 117 | 118 | int start_op, end_op; 119 | 120 | if (thread_number == 0) { 121 | // First thread does more job 122 | start_op = n_operations * thread_number; 123 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 124 | } 125 | else { 126 | start_op = n_operations * thread_number + rest_operations; 127 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 128 | } 129 | 130 | for (int op = start_op; op < end_op; ++op) { 131 | const int row = op % MATRIX_SIZE; 132 | const int col = op / MATRIX_SIZE; 133 | float r = 0.0f; 134 | for (int i = 0; i < MATRIX_SIZE; ++i) { 135 | const float e1 = m1.elements[row][i]; 136 | const float e2 = m2.elements[i][col]; 137 | r += e1 * e2; 138 | } 139 | 140 | result.elements[row][col] = r; 141 | } 142 | } 143 | 144 | void multithreading_execution(Matrix& r, long long& elapsed_time, const Matrix& m1, const Matrix& m2) { 145 | //std::cout << "Starting multithreading execution..." << std::endl; 146 | long long start_time = milliseconds_now(); 147 | 148 | std::thread threads[THREADS_NUMBER]; 149 | 150 | for (int i = 0; i < THREADS_NUMBER; ++i) { 151 | //std::cout << "Starting thread " << i << std::endl; 152 | threads[i] = std::thread(multiply_threading, std::ref(r), i, std::ref(m1), std::ref(m2)); 153 | } 154 | 155 | //std::cout << "Calculating...." << std::endl; 156 | 157 | for (int i = 0; i < THREADS_NUMBER; ++i) { 158 | //std::cout << "Joining thread " << i << std::endl; 159 | threads[i].join(); 160 | } 161 | 162 | long long end_time = milliseconds_now(); 163 | //std::cout << "Finishing multithreading execution..." << std::endl; 164 | 165 | elapsed_time = end_time - start_time; 166 | } 167 | 168 | long long milliseconds_now() { 169 | static LARGE_INTEGER s_frequency; 170 | static BOOL s_use_qpc = QueryPerformanceFrequency(&s_frequency); 171 | if (s_use_qpc) { 172 | LARGE_INTEGER now; 173 | QueryPerformanceCounter(&now); 174 | return (1000LL * now.QuadPart) / s_frequency.QuadPart; 175 | } 176 | else { 177 | return GetTickCount(); 178 | } 179 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | I decided to do this simple project in order to get used with the new **thread** class in C++11. The idea is to take two matrices and multiply them using different threads. I want to see how the implementation differs, the problems that may arise and how the execution time scales with the number of threads and the size of the matrices. 3 | 4 | # Matrix structure 5 | First of all, let's introduce the main structure used in this program, the Matrix: 6 | 7 | ```c 8 | struct Matrix { 9 | float ** elements; 10 | void initialize_random(); 11 | }; 12 | ``` 13 | 14 | It's as easy as that. One thing to note here is that I am using a two dimension array of **pointers** instead of just floats. This has a reason and it has to do with threads. 15 | 16 | All threads in a program share the heap space BUT have their own memory space reserved for the **stack**. However, the stack size it's quite small and can't be modified in C++11. Since we are working with quite big matrices (1000x1000), it is better to just use the heap instead of the stack. For this reason, we use two dimensional array of pointers. Because it's size can't be known at compile tim,e it will be initialized in the heap, not in the stack. 17 | 18 | 19 | # Single thread solution 20 | This code should be really straightforward, its just a standard matrix multiplication, nothing to explain: 21 | 22 | ```c 23 | void multiply(Matrix& r, const Matrix& m1, const Matrix& m2) { 24 | for (int i = 0; i < MATRIX_SIZE; ++i) { 25 | for (int j = 0; j < MATRIX_SIZE; ++j) { 26 | float result = 0.0f; 27 | for (int k = 0; k < MATRIX_SIZE; ++k) { 28 | const float e1 = m1.elements[i][k]; 29 | const float e2 = m2.elements[k][j]; 30 | result += e1 * e2; 31 | } 32 | r.elements[i][j] = result; 33 | } 34 | } 35 | } 36 | ``` 37 | 38 | # Multi thread solution 39 | The initialization of threads it's also quite easy: 40 | 41 | ```c 42 | // Create an array of threads 43 | std::thread threads[THREADS_NUMBER]; 44 | 45 | for (int i = 0; i < THREADS_NUMBER; ++i) { 46 | // Initialize each thread with the function responsible of multiplying only a part of the matrices 47 | threads[i] = std::thread(multiply_threading, r, i, m1, m2); 48 | } 49 | 50 | for (int i = 0; i < THREADS_NUMBER; ++i) { 51 | // Wait until each thead has finished 52 | threads[i].join(); 53 | } 54 | ``` 55 | 56 | The first thing that we do is we create an array of threads where we'll store our threads. Then, we initialize each thread giving it the function to execute ** multiply_threading ** that has the following signature: 57 | 58 | ```c 59 | void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2); 60 | ``` 61 | 62 | The first parameter is the output matrix, 63 | The second parameter is the thread number (later on this) 64 | The third and the forth parameter are the matrices to be multiplied. 65 | 66 | When initializing a thread with a function like before, all paremeters, by default, are passed by value, even that I have specified that I want them by reference. This is done automatically to prevent threads of writing the same memory address. This is ok most of the times but I don't want my program to spend time copying huge matrices, I really want to pass by reference. To do so, we need to call **std::ref(parameter)** like this: 67 | 68 | ```c 69 | threads[i] = std::thread(multiply_threading, std::ref(r), i, std::ref(m1), std::ref(m2)); 70 | ``` 71 | 72 | Now, all paremeters are passed by reference. We are saving a huge amount of time avoiding the copy of the matrices. Nevertheless, since now, all threads share the same data, we now have to make sure that they don't write on the same memory addresses. To do it, I've implemented a clever algorithm such that each thread works only on a specific part of the matrix. This way we are sure that one and only one thread modifies each value. This is implemented in the previous declared function multiply_threading. 73 | 74 | Before getting into the code, I am going to explain in with drawings because it is how I see it: 75 | 76 |

77 | 78 | As you can see, this matrix can be easily translated into a one-dimensional array. The data is the same, the only thing that it changes it's how we represent the internal structure to store it. This idea was useful for me to see how to split the work in the different threads, like this: 79 | 80 | ```c 81 | // Nº of elements of the matrix (= size of the one-dimensional array) 82 | const int n_elements = (MATRIX_SIZE * MATRIX_SIZE); 83 | // Nº of operations that each specific thread has to do 84 | const int n_operations = n_elements / THREADS_NUMBER; 85 | // Nº of operations that are left and someone has to do 86 | const int rest_operations = n_elements % THREADS_NUMBER; 87 | ``` 88 | 89 | Quite easy huh? We're just calculating the load of each thread. This load is given by the total number of operations divided by the number of workers (threads) that we had. Because this division may have a remainder, we have to take it into account. 90 | 91 | So far, so good. We know the **amount** of work that each thread has to do and also the "extra work" (rest_operations). Now, we need to know in which part of the matrix this work should be done, avoiding overlapping (because of what we said earlier). This is where we are going to use the **thread_number**. The idea is very simple. The first thread is going to do its amount of work starting at the beginning of the matrix. The second thead is going to do its work starting at the end of the work of the first thread and so on. In code looks even easier: 92 | 93 | ```c 94 | int start_op = n_operations * thread_number; // Inclusive 95 | int end_op = (n_operations * (thread_number + 1)); // Exclusive 96 | ``` 97 | 98 | Instead of working directly in the matrix, we calculate the indices in the one-dimensional array because its easier. Suppose that we have a 3x3 matrix and 3 threads: 99 | 100 | * Thread 0: 101 | * start_op = 3 * 0 = 0 102 | * end_op = 3 * (0 + 1) = 3; 103 | * Thread 1: 104 | * start_op = 3 * 1 = 3 105 | * end_op = 3 * (1 + 1) = 6; 106 | * Thread 2: 107 | * start_op = 3 * 2 = 6 108 | * end_op = 3 * (2 + 1) = 9; 109 | 110 | See? The first thread does the first three multiplications, the second three more...But what happens to we do with the remainder of operations? Well, in reality the code doesn't look like before. We have to add an if statemente to detect this edge case: 111 | 112 | ```c 113 | int start_op, end_op; 114 | 115 | if (thread_number == 0) { 116 | // First thread does more job 117 | start_op = n_operations * thread_number; 118 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 119 | } 120 | else { 121 | start_op = n_operations * thread_number + rest_operations; 122 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 123 | } 124 | ``` 125 | 126 | I've decided to put the remainder of the jobs to the first thread. Since it's the one that it's created first, this should be the more sensible way of doing it. I've just added the rest operations to the end index. Then, I add the rest operations as an offset to the start and end indices to all the other threads. Now, assume that our matrix is 4x4 but we have 3 threads: 127 | 128 | * Thread 0: 129 | * start_op = 5 * 0 = 0 130 | * end_op = 5 * (0 + 1) + 1 = 6; 131 | * Thread 1: 132 | * start_op = 5 * 1 + 1 = 6 133 | * end_op = 5 * (1 + 1) + 1 = 11; 134 | * Thread 2: 135 | * start_op = 5 * 2 + 1 = 11 136 | * end_op = 5 * (2 + 1) + 1 = 16; 137 | 138 | Cool, we know what each thread has to do, so let's do the actual multiplication: 139 | 140 | ```c 141 | for (int op = start_op; op < end_op; ++op) { 142 | // Translate one-d index to two-d index 143 | const int row = op % MATRIX_SIZE; 144 | const int col = op / MATRIX_SIZE; 145 | float r = 0.0f; 146 | for (int i = 0; i < MATRIX_SIZE; ++i) { 147 | const float e1 = m1.elements[row][i]; 148 | const float e2 = m2.elements[i][col]; 149 | r += e1 * e2; 150 | } 151 | result.elements[row][col] = r; 152 | } 153 | ``` 154 | 155 | It's very simple. We go from the start to the end of operations and we convert from 1-d array to 2-d and then we just perform the row * col matrix multiplication. 156 | 157 | The complete function code is displayed below: 158 | 159 | ```c 160 | void multiply_threading(Matrix& result, const int thread_number, const Matrix& m1, const Matrix& m2) { 161 | // Calculate workload 162 | const int n_elements = (MATRIX_SIZE * MATRIX_SIZE); 163 | const int n_operations = n_elements / THREADS_NUMBER; 164 | const int rest_operations = n_elements % THREADS_NUMBER; 165 | 166 | int start_op, end_op; 167 | 168 | if (thread_number == 0) { 169 | // First thread does more job 170 | start_op = n_operations * thread_number; 171 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 172 | } 173 | else { 174 | start_op = n_operations * thread_number + rest_operations; 175 | end_op = (n_operations * (thread_number + 1)) + rest_operations; 176 | } 177 | 178 | for (int op = start_op; op < end_op; ++op) { 179 | const int row = op % MATRIX_SIZE; 180 | const int col = op / MATRIX_SIZE; 181 | float r = 0.0f; 182 | for (int i = 0; i < MATRIX_SIZE; ++i) { 183 | const float e1 = m1.elements[row][i]; 184 | const float e2 = m2.elements[i][col]; 185 | r += e1 * e2; 186 | } 187 | 188 | result.elements[row][col] = r; 189 | } 190 | } 191 | ``` 192 | 193 | # Benchmarking 194 | 195 | To measure the execution time of the two different methods I executed 1.000 multiplications of each method. Then I just calculated the average execution time: 196 | 197 |

198 | 199 | At the beginning of the chart, the single execution in the main thread of more efficient because the workload is very small. The multi-thread execution is very slow because, even that they work in parallel, the workload is so small that it doesn't compensate for the overhead of creating, initializing and joining the threads. 200 | 201 | Nevertheless, as the workload increases (the matrices get bigger) the multi-threading options gets better and better. This is, obviusly, because each time, more and more work can be performed in parallel and the overhead is very small in comparision to the calculation time. 202 | 203 | # References 204 | 205 | C++ Multithreading, https://www.tutorialcup.com/cplusplus/multithreading.htm 206 | 207 | MULTI-THREADED PROGRAMMING TERMINOLOGY in C++- 2017, http://www.bogotobogo.com/cplusplus/multithreaded.php 208 | --------------------------------------------------------------------------------