├── Code ├── coroutine │ ├── main.cpp │ └── scheduler.hpp ├── lecture10 │ └── lecture10-threadpool.cpp ├── lecture11 │ ├── task-parallelism.cpp │ ├── taskflow-matmul.cpp │ └── taskflow-parallelism.cpp ├── lecture12 │ ├── relaxed.cpp │ ├── release_acquire.cpp │ └── release_consume.cpp ├── lecture14 │ └── parallel-for.cpp ├── lecture15 │ └── parallel-for-guided.cpp ├── lecture16 │ └── parallel-reduction.cpp ├── lecture17 │ └── parallel-transform.cpp ├── lecture18 │ └── cuda.cu ├── lecture19 │ └── cuda.cu ├── lecture21 │ └── cuda.cu ├── lecture22 │ └── find_if.cu ├── lecture23 │ └── find_if.cu ├── lecture24 │ ├── reduce-slow.cu │ └── reduce.cu ├── lecture3 │ └── lecture3.cpp ├── lecture4 │ └── lecture4.cpp ├── lecture5 │ ├── lecture5-async.cpp │ ├── lecture5-custom-async.cpp │ ├── lecture5-promise-refactored.cpp │ └── lecture5-promise.cpp ├── lecture6 │ └── lecture6-job-queue.cpp ├── lecture7 │ └── lecture7-job-queue-cv.cpp ├── lecture8 │ └── lecture8-job-queue-cv-bug-free.cpp └── lecture9 │ └── lecture9-threadpool.cpp ├── Lectures ├── coroutine.pdf ├── lecture1.pdf ├── lecture2.pdf └── lecture3.pdf └── README.md /Code/coroutine/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "scheduler.hpp" 3 | 4 | Task TaskA(Scheduler& sched) { 5 | std::cout << "Hello from TaskA\n"; 6 | co_await sched.suspend(); 7 | std::cout << "Executing the TaskA\n"; 8 | co_await sched.suspend(); 9 | std::cout << "TaskA is finished\n"; 10 | } 11 | 12 | Task TaskB(Scheduler& sched) { 13 | std::cout << "Hello from TaskB\n"; 14 | co_await sched.suspend(); 15 | std::cout << "Executing the TaskB\n"; 16 | co_await sched.suspend(); 17 | std::cout << "TaskB is finished\n"; 18 | } 19 | 20 | 21 | int main() { 22 | 23 | Scheduler sched; 24 | 25 | TaskA(sched); 26 | TaskB(sched); 27 | 28 | while(sched.schedule()) {}; 29 | } 30 | 31 | 32 | -------------------------------------------------------------------------------- /Code/coroutine/scheduler.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | struct Task { 5 | 6 | struct promise_type { 7 | std::suspend_never initial_suspend() noexcept { return {}; } 8 | std::suspend_never final_suspend() noexcept { return {}; } 9 | 10 | // ignore 11 | Task get_return_object() { return Task{}; } 12 | void unhandled_exception() {} 13 | }; 14 | }; 15 | 16 | 17 | class Scheduler { 18 | 19 | std::list> _tasks; 20 | 21 | 22 | public: 23 | bool schedule() { 24 | auto task = _tasks.front(); 25 | _tasks.pop_front(); 26 | if(!task.done()) { task.resume(); } 27 | 28 | return !_tasks.empty(); 29 | } 30 | 31 | auto suspend() { 32 | struct Awaiter: std::suspend_always { 33 | Scheduler& scheduler; 34 | Awaiter(Scheduler& sched): scheduler{sched} {} 35 | void await_suspend(std::coroutine_handle<> coro) { 36 | scheduler._tasks.push_back(coro); 37 | } 38 | }; 39 | 40 | return Awaiter{*this}; 41 | } 42 | }; 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /Code/lecture10/lecture10-threadpool.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct MoC { 14 | 15 | MoC(T&& rhs) : object(std::move(rhs)) {} 16 | MoC(const MoC& other) : object(std::move(other.object)) {} 17 | 18 | T& get() { return object; } 19 | 20 | mutable T object; 21 | }; 22 | 23 | // ---------------------------------------------------------------------------- 24 | // Class definition for Threadpool 25 | // ---------------------------------------------------------------------------- 26 | 27 | class Threadpool { 28 | 29 | public: 30 | 31 | // constructor tasks a unsigned integer representing the number of 32 | // workers you need 33 | Threadpool(size_t N) { 34 | 35 | for(size_t i=0; i task; 40 | // my job is to iteratively grab a task from the queue 41 | { 42 | // Best practice: anything that happens inside the while continuation check 43 | // should always be protected by lock 44 | std::unique_lock lock(mtx); 45 | while(queue.empty() && !stop) { 46 | cv.wait(lock); 47 | } 48 | if(!queue.empty()) { 49 | task = queue.front(); 50 | queue.pop(); 51 | } 52 | } 53 | // and run the task... 54 | if(task) { 55 | task(); 56 | } 57 | } 58 | }); 59 | } 60 | } 61 | 62 | // destructor will release all threading resources by joining all of them 63 | ~Threadpool() { 64 | // I need to join the threads to release their resources 65 | for(auto& t : threads) { 66 | t.join(); 67 | } 68 | } 69 | 70 | // shutdown the threadpool 71 | void shutdown() { 72 | std::scoped_lock lock(mtx); 73 | stop = true; 74 | cv.notify_all(); 75 | } 76 | 77 | // insert a task "callable object" into the threadpool 78 | template 79 | auto insert(C&& task) { 80 | std::promise promise; 81 | auto fu = promise.get_future(); 82 | { 83 | std::scoped_lock lock(mtx); 84 | queue.push( 85 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 86 | task(); 87 | moc.object.set_value(); 88 | } 89 | ); 90 | } 91 | cv.notify_one(); 92 | return fu; 93 | } 94 | 95 | // insert a task "callable object" into the threadpool 96 | template 97 | auto insert_with_return(C&& task) { 98 | using R = std::result_of_t; 99 | std::promise promise; 100 | auto fu = promise.get_future(); 101 | { 102 | std::scoped_lock lock(mtx); 103 | queue.push( 104 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 105 | moc.object.set_value( 106 | task() 107 | ); 108 | } 109 | ); 110 | } 111 | cv.notify_one(); 112 | return fu; 113 | } 114 | 115 | // insert a task "callable object" into the threadpool using a generic 116 | // function wrapper (instead of a template argument) 117 | auto insert_2(std::function task) { 118 | 119 | std::promise promise; 120 | auto fu = promise.get_future(); 121 | 122 | { 123 | std::scoped_lock lock(mtx); 124 | queue.push( 125 | [moc=MoC{std::move(promise)}, task=std::move(task)] () mutable { 126 | task(); 127 | moc.object.set_value(); 128 | } 129 | ); 130 | } 131 | cv.notify_one(); 132 | 133 | return fu; 134 | } 135 | 136 | private: 137 | 138 | std::mutex mtx; 139 | std::vector threads; 140 | std::condition_variable cv; 141 | 142 | bool stop {false}; 143 | std::queue< std::function > queue; 144 | 145 | }; 146 | 147 | 148 | // ---------------------------------------------------------------------------- 149 | // application code 150 | // 151 | // perform parallel matrix multiplication 152 | // A * B = C 153 | // A is NxK 154 | // B is KxM 155 | // C is NxM 156 | // ---------------------------------------------------------------------------- 157 | 158 | void matmul_seq( 159 | size_t N, size_t K, size_t M, 160 | const std::vector& A, 161 | const std::vector& B, 162 | std::vector& C, 163 | Threadpool& threadpool 164 | ) { 165 | // seq version of matmul 166 | for(size_t i=0; i& A, 179 | const std::vector& B, 180 | std::vector& C, 181 | Threadpool& threadpool 182 | ) { 183 | 184 | std::vector> futures; 185 | 186 | // this version has a serious issue of false sharing 187 | //for(size_t i=0; i A(N*K, 1), B(K*M, 2), C(N*M, 0); 236 | 237 | // create a thread pool of the maximum hardware concurrency 238 | Threadpool threadpool(T); 239 | 240 | // run matrix multiplication in parallel 241 | auto beg = std::chrono::steady_clock::now(); 242 | matmul(N, K, M, A, B, C, threadpool); 243 | auto end = std::chrono::steady_clock::now(); 244 | 245 | std::cout << "Parallel AxB=C takes " 246 | << std::chrono::duration_cast(end-beg).count() 247 | << " us\n"; 248 | 249 | // run matrix multiplication in sequential 250 | beg = std::chrono::steady_clock::now(); 251 | matmul_seq(N, K, M, A, B, C, threadpool); 252 | end = std::chrono::steady_clock::now(); 253 | 254 | std::cout << "Sequential AxB=C takes " 255 | << std::chrono::duration_cast(end-beg).count() 256 | << " us\n"; 257 | 258 | // shut down the threadpool 259 | threadpool.shutdown(); 260 | 261 | 262 | return 0; 263 | } 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /Code/lecture11/task-parallelism.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct MoC { 14 | 15 | MoC(T&& rhs) : object(std::move(rhs)) {} 16 | MoC(const MoC& other) : object(std::move(other.object)) {} 17 | 18 | T& get() { return object; } 19 | 20 | mutable T object; 21 | }; 22 | 23 | // ---------------------------------------------------------------------------- 24 | // Class definition for Threadpool 25 | // ---------------------------------------------------------------------------- 26 | 27 | class Threadpool { 28 | 29 | public: 30 | 31 | // constructor tasks a unsigned integer representing the number of 32 | // workers you need 33 | Threadpool(size_t N) { 34 | 35 | for(size_t i=0; i task; 40 | // my job is to iteratively grab a task from the queue 41 | { 42 | // Best practice: anything that happens inside the while continuation check 43 | // should always be protected by lock 44 | std::unique_lock lock(mtx); 45 | while(queue.empty() && !stop) { 46 | cv.wait(lock); 47 | } 48 | if(!queue.empty()) { 49 | task = queue.front(); 50 | queue.pop(); 51 | } 52 | } 53 | // and run the task... 54 | if(task) { 55 | task(); 56 | } 57 | } 58 | }); 59 | } 60 | } 61 | 62 | // destructor will release all threading resources by joining all of them 63 | ~Threadpool() { 64 | // I need to join the threads to release their resources 65 | for(auto& t : threads) { 66 | t.join(); 67 | } 68 | } 69 | 70 | // shutdown the threadpool 71 | void shutdown() { 72 | std::scoped_lock lock(mtx); 73 | stop = true; 74 | cv.notify_all(); 75 | } 76 | 77 | // insert a task "callable object" into the threadpool 78 | template 79 | auto insert(C&& task) { 80 | std::promise promise; 81 | auto fu = promise.get_future(); 82 | { 83 | std::scoped_lock lock(mtx); 84 | queue.push( 85 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 86 | task(); 87 | moc.object.set_value(); 88 | } 89 | ); 90 | } 91 | cv.notify_one(); 92 | return fu; 93 | } 94 | 95 | // insert a task "callable object" into the threadpool 96 | template 97 | auto insert_with_return(C&& task) { 98 | using R = std::result_of_t; 99 | std::promise promise; 100 | auto fu = promise.get_future(); 101 | { 102 | std::scoped_lock lock(mtx); 103 | queue.push( 104 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 105 | moc.object.set_value( 106 | task() 107 | ); 108 | } 109 | ); 110 | } 111 | cv.notify_one(); 112 | return fu; 113 | } 114 | 115 | private: 116 | 117 | std::mutex mtx; 118 | std::vector threads; 119 | std::condition_variable cv; 120 | 121 | bool stop {false}; 122 | std::queue< std::function > queue; 123 | 124 | }; 125 | 126 | int main(int argc, char* argv[]) { 127 | 128 | // create a thread pool of the maximum hardware concurrency 129 | Threadpool threadpool(4); 130 | 131 | // Do something parallel... 132 | // create a task dependency graph 133 | // A->B 134 | // A->C 135 | // B->D 136 | // C->D 137 | 138 | std::future fu_A = threadpool.insert([](){ 139 | std::cout << "running task A\n"; 140 | }); 141 | 142 | auto shared_fu_A = fu_A.share(); 143 | 144 | std::future fu_B = threadpool.insert([&](){ 145 | shared_fu_A.get(); 146 | std::cout << "running task B\n"; 147 | }); 148 | 149 | std::future fu_C = threadpool.insert([&](){ 150 | shared_fu_A.get(); 151 | std::cout << "running task C\n"; 152 | }); 153 | 154 | std::future fu_D = threadpool.insert([&](){ 155 | fu_B.get(); 156 | fu_C.get(); 157 | std::cout << "running task D\n"; 158 | }); 159 | 160 | // wait for all the four tasks to finish 161 | fu_D.get(); 162 | 163 | // shut down the threadpool 164 | threadpool.shutdown(); 165 | 166 | 167 | return 0; 168 | } 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /Code/lecture11/taskflow-matmul.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void matmul( 4 | size_t N, size_t K, size_t M, 5 | const std::vector& A, 6 | const std::vector& B, 7 | std::vector& C, 8 | tf::Executor& executor 9 | ) { 10 | 11 | tf::Taskflow taskflow; 12 | 13 | //for(size_t i=0; i A(N*K, 1), B(K*M, 2), C(N*M, 0); 61 | 62 | matmul(N, K, M, A, B, C, executor); 63 | } 64 | 65 | 66 | -------------------------------------------------------------------------------- /Code/lecture11/taskflow-parallelism.cpp: -------------------------------------------------------------------------------- 1 | #include // Taskflow is header-only 2 | 3 | int main(){ 4 | 5 | tf::Executor executor(10); 6 | tf::Taskflow taskflow; 7 | 8 | tf::Task A = taskflow.emplace([](){ std::cout << "Task A\n"; }); 9 | tf::Task B = taskflow.emplace([](){ std::cout << "Task B\n"; }); 10 | tf::Task C = taskflow.emplace([](){ std::cout << "Task C\n"; }); 11 | tf::Task D = taskflow.emplace([](){ std::cout << "Task D\n"; }); 12 | 13 | A.precede(B, C); 14 | B.precede(D); 15 | C.precede(D); 16 | 17 | auto fu = executor.run(taskflow); 18 | 19 | fu.get(); 20 | 21 | return 0; 22 | } 23 | 24 | -------------------------------------------------------------------------------- /Code/lecture12/relaxed.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | std::atomic cnt = {0}; 7 | 8 | // compiler can freely reorder your instructions to optimize performance 9 | 10 | // original program 11 | //int a = 1; 12 | //int b = 2; 13 | //int c = a +1; 14 | // 15 | //// compiler can optimize the instruction order to improve data locality 16 | //int b = 2; 17 | //int a = 1; 18 | //int c = a + 1; 19 | // 20 | //// with atomic operation ... (original program forces compiler NOT to reorder a below cnt++) 21 | //int a = 1; 22 | //cnt.fetch_add(1, std::memory_order_seq_cst); // disallow compiler to reorder instructions 23 | // // before and after cnt 24 | //int b = 2; 25 | //int c = a +1; 26 | // 27 | //// with atomic operation ... (original program forces compiler NOT to reorder a below cnt++) 28 | //int a = 1; 29 | //cnt.fetch_add(1, std::memory_order_relaxed); // allow compiler to reorder instruction before 30 | // // and after cnt 31 | //int b = 2; 32 | //int c = a +1; 33 | 34 | 35 | void f() 36 | { 37 | for (int n = 0; n < 1000; ++n) { 38 | cnt.fetch_add(1, std::memory_order_relaxed); 39 | } 40 | } 41 | 42 | int main() 43 | { 44 | std::vector v; 45 | for (int n = 0; n < 10; ++n) { 46 | v.emplace_back(f); 47 | } 48 | for (auto& t : v) { 49 | t.join(); 50 | } 51 | std::cout << "Final counter value is " << cnt << '\n'; 52 | } 53 | -------------------------------------------------------------------------------- /Code/lecture12/release_acquire.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | std::atomic ptr {nullptr}; 7 | int data {0}; 8 | 9 | void producer() 10 | { 11 | std::string* p = new std::string("Hello"); 12 | data = 42; 13 | ptr.store(p, std::memory_order_release); 14 | int a = 100; 15 | int b = 1000; 16 | int c = a + b; 17 | //data = 42; compiler cannot reorder this instruction after ptr=p 18 | } 19 | 20 | void consumer() 21 | { 22 | std::string* p2 {nullptr}; 23 | while (!p2) { 24 | p2 = ptr.load(std::memory_order_acquire); 25 | } 26 | assert(*p2 == "Hello"); 27 | assert(data == 42); // this may not be true, since compiler can 28 | // reorder "data = 42" 29 | } 30 | 31 | int main() 32 | { 33 | //producer(); 34 | //consumer(); 35 | 36 | std::thread t1(producer()); 37 | std::thread t2(consumer()); 38 | t1.join(); 39 | t2.join(); 40 | } 41 | -------------------------------------------------------------------------------- /Code/lecture12/release_consume.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | std::string* ptr; 7 | int data; 8 | 9 | void producer() 10 | { 11 | std::string* p = new std::string("Hello"); 12 | data = 42; 13 | ptr = p; 14 | } 15 | 16 | void consumer() 17 | { 18 | std::string* p2; 19 | while (!(p2 = ptr)) 20 | ; 21 | assert(*p2 == "Hello"); 22 | assert(data == 42); 23 | } 24 | 25 | int main() 26 | { 27 | producer(); 28 | consumer(); 29 | } 30 | -------------------------------------------------------------------------------- /Code/lecture14/parallel-for.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct MoC { 14 | 15 | MoC(T&& rhs) : object(std::move(rhs)) {} 16 | MoC(const MoC& other) : object(std::move(other.object)) {} 17 | 18 | T& get() { return object; } 19 | 20 | mutable T object; 21 | }; 22 | 23 | // ---------------------------------------------------------------------------- 24 | // Class definition for Threadpool 25 | // ---------------------------------------------------------------------------- 26 | 27 | class Threadpool { 28 | 29 | public: 30 | 31 | // constructor tasks a unsigned integer representing the number of 32 | // workers you need 33 | Threadpool(size_t N) { 34 | 35 | for(size_t i=0; i task; 40 | // my job is to iteratively grab a task from the queue 41 | { 42 | // Best practice: anything that happens inside the while continuation check 43 | // should always be protected by lock 44 | std::unique_lock lock(mtx); 45 | while(queue.empty() && !stop) { 46 | cv.wait(lock); 47 | } 48 | if(!queue.empty()) { 49 | task = queue.front(); 50 | queue.pop(); 51 | } 52 | } 53 | // and run the task... 54 | if(task) { 55 | task(); 56 | } 57 | } 58 | }); 59 | } 60 | } 61 | 62 | // destructor will release all threading resources by joining all of them 63 | ~Threadpool() { 64 | // I need to join the threads to release their resources 65 | for(auto& t : threads) { 66 | t.join(); 67 | } 68 | } 69 | 70 | // shutdown the threadpool 71 | void shutdown() { 72 | std::scoped_lock lock(mtx); 73 | stop = true; 74 | cv.notify_all(); 75 | } 76 | 77 | // insert a task "callable object" into the threadpool 78 | template 79 | auto insert(C&& task) { 80 | std::promise promise; 81 | auto fu = promise.get_future(); 82 | { 83 | std::scoped_lock lock(mtx); 84 | queue.push( 85 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 86 | task(); 87 | moc.object.set_value(); 88 | } 89 | ); 90 | } 91 | cv.notify_one(); 92 | return fu; 93 | } 94 | 95 | template 96 | void for_each(Input beg, Input end, F func, size_t chunk_size = 1) { 97 | 98 | // the total number of elements in the range [beg, end) 99 | size_t N = std::distance(beg, end); 100 | 101 | std::vector> futures; 102 | std::atomic tokens {0}; 103 | 104 | for(size_t i=0; i threads; 127 | std::condition_variable cv; 128 | 129 | bool stop {false}; 130 | std::queue< std::function > queue; 131 | 132 | }; 133 | 134 | // seq version of for_each based on STL implementation 135 | void seq_for_each(std::vector& vec) { 136 | std::for_each(vec.begin(), vec.end(), [](int& element){ 137 | element = element * 10; 138 | }); 139 | } 140 | 141 | void par_for_each(std::vector& vec, Threadpool& threadpool) { 142 | threadpool.for_each(vec.begin(), vec.end(), [](int& element){ 143 | element = element * 10; 144 | }); 145 | } 146 | 147 | int main(int argc, char* argv[]) { 148 | 149 | // usage: ./a.out T N 150 | if(argc != 3) { 151 | std::cerr << "usage: ./a.out T N\n"; 152 | std::exit(EXIT_FAILURE); 153 | } 154 | 155 | size_t T = std::atoi(argv[1]); 156 | size_t N = std::atoi(argv[2]); 157 | 158 | // create a thread pool of the maximum hardware concurrency 159 | Threadpool threadpool(T); 160 | 161 | std::vector vec(N); 162 | for(auto i : vec) { 163 | i = ::rand(); 164 | } 165 | 166 | // run for_each sequentially 167 | std::cout << "running seq_for_each ... "; 168 | auto beg = std::chrono::steady_clock::now(); 169 | seq_for_each(vec); 170 | auto end = std::chrono::steady_clock::now(); 171 | std::cout << std::chrono::duration_cast(end-beg).count() 172 | << "us\n"; 173 | 174 | // run for_each parallely 175 | std::cout << "running par_for_each ... "; 176 | beg = std::chrono::steady_clock::now(); 177 | par_for_each(vec, threadpool); 178 | end = std::chrono::steady_clock::now(); 179 | std::cout << std::chrono::duration_cast(end-beg).count() 180 | << "us\n"; 181 | 182 | 183 | // shut down the threadpool 184 | threadpool.shutdown(); 185 | 186 | 187 | return 0; 188 | } 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /Code/lecture15/parallel-for-guided.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct MoC { 14 | 15 | MoC(T&& rhs) : object(std::move(rhs)) {} 16 | MoC(const MoC& other) : object(std::move(other.object)) {} 17 | 18 | T& get() { return object; } 19 | 20 | mutable T object; 21 | }; 22 | 23 | // ---------------------------------------------------------------------------- 24 | // Class definition for Threadpool 25 | // ---------------------------------------------------------------------------- 26 | 27 | class Threadpool { 28 | 29 | public: 30 | 31 | // constructor tasks a unsigned integer representing the number of 32 | // workers you need 33 | Threadpool(size_t N) { 34 | 35 | for(size_t i=0; i task; 40 | // my job is to iteratively grab a task from the queue 41 | { 42 | // Best practice: anything that happens inside the while continuation check 43 | // should always be protected by lock 44 | std::unique_lock lock(mtx); 45 | while(queue.empty() && !stop) { 46 | cv.wait(lock); 47 | } 48 | if(!queue.empty()) { 49 | task = queue.front(); 50 | queue.pop(); 51 | } 52 | } 53 | // and run the task... 54 | if(task) { 55 | task(); 56 | } 57 | } 58 | }); 59 | } 60 | } 61 | 62 | // destructor will release all threading resources by joining all of them 63 | ~Threadpool() { 64 | // I need to join the threads to release their resources 65 | for(auto& t : threads) { 66 | t.join(); 67 | } 68 | } 69 | 70 | // shutdown the threadpool 71 | void shutdown() { 72 | std::scoped_lock lock(mtx); 73 | stop = true; 74 | cv.notify_all(); 75 | } 76 | 77 | // insert a task "callable object" into the threadpool 78 | template 79 | auto insert(C&& task) { 80 | std::promise promise; 81 | auto fu = promise.get_future(); 82 | { 83 | std::scoped_lock lock(mtx); 84 | queue.push( 85 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 86 | task(); 87 | moc.object.set_value(); 88 | } 89 | ); 90 | } 91 | cv.notify_one(); 92 | return fu; 93 | } 94 | 95 | template 96 | void for_each(Input beg, Input end, F func, size_t chunk_size = 1) { 97 | 98 | // the total number of elements in the range [beg, end) 99 | size_t N = std::distance(beg, end); 100 | 101 | std::vector> futures; 102 | std::atomic tokens {0}; 103 | 104 | for(size_t i=0; i 124 | void for_each_guided(Input beg, Input end, F func, size_t chunk_size = 1) { 125 | 126 | // the total number of elements in the range [beg, end) 127 | size_t N = std::distance(beg, end); 128 | 129 | std::vector> futures; 130 | std::atomic takens {0}; 131 | 132 | for(size_t i=0; i= N) { 145 | return; 146 | } 147 | size_t curr_e = std::min(N, curr_b + chunk_size); 148 | std::for_each(beg + curr_b, beg + curr_e, func); 149 | } 150 | } 151 | // coarse-grained scheduling 152 | else { 153 | size_t q = R * p; 154 | if(q < chunk_size) { 155 | q = chunk_size; 156 | } 157 | size_t curr_e = std::min(N, curr_b + q); 158 | if(takens.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, 159 | std::memory_order_relaxed)) { 160 | std::for_each(beg + curr_b, beg + curr_e, func); 161 | curr_b = takens.load(std::memory_order_relaxed); 162 | } 163 | } 164 | } 165 | })); 166 | } 167 | 168 | // caller thread to wait for all W tasks finish (futures) 169 | for(auto & fu : futures) { 170 | fu.get(); 171 | } 172 | } 173 | 174 | private: 175 | 176 | std::mutex mtx; 177 | std::vector threads; 178 | std::condition_variable cv; 179 | 180 | bool stop {false}; 181 | std::queue< std::function > queue; 182 | 183 | }; 184 | 185 | // seq version of for_each based on STL implementation 186 | void seq_for_each(std::vector& vec) { 187 | std::for_each(vec.begin(), vec.end(), [](int& element){ 188 | element = element * 10; 189 | }); 190 | } 191 | 192 | void par_for_each(std::vector& vec, Threadpool& threadpool) { 193 | threadpool.for_each(vec.begin(), vec.end(), [](int& element){ 194 | element = element * 10; 195 | }); 196 | } 197 | 198 | int main(int argc, char* argv[]) { 199 | 200 | // usage: ./a.out T N 201 | if(argc != 3) { 202 | std::cerr << "usage: ./a.out T N\n"; 203 | std::exit(EXIT_FAILURE); 204 | } 205 | 206 | size_t T = std::atoi(argv[1]); 207 | size_t N = std::atoi(argv[2]); 208 | 209 | // create a thread pool of the maximum hardware concurrency 210 | Threadpool threadpool(T); 211 | 212 | std::vector vec(N); 213 | for(auto i : vec) { 214 | i = ::rand(); 215 | } 216 | 217 | // run for_each sequentially 218 | std::cout << "running seq_for_each ... "; 219 | auto beg = std::chrono::steady_clock::now(); 220 | seq_for_each(vec); 221 | auto end = std::chrono::steady_clock::now(); 222 | std::cout << std::chrono::duration_cast(end-beg).count() 223 | << "us\n"; 224 | 225 | // run for_each parallely 226 | std::cout << "running par_for_each ... "; 227 | beg = std::chrono::steady_clock::now(); 228 | par_for_each(vec, threadpool); 229 | end = std::chrono::steady_clock::now(); 230 | std::cout << std::chrono::duration_cast(end-beg).count() 231 | << "us\n"; 232 | 233 | 234 | // shut down the threadpool 235 | threadpool.shutdown(); 236 | 237 | 238 | return 0; 239 | } 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Code/lecture16/parallel-reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | struct MoC { 15 | 16 | MoC(T&& rhs) : object(std::move(rhs)) {} 17 | MoC(const MoC& other) : object(std::move(other.object)) {} 18 | 19 | T& get() { return object; } 20 | 21 | mutable T object; 22 | }; 23 | 24 | // ---------------------------------------------------------------------------- 25 | // Class definition for Threadpool 26 | // ---------------------------------------------------------------------------- 27 | 28 | class Threadpool { 29 | 30 | public: 31 | 32 | // constructor tasks a unsigned integer representing the number of 33 | // workers you need 34 | Threadpool(size_t N) { 35 | 36 | for(size_t i=0; i task; 41 | // my job is to iteratively grab a task from the queue 42 | { 43 | // Best practice: anything that happens inside the while continuation check 44 | // should always be protected by lock 45 | std::unique_lock lock(mtx); 46 | while(queue.empty() && !stop) { 47 | cv.wait(lock); 48 | } 49 | if(!queue.empty()) { 50 | task = queue.front(); 51 | queue.pop(); 52 | } 53 | } 54 | // and run the task... 55 | if(task) { 56 | task(); 57 | } 58 | } 59 | }); 60 | } 61 | } 62 | 63 | // destructor will release all threading resources by joining all of them 64 | ~Threadpool() { 65 | // I need to join the threads to release their resources 66 | for(auto& t : threads) { 67 | t.join(); 68 | } 69 | } 70 | 71 | // shutdown the threadpool 72 | void shutdown() { 73 | std::scoped_lock lock(mtx); 74 | stop = true; 75 | cv.notify_all(); 76 | } 77 | 78 | // insert a task "callable object" into the threadpool 79 | template 80 | auto insert(C&& task) { 81 | std::promise promise; 82 | auto fu = promise.get_future(); 83 | { 84 | std::scoped_lock lock(mtx); 85 | queue.push( 86 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 87 | task(); 88 | moc.object.set_value(); 89 | } 90 | ); 91 | } 92 | cv.notify_one(); 93 | return fu; 94 | } 95 | 96 | template 97 | void for_each(Input beg, Input end, F func, size_t chunk_size = 1) { 98 | 99 | // the total number of elements in the range [beg, end) 100 | size_t N = std::distance(beg, end); 101 | 102 | std::vector> futures; 103 | std::atomic takens {0}; 104 | 105 | for(size_t i=0; i 125 | T reduce(Input beg, Input end, T init, F bop, size_t chunk_size = 2) { 126 | size_t N = std::distance(beg, end); 127 | 128 | std::vector> futures; 129 | std::atomic takens {0}; 130 | 131 | std::mutex mutex; 132 | 133 | for(size_t i=0; i= N) { 140 | return; 141 | } 142 | // corner case #2: only one element left 143 | if(N - curr_b == 1) { 144 | std::scoped_lock lock(mutex); 145 | init = bop(init, *(beg + curr_b)); 146 | return; 147 | } 148 | // perform a reduction on these two elements 149 | T temp = bop( *(beg+curr_b), *(beg+curr_b+1) ); 150 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 151 | while(curr_b < N) { 152 | size_t curr_e = std::min(N, curr_b + chunk_size); 153 | // run a sequential reduction to the range specified by beg + [curr_b, curr_e) 154 | temp = std::accumulate(beg + curr_b, beg + curr_e, temp, bop); 155 | // get the next chunk 156 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 157 | } 158 | // perform a final reduction on temp with init 159 | { 160 | std::scoped_lock lock(mutex); 161 | init = bop(init, temp); 162 | } 163 | })); 164 | } 165 | 166 | // caller thread to wait for all W tasks finish (futures) 167 | for(auto & fu : futures) { 168 | fu.get(); 169 | } 170 | 171 | return init; 172 | } 173 | 174 | private: 175 | 176 | std::mutex mtx; 177 | std::vector threads; 178 | std::condition_variable cv; 179 | 180 | bool stop {false}; 181 | std::queue< std::function > queue; 182 | 183 | }; 184 | 185 | // seq version of for_each based on STL implementation 186 | auto seq_reduce(std::vector& vec) { 187 | return std::accumulate(vec.begin(), vec.end(), 0, [](int a, int b){ return a + b; }); 188 | } 189 | 190 | auto par_reduce(std::vector& vec, Threadpool& threadpool) { 191 | return threadpool.reduce( 192 | vec.begin(), vec.end(), 0, [](int a, int b){ return a+b; }, 1024 193 | ); 194 | } 195 | 196 | int main(int argc, char* argv[]) { 197 | 198 | // usage: ./a.out T N 199 | if(argc != 3) { 200 | std::cerr << "usage: ./a.out T N\n"; 201 | std::exit(EXIT_FAILURE); 202 | } 203 | 204 | size_t T = std::atoi(argv[1]); 205 | size_t N = std::atoi(argv[2]); 206 | 207 | // create a thread pool of the maximum hardware concurrency 208 | Threadpool threadpool(T); 209 | 210 | std::vector vec(N); 211 | for(auto& i : vec) { 212 | i = 1; 213 | } 214 | 215 | // run reduce sequentially 216 | std::cout << "running seq_reduce ... "; 217 | auto beg = std::chrono::steady_clock::now(); 218 | auto res1 = seq_reduce(vec); 219 | auto end = std::chrono::steady_clock::now(); 220 | std::cout << std::chrono::duration_cast(end-beg).count() 221 | << "ns\n"; 222 | 223 | // run reduce parallely 224 | std::cout << "running par_for_each ... "; 225 | beg = std::chrono::steady_clock::now(); 226 | auto res2 = par_reduce(vec, threadpool); 227 | end = std::chrono::steady_clock::now(); 228 | std::cout << std::chrono::duration_cast(end-beg).count() 229 | << "ns\n"; 230 | 231 | // shut down the threadpool 232 | threadpool.shutdown(); 233 | 234 | 235 | return 0; 236 | } 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | -------------------------------------------------------------------------------- /Code/lecture17/parallel-transform.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | template 14 | struct MoC { 15 | 16 | MoC(T&& rhs) : object(std::move(rhs)) {} 17 | MoC(const MoC& other) : object(std::move(other.object)) {} 18 | 19 | T& get() { return object; } 20 | 21 | mutable T object; 22 | }; 23 | 24 | // ---------------------------------------------------------------------------- 25 | // Class definition for Threadpool 26 | // ---------------------------------------------------------------------------- 27 | 28 | class Threadpool { 29 | 30 | public: 31 | 32 | // constructor tasks a unsigned integer representing the number of 33 | // workers you need 34 | Threadpool(size_t N) { 35 | 36 | for(size_t i=0; i task; 41 | // my job is to iteratively grab a task from the queue 42 | { 43 | // Best practice: anything that happens inside the while continuation check 44 | // should always be protected by lock 45 | std::unique_lock lock(mtx); 46 | while(queue.empty() && !stop) { 47 | cv.wait(lock); 48 | } 49 | if(!queue.empty()) { 50 | task = queue.front(); 51 | queue.pop(); 52 | } 53 | } 54 | // and run the task... 55 | if(task) { 56 | task(); 57 | } 58 | } 59 | }); 60 | } 61 | } 62 | 63 | // destructor will release all threading resources by joining all of them 64 | ~Threadpool() { 65 | // I need to join the threads to release their resources 66 | for(auto& t : threads) { 67 | t.join(); 68 | } 69 | } 70 | 71 | // shutdown the threadpool 72 | void shutdown() { 73 | std::scoped_lock lock(mtx); 74 | stop = true; 75 | cv.notify_all(); 76 | } 77 | 78 | // insert a task "callable object" into the threadpool 79 | template 80 | auto insert(C&& task) { 81 | std::promise promise; 82 | auto fu = promise.get_future(); 83 | { 84 | std::scoped_lock lock(mtx); 85 | queue.push( 86 | [moc=MoC{std::move(promise)}, task=std::forward(task)] () mutable { 87 | task(); 88 | moc.object.set_value(); 89 | } 90 | ); 91 | } 92 | cv.notify_one(); 93 | return fu; 94 | } 95 | 96 | template 97 | void for_each(Input beg, Input end, F func, size_t chunk_size = 1) { 98 | 99 | // the total number of elements in the range [beg, end) 100 | size_t N = std::distance(beg, end); 101 | 102 | std::vector> futures; 103 | std::atomic takens {0}; 104 | 105 | for(size_t i=0; i 125 | void transform(SrcItr first1, SrcItr last1, DesItr first2, F uop, size_t chunk_size = 1) { 126 | 127 | // the total number of elements in the range [beg, end) 128 | size_t N = std::distance(first1, last1); 129 | 130 | std::vector> futures; 131 | std::atomic takens {0}; 132 | 133 | for(size_t i=0; i 153 | T reduce(Input beg, Input end, T init, F bop, size_t chunk_size = 2) { 154 | size_t N = std::distance(beg, end); 155 | 156 | std::vector> futures; 157 | std::atomic takens {0}; 158 | 159 | std::mutex mutex; 160 | 161 | for(size_t i=0; i= N) { 168 | return; 169 | } 170 | // corner case #2: only one element left 171 | if(N - curr_b == 1) { 172 | std::scoped_lock lock(mutex); 173 | init = bop(init, *(beg + curr_b)); 174 | return; 175 | } 176 | // perform a reduction on these two elements 177 | T temp = bop( *(beg+curr_b), *(beg+curr_b+1) ); 178 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 179 | while(curr_b < N) { 180 | size_t curr_e = std::min(N, curr_b + chunk_size); 181 | // run a sequential reduction to the range specified by beg + [curr_b, curr_e) 182 | temp = std::accumulate(beg + curr_b, beg + curr_e, temp, bop); 183 | // get the next chunk 184 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 185 | } 186 | // perform a final reduction on temp with init 187 | { 188 | std::scoped_lock lock(mutex); 189 | init = bop(init, temp); 190 | } 191 | })); 192 | } 193 | 194 | // caller thread to wait for all W tasks finish (futures) 195 | for(auto & fu : futures) { 196 | fu.get(); 197 | } 198 | 199 | return init; 200 | } 201 | 202 | template 203 | T transform_reduce(Input1 beg1, Input end1, Input2 beg2, T init, R rop, B top, size_t chunk_size = 2) { 204 | 205 | size_t N = std::distance(beg, end); 206 | 207 | std::vector> futures; 208 | std::atomic takens {0}; 209 | 210 | std::mutex mutex; 211 | 212 | for(size_t i=0; i= N) { 219 | return; 220 | } 221 | // corner case #2: only one element left 222 | if(N - curr_b == 1) { 223 | std::scoped_lock lock(mutex); 224 | init = rop(init, top(*(beg1 + curr_b), *(beg2 + curr_b))); 225 | return; 226 | } 227 | 228 | // perform a reduction on these two elements 229 | T temp = rop( 230 | top(*(beg1 + curr_b), *(beg2 + curr_b)), 231 | top(*(beg1 + curr_b+1), *(beg2 + curr_b + 1)) 232 | ); 233 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 234 | while(curr_b < N) { 235 | size_t curr_e = std::min(N, curr_b + chunk_size); 236 | // run a sequential reduction to the range specified by beg + [curr_b, curr_e) 237 | temp = std::transform_reduce( 238 | beg1 + curr_b, beg1 + curr_e, beg2 + curr_b, temp, rop, top 239 | ); 240 | // get the next chunk 241 | curr_b = takens.fetch_add(chunk_size, std::memory_order_relaxed); 242 | } 243 | // perform a final reduction on temp with init 244 | { 245 | std::scoped_lock lock(mutex); 246 | init = rop(init, temp); 247 | } 248 | })); 249 | } 250 | 251 | // caller thread to wait for all W tasks finish (futures) 252 | for(auto & fu : futures) { 253 | fu.get(); 254 | } 255 | 256 | return init; 257 | } 258 | 259 | private: 260 | 261 | std::mutex mtx; 262 | std::vector threads; 263 | std::condition_variable cv; 264 | 265 | bool stop {false}; 266 | std::queue< std::function > queue; 267 | 268 | }; 269 | 270 | // seq version of for_each based on STL implementation 271 | auto seq_reduce(std::vector& vec) { 272 | return std::accumulate(vec.begin(), vec.end(), 0, [](int a, int b){ return a + b; }); 273 | } 274 | 275 | auto par_reduce(std::vector& vec, Threadpool& threadpool) { 276 | return threadpool.reduce( 277 | vec.begin(), vec.end(), 0, [](int a, int b){ return a+b; }, 1024 278 | ); 279 | } 280 | 281 | auto seq_transform(std::vector& src, std::vector& des) { 282 | std::transform( 283 | src.begin(), src.end(), des.begin(), 284 | [](int v){ 285 | return std::to_string(v) + " is the result"; 286 | } 287 | ); 288 | } 289 | 290 | auto par_transform(std::vector& src, std::vector& des, Threadpool& threadpool) { 291 | threadpool.transform( 292 | src.begin(), src.end(), des.begin(), 293 | [](int v){ 294 | return std::to_string(v) + " is the result"; 295 | }, 296 | 1024 297 | ); 298 | } 299 | 300 | int main(int argc, char* argv[]) { 301 | 302 | // usage: ./a.out T N 303 | if(argc != 3) { 304 | std::cerr << "usage: ./a.out T N\n"; 305 | std::exit(EXIT_FAILURE); 306 | } 307 | 308 | size_t T = std::atoi(argv[1]); 309 | size_t N = std::atoi(argv[2]); 310 | 311 | // create a thread pool of the maximum hardware concurrency 312 | Threadpool threadpool(T); 313 | 314 | std::vector src(N); 315 | for(auto& i : src) { 316 | i = ::rand() % 10; 317 | } 318 | 319 | std::vector des(N); 320 | 321 | // run reduce sequentially 322 | std::cout << "running seq_transform ... "; 323 | auto beg = std::chrono::steady_clock::now(); 324 | seq_transform(src, des); 325 | auto end = std::chrono::steady_clock::now(); 326 | std::cout << std::chrono::duration_cast(end-beg).count() 327 | << "ns\n"; 328 | 329 | // run reduce parallely 330 | std::cout << "running par_transform ... "; 331 | beg = std::chrono::steady_clock::now(); 332 | par_transform(src, des, threadpool); 333 | end = std::chrono::steady_clock::now(); 334 | std::cout << std::chrono::duration_cast(end-beg).count() 335 | << "ns\n"; 336 | 337 | 338 | // shut down the threadpool 339 | threadpool.shutdown(); 340 | 341 | 342 | return 0; 343 | } 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /Code/lecture18/cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define K 8 6 | 7 | // assume we have an input range of 1024 iterations 8 | 9 | // Idea #1: each thread does only one iteration of work 10 | // This kerne is going to be run by multiple GPU threads 11 | __global__ void kernel1(int* array, unsigned N, int value) { 12 | // the global index in array of this thread in this block 13 | unsigned gid = blockDim.x * blockIdx.x + threadIdx.x; 14 | 15 | // if the global index is not out of boundary 16 | if(gid < N) { 17 | array[gid] = value; 18 | } 19 | } 20 | 21 | // Idea #2: each thread does K iterations of work 22 | __global__ void kernel2(int* array, unsigned N, int value) { 23 | // How do we get a correct gid for this thread, assuming 24 | // each thread does K iterations of the work 25 | 26 | // assume we know the range is starting at gid 27 | array[gid] = value; 28 | array[gid+1] = value; 29 | array[gid+2] = value; 30 | array[gid+3] = value; 31 | array[gid+4] = value; 32 | array[gid+5] = value; 33 | array[gid+6] = value; 34 | array[gid+7] = value; 35 | 36 | } 37 | 38 | int main() { 39 | 40 | printf("First CUDA program\n"); 41 | 42 | // Goal: use GPU to initialize every element in an input range 43 | // to -1 44 | 45 | unsigned N = 1024; 46 | unsigned block_size = 512; 47 | unsigned grid_size = (N + block_size - 1) / block_size; // ceil(N/block_size) 48 | 49 | // step 1: allocate a GPU global memory 50 | std::vector cpu(N, 0); 51 | int* gpu; 52 | cudaMalloc(&gpu, sizeof(int)*N); 53 | 54 | // step 2: copy data from cpu to gpu 55 | cudaMemcpy(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault); 56 | 57 | // step 3: invoke the kernel 58 | // Idea #1: each thread does one iteration of work 59 | //kernel1<<>>(gpu, N, -1); 60 | 61 | // Idea #2: each thread does K iterations of work 62 | // each block is going to do K*512 iterations of work 63 | // assuming K is 8, each block can do 4096 iterations (much larger than N) 64 | kernel2<<< 65 | (N + block_size*K -1) / (block_size*K), block_size 66 | >>>(gpu, N, -1); 67 | 68 | // step 4: copy data from gpu back to cpu 69 | cudaMemcpy(cpu.data(), gpu, sizeof(int)*N, cudaMemcpyDefault); 70 | 71 | // show the result 72 | for(unsigned i=0; i 2 | #include 3 | #include 4 | #include 5 | 6 | #define K 11 7 | 8 | // assume we have an input range of 1024 iterations 9 | 10 | // Idea #1: each thread does only one iteration of work 11 | // This kerne is going to be run by multiple GPU threads 12 | __global__ void kernel1(int* array, unsigned N, int value) { 13 | // the global index in array of this thread in this block 14 | unsigned gid = blockDim.x * blockIdx.x + threadIdx.x; 15 | 16 | //printf("thread %u from block %d\n", threadIdx.x, blockIdx.x); 17 | 18 | // if the global index is not out of boundary 19 | if(gid < N) { 20 | array[gid] = value; 21 | } 22 | } 23 | 24 | // Idea #2: each thread does K iterations of work 25 | __global__ void kernel2(int* array, unsigned N, int value) { 26 | // How do we get a correct gid for this thread, assuming 27 | // each thread does K iterations of the work 28 | 29 | // what kind of begining position this thread should use??? 30 | //unsigned gid = blockDim.x * blockIdx.x + threadIdx.x; 31 | 32 | // solution #1 33 | //unsigned gid = blockDim.x * blockIdx.x + threadIdx.x; 34 | //for(int i=gid*K; i([](int i){ std::cout << i << std::endl;}); 89 | 90 | printf("First CUDA program\n"); 91 | 92 | // Goal: use GPU to initialize every element in an input range 93 | // to -1 94 | 95 | unsigned N = 1024; 96 | unsigned block_size = 512; 97 | unsigned grid_size = (N + block_size - 1) / block_size; // ceil(N/block_size) 98 | 99 | int value = -1; 100 | 101 | // step 1: allocate a GPU global memory 102 | std::vector cpu(N, 0); 103 | int* gpu; 104 | cudaMalloc(&gpu, sizeof(int)*N); 105 | 106 | // step 2: copy data from cpu to gpu 107 | cudaMemcpy(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault); 108 | 109 | // step 3: invoke the kernel 110 | // Idea #1: each thread does one iteration of work 111 | //kernel1<<>>(gpu, N, value); 112 | 113 | // Idea #2: each thread does K iterations of work 114 | // each block is going to do K*512 iterations of work 115 | // assuming K is 8, each block can do 4096 iterations (much larger than N) 116 | kernel2<<< 117 | (N + block_size*K -1) / (block_size*K), block_size 118 | >>>(gpu, N, value); 119 | 120 | // step 4: copy data from gpu back to cpu 121 | cudaMemcpy(cpu.data(), gpu, sizeof(int)*N, cudaMemcpyDefault); 122 | 123 | // show the result 124 | for(unsigned i=0; i 2 | #include 3 | #include 4 | #include 5 | 6 | const size_t K = 11; 7 | 8 | // ---------------------------------------------------------------------------- 9 | 10 | template 11 | struct Iterate { 12 | template 13 | static __device__ void eval(F f){ 14 | f(i); 15 | Iterate::eval(f); 16 | } 17 | }; 18 | 19 | // partial template specialization for c++ template when valid is false 20 | template 21 | struct Iterate { 22 | template 23 | static __device__ void eval(F f) {} 24 | }; 25 | 26 | template 27 | __device__ void static_iterate(F&& func) { 28 | Iterate<0, end-begin>::eval(func); 29 | } 30 | 31 | // ---------------------------------------------------------------------------- 32 | 33 | 34 | // assume we have an input range of 1024 iterations 35 | 36 | // Idea #1: each thread does only one iteration of work 37 | // This kerne is going to be run by multiple GPU threads 38 | __global__ void kernel1(int* array, unsigned N, int value) { 39 | // the global index in array of this thread in this block 40 | unsigned gid = blockDim.x * blockIdx.x + threadIdx.x; 41 | 42 | //printf("thread %u from block %d\n", threadIdx.x, blockIdx.x); 43 | 44 | // if the global index is not out of boundary 45 | if(gid < N) { 46 | array[gid] = value; 47 | } 48 | } 49 | 50 | // Idea #2: each thread does K iterations of work 51 | template 52 | __global__ void kernel2(int* array, unsigned N, int value) { 53 | 54 | // begining element of this thread in this block 55 | auto beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 56 | 57 | // for-loop version 58 | //for(int k=0; k([=](int k){ 67 | auto gid = beg + k*blockDim.x; 68 | if(gid < N) { 69 | array[gid] = value; 70 | } 71 | }); 72 | } 73 | 74 | template 75 | __global__ void for_each( InputIt first, InputIt last, F f ) { 76 | 77 | unsigned N = last - first; 78 | 79 | // begining element of this thread in this block 80 | auto beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 81 | 82 | // unrolled version 83 | static_iterate<0, K>([=] __device__ (int k){ 84 | auto gid = beg + k*blockDim.x; 85 | if(gid < N) { 86 | f(*(first + gid)); 87 | } 88 | }); 89 | } 90 | 91 | template 92 | __global__ void transform( 93 | InputIt first1, InputIt last1, OutputIt d_first, F f 94 | ) { 95 | 96 | unsigned N = last1 - first1; 97 | 98 | // begining element of this thread in this block 99 | auto beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 100 | 101 | // unrolled version 102 | static_iterate<0, K>([=] __device__ (int k){ 103 | auto gid = beg + k*blockDim.x; 104 | if(gid < N) { 105 | *(d_first + gid) = f(*(first1 + gid)); 106 | } 107 | }); 108 | } 109 | 110 | int main(int argc, char* argv[]) { 111 | 112 | printf("CPU-based for_each algorithm implementation\n"); 113 | 114 | unsigned N = 1000000; 115 | unsigned block_size = 512; 116 | unsigned grid_size = (N + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 117 | 118 | cudaStream_t s1, s2; 119 | cudaStreamCreate(&s1); 120 | cudaStreamCreate(&s2); 121 | 122 | int value = 2; 123 | 124 | // step 1: allocate a GPU global memory 125 | std::vector cpu(N, 1); 126 | int* gpu; 127 | cudaMallocAsync(&gpu, sizeof(int)*N, s1); 128 | 129 | // step 2: copy data from cpu to gpu 130 | cudaMemcpyAsync(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault, s1); 131 | 132 | // step 3: invoke the kernel 133 | for_each <<< grid_size, block_size, 0, s1 >>>( 134 | gpu, gpu + N, [=] __device__ (int& item) { item = value; } 135 | ); 136 | 137 | //unsigned grid_size1 = (N/2 + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 138 | //transform <<< grid_size1, block_size, 0, s1 >>>( 139 | // gpu, gpu + N/2, gpu, [=] __device__ (int item) { return 2*item; } 140 | //); 141 | // 142 | //unsigned grid_size2 = (N - N/2 + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 143 | //transform <<< grid_size2, block_size, 0, s2 >>>( 144 | // gpu + N/2, gpu + N, gpu + N/2, [=] __device__ (int item) { return 2*item; } 145 | //); 146 | 147 | // step 4: copy data from gpu back to cpu 148 | cudaMemcpyAsync(cpu.data(), gpu, sizeof(int)*N, cudaMemcpyDefault, s1); 149 | 150 | cudaFreeAsync(gpu, s1); 151 | 152 | cudaStreamSynchronize(s1); 153 | 154 | // show the result 155 | for(unsigned i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | const size_t K = 11; 8 | 9 | // ---------------------------------------------------------------------------- 10 | 11 | template 12 | struct Iterate { 13 | template 14 | static __device__ void eval(F f){ 15 | f(i); 16 | Iterate::eval(f); 17 | } 18 | }; 19 | 20 | // partial template specialization for c++ template when valid is false 21 | template 22 | struct Iterate { 23 | template 24 | static __device__ void eval(F f) {} 25 | }; 26 | 27 | template 28 | __device__ void static_iterate(F&& func) { 29 | Iterate<0, end-begin>::eval(func); 30 | } 31 | 32 | // single task : invokes only one GPU thread to run the given function 33 | template 34 | __global__ void single_task(F f) { 35 | f(); 36 | } 37 | 38 | // GPU-based implementation of std::find_if, but asynchronously 39 | // the input range [first, last) 40 | template 41 | __global__ void find_if(Input first, Input last, unsigned* idx, F predicate) { 42 | 43 | unsigned N = last - first; 44 | 45 | /* 46 | wrong implementation... becase syncthreads sync threads on a per-block basis 47 | if(threadIdx.x == 0 && blockIdx.x == 0) { 48 | *idx = N; 49 | } 50 | __syncthreads(); 51 | */ 52 | 53 | // begining element of this thread in this block 54 | unsigned beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 55 | 56 | // unrolled version 57 | static_iterate<0, K>([=] __device__ (int k){ 58 | unsigned gid = beg + k*blockDim.x; 59 | // now, this thread find the element 60 | if(gid < N && predicate(*(first + gid))){ 61 | // store this gid into idx atomically 62 | atomicMin(idx, gid); // multiple threads can execute this line of update 63 | } 64 | }); 65 | } 66 | 67 | 68 | // ---------------------------------------------------------------------------- 69 | 70 | int main(int argc, char* argv[]) { 71 | 72 | printf("CPU-based find_if algorithm implementation\n"); 73 | 74 | unsigned N = 1000000; 75 | unsigned block_size = 512; 76 | unsigned grid_size = (N + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 77 | 78 | std::vector cpu(N, 1); 79 | cpu[6778 ] = 5; 80 | cpu[99999] = 5; 81 | 82 | // use std::find_if to find the first element that is equal to 5 83 | auto sol = std::find_if(cpu.begin(), cpu.end(), []( int item ){ return item == 5; }); 84 | 85 | std::cout << "*sol = " << *sol << std::endl; 86 | std::cout << "distance(cpu.begin(), sol) = " << std::distance(cpu.begin(), sol) << std::endl; 87 | 88 | // step 1: allocate a GPU global memory 89 | cudaStream_t stream; 90 | cudaStreamCreate(&stream); 91 | 92 | int* gpu; 93 | unsigned* idx; 94 | unsigned res; // result we are going to store and use in cpu 95 | 96 | cudaMallocAsync(&gpu, sizeof(int)*N, stream); 97 | cudaMallocAsync(&idx, sizeof(unsigned), stream); 98 | 99 | // step 2: copy the data from cpu to gpu 100 | cudaMemcpyAsync(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault, stream); 101 | 102 | // step 3: invoke the find_if kernel 103 | //*idx = N; // cann't dereference a GPU variable in CPU scope... (seg fault) 104 | single_task <<< 1, 1, 0, stream >>> ([=]__device__() { *idx = N; }); 105 | 106 | find_if <<< grid_size, block_size, 0, stream >>>( 107 | gpu, gpu+N, idx, [=] __device__ (int item) { return item == 5; } 108 | ); 109 | 110 | // step 4: copy the solution from gpu to cpu 111 | cudaMemcpyAsync(&res, idx, sizeof(unsigned), cudaMemcpyDefault, stream); 112 | 113 | // step 5: synchronize the execution to get the result 114 | cudaStreamSynchronize(stream); 115 | 116 | std::cout << "res = " << res << std::endl; 117 | 118 | // deallocate all the storage we have allocated 119 | cudaFreeAsync(gpu, stream); 120 | cudaStreamDestroy(stream); 121 | 122 | return 0; 123 | } 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /Code/lecture23/find_if.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | const size_t K = 11; 8 | 9 | // ---------------------------------------------------------------------------- 10 | 11 | template 12 | struct Iterate { 13 | template 14 | static __device__ void eval(F f){ 15 | f(i); 16 | Iterate::eval(f); 17 | } 18 | }; 19 | 20 | // partial template specialization for c++ template when valid is false 21 | template 22 | struct Iterate { 23 | template 24 | static __device__ void eval(F f) {} 25 | }; 26 | 27 | template 28 | __device__ void static_iterate(F&& func) { 29 | Iterate<0, end-begin>::eval(func); 30 | } 31 | 32 | // single task : invokes only one GPU thread to run the given function 33 | template 34 | __global__ void single_task(F f) { 35 | f(); 36 | } 37 | 38 | // GPU-based implementation of std::find_if, but asynchronously 39 | // the input range [first, last) 40 | template 41 | __global__ void find_if(Input first, Input last, unsigned* idx, F predicate) { 42 | 43 | unsigned N = last - first; 44 | __shared__ unsigned block_idx; 45 | 46 | // only ask the first thread of this block to initialize the shared memory variable 47 | if(threadIdx.x == 0) { 48 | block_idx = N; 49 | } 50 | __syncthreads(); 51 | 52 | // begining element of this thread in this block 53 | unsigned beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 54 | unsigned local_idx = N; 55 | 56 | // unrolled version 57 | static_iterate<0, K>([=, &local_idx] __device__ (int k){ 58 | unsigned gid = beg + k*blockDim.x; 59 | // now, this thread find the element 60 | if(gid < N && predicate(*(first + gid))){ 61 | // store this gid into idx atomically 62 | if(gid < local_idx) { 63 | local_idx = gid; 64 | } 65 | } 66 | }); 67 | 68 | // this atomic operation is MUCH faster than running atomic operations 69 | // on global memory (i.e., idx) 70 | atomicMin(&block_idx, local_idx); 71 | 72 | // synchronize all threads to ensure local_idx are valid 73 | __syncthreads(); 74 | 75 | // Only the first thread of each block will perform atomic min operation 76 | // on the global memory (i.e., idx) 77 | if(threadIdx.x == 0) { 78 | atomicMin(idx, block_idx); 79 | } 80 | } 81 | 82 | 83 | // ---------------------------------------------------------------------------- 84 | 85 | int main(int argc, char* argv[]) { 86 | 87 | printf("CPU-based find_if algorithm implementation\n"); 88 | 89 | unsigned N = 1000000; 90 | unsigned block_size = 512; 91 | unsigned grid_size = (N + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 92 | 93 | std::vector cpu(N, 1); 94 | cpu[6778 ] = 5; 95 | cpu[99999] = 5; 96 | 97 | // use std::find_if to find the first element that is equal to 5 98 | auto sol = std::find_if(cpu.begin(), cpu.end(), []( int item ){ return item == 5; }); 99 | 100 | std::cout << "*sol = " << *sol << std::endl; 101 | std::cout << "distance(cpu.begin(), sol) = " << std::distance(cpu.begin(), sol) << std::endl; 102 | 103 | // step 1: allocate a GPU global memory 104 | cudaStream_t stream; 105 | cudaStreamCreate(&stream); 106 | 107 | int* gpu; 108 | unsigned* idx; 109 | unsigned res; // result we are going to store and use in cpu 110 | 111 | cudaMallocAsync(&gpu, sizeof(int)*N, stream); 112 | cudaMallocAsync(&idx, sizeof(unsigned), stream); 113 | 114 | // step 2: copy the data from cpu to gpu 115 | cudaMemcpyAsync(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault, stream); 116 | 117 | // step 3: invoke the find_if kernel 118 | //*idx = N; // cann't dereference a GPU variable in CPU scope... (seg fault) 119 | single_task <<< 1, 1, 0, stream >>> ([=]__device__() { *idx = N; }); 120 | 121 | find_if <<< grid_size, block_size, 0, stream >>>( 122 | gpu, gpu+N, idx, [=] __device__ (int item) { return item == 5; } 123 | ); 124 | 125 | // step 4: copy the solution from gpu to cpu 126 | cudaMemcpyAsync(&res, idx, sizeof(unsigned), cudaMemcpyDefault, stream); 127 | 128 | // step 5: synchronize the execution to get the result 129 | cudaStreamSynchronize(stream); 130 | 131 | std::cout << "res = " << res << std::endl; 132 | 133 | // deallocate all the storage we have allocated 134 | cudaFreeAsync(gpu, stream); 135 | cudaStreamDestroy(stream); 136 | 137 | return 0; 138 | } 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /Code/lecture24/reduce-slow.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | const size_t K = 11; 7 | 8 | // ---------------------------------------------------------------------------- 9 | 10 | template 11 | struct Iterate { 12 | template 13 | static __device__ void eval(F f){ 14 | f(i); 15 | Iterate::eval(f); 16 | } 17 | }; 18 | 19 | // partial template specialization for c++ template when valid is false 20 | template 21 | struct Iterate { 22 | template 23 | static __device__ void eval(F f) {} 24 | }; 25 | 26 | template 27 | __device__ void static_iterate(F&& func) { 28 | Iterate<0, end-begin>::eval(func); 29 | } 30 | 31 | // ---------------------------------------------------------------------------- 32 | 33 | template 34 | __global__ void single_task(F func) { 35 | func(); 36 | } 37 | 38 | template 39 | __global__ void reduce(Input first, Input last, T* init) { 40 | 41 | unsigned N = last - first; 42 | 43 | // assume we have a block of 512 threads 44 | __shared__ T shm[512]; 45 | 46 | // begining element of this thread in this block 47 | auto beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 48 | 49 | // only ask the first thread of this block to initialize the shared memory variable 50 | shm[threadIdx.x] = 0; 51 | __syncthreads(); 52 | 53 | T local_sum {0}; 54 | // unrolled version 55 | static_iterate<0, K>([=, &local_sum](int k){ 56 | auto gid = beg + k*blockDim.x; 57 | if(gid < N) { 58 | local_sum += *(gid + first); 59 | } 60 | }); 61 | shm[threadIdx.x] = local_sum; 62 | __syncthreads(); 63 | 64 | for(unsigned stride = blockDim.x / 2; stride > 0; stride /= 2) { 65 | if(threadIdx.x < stride) { 66 | shm[threadIdx.x] += shm[threadIdx.x + stride]; 67 | } 68 | __syncthreads(); 69 | } 70 | 71 | // Only the first thread of each block will perform atomic add to the init 72 | if(threadIdx.x == 0) { 73 | atomicAdd(init, shm[0]); 74 | } 75 | } 76 | 77 | int main(int argc, char* argv[]) { 78 | 79 | printf("CPU-based reduce algorithm implementation\n"); 80 | 81 | unsigned N = 1000000; 82 | unsigned block_size = 512; 83 | unsigned grid_size = (N + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 84 | 85 | cudaStream_t stream; 86 | cudaStreamCreate(&stream); 87 | 88 | // step 1: allocate a GPU global memory 89 | std::vector cpu(N, 1); 90 | 91 | int* gpu; 92 | int* sum; 93 | int sol; 94 | cudaMallocAsync(&gpu, sizeof(int)*N, stream); 95 | cudaMallocAsync(&sum, sizeof(int), stream); 96 | 97 | // step 2: copy data from cpu to gpu 98 | cudaMemcpyAsync(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault, stream); 99 | 100 | single_task <<< 1, 1, 0, stream >>>([=] __device__ () { *sum = 1; }); 101 | 102 | reduce <<< grid_size, block_size, 0, stream >>>( 103 | gpu, gpu + N, sum 104 | ); 105 | 106 | // step 4: copy data from gpu back to cpu 107 | cudaMemcpyAsync(&sol, sum, sizeof(int), cudaMemcpyDefault, stream); 108 | 109 | cudaFreeAsync(gpu, stream); 110 | cudaFreeAsync(sum, stream); 111 | 112 | cudaStreamSynchronize(stream); 113 | 114 | cudaStreamDestroy(stream); 115 | 116 | cudaDeviceSynchronize(); 117 | 118 | // show the solution 119 | printf("%d\n", sol); 120 | 121 | return 0; 122 | } 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Code/lecture24/reduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | const size_t K = 11; 9 | 10 | // ---------------------------------------------------------------------------- 11 | 12 | template 13 | struct Iterate { 14 | template 15 | static __device__ void eval(F f){ 16 | f(i); 17 | Iterate::eval(f); 18 | } 19 | }; 20 | 21 | // partial template specialization for c++ template when valid is false 22 | template 23 | struct Iterate { 24 | template 25 | static __device__ void eval(F f) {} 26 | }; 27 | 28 | template 29 | __device__ void static_iterate(F&& func) { 30 | Iterate<0, end-begin>::eval(func); 31 | } 32 | 33 | // single task : invokes only one GPU thread to run the given function 34 | template 35 | __global__ void single_task(F f) { 36 | f(); 37 | } 38 | 39 | // GPU-based implementation of std::accumulate 40 | template 41 | __global__ void reduce(Input first, Input last, T* sum) { 42 | 43 | unsigned N = last - first; 44 | 45 | // begining element of this thread in this block 46 | unsigned beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 47 | unsigned local_sum = 0; 48 | 49 | // unrolled version 50 | static_iterate<0, K>([=, &local_sum] __device__ (int k){ 51 | unsigned gid = beg + k*blockDim.x; 52 | // now, this thread find the element 53 | if(gid < N){ 54 | local_sum += *(gid + first); 55 | } 56 | }); 57 | 58 | // this atomic operation is MUCH faster than running atomic operations 59 | // on global memory (i.e., idx) 60 | atomicAdd(sum, local_sum); 61 | } 62 | 63 | // GPU-based implementation of std::accumulate 64 | template 65 | __global__ void reduce_shm(Input first, Input last, T* sum) { 66 | 67 | unsigned N = last - first; 68 | 69 | __shared__ T shm[512]; 70 | 71 | // begining element of this thread in this block 72 | unsigned beg = blockIdx.x * (K*blockDim.x) + threadIdx.x; 73 | 74 | // Initialize local and shared storage 75 | unsigned local_sum = 0; 76 | shm[threadIdx.x] = 0; 77 | 78 | __syncthreads(); 79 | 80 | // unrolled version 81 | static_iterate<0, K>([=, &local_sum] __device__ (int k){ 82 | unsigned gid = beg + k*blockDim.x; 83 | // now, this thread find the element 84 | if(gid < N){ 85 | local_sum += *(gid + first); 86 | } 87 | }); 88 | 89 | shm[threadIdx.x] = local_sum; 90 | 91 | __syncthreads(); 92 | 93 | for(unsigned s = blockDim.x / 2; s>0; s >>= 1) { 94 | if(threadIdx.x < s) { 95 | shm[threadIdx.x] += shm[threadIdx.x + s]; 96 | } 97 | __syncthreads(); 98 | } 99 | 100 | // this atomic operation is MUCH faster than running atomic operations 101 | // on global memory (i.e., idx) 102 | if(threadIdx.x == 0) { 103 | atomicAdd(sum, shm[0]); 104 | } 105 | } 106 | 107 | 108 | // ---------------------------------------------------------------------------- 109 | 110 | int main(int argc, char* argv[]) { 111 | 112 | printf("CPU-based reduction algorithm implementation\n"); 113 | 114 | unsigned N = 1000000; 115 | unsigned block_size = 512; 116 | unsigned grid_size = (N + block_size*K - 1) / (block_size*K); // ceil(N/(block_size*K)) 117 | 118 | std::vector cpu(N, 1); 119 | 120 | // use std::find_if to find the first element that is equal to 5 121 | auto sol = std::accumulate(cpu.begin(), cpu.end(), 0); 122 | 123 | // step 1: allocate a GPU global memory 124 | cudaStream_t stream; 125 | cudaStreamCreate(&stream); 126 | 127 | int* gpu; 128 | int* sum; 129 | int res; // result we are going to store and use in cpu 130 | 131 | cudaMallocAsync(&gpu, sizeof(int)*N, stream); 132 | cudaMallocAsync(&sum, sizeof(unsigned), stream); 133 | 134 | // step 2: copy the data from cpu to gpu 135 | cudaMemcpyAsync(gpu, cpu.data(), sizeof(int)*N, cudaMemcpyDefault, stream); 136 | 137 | // step 3: invoke the find_if kernel 138 | //*sum = N; // cann't dereference a GPU variable in CPU scope... (seg fault) 139 | single_task <<< 1, 1, 0, stream >>> ([=]__device__() { *sum = 0; }); 140 | 141 | reduce_shm <<< grid_size, block_size, 0, stream >>>( 142 | gpu, gpu+N, sum 143 | ); 144 | 145 | // step 4: copy the solution from gpu to cpu 146 | cudaMemcpyAsync(&res, sum, sizeof(unsigned), cudaMemcpyDefault, stream); 147 | 148 | // step 5: synchronize the execution to get the result 149 | cudaStreamSynchronize(stream); 150 | 151 | std::cout << "CPU sum = " << sol << '\n'; 152 | std::cout << "GPU sum = " << res << '\n'; 153 | 154 | // deallocate all the storage we have allocated 155 | cudaFreeAsync(gpu, stream); 156 | cudaStreamDestroy(stream); 157 | 158 | return 0; 159 | } 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /Code/lecture3/lecture3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void func(int& a) { 6 | //printf("executing func by thread %p\n", std::this_thread::get_id()); 7 | a = 1000; 8 | } 9 | 10 | int main(int argc, char* argv[]) { 11 | 12 | std::cout << "main thread id is: " << std::this_thread::get_id() << '\n'; 13 | 14 | if(argc != 2) { 15 | std::cerr << "usage: ./a.out N\n"; 16 | std::exit(EXIT_FAILURE); 17 | } 18 | 19 | int N = std::atoi(argv[1]); 20 | 21 | std::vector data(N); 22 | std::vector threads; 23 | 24 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | 6 | int main(int argc, char* argv[]) { 7 | 8 | if(argc != 2) { 9 | std::cerr << "usage: ./a.out N\n"; 10 | std::exit(EXIT_FAILURE); 11 | } 12 | 13 | int N = std::atoi(argv[1]); 14 | int T = 4; // number of threads I am going to parallelize reduction 15 | int C = (N + T - 1)/T; // number of elements (chunk size) each thread is going to take 16 | // (N+T-1)/T => std::ceil((float)N/T); 17 | 18 | std::vector data(N), sums(T); 19 | std::vector threads; 20 | 21 | // initialize data to random numbers 22 | for(int i=0; i 23 34 | 35 | // 1st thread (t=0): [0, 1, 2, ...C) => chunk size is C 36 | // 2nd thread (t=1): [C, C+1, C+2, ...2C) => Chunk size is C 37 | // ... 38 | // in general, for a thread with id = t, its partition is indexed 39 | // by [t*C, ... std::min((t+1)*C, N)) 40 | for(int t=0; t( 68 | end_t - beg_t 69 | ).count(); 70 | 71 | printf("final reduction result: %d (%lu ns)\n", res, time); 72 | 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /Code/lecture5/lecture5-async.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, char* argv[]) { 8 | 9 | if(argc != 2) { 10 | std::cerr << "usage: ./a.out N\n"; 11 | std::exit(EXIT_FAILURE); 12 | } 13 | 14 | int N = std::atoi(argv[1]); 15 | int T = 4; // number of threads I am going to parallelize reduction 16 | int C = (N + T - 1)/T; // number of elements (chunk size) each thread is going to take 17 | // (N+T-1)/T => std::ceil((float)N/T); 18 | 19 | std::vector> futures; 20 | std::vector data(N); 21 | 22 | // initialize data to random numbers 23 | for(int i=0; i 23 35 | 36 | // 1st thread (t=0): [0, 1, 2, ...C) => chunk size is C 37 | // 2nd thread (t=1): [C, C+1, C+2, ...2C) => Chunk size is C 38 | // ... 39 | // in general, for a thread with id = t, its partition is indexed 40 | // by [t*C, ... std::min((t+1)*C, N)) 41 | for(int t=0; t( 65 | end_t - beg_t 66 | ).count(); 67 | 68 | printf("final reduction result: %d (%lu ns)\n", res, time); 69 | 70 | 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /Code/lecture5/lecture5-custom-async.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // custom async function to launch a function object with 8 | // a new thread, and return a future object for the caller 9 | // to access the result whenever it is available 10 | // 11 | // example: 12 | // 13 | // std::future fu = my_async([](){ return 10; }); 14 | 15 | // 16 | template 17 | auto my_async(F func) { 18 | using R = std::result_of_t; 19 | std::promise promise; 20 | auto fu = promise.get_future(); 21 | 22 | std::thread thread([p=std::move(promise), func](){ 23 | R ret = func(); 24 | promise.set_value(ret); 25 | }); 26 | 27 | return fu; 28 | } 29 | 30 | 31 | int main(int argc, char* argv[]) { 32 | 33 | if(argc != 2) { 34 | std::cerr << "usage: ./a.out N\n"; 35 | std::exit(EXIT_FAILURE); 36 | } 37 | 38 | int N = std::atoi(argv[1]); 39 | int T = 4; // number of threads I am going to parallelize reduction 40 | int C = (N + T - 1)/T; // number of elements (chunk size) each thread is going to take 41 | // (N+T-1)/T => std::ceil((float)N/T); 42 | 43 | std::vector data(N); 44 | std::vector threads; 45 | std::vector> futures; 46 | 47 | // initialize data to random numbers 48 | for(int i=0; i 23 60 | 61 | // 1st thread (t=0): [0, 1, 2, ...C) => chunk size is C 62 | // 2nd thread (t=1): [C, C+1, C+2, ...2C) => Chunk size is C 63 | // ... 64 | // in general, for a thread with id = t, its partition is indexed 65 | // by [t*C, ... std::min((t+1)*C, N)) 66 | for(int t=0; t promise; 69 | // we need to get the future from the promise object here!!!! 70 | futures.emplace_back(promise.get_future()); 71 | 72 | // here, we are transferring the ownership of the promise local variable 73 | // to the lambda function object 74 | threads.emplace_back( 75 | [t, &data, p=std::move(promise), N, C]() mutable { 76 | int beg = t*C; 77 | int end = std::min(beg+C, N); 78 | int sum = 0; 79 | for(int i=beg; i( 98 | end_t - beg_t 99 | ).count(); 100 | 101 | printf("final reduction result: %d (%lu ns)\n", res, time); 102 | 103 | // join all threads 104 | for(auto& t : threads){ // for(size_t i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, char* argv[]) { 8 | 9 | if(argc != 2) { 10 | std::cerr << "usage: ./a.out N\n"; 11 | std::exit(EXIT_FAILURE); 12 | } 13 | 14 | int N = std::atoi(argv[1]); 15 | int T = 4; // number of threads I am going to parallelize reduction 16 | int C = (N + T - 1)/T; // number of elements (chunk size) each thread is going to take 17 | // (N+T-1)/T => std::ceil((float)N/T); 18 | 19 | std::vector data(N); 20 | std::vector threads; 21 | std::vector> futures; 22 | 23 | // initialize data to random numbers 24 | for(int i=0; i 23 36 | 37 | // 1st thread (t=0): [0, 1, 2, ...C) => chunk size is C 38 | // 2nd thread (t=1): [C, C+1, C+2, ...2C) => Chunk size is C 39 | // ... 40 | // in general, for a thread with id = t, its partition is indexed 41 | // by [t*C, ... std::min((t+1)*C, N)) 42 | for(int t=0; t promise; 45 | // we need to get the future from the promise object here!!!! 46 | futures.emplace_back(promise.get_future()); 47 | 48 | // here, we are transferring the ownership of the promise local variable 49 | // to the lambda function object 50 | threads.emplace_back( 51 | [t, &data, p=std::move(promise), N, C]() mutable { 52 | int beg = t*C; 53 | int end = std::min(beg+C, N); 54 | int sum = 0; 55 | for(int i=beg; i( 74 | end_t - beg_t 75 | ).count(); 76 | 77 | printf("final reduction result: %d (%lu ns)\n", res, time); 78 | 79 | // join all threads 80 | for(auto& t : threads){ // for(size_t i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, char* argv[]) { 8 | 9 | if(argc != 2) { 10 | std::cerr << "usage: ./a.out N\n"; 11 | std::exit(EXIT_FAILURE); 12 | } 13 | 14 | int N = std::atoi(argv[1]); 15 | int T = 4; // number of threads I am going to parallelize reduction 16 | int C = (N + T - 1)/T; // number of elements (chunk size) each thread is going to take 17 | // (N+T-1)/T => std::ceil((float)N/T); 18 | 19 | std::vector data(N); 20 | std::vector threads; 21 | std::vector> futures; 22 | std::vector> promises(T); 23 | 24 | // initialize data to random numbers 25 | for(int i=0; i 23 37 | 38 | // 1st thread (t=0): [0, 1, 2, ...C) => chunk size is C 39 | // 2nd thread (t=1): [C, C+1, C+2, ...2C) => Chunk size is C 40 | // ... 41 | // in general, for a thread with id = t, its partition is indexed 42 | // by [t*C, ... std::min((t+1)*C, N)) 43 | for(int t=0; t( 69 | end_t - beg_t 70 | ).count(); 71 | 72 | printf("final reduction result: %d (%lu ns)\n", res, time); 73 | 74 | // join all threads 75 | for(auto& t : threads){ // for(size_t i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | struct MoC { 12 | 13 | MoC(T&& rhs) : object(std::move(rhs)) {} 14 | MoC(const MoC& other) : object(std::move(other.object)) {} 15 | 16 | T& get() { return object; } 17 | 18 | mutable T object; 19 | }; 20 | 21 | int main(int argc, char* argv[]) { 22 | 23 | std::mutex mtx; 24 | std::vector threads; 25 | std::vector> promises(1000); 26 | std::vector> futures; 27 | 28 | // stop signal sent by the main thread 29 | //bool stop = false; 30 | std::atomic stop {false}; 31 | std::queue< std::function > queue; 32 | 33 | for(int i=0; i<4; i++) { 34 | threads.emplace_back([&mtx, &queue, &stop](){ 35 | // keep doing my job until the main thread sends a stop signal 36 | while(!stop) { 37 | std::function task; 38 | // my job is to iteratively grab a task from the queue 39 | mtx.lock(); 40 | if(queue.empty() == false) { 41 | task = queue.front(); 42 | queue.pop(); 43 | } 44 | mtx.unlock(); 45 | // and run the task... 46 | if(task) { 47 | task(); 48 | } 49 | } 50 | }); 51 | } 52 | 53 | // main thread insert 1000 tasks into the queue 54 | for(int i=0; i<1000; i++) { 55 | //futures.emplace_back(promises[i].get_future()); 56 | //mtx.lock(); 57 | //queue.push( 58 | // [i, &p=promises[i]] () mutable { 59 | // printf("task %d finished by thread %p\n", i, std::this_thread::get_id()); 60 | // p.set_value(); 61 | // } 62 | //); 63 | //mtx.unlock(); 64 | 65 | // std::function requires the target to be copyable, so 66 | // we cannot just use the plain std::promise which is move-only 67 | //auto promise = std::make_shared>(); 68 | //futures.emplace_back(promise->get_future()); 69 | //mtx.lock(); 70 | //queue.push( 71 | // [i, promise] () mutable { 72 | // printf("task %d finished by thread %p\n", i, std::this_thread::get_id()); 73 | // promise->set_value(); 74 | // } 75 | //); 76 | //mtx.unlock(); 77 | 78 | std::promise promise; 79 | futures.emplace_back(promise.get_future()); 80 | mtx.lock(); 81 | queue.push( 82 | [i, moc=MoC{std::move(promise)}] () mutable { 83 | printf("task %d finished by thread %p\n", i, std::this_thread::get_id()); 84 | moc.object.set_value(); 85 | } 86 | ); 87 | mtx.unlock(); 88 | } 89 | 90 | // TODO: how does the main thread know when the 1000 tasks finish 91 | // and send the stop signal (stop = true) 92 | // Solution: main thread wait until all futures become available, 93 | // i.e., the corresponding promises have been carried out 94 | // by four threads 95 | for(auto& fu : futures) { 96 | fu.get(); 97 | } 98 | 99 | // now, I know all the 1000 tasks finish, so I can stop the job queue 100 | stop = true; 101 | 102 | // I need to join the threads to release their resources 103 | for(auto& t : threads) t.join(); 104 | 105 | return 0; 106 | } 107 | 108 | 109 | -------------------------------------------------------------------------------- /Code/lecture7/lecture7-job-queue-cv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | struct MoC { 13 | 14 | MoC(T&& rhs) : object(std::move(rhs)) {} 15 | MoC(const MoC& other) : object(std::move(other.object)) {} 16 | 17 | T& get() { return object; } 18 | 19 | mutable T object; 20 | }; 21 | 22 | int main(int argc, char* argv[]) { 23 | 24 | std::mutex mtx; 25 | std::vector threads; 26 | std::vector> futures; 27 | std::condition_variable cv; 28 | 29 | // stop signal sent by the main thread 30 | std::atomic stop = false; 31 | std::queue< std::function > queue; 32 | 33 | for(int i=0; i<4; i++) { 34 | threads.emplace_back([&mtx, &cv, &queue, &stop](){ 35 | // keep doing my job until the main thread sends a stop signal 36 | while(!stop) { 37 | std::function task; 38 | // my job is to iteratively grab a task from the queue 39 | { 40 | // this version is pretty bad because it forces all threads 41 | // to stay in a busy loop of getting tasks from the queue ... 42 | // and... we know most of the time the queue is empty ... 43 | //std::scoped_lock lock(mtx); 44 | //if(queue.empty() == false) { 45 | // task = queue.front(); 46 | // queue.pop(); 47 | //} 48 | 49 | // 50 | std::unique_lock lock(mtx); 51 | while(queue.empty() && !stop) { 52 | 53 | // TODO: bug here... the thread may miss the notification 54 | 55 | cv.wait(lock); 56 | } 57 | if(!queue.empty()) { 58 | task = queue.front(); 59 | queue.pop(); 60 | } 61 | } 62 | // and run the task... 63 | if(task) { 64 | task(); 65 | } 66 | } 67 | }); 68 | } 69 | 70 | // main thread insert 1000 tasks into the queue 71 | for(int i=0; i<1000; i++) { 72 | std::promise promise; 73 | futures.emplace_back(promise.get_future()); 74 | { 75 | std::scoped_lock lock(mtx); 76 | queue.push( 77 | [i, moc=MoC{std::move(promise)}] () mutable { 78 | printf("task %d finished by thread %p\n", i, std::this_thread::get_id()); 79 | moc.object.set_value(); 80 | } 81 | ); 82 | } 83 | cv.notify_one(); 84 | 85 | // do something else... 86 | //std::this_thread::sleep_for(std::chrono::seconds(1)); 87 | } 88 | 89 | // Solution: main thread wait until all futures become available, 90 | // i.e., the corresponding promises have been carried out 91 | // by four threads 92 | for(auto& fu : futures) { 93 | fu.get(); 94 | } 95 | 96 | // now, I know all the 1000 tasks finish, so I can stop the job queue 97 | stop = true; 98 | cv.notify_all(); 99 | 100 | // I need to join the threads to release their resources 101 | for(auto& t : threads) t.join(); 102 | 103 | return 0; 104 | } 105 | 106 | 107 | -------------------------------------------------------------------------------- /Code/lecture8/lecture8-job-queue-cv-bug-free.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include