├── README.md ├── chapter10 ├── axpy │ ├── Makefile │ └── axpy.cxx ├── hello_world │ ├── Makefile │ └── hello_world.cxx ├── histogram │ ├── Makefile │ ├── histo1.cxx │ ├── histo2.cxx │ └── histo3.cxx ├── letter │ ├── Makefile │ ├── letter1.cxx │ └── letter2.cxx ├── mandelbrot │ ├── Makefile │ ├── mandel1.cxx │ ├── mandel2.cxx │ └── view.py └── matrix_vector │ ├── Makefile │ └── matrix_vector.cxx ├── chapter3 ├── AVX │ ├── Makefile │ ├── matrix_matrix_mult.cpp │ ├── pointwise_vector_max.cpp │ ├── vector_max.cpp │ ├── vector_norm_aos_avx.cpp │ ├── vector_norm_aos_plain.cpp │ ├── vector_norm_soa_avx.cpp │ └── vector_norm_soa_plain.cpp ├── include └── matrix_matrix_mult_transposed │ ├── Makefile │ └── matrix_mult.cpp ├── chapter4 ├── all_pairs_distance_matrix │ ├── Makefile │ ├── all_pair.cpp │ └── data │ │ └── mnist_exporter.py ├── condition_variables │ ├── Makefile │ ├── alarm_clock.cpp │ ├── one_shot_alarm_clock.cpp │ └── ping_pong.cpp ├── false_sharing │ ├── Makefile │ └── false_sharing.cpp ├── hello_world │ ├── Makefile │ └── hello_world.cpp ├── include ├── matrix_vector_mult │ ├── Makefile │ └── matrix_vector.cpp ├── return_values │ ├── Makefile │ ├── async.cpp │ ├── packaged_task.cpp │ ├── promise_future.cpp │ └── traditional.cpp └── thread_pool │ ├── Makefile │ ├── main_basic.cpp │ ├── main_basic_tree.cpp │ └── threadpool_basic.hpp ├── chapter5 ├── atomics │ ├── Makefile │ ├── arbitrary_atomics.cpp │ ├── atomic_counting.cpp │ ├── atomic_max.cpp │ ├── query_atomics.cpp │ └── universal_atomics.cpp ├── include ├── knapsack │ ├── Makefile │ ├── knapsack.cpp │ └── threadpool.hpp └── thread_pool │ ├── Makefile │ ├── threadpool.hpp │ └── tree.cpp ├── chapter6 ├── 1NN_classification │ ├── 1NN.cpp │ ├── Makefile │ └── data │ │ └── mnist_exporter.py ├── advanced_reductions │ ├── Makefile │ ├── avx_reduction.cpp │ ├── custom_reduction.cpp │ └── string_reduction.cpp ├── hello_world │ ├── Makefile │ └── hello_world.cpp ├── include ├── load_imbalance │ ├── Makefile │ ├── data │ └── scheduling.cpp ├── matrix_vector │ ├── Makefile │ └── matrix_vector.cpp ├── softmax_regression │ ├── Makefile │ ├── data │ │ └── mnist_softmax.py │ └── softmax.cpp └── vector_add │ ├── Makefile │ ├── vector_add.cpp │ └── vector_add_scoped.cpp ├── chapter7 ├── dynamic_time_warping │ ├── Makefile │ ├── dtw_device.cu │ └── dtw_host.cu ├── eigenfaces │ ├── Makefile │ ├── covariance.cu │ ├── data │ │ ├── convert_images.py │ │ └── img_align_celeba │ │ │ └── README.md │ ├── eigenfaces.cu │ ├── mean_computation.cu │ └── mean_correction.cu ├── hello_world │ ├── Makefile │ └── hello_world.cu └── include ├── chapter8 ├── include ├── intrinsics_and_atomics │ ├── Makefile │ ├── atomics.cu │ ├── cas.cu │ └── znorm.cu ├── multi_gpu │ ├── Makefile │ ├── multi_gpu.cu │ ├── multi_streamed_gpu.cu │ ├── single_gpu.cu │ └── streamed_gpu.cu └── uvm │ ├── Makefile │ └── uvm_minimal_example.cu ├── chapter9 ├── hello_world │ ├── Makefile │ └── hello_world.cpp ├── jacobi_iteration │ ├── Makefile │ ├── jacobi_1D_block.cpp │ ├── jacobi_1D_block_simple.cpp │ ├── jacobi_1D_nonblock.cpp │ └── jacobi_seq.cpp ├── matrix_matrix_mult │ ├── Makefile │ ├── matrix_mult_2D.cpp │ ├── matrix_mult_cols.cpp │ ├── matrix_mult_rows.cpp │ └── summa.cpp ├── ping_pong │ ├── Makefile │ ├── ping_pong_ring.cpp │ └── ping_pong_ring_nonblock.cpp └── primes │ ├── Makefile │ ├── primes.cpp │ └── primes_serialized_comm.cpp └── include ├── binary_IO.hpp ├── bitmap_IO.hpp ├── cbf_generator.hpp ├── hpc_helpers.hpp └── svd.hpp /README.md: -------------------------------------------------------------------------------- 1 | # parallelprogrammingbook 2 | supplementary material/programming exercises 3 | -------------------------------------------------------------------------------- /chapter10/axpy/Makefile: -------------------------------------------------------------------------------- 1 | UPCXXHOME= /opt/upcxx/ 2 | UPCXX= $(UPCXXHOME)/bin/upc++ 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/ 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend 6 | 7 | all: axpy 8 | 9 | axpy: axpy.cxx 10 | $(UPCXX) $(UPCXXFLAGS) axpy.cxx -o axpy -I $(UPCXXINC) 11 | 12 | axpy_run: axpy 13 | $(GASNETRUN) axpy 4 1 14 | 15 | clean: 16 | rm -rf axpy 17 | -------------------------------------------------------------------------------- /chapter10/axpy/axpy.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void printOutput(int n, upcxx::shared_array data){ 5 | FILE *fp = fopen("outAXPY.txt", "wb"); 6 | // Check if the file was opened 7 | if(fp == NULL){ 8 | std::cout << "ERROR: Output file outAXPY.txt could not be opened" << std::endl; 9 | exit(1); 10 | } 11 | 12 | float aux; 13 | for(int i=0; i x(n); 60 | upcxx::shared_array y(n); 61 | 62 | // To measure time 63 | upcxx::timer t; 64 | upcxx::barrier(); 65 | t.start(); 66 | 67 | // Example accessing memory without affinity 68 | // Initialize arrays 69 | for(int i=myId; i 2 | 3 | int main (int argc, char *argv[]){ 4 | // Initialize UPC++ 5 | upcxx::init(&argc, &argv); 6 | 7 | // Every process prints Hello 8 | std::cout << "Thread " << upcxx::myrank() << " of " << upcxx::ranks() << ": Hello, world!" << std::endl; 9 | 10 | // Terminate UPC++ 11 | upcxx::finalize(); 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /chapter10/histogram/Makefile: -------------------------------------------------------------------------------- 1 | UPCXX= /opt/upcxx/bin/upc++ # install UPC++ and specify binary 2 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES 3 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend 4 | 5 | all: histo1 histo2 histo3 6 | 7 | histo1: histo1.cxx 8 | $(UPCXX) $(UPCXXFLAGS) histo1.cxx -o histo1 9 | 10 | histo1_run: histo1 11 | $(GASNETRUN) histo1 16 16 12 | 13 | histo2: histo2.cxx 14 | $(UPCXX) $(UPCXXFLAGS) histo2.cxx -o histo2 15 | 16 | histo2_run: histo2 17 | $(GASNETRUN) histo2 16 16 18 | 19 | histo3: histo3.cxx 20 | $(UPCXX) $(UPCXXFLAGS) histo3.cxx -o histo3 21 | 22 | histo3_run: histo3 23 | $(GASNETRUN) histo3 16 16 24 | 25 | clean: 26 | rm -rf histo1 27 | rm -rf histo2 28 | rm -rf histo3 29 | -------------------------------------------------------------------------------- /chapter10/histogram/histo1.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void readImage(int rows, int cols, int *image){ 4 | 5 | for(int i=0; i h){ 14 | 15 | for(int i=0; i<256; i++) 16 | std::cout << h[i] << " "; 17 | 18 | std::cout << std::endl; 19 | } 20 | 21 | upcxx::shared_lock l; 22 | 23 | int main (int argc, char *argv[]){ 24 | // Initialize UPC++ 25 | upcxx::init(&argc, &argv); 26 | 27 | int numT = upcxx::ranks(); 28 | int myId = upcxx::myrank(); 29 | 30 | if(argc < 3){ 31 | // Only the first process prints the output message 32 | if(!MYTHREAD){ 33 | std::cout << "ERROR: The syntax of the program is " 34 | << argv[0] << " rows cols" << std::endl; 35 | } 36 | exit(1); 37 | } 38 | 39 | int rows = atoi(argv[1]); 40 | int cols = atoi(argv[2]); 41 | 42 | if(rows < 0){ 43 | // Only the first process prints the output message 44 | if(!myId) 45 | std::cout << "ERROR: 'rows' must be higher than 0" << std::endl; 46 | exit(1); 47 | } 48 | 49 | if(cols < 0){ 50 | // Only the first process prints the output message 51 | if(!myId) 52 | std::cout << "ERROR: 'cols' must be higher than 0" << std::endl; 53 | exit(1); 54 | } 55 | 56 | if(rows%numT){ 57 | // Only the first process prints the output message 58 | if(!myId) 59 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 60 | exit(1); 61 | } 62 | 63 | // Create the array of global pointers 64 | upcxx::shared_array> p(numT); 65 | 66 | // Each thread allocates the memory of its subspace 67 | int blockRows = rows/numT; 68 | p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int)); 69 | 70 | // Thread 0 reads the image and copies the fragments 71 | if(!myId){ 72 | int *block = new int[blockRows*cols]; 73 | int *block2 = new int[blockRows*cols]; 74 | upcxx::event e; 75 | 76 | readImage(blockRows, cols, block); 77 | 78 | for(int i=0; i(block, p[i], blockRows*cols, &e); 80 | 81 | // Overlap the copy with reading the next fragment 82 | // We cannot use "block" for the next fragment because it has not been sent 83 | readImage(blockRows, cols, block2); 84 | 85 | // The previous copy must have finished to reuse its buffer 86 | e.wait(); 87 | int *aux = block; 88 | block = block2; 89 | block2 = aux; 90 | } 91 | 92 | // The last copy does not overlap 93 | upcxx::copy(block, p[numT-1], blockRows*cols); 94 | 95 | delete block; 96 | delete block2; 97 | } 98 | 99 | // Threads must wait until Thread 0 has copied the fragments of the text 100 | upcxx::barrier(); 101 | 102 | // Privatize the pointer 103 | int *myImage = (int *) (upcxx::global_ptr) p[myId]; 104 | 105 | // Check whether it is really local 106 | if(!((upcxx::global_ptr) p[myId]).is_local()) 107 | std::cout << "Thread " << myId << " not accessing local memory" << std::endl; 108 | 109 | // Declare the histogram 110 | upcxx::shared_array histogram(256); 111 | for(int i=myId; i<256; i+=numT) 112 | histogram[i] = 0; 113 | 114 | // Threads must wait until all locks and histogram have been initialized 115 | upcxx::barrier(); 116 | 117 | // Examine the local image 118 | for(int i=0; i(p[myId]); 136 | 137 | // Terminate UPC++ 138 | upcxx::finalize(); 139 | return 0; 140 | } 141 | -------------------------------------------------------------------------------- /chapter10/histogram/histo2.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void readImage(int rows, int cols, int *image){ 4 | 5 | for(int i=0; i h){ 14 | 15 | for(int i=0; i<256; i++) 16 | std::cout << h[i] << " "; 17 | 18 | std::cout << std::endl; 19 | } 20 | 21 | upcxx::shared_array locks; 22 | 23 | int main (int argc, char *argv[]){ 24 | // Initialize UPC++ 25 | upcxx::init(&argc, &argv); 26 | 27 | int numT = upcxx::ranks(); 28 | int myId = upcxx::myrank(); 29 | 30 | if(argc < 3){ 31 | // Only the first process prints the output message 32 | if(!MYTHREAD){ 33 | std::cout << "ERROR: The syntax of the program is " 34 | << argv[0] << " rows cols" << std::endl; 35 | } 36 | exit(1); 37 | } 38 | 39 | int rows = atoi(argv[1]); 40 | int cols = atoi(argv[2]); 41 | 42 | if(rows < 0){ 43 | // Only the first process prints the output message 44 | if(!myId) 45 | std::cout << "ERROR: 'rows' must be higher than 0" << std::endl; 46 | exit(1); 47 | } 48 | 49 | if(cols < 0){ 50 | // Only the first process prints the output message 51 | if(!myId) 52 | std::cout << "ERROR: 'cols' must be higher than 0" << std::endl; 53 | exit(1); 54 | } 55 | 56 | if(rows%numT){ 57 | // Only the first process prints the output message 58 | if(!myId) 59 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 60 | exit(1); 61 | } 62 | 63 | // Create the array of global pointers 64 | upcxx::shared_array> p(numT); 65 | 66 | // Each thread allocates the memory of its subspace 67 | int blockRows = rows/numT; 68 | p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int)); 69 | 70 | // Thread 0 reads the image and copies the fragments 71 | if(!myId){ 72 | int *block = new int[blockRows*cols]; 73 | int *block2 = new int[blockRows*cols]; 74 | upcxx::event e; 75 | 76 | readImage(blockRows, cols, block); 77 | 78 | for(int i=0; i(block, p[i], blockRows*cols, &e); 80 | 81 | // Overlap the copy with reading the next fragment 82 | // We cannot use "block" for the next fragment because it has not been sent 83 | readImage(blockRows, cols, block2); 84 | 85 | // The previous copy must have finished to reuse its buffer 86 | e.wait(); 87 | int *aux = block; 88 | block = block2; 89 | block2 = aux; 90 | } 91 | 92 | // The last copy does not overlap 93 | upcxx::copy(block, p[numT-1], blockRows*cols); 94 | 95 | delete block; 96 | delete block2; 97 | } 98 | 99 | // Threads must wait until Thread 0 has copied the fragments of the text 100 | upcxx::barrier(); 101 | 102 | // Privatize the pointer 103 | int *myImage = (int *) (upcxx::global_ptr) p[myId]; 104 | 105 | // Check whether it is really local 106 | if(!((upcxx::global_ptr) p[myId]).is_local()) 107 | std::cout << "Thread " << myId << " not accessing local memory" << std::endl; 108 | 109 | // Declare the histogram 110 | upcxx::shared_array histogram(256); 111 | for(int i=myId; i<256; i+=numT) 112 | histogram[i] = 0; 113 | 114 | // Initialize the locks 115 | locks.init(256); 116 | for(int i=myId; i<256; i+=numT) 117 | new (locks[i].raw_ptr()) upcxx::shared_lock(myId); 118 | 119 | // Threads must wait until all locks and histogram have been initialized 120 | upcxx::barrier(); 121 | 122 | // Examine the local image 123 | for(int i=0; i(p[myId]); 141 | 142 | // Terminate UPC++ 143 | upcxx::finalize(); 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /chapter10/histogram/histo3.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | upcxx::shared_array> histogram; 4 | 5 | void readImage(int rows, int cols, int *image){ 6 | 7 | for(int i=0; i) histogram[i]).load() << " "; 19 | 20 | std::cout << std::endl; 21 | } 22 | 23 | int main (int argc, char *argv[]){ 24 | // Initialize UPC++ 25 | upcxx::init(&argc, &argv); 26 | 27 | int numT = upcxx::ranks(); 28 | int myId = upcxx::myrank(); 29 | 30 | if(argc < 3){ 31 | // Only the first process prints the output message 32 | if(!MYTHREAD){ 33 | std::cout << "ERROR: The syntax of the program is " 34 | << argv[0] << " rows cols" << std::endl; 35 | } 36 | exit(1); 37 | } 38 | 39 | int rows = atoi(argv[1]); 40 | int cols = atoi(argv[2]); 41 | 42 | if(rows < 0){ 43 | // Only the first process prints the output message 44 | if(!myId) 45 | std::cout << "ERROR: 'rows' must be higher than 0" << std::endl; 46 | exit(1); 47 | } 48 | 49 | if(cols < 0){ 50 | // Only the first process prints the output message 51 | if(!myId) 52 | std::cout << "ERROR: 'cols' must be higher than 0" << std::endl; 53 | exit(1); 54 | } 55 | 56 | if(rows%numT){ 57 | // Only the first process prints the output message 58 | if(!myId) 59 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 60 | exit(1); 61 | } 62 | 63 | // Create the array of global pointers 64 | upcxx::shared_array> p(numT); 65 | 66 | // Each thread allocates the memory of its subspace 67 | int blockRows = rows/numT; 68 | p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int)); 69 | 70 | // Thread 0 reads the image and copies the fragments 71 | if(!myId){ 72 | int *block = new int[blockRows*cols]; 73 | int *block2 = new int[blockRows*cols]; 74 | upcxx::event e; 75 | 76 | readImage(blockRows, cols, block); 77 | 78 | for(int i=0; i(block, p[i], blockRows*cols, &e); 80 | 81 | // Overlap the copy with reading the next fragment 82 | // We cannot use "block" for the next fragment because it has not been sent 83 | readImage(blockRows, cols, block2); 84 | 85 | // The previous copy must have finished to reuse its buffer 86 | e.wait(); 87 | int *aux = block; 88 | block = block2; 89 | block2 = aux; 90 | } 91 | 92 | // The last copy does not overlap 93 | upcxx::copy(block, p[numT-1], blockRows*cols); 94 | 95 | delete block; 96 | delete block2; 97 | } 98 | 99 | // Threads must wait until Thread 0 has copied the fragments of the text 100 | upcxx::barrier(); 101 | 102 | // Privatize the pointer 103 | int *myImage = (int *) (upcxx::global_ptr) p[myId]; 104 | 105 | // Check whether it is really local 106 | if(!((upcxx::global_ptr) p[myId]).is_local()) 107 | std::cout << "Thread " << myId << " not accessing local memory" << std::endl; 108 | 109 | std::cout << "To init histogram" << std::endl; 110 | 111 | // Initialize the histogram 112 | histogram.init(256); 113 | for(int i=myId; i<256; i+=numT){ 114 | std::cout << "Before, histogram[" << i << "] = " << histogram[i].get().load() << std::endl; 115 | //((upcxx::atomic) histogram[i]).store(1); 116 | histogram[i].get().store(1); 117 | std::cout << "After, histogram[" << i << "] = " << histogram[i].get().load() << std::endl; 118 | } 119 | 120 | std::cout << "histogram initialized" << std::endl; 121 | 122 | // Threads must wait until the histogram has been initialized 123 | upcxx::barrier(); 124 | 125 | // Examine the local image 126 | /*for(int i=0; i) histogram[myImage[i]]).fetch_add(1);*/ 129 | 130 | // All threads must have finished their local computation 131 | upcxx::barrier(); 132 | 133 | if(!myId) 134 | printHistogram(); 135 | 136 | // Deallocate the local memory 137 | upcxx::deallocate(p[myId]); 138 | 139 | // Terminate UPC++ 140 | upcxx::finalize(); 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /chapter10/letter/Makefile: -------------------------------------------------------------------------------- 1 | UPCXXHOME= /opt/upcxx/ 2 | UPCXX= $(UPCXXHOME)/bin/upc++ 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/ 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend 6 | 7 | all: letter1 letter2 8 | 9 | letter1: letter1.cxx 10 | $(UPCXX) $(UPCXXFLAGS) letter1.cxx -o letter1 -I $(UPCXXINC) 11 | 12 | letter1_run: letter1 13 | $(GASNETRUN) letter1 C 4 14 | 15 | letter2: letter2.cxx 16 | $(UPCXX) $(UPCXXFLAGS) letter2.cxx -o letter2 -I $(UPCXXINC) 17 | 18 | letter2_run: letter2 19 | $(GASNETRUN) letter2 C 4 20 | 21 | clean: 22 | rm -rf letter1 23 | rm -rf letter2 24 | -------------------------------------------------------------------------------- /chapter10/letter/letter1.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void readText(int n, char *text){ 4 | 5 | int i; 6 | for(i=0; i 1){ 16 | text[i*4+1] = 'C'; 17 | if((n%4) > 2){ 18 | text[i*4+2] = 'G'; 19 | } 20 | } 21 | } 22 | } 23 | 24 | int main (int argc, char *argv[]){ 25 | // Initialize UPC++ 26 | upcxx::init(&argc, &argv); 27 | 28 | int numT = upcxx::ranks(); 29 | int myId = upcxx::myrank(); 30 | 31 | if(argc < 3){ 32 | // Only the first process prints the output message 33 | if(!MYTHREAD){ 34 | std::cout << "ERROR: The syntax of the program is " 35 | << argv[0] << " l n" << std::endl; 36 | } 37 | exit(1); 38 | } 39 | 40 | char l = *argv[1]; 41 | int n = atoi(argv[2]); 42 | 43 | if(n < 0){ 44 | // Only the first process prints the output message 45 | if(!myId) 46 | std::cout << "ERROR: 'n' must be higher than 0" << std::endl; 47 | 48 | exit(1); 49 | } 50 | 51 | if(n%numT){ 52 | // Only the first process prints the output message 53 | if(!myId) 54 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 55 | 56 | exit(1); 57 | } 58 | 59 | // Create the array of global pointers 60 | upcxx::shared_array> p(numT); 61 | 62 | // Each thread allocates the memory of its subspace 63 | int blockFactor = n/numT; 64 | p[myId] = upcxx::allocate(myId, blockFactor*sizeof(char)); 65 | 66 | // Thread 0 reads the text and copy the fragments 67 | if(!myId){ 68 | char *text = new char[100]; 69 | readText(n, text); 70 | 71 | for(int i=0; i(&text[blockFactor*i], p[i], blockFactor); 73 | 74 | delete text; 75 | } 76 | 77 | // Threads must wait until Thread 0 has copied the fragments of the text 78 | upcxx::barrier(); 79 | 80 | // Privatize the pointer 81 | int myNumOcc = 0; 82 | char *myText = (char *) (upcxx::global_ptr) p[myId]; 83 | 84 | // Check whether it is really local 85 | if(!((upcxx::global_ptr) p[myId]).is_local()) 86 | std::cout << "Thread " << myId << " not accessing local memory" << std::endl; 87 | 88 | // Find the local occurrences 89 | for(int i=0; i occs(numT); 95 | occs[myId] = myNumOcc; 96 | 97 | // All threads must have put accessible the local occurrences 98 | upcxx::barrier(); 99 | 100 | if(!myId){ 101 | int numOcc = myNumOcc; 102 | for(int i=1; i(p[myId]); 110 | 111 | // Terminate UPC++ 112 | upcxx::finalize(); 113 | return 0; 114 | } 115 | -------------------------------------------------------------------------------- /chapter10/letter/letter2.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void readText(int n, char *text){ 4 | 5 | int i; 6 | for(i=0; i 1){ 16 | text[i*4+1] = 'C'; 17 | if((n%4) > 2){ 18 | text[i*4+2] = 'G'; 19 | } 20 | } 21 | } 22 | } 23 | 24 | int main (int argc, char *argv[]){ 25 | // Initialize UPC++ 26 | upcxx::init(&argc, &argv); 27 | 28 | int numT = upcxx::ranks(); 29 | int myId = upcxx::myrank(); 30 | 31 | if(argc < 3){ 32 | // Only the first process prints the output message 33 | if(!MYTHREAD){ 34 | std::cout << "ERROR: The syntax of the program is ./letter l n" 35 | << std::endl; 36 | } 37 | exit(1); 38 | } 39 | 40 | char l = *argv[1]; 41 | int n = atoi(argv[2]); 42 | 43 | if(n < 0){ 44 | // Only the first process prints the output message 45 | if(!myId) 46 | std::cout << "ERROR: 'n' must be higher than 0" << std::endl; 47 | 48 | exit(1); 49 | } 50 | 51 | if(n%numT){ 52 | // Only the first process prints the output message 53 | if(!myId) 54 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 55 | 56 | exit(1); 57 | } 58 | 59 | // Create the array of global pointers 60 | upcxx::shared_array> p(numT); 61 | 62 | // Each thread allocates the memory of its subspace 63 | int blockFactor = n/numT; 64 | p[myId] = upcxx::allocate(myId, blockFactor*sizeof(char)); 65 | 66 | // Thread 0 reads the text and copy the fragments 67 | if(!myId){ 68 | char *text = new char[blockFactor]; 69 | char *text2 = new char[blockFactor]; 70 | upcxx::event e; 71 | 72 | readText(blockFactor, text); 73 | 74 | for(int i=0; i(text, p[i], blockFactor, &e); 76 | 77 | // Overlap the copy with reading the next fragment 78 | // We cannot use text for teh next fragment because it has not been sent 79 | readText(blockFactor, text2); 80 | char *aux = text; 81 | text = text2; 82 | text2 = aux; 83 | 84 | // The previous copy must have finished to reuse its buffer 85 | e.wait(); 86 | } 87 | 88 | // The last copy does not overlap 89 | upcxx::copy(text, p[numT-1], blockFactor); 90 | 91 | delete text; 92 | delete text2; 93 | } 94 | 95 | // Threads must wait until Thread 0 has copied the fragments of the text 96 | upcxx::barrier(); 97 | 98 | // Privatize the pointer 99 | int myNumOcc = 0; 100 | char *myText = (char *) (upcxx::global_ptr) p[myId]; 101 | 102 | // Check whether it is really local 103 | if(!((upcxx::global_ptr) p[myId]).is_local()) 104 | std::cout << "Thread " << myId << " not accessing local memory" << std::endl; 105 | 106 | // Find the local occurrences 107 | for(int i=0; i(p[myId]); 121 | 122 | // Terminate UPC++ 123 | upcxx::finalize(); 124 | return 0; 125 | } 126 | -------------------------------------------------------------------------------- /chapter10/mandelbrot/Makefile: -------------------------------------------------------------------------------- 1 | UPCXXHOME= /opt/upcxx/ 2 | UPCXX= $(UPCXXHOME)/bin/upc++ 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/ 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend 6 | 7 | all: mandel1 mandel2 8 | 9 | mandel1: mandel1.cxx 10 | $(UPCXX) $(UPCXXFLAGS) mandel1.cxx -o mandel1 -I $(UPCXXINC) 11 | 12 | mandel1_run: mandel1 13 | $(GASNETRUN) mandel1 512 512 1024 14 | 15 | mandel2: mandel2.cxx 16 | $(UPCXX) $(UPCXXFLAGS) mandel2.cxx -o mandel2 -I $(UPCXXINC) 17 | 18 | mandel2_run: mandel2 19 | $(GASNETRUN) mandel2 512 512 1024 20 | 21 | clean: 22 | rm -rf mandel1 23 | rm -rf mandel2 24 | -------------------------------------------------------------------------------- /chapter10/mandelbrot/mandel1.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void printMandel(int *image, int rows, int cols){ 4 | 5 | for(int i=0; i=maxIter) 29 | return 0; 30 | 31 | return k; 32 | } 33 | 34 | int main (int argc, char *argv[]){ 35 | 36 | // Initialize UPC++ 37 | upcxx::init(&argc, &argv); 38 | 39 | int numT = upcxx::ranks(); 40 | int myId = upcxx::myrank(); 41 | 42 | if(argc < 4){ 43 | // Only the first process prints the output message 44 | if(!MYTHREAD){ 45 | std::cout << "ERROR: The syntax of the program is " 46 | << argv[0] << " rows cols maxIter" << std::endl; 47 | } 48 | exit(1); 49 | } 50 | 51 | int rows = atoi(argv[1]); 52 | int cols = atoi(argv[2]); 53 | int maxIter = atoi(argv[3]); 54 | 55 | if(rows < 0){ 56 | // Only the first process prints the output message 57 | if(!myId) 58 | std::cout << "ERROR: 'rows' must be higher than 0" << std::endl; 59 | exit(1); 60 | } 61 | 62 | if(cols < 0){ 63 | // Only the first process prints the output message 64 | if(!myId) 65 | std::cout << "ERROR: 'cols' must be higher than 0" << std::endl; 66 | exit(1); 67 | } 68 | 69 | if(maxIter < 0){ 70 | // Only the first process prints the output message 71 | if(!myId) 72 | std::cout << "ERROR: 'maxIter' must be higher than 0" << std::endl; 73 | exit(1); 74 | } 75 | 76 | if(rows%numT){ 77 | // Only the first process prints the output message 78 | if(!myId) 79 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 80 | exit(1); 81 | } 82 | 83 | // Output array 84 | int blockRows = rows/numT; 85 | int myImage[blockRows*cols]; 86 | upcxx::shared_var> outImage; 87 | 88 | // Only the owner allocates the array to gather the output 89 | if(!myId){ 90 | outImage.put(upcxx::allocate(0, rows*cols*sizeof(int))); 91 | } 92 | 93 | // To guarantee that memory is allocated 94 | upcxx::barrier(); 95 | 96 | // Mandel computation of the block of rows 97 | for(int i=0; i(myImage, (upcxx::global_ptr) &(outImage.get())[myId*blockRows*cols], blockRows*cols); 103 | 104 | // All threads must have finished their local computation 105 | upcxx::barrier(); 106 | 107 | if(!myId){ 108 | printMandel((int *) outImage.get(), rows, cols); 109 | // Deallocate the local memory 110 | upcxx::deallocate(outImage.get()); 111 | } 112 | 113 | // Terminate UPC++ 114 | upcxx::finalize(); 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /chapter10/mandelbrot/mandel2.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // Output array 4 | upcxx::shared_var> outImage; 5 | 6 | // Array to know the busy threads 7 | upcxx::shared_array busyTh; 8 | 9 | void printMandel(int *image, int rows, int cols){ 10 | 11 | for(int i=0; i=maxIter) 35 | return 0; 36 | 37 | return k; 38 | } 39 | 40 | 41 | void mandelRow(int iterRow, int th, int rows, int cols, int maxIter){ 42 | int rowRes[cols]; 43 | 44 | for(int j=0; j(rowRes, (upcxx::global_ptr) &(outImage.get())[iterRow*cols], cols); 50 | 51 | busyTh[th] = false; 52 | } 53 | 54 | int main (int argc, char *argv[]){ 55 | 56 | // Initialize UPC++ 57 | upcxx::init(&argc, &argv); 58 | 59 | int numT = upcxx::ranks(); 60 | int myId = upcxx::myrank(); 61 | 62 | if(numT == 1){ 63 | std::cout << "ERROR: More than 1 thread is required for this master-slave approach" 64 | << std::endl; 65 | exit(1); 66 | } 67 | 68 | if(argc < 4){ 69 | // Only the first process prints the output message 70 | if(!MYTHREAD){ 71 | std::cout << "ERROR: The syntax of the program is " 72 | << argv[0] << " rows cols maxIter" << std::endl; 73 | } 74 | exit(1); 75 | } 76 | 77 | int rows = atoi(argv[1]); 78 | int cols = atoi(argv[2]); 79 | int maxIter = atoi(argv[3]); 80 | 81 | if(rows < 0){ 82 | // Only the first process prints the output message 83 | if(!myId) 84 | std::cout << "ERROR: 'rows' must be higher than 0" << std::endl; 85 | exit(1); 86 | } 87 | 88 | if(cols < 0){ 89 | // Only the first process prints the output message 90 | if(!myId) 91 | std::cout << "ERROR: 'cols' must be higher than 0" << std::endl; 92 | exit(1); 93 | } 94 | 95 | if(maxIter < 0){ 96 | // Only the first process prints the output message 97 | if(!myId) 98 | std::cout << "ERROR: 'maxIter' must be higher than 0" << std::endl; 99 | exit(1); 100 | } 101 | 102 | if(rows%numT){ 103 | // Only the first process prints the output message 104 | if(!myId) 105 | std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl; 106 | exit(1); 107 | } 108 | 109 | // Initialize the lazy array 110 | // All elements with affinity to Thread 0 111 | busyTh.init(numT); 112 | busyTh[myId] = false; 113 | 114 | // To guarantee that busyTh is initialized 115 | upcxx::barrier(); 116 | 117 | // Thread 0 is the master 118 | if(!myId){ 119 | outImage.put(upcxx::allocate(0, rows*cols*sizeof(int))); 120 | int nextTh = 1; 121 | 122 | // While there are more rows 123 | for(int i=0; i(outImage.get()); 143 | } 144 | 145 | // Terminate UPC++ 146 | upcxx::finalize(); 147 | return 0; 148 | } 149 | -------------------------------------------------------------------------------- /chapter10/mandelbrot/view.py: -------------------------------------------------------------------------------- 1 | # use as follows: 2 | # # $ ./mandel > mandel.txt 3 | # $ python2 view.py mandel.txt 4 | 5 | # install numpy and matplotlib from standard repositories 6 | # or locally with pip 7 | # pip install --user numpy 8 | # pip install --user matplotlib 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import math 13 | import sys 14 | 15 | # Extract points from specified file 16 | im = np.loadtxt( sys.argv[1] ) 17 | 18 | # Display 19 | plt.imshow(im,cmap=plt.cm.flag) 20 | plt.show() 21 | -------------------------------------------------------------------------------- /chapter10/matrix_vector/Makefile: -------------------------------------------------------------------------------- 1 | UPCXXHOME= /opt/upcxx/ 2 | UPCXX= $(UPCXXHOME)/bin/upc++ 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/ 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend 6 | 7 | all: matrix_vector 8 | 9 | matrix_vector: matrix_vector.cxx 10 | $(UPCXX) $(UPCXXFLAGS) matrix_vector.cxx -o matrix_vector -I $(UPCXXINC) 11 | 12 | matrix_vector_run: matrix_vector 13 | $(GASNETRUN) matrix_vector 128 256 14 | 15 | 16 | clean: 17 | rm -rf matrix_vector 18 | -------------------------------------------------------------------------------- /chapter10/matrix_vector/matrix_vector.cxx: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | void readInput(int m, int n, float *A, float *x){ 10 | 11 | // checkerboard 12 | for(int i=0; i> globalA, globalx, globaly; 82 | upcxx::global_ptr A, x, y; 83 | 84 | if(!myId){ 85 | // Allocate shared memory with affinity to process 0 to store the whole matrices 86 | A = upcxx::allocate(0, m*n); 87 | x = upcxx::allocate(0, n); 88 | y = upcxx::allocate(0, m); 89 | readInput(m, n, (float *)A, (float *)x); 90 | globalA = A; 91 | globalx = x; 92 | globaly = y; 93 | } 94 | 95 | size_t blockRows = m/numP; 96 | 97 | // To measure time 98 | upcxx::timer t; 99 | 100 | // Barrier to guarantee that 'A' and 'x' are initialized 101 | upcxx::barrier(); 102 | t.start(); 103 | 104 | A = globalA; 105 | x = globalx; 106 | y = globaly; 107 | 108 | // First option, directly access in computation to shared memory 109 | for(size_t i=myId*blockRows; i<(myId+1)*blockRows; i++){ 110 | y[i] = 0; 111 | for(size_t j=0; j privA = upcxx::allocate(myId, blockRows*n); 128 | upcxx::global_ptr privX = upcxx::allocate(myId, n); 129 | upcxx::global_ptr privY = upcxx::allocate(myId, blockRows); 130 | 131 | upcxx::copy(A+blockRows*n*myId, privA, blockRows*n); 132 | upcxx::copy(x, privX, n); 133 | 134 | for(size_t i=0; i // prng 2 | #include // uint32_t 3 | #include // std::cout 4 | #include // AVX intrinsics 5 | 6 | // timers distributed with this book 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | void init(float * data, uint64_t length) { 10 | 11 | std::mt19937 engine(42); 12 | std::uniform_real_distribution density(-1, 1); 13 | 14 | for (uint64_t i = 0; i < length; i++) 15 | data[i] = density(engine); 16 | } 17 | 18 | inline float hsum_sse3(__m128 v) { 19 | __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 20 | __m128 maxs = _mm_add_ps(v, shuf); 21 | shuf = _mm_movehl_ps(shuf, maxs); // high half -> low half 22 | maxs = _mm_add_ss(maxs, shuf); 23 | return _mm_cvtss_f32(maxs); 24 | } 25 | 26 | inline float hsum_avx(__m256 v) { 27 | __m128 lo = _mm256_castps256_ps128(v); // low 128 28 | __m128 hi = _mm256_extractf128_ps(v, 1); // high 128 29 | lo = _mm_add_ps(lo, hi); // max the low 128 30 | return hsum_sse3(lo); // and inline the sse3 version 31 | } 32 | 33 | void plain_dmm(float * A, 34 | float * B, 35 | float * C, 36 | uint64_t M, 37 | uint64_t L, 38 | uint64_t N, 39 | bool parallel) { 40 | 41 | #pragma omp parallel for collapse(2) if(parallel) 42 | for (uint64_t i = 0; i < M; i++) 43 | for (uint64_t j = 0; j < N; j++) { 44 | float accum = float(0); 45 | for (uint64_t k = 0; k < L; k++) 46 | accum += A[i*L+k]*B[j*L+k]; 47 | C[i*N+j] = accum; 48 | } 49 | } 50 | 51 | void avx_dmm(float * A, 52 | float * B, 53 | float * C, 54 | uint64_t M, 55 | uint64_t L, 56 | uint64_t N, 57 | bool parallel) { 58 | 59 | #pragma omp parallel for collapse(2) if(parallel) 60 | for (uint64_t i = 0; i < M; i++) 61 | for (uint64_t j = 0; j < N; j++) { 62 | 63 | __m256 X = _mm256_setzero_ps(); 64 | for (uint64_t k = 0; k < L; k += 8) { 65 | const __m256 AV = _mm256_load_ps(A+i*L+k); 66 | const __m256 BV = _mm256_load_ps(B+j*L+k); 67 | X = _mm256_add_ps(X, _mm256_mul_ps(AV, BV)); 68 | } 69 | 70 | C[i*N+j] = hsum_avx(X); 71 | } 72 | } 73 | 74 | void avx_dmm_unroll_2(float * A, 75 | float * B, 76 | float * C, 77 | uint64_t M, 78 | uint64_t L, 79 | uint64_t N, 80 | bool parallel) { 81 | 82 | #pragma omp parallel for collapse(2) if(parallel) 83 | for (uint64_t i = 0; i < M; i++) 84 | for (uint64_t j = 0; j < N; j++) { 85 | 86 | __m256 X = _mm256_setzero_ps(); 87 | __m256 Y = _mm256_setzero_ps(); 88 | for (uint64_t k = 0; k < L; k += 16) { 89 | const __m256 AVX = _mm256_load_ps(A+i*L+k+0); 90 | const __m256 BVX = _mm256_load_ps(B+j*L+k+0); 91 | const __m256 AVY = _mm256_load_ps(A+i*L+k+8); 92 | const __m256 BVY = _mm256_load_ps(B+j*L+k+8); 93 | X = _mm256_add_ps(X, _mm256_mul_ps(AVX, BVX)); 94 | Y = _mm256_add_ps(X, _mm256_mul_ps(AVY, BVY)); 95 | } 96 | 97 | C[i*N+j] = hsum_avx(X)+hsum_avx(Y); 98 | } 99 | } 100 | 101 | int main () { 102 | 103 | const uint64_t M = 1UL << 10; 104 | const uint64_t L = 1UL << 11; 105 | const uint64_t N = 1UL << 12; 106 | 107 | TIMERSTART(alloc_memory) 108 | auto A = static_cast(_mm_malloc(M*L*sizeof(float) , 32)); 109 | auto B = static_cast(_mm_malloc(N*L*sizeof(float) , 32)); 110 | auto C = static_cast(_mm_malloc(M*N*sizeof(float) , 32)); 111 | TIMERSTOP(alloc_memory) 112 | 113 | TIMERSTART(init) 114 | init(A, M*L); 115 | init(B, N*L); 116 | TIMERSTOP(init) 117 | 118 | TIMERSTART(plain_dmm_single) 119 | plain_dmm(A, B, C, M, L, N, false); 120 | TIMERSTOP(plain_dmm_single) 121 | 122 | TIMERSTART(plain_dmm_multi) 123 | plain_dmm(A, B, C, M, L, N, true); 124 | TIMERSTOP(plain_dmm_multi) 125 | 126 | TIMERSTART(avx_dmm_single) 127 | avx_dmm(A, B, C, M, L, N, false); 128 | TIMERSTOP(avx_dmm_single) 129 | 130 | TIMERSTART(avx_dmm_multi) 131 | avx_dmm(A, B, C, M, L, N, true); 132 | TIMERSTOP(avx_dmm_multi) 133 | 134 | TIMERSTART(avx_dmm_unroll_2_single) 135 | avx_dmm_unroll_2(A, B, C, M, L, N, false); 136 | TIMERSTOP(avx_dmm_unroll_2_single) 137 | 138 | TIMERSTART(avx_dmm_unroll_2_multi) 139 | avx_dmm_unroll_2(A, B, C, M, L, N, true); 140 | TIMERSTOP(avx_dmm_unroll_2_multi) 141 | 142 | TIMERSTART(free_memory) 143 | _mm_free(A); 144 | _mm_free(B); 145 | _mm_free(C); 146 | TIMERSTOP(free_memory) 147 | } 148 | -------------------------------------------------------------------------------- /chapter3/AVX/pointwise_vector_max.cpp: -------------------------------------------------------------------------------- 1 | #include // prng 2 | #include // uint32_t 3 | #include // std::cout 4 | #include // AVX intrinsics 5 | 6 | // timers distributed with this book 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | void init(float * data, uint64_t length) { 10 | 11 | std::mt19937 engine(42); 12 | std::uniform_real_distribution density(-1L<<28, 1L<<28); 13 | 14 | for (uint64_t i = 0; i < length; i++) 15 | data[i] = density(engine); 16 | } 17 | 18 | void plain_pointwise_max(float * x, 19 | float * y, 20 | float * z, uint64_t length) { 21 | 22 | for (uint64_t i = 0; i < length; i++) 23 | z[i] = std::max(x[i], y[i]); 24 | } 25 | 26 | void avx_pointwise_max(float * x, 27 | float * y, 28 | float * z, uint64_t length) { 29 | 30 | 31 | for (uint64_t i = 0; i < length; i += 8) { 32 | __m256 X = _mm256_load_ps(x+i); 33 | __m256 Y = _mm256_load_ps(y+i); 34 | _mm256_store_ps(z+i, _mm256_max_ps(X, Y)); 35 | } 36 | } 37 | 38 | 39 | int main () { 40 | 41 | const uint64_t num_entries = 1UL << 28; 42 | const uint64_t num_bytes = num_entries*sizeof(float); 43 | 44 | TIMERSTART(alloc_memory) 45 | auto x = static_cast(_mm_malloc(num_bytes , 32)); 46 | auto y = static_cast(_mm_malloc(num_bytes , 32)); 47 | auto z = static_cast(_mm_malloc(num_bytes , 32)); 48 | TIMERSTOP(alloc_memory) 49 | 50 | TIMERSTART(init) 51 | init(x, num_entries); 52 | init(y, num_entries); 53 | TIMERSTOP(init) 54 | 55 | TIMERSTART(plain_pointwise_max) 56 | plain_pointwise_max(x, y, z, num_entries); 57 | TIMERSTOP(plain_pointwise_max) 58 | 59 | TIMERSTART(avx_pointwise_max) 60 | avx_pointwise_max(x, y, z, num_entries); 61 | TIMERSTOP(avx_pointwise_max) 62 | 63 | TIMERSTART(free_memory) 64 | _mm_free(x); 65 | _mm_free(y); 66 | _mm_free(z); 67 | TIMERSTOP(free_memory) 68 | } 69 | -------------------------------------------------------------------------------- /chapter3/AVX/vector_max.cpp: -------------------------------------------------------------------------------- 1 | #include // prng 2 | #include // uint32_t 3 | #include // std::cout 4 | #include // AVX intrinsics 5 | 6 | // timers distributed with this book 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | void init(float * data, uint64_t length) { 10 | 11 | std::mt19937 engine(42); 12 | std::uniform_real_distribution density(-1L<<28, 1L<<28); 13 | 14 | for (uint64_t i = 0; i < length; i++) 15 | data[i] = density(engine); 16 | } 17 | 18 | inline float hmax_sse3(__m128 v) { 19 | __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 20 | __m128 maxs = _mm_max_ps(v, shuf); 21 | shuf = _mm_movehl_ps(shuf, maxs); // high half -> low half 22 | maxs = _mm_max_ss(maxs, shuf); 23 | return _mm_cvtss_f32(maxs); 24 | } 25 | 26 | inline float hmax_avx(__m256 v) { 27 | __m128 lo = _mm256_castps256_ps128(v); // low 128 28 | __m128 hi = _mm256_extractf128_ps(v, 1); // high 128 29 | lo = _mm_max_ps(lo, hi); // max the low 128 30 | return hmax_sse3(lo); // and inline the sse3 version 31 | } 32 | 33 | float avx_max(float * data, uint64_t length) { 34 | 35 | // neutral element "e" in monoid (|R, max) is -oo 36 | const float e = -INFINITY; 37 | __m256 X = _mm256_set1_ps(e); 38 | 39 | for (uint64_t i = 0; i < length; i += 8) { 40 | __m256 DATA = _mm256_load_ps(data+i); 41 | X = _mm256_max_ps(X, DATA); 42 | } 43 | 44 | return hmax_avx(X); 45 | } 46 | 47 | float avx_max_unroll_2(float * data, uint64_t length) { 48 | 49 | // neutral element "e" in monoid (|R, max) is -oo 50 | const float e = -INFINITY; 51 | __m256 X = _mm256_set1_ps(e); 52 | __m256 Y = _mm256_set1_ps(e); 53 | 54 | for (uint64_t i = 0; i < length; i += 16) { 55 | __m256 DATA_X = _mm256_load_ps(data+i+0); 56 | __m256 DATA_Y = _mm256_load_ps(data+i+8); 57 | X = _mm256_max_ps(X, DATA_X); 58 | Y = _mm256_max_ps(Y, DATA_Y); 59 | } 60 | 61 | return std::max(hmax_avx(X), hmax_avx(Y)); 62 | } 63 | 64 | float plain_max(float * data, uint64_t length) { 65 | 66 | // neutral element "e" in monoid (|R, max) is -oo 67 | float max = -INFINITY; 68 | 69 | for (uint64_t i = 0; i < length; i++) 70 | max = std::max(max, data[i]); 71 | 72 | return max; 73 | } 74 | 75 | float plain_max_unroll_2(float * data, uint64_t length) { 76 | 77 | // neutral element "e" in monoid (|R, max) is -oo 78 | float max_0 = -INFINITY; 79 | float max_1 = -INFINITY; 80 | 81 | for (uint64_t i = 0; i < length; i += 2) { 82 | max_0 = std::max(max_0, data[i+0]); 83 | max_1 = std::max(max_1, data[i+1]); 84 | } 85 | 86 | return std::max(max_0, max_1); 87 | } 88 | 89 | float plain_max_unroll_4(float * data, uint64_t length) { 90 | 91 | // neutral element "e" in monoid (|R, max) is -oo 92 | float max_0 = -INFINITY; 93 | float max_1 = -INFINITY; 94 | float max_2 = -INFINITY; 95 | float max_3 = -INFINITY; 96 | 97 | for (uint64_t i = 0; i < length; i += 4) { 98 | max_0 = std::max(max_0, data[i+0]); 99 | max_1 = std::max(max_1, data[i+1]); 100 | max_2 = std::max(max_2, data[i+2]); 101 | max_3 = std::max(max_3, data[i+3]); 102 | } 103 | 104 | return std::max(max_0, 105 | std::max(max_1, 106 | std::max(max_2, max_3))); 107 | } 108 | 109 | float plain_max_unroll_8(float * data, uint64_t length) { 110 | 111 | // neutral element "e" in monoid (|R, max) is -oo 112 | float max_0 = -INFINITY; 113 | float max_1 = -INFINITY; 114 | float max_2 = -INFINITY; 115 | float max_3 = -INFINITY; 116 | float max_4 = -INFINITY; 117 | float max_5 = -INFINITY; 118 | float max_6 = -INFINITY; 119 | float max_7 = -INFINITY; 120 | 121 | for (uint64_t i = 0; i < length; i += 8) { 122 | max_0 = std::max(max_0, data[i+0]); 123 | max_1 = std::max(max_1, data[i+1]); 124 | max_2 = std::max(max_2, data[i+2]); 125 | max_3 = std::max(max_3, data[i+3]); 126 | max_4 = std::max(max_4, data[i+0]); 127 | max_5 = std::max(max_5, data[i+1]); 128 | max_6 = std::max(max_6, data[i+2]); 129 | max_7 = std::max(max_7, data[i+3]); 130 | } 131 | 132 | return std::max(max_0, 133 | std::max(max_1, 134 | std::max(max_2, 135 | std::max(max_3, 136 | std::max(max_4, 137 | std::max(max_5, 138 | std::max(max_6, max_7))))))); 139 | } 140 | 141 | int main () { 142 | 143 | const uint64_t num_entries = 1UL << 28; 144 | const uint64_t num_bytes = num_entries*sizeof(float); 145 | 146 | TIMERSTART(alloc_memory) 147 | auto data = static_cast(_mm_malloc(num_bytes , 32)); 148 | TIMERSTOP(alloc_memory) 149 | 150 | TIMERSTART(init) 151 | init(data, num_entries); 152 | TIMERSTOP(init) 153 | 154 | TIMERSTART(plain_max) 155 | std::cout << plain_max(data, num_entries) << std::endl; 156 | TIMERSTOP(plain_max) 157 | 158 | TIMERSTART(plain_max_unroll_2) 159 | std::cout << plain_max_unroll_2(data, num_entries) << std::endl; 160 | TIMERSTOP(plain_max_unroll_2) 161 | 162 | TIMERSTART(plain_max_unroll_4) 163 | std::cout << plain_max_unroll_4(data, num_entries) << std::endl; 164 | TIMERSTOP(plain_max_unroll_4) 165 | 166 | TIMERSTART(plain_max_unroll_8) 167 | std::cout << plain_max_unroll_8(data, num_entries) << std::endl; 168 | TIMERSTOP(plain_max_unroll_8) 169 | 170 | TIMERSTART(avx_max) 171 | std::cout << avx_max(data, num_entries) << std::endl; 172 | TIMERSTOP(avx_max) 173 | 174 | TIMERSTART(avx_max_unroll_2) 175 | std::cout << avx_max_unroll_2(data, num_entries) << std::endl; 176 | TIMERSTOP(avx_max_unroll_2) 177 | 178 | TIMERSTART(free_memory) 179 | _mm_free(data); 180 | TIMERSTOP(free_memory) 181 | } 182 | -------------------------------------------------------------------------------- /chapter3/AVX/vector_norm_aos_avx.cpp: -------------------------------------------------------------------------------- 1 | #include // prng 2 | #include // uint32_t 3 | #include // std::cout 4 | #include // AVX intrinsics 5 | 6 | // timers distributed with this book 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | void aos_init(float * xyz, uint64_t length) { 10 | 11 | std::mt19937 engine(42); 12 | std::uniform_real_distribution density(-1, 1); 13 | 14 | for (uint64_t i = 0; i < 3*length; i++) 15 | xyz[i] = density(engine); 16 | } 17 | 18 | void avx_aos_norm(float * xyz, uint64_t length) { 19 | 20 | for (uint64_t i = 0; i < 3*length; i += 3*8) { 21 | 22 | ///////////////////////////////////////////////////////////////////// 23 | // AOS2SOA: XYZXYZXY ZXYZXYZX YZXYZXYZ --> XXXXXXX YYYYYYY ZZZZZZZZ 24 | ///////////////////////////////////////////////////////////////////// 25 | 26 | // registers: NOTE: M is an SSE pointer (length 4) 27 | __m128 *M = (__m128*) (xyz+i); 28 | __m256 M03; 29 | __m256 M14; 30 | __m256 M25; 31 | 32 | // load lower halves 33 | M03 = _mm256_castps128_ps256(M[0]); 34 | M14 = _mm256_castps128_ps256(M[1]); 35 | M25 = _mm256_castps128_ps256(M[2]); 36 | 37 | // load upper halves 38 | M03 = _mm256_insertf128_ps(M03 ,M[3],1); 39 | M14 = _mm256_insertf128_ps(M14 ,M[4],1); 40 | M25 = _mm256_insertf128_ps(M25 ,M[5],1); 41 | 42 | // everyday I am shuffeling... 43 | __m256 XY = _mm256_shuffle_ps(M14, M25, _MM_SHUFFLE( 2,1,3,2)); 44 | __m256 YZ = _mm256_shuffle_ps(M03, M14, _MM_SHUFFLE( 1,0,2,1)); 45 | __m256 X = _mm256_shuffle_ps(M03, XY , _MM_SHUFFLE( 2,0,3,0)); 46 | __m256 Y = _mm256_shuffle_ps(YZ , XY , _MM_SHUFFLE( 3,1,2,0)); 47 | __m256 Z = _mm256_shuffle_ps(YZ , M25, _MM_SHUFFLE( 3,0,3,1)); 48 | 49 | ///////////////////////////////////////////////////////////////////// 50 | // SOA computation 51 | ///////////////////////////////////////////////////////////////////// 52 | 53 | // R <- X*X+Y*Y+Z*Z 54 | __m256 R = _mm256_add_ps(_mm256_mul_ps(X, X), 55 | _mm256_add_ps(_mm256_mul_ps(Y, Y), 56 | _mm256_mul_ps(Z, Z))); 57 | // R <- 1/sqrt(R) 58 | R = _mm256_rsqrt_ps(R); 59 | 60 | // normalize vectors 61 | X = _mm256_mul_ps(X, R); 62 | Y = _mm256_mul_ps(Y, R); 63 | Z = _mm256_mul_ps(Z, R); 64 | 65 | ///////////////////////////////////////////////////////////////////// 66 | // SOA2AOS: XXXXXXX YYYYYYY ZZZZZZZZ -> XYZXYZXY ZXYZXYZX YZXYZXYZ 67 | ///////////////////////////////////////////////////////////////////// 68 | 69 | // everyday I am shuffeling... 70 | __m256 RXY = _mm256_shuffle_ps(X,Y, _MM_SHUFFLE(2,0,2,0)); 71 | __m256 RYZ = _mm256_shuffle_ps(Y,Z, _MM_SHUFFLE(3,1,3,1)); 72 | __m256 RZX = _mm256_shuffle_ps(Z,X, _MM_SHUFFLE(3,1,2,0)); 73 | __m256 R03 = _mm256_shuffle_ps(RXY, RZX, _MM_SHUFFLE(2,0,2,0)); 74 | __m256 R14 = _mm256_shuffle_ps(RYZ, RXY, _MM_SHUFFLE(3,1,2,0)); 75 | __m256 R25 = _mm256_shuffle_ps(RZX, RYZ, _MM_SHUFFLE(3,1,3,1)); 76 | 77 | // store in AOS (6*4=24) 78 | M[0] = _mm256_castps256_ps128(R03); 79 | M[1] = _mm256_castps256_ps128(R14); 80 | M[2] = _mm256_castps256_ps128(R25); 81 | M[3] = _mm256_extractf128_ps(R03, 1); 82 | M[4] = _mm256_extractf128_ps(R14, 1); 83 | M[5] = _mm256_extractf128_ps(R25, 1); 84 | } 85 | } 86 | 87 | void aos_check(float * xyz, uint64_t length) { 88 | 89 | for (uint64_t i = 0; i < 3*length; i += 3) { 90 | 91 | const float x = xyz[i+0]; 92 | const float y = xyz[i+1]; 93 | const float z = xyz[i+2]; 94 | 95 | float rho = x*x+y*y+z*z; 96 | 97 | if ((rho-1)*(rho-1) > 1E-6) 98 | std::cout << "error too big at position " 99 | << i << std::endl; 100 | } 101 | } 102 | int main () { 103 | 104 | const uint64_t num_vectors = 1UL << 28; 105 | const uint64_t num_bytes = 3*num_vectors*sizeof(float); 106 | 107 | TIMERSTART(alloc_memory) 108 | auto xyz = static_cast(_mm_malloc(num_bytes , 32)); 109 | TIMERSTOP(alloc_memory) 110 | 111 | TIMERSTART(init) 112 | aos_init(xyz, num_vectors); 113 | TIMERSTOP(init) 114 | 115 | TIMERSTART(avx_aos_normalize) 116 | avx_aos_norm(xyz, num_vectors); 117 | TIMERSTOP(avx_aos_normalize) 118 | 119 | TIMERSTART(check) 120 | aos_check(xyz, num_vectors); 121 | TIMERSTOP(check) 122 | 123 | TIMERSTART(free_memory) 124 | _mm_free(xyz); 125 | TIMERSTOP(free_memory) 126 | } 127 | -------------------------------------------------------------------------------- /chapter3/AVX/vector_norm_aos_plain.cpp: -------------------------------------------------------------------------------- 1 | #include // uint32_t 2 | #include // std::cout 3 | #include // prng 4 | 5 | // timers distributed with this book 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | void aos_init(float * xyz, uint64_t length) { 9 | 10 | std::mt19937 engine(42); 11 | std::uniform_real_distribution density(-1, 1); 12 | 13 | for (uint64_t i = 0; i < 3*length; i++) 14 | xyz[i] = density(engine); 15 | } 16 | 17 | void plain_aos_norm(float * xyz, uint64_t length) { 18 | 19 | for (uint64_t i = 0; i < 3*length; i += 3) { 20 | const float x = xyz[i+0]; 21 | const float y = xyz[i+1]; 22 | const float z = xyz[i+2]; 23 | 24 | float irho = 1.0f/std::sqrt(x*x+y*y+z*z); 25 | 26 | xyz[i+0] *= irho; 27 | xyz[i+1] *= irho; 28 | xyz[i+2] *= irho; 29 | } 30 | } 31 | 32 | void aos_check(float * xyz, uint64_t length) { 33 | 34 | for (uint64_t i = 0; i < 3*length; i += 3) { 35 | 36 | const float x = xyz[i+0]; 37 | const float y = xyz[i+1]; 38 | const float z = xyz[i+2]; 39 | 40 | float rho = x*x+y*y+z*z; 41 | 42 | if ((rho-1)*(rho-1) > 1E-6) 43 | std::cout << "error too big at position " 44 | << i << std::endl; 45 | } 46 | } 47 | 48 | int main () { 49 | 50 | const uint64_t num_vectors = 1UL << 28; 51 | 52 | TIMERSTART(alloc_memory) 53 | auto xyz = new float[3*num_vectors]; 54 | TIMERSTOP(alloc_memory) 55 | 56 | TIMERSTART(init) 57 | aos_init(xyz, num_vectors); 58 | TIMERSTOP(init) 59 | 60 | TIMERSTART(plain_aos_normalize) 61 | plain_aos_norm(xyz, num_vectors); 62 | TIMERSTOP(plain_aos_normalize) 63 | 64 | TIMERSTART(check) 65 | aos_check(xyz, num_vectors); 66 | TIMERSTOP(check) 67 | 68 | TIMERSTART(free_memory) 69 | delete [] xyz; 70 | TIMERSTOP(free_memory) 71 | } 72 | -------------------------------------------------------------------------------- /chapter3/AVX/vector_norm_soa_avx.cpp: -------------------------------------------------------------------------------- 1 | #include // prng 2 | #include // uint32_t 3 | #include // std::cout 4 | #include // AVX intrinsics 5 | 6 | // timers distributed with this book 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | void soa_init(float * x, 10 | float * y, 11 | float * z, 12 | uint64_t length) { 13 | 14 | std::mt19937 engine(42); 15 | std::uniform_real_distribution density(-1, 1); 16 | 17 | for (uint64_t i = 0; i < length; i++) { 18 | x[i] = density(engine); 19 | y[i] = density(engine); 20 | z[i] = density(engine); 21 | } 22 | 23 | } 24 | 25 | void avx_soa_norm(float * x, 26 | float * y, 27 | float * z, 28 | uint64_t length) { 29 | 30 | for (uint64_t i = 0; i < length; i += 8) { 31 | 32 | // aligned loads 33 | __m256 X = _mm256_load_ps(x+i); 34 | __m256 Y = _mm256_load_ps(y+i); 35 | __m256 Z = _mm256_load_ps(z+i); 36 | 37 | // R <- X*X+Y*Y+Z*Z 38 | __m256 R = _mm256_add_ps(_mm256_mul_ps(X, X), 39 | _mm256_add_ps(_mm256_mul_ps(Y, Y), 40 | _mm256_mul_ps(Z, Z))); 41 | // R <- 1/sqrt(R) 42 | R = _mm256_rsqrt_ps(R); 43 | 44 | // aligned stores 45 | _mm256_store_ps(x+i, _mm256_mul_ps(X, R)); 46 | _mm256_store_ps(y+i, _mm256_mul_ps(Y, R)); 47 | _mm256_store_ps(z+i, _mm256_mul_ps(Z, R)); 48 | } 49 | } 50 | 51 | void soa_check(float * x, 52 | float * y, 53 | float * z, 54 | uint64_t length) { 55 | 56 | for (uint64_t i = 0; i < length; i++) { 57 | float rho = x[i]*x[i]+y[i]*y[i]+z[i]*z[i]; 58 | if ((rho-1)*(rho-1) > 1E-6) 59 | std::cout << "error too big at position " 60 | << i << std::endl; 61 | } 62 | } 63 | 64 | int main () { 65 | 66 | const uint64_t num_vectors = 1UL << 28; 67 | const uint64_t num_bytes = num_vectors*sizeof(float); 68 | 69 | TIMERSTART(alloc_memory) 70 | auto x = static_cast(_mm_malloc(num_bytes , 32)); 71 | auto y = static_cast(_mm_malloc(num_bytes , 32)); 72 | auto z = static_cast(_mm_malloc(num_bytes , 32)); 73 | TIMERSTOP(alloc_memory) 74 | 75 | TIMERSTART(init) 76 | soa_init(x, y, z, num_vectors); 77 | TIMERSTOP(init) 78 | 79 | TIMERSTART(avx_soa_normalize) 80 | avx_soa_norm(x, y, z, num_vectors); 81 | TIMERSTOP(avx_soa_normalize) 82 | 83 | TIMERSTART(check) 84 | soa_check(x, y, z, num_vectors); 85 | TIMERSTOP(check) 86 | 87 | TIMERSTART(free_memory) 88 | _mm_free(x); 89 | _mm_free(y); 90 | _mm_free(z); 91 | TIMERSTOP(free_memory) 92 | } 93 | -------------------------------------------------------------------------------- /chapter3/AVX/vector_norm_soa_plain.cpp: -------------------------------------------------------------------------------- 1 | #include // uint32_t 2 | #include // std::cout 3 | #include // prng 4 | 5 | // timers distributed with this book 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | void soa_init(float * x, 9 | float * y, 10 | float * z, 11 | uint64_t length) { 12 | 13 | std::mt19937 engine(42); 14 | std::uniform_real_distribution density(-1, 1); 15 | 16 | for (uint64_t i = 0; i < length; i++) { 17 | x[i] = density(engine); 18 | y[i] = density(engine); 19 | z[i] = density(engine); 20 | } 21 | 22 | } 23 | 24 | void plain_soa_norm(float * x, 25 | float * y, 26 | float * z, 27 | uint64_t length) { 28 | 29 | for (uint64_t i = 0; i < length; i++) { 30 | float irho = 1.0f/std::sqrt(x[i]*x[i]+ 31 | y[i]*y[i]+ 32 | z[i]*z[i]); 33 | x[i] *= irho; 34 | y[i] *= irho; 35 | z[i] *= irho; 36 | } 37 | } 38 | 39 | void soa_check(float * x, 40 | float * y, 41 | float * z, 42 | uint64_t length) { 43 | 44 | for (uint64_t i = 0; i < length; i++) { 45 | float rho = x[i]*x[i]+y[i]*y[i]+z[i]*z[i]; 46 | if ((rho-1)*(rho-1) > 1E-6) 47 | std::cout << "error too big at position " 48 | << i << std::endl; 49 | } 50 | } 51 | 52 | int main () { 53 | 54 | const uint64_t num_vectors = 1UL << 28; 55 | 56 | TIMERSTART(alloc_memory) 57 | auto x = new float[num_vectors]; 58 | auto y = new float[num_vectors]; 59 | auto z = new float[num_vectors]; 60 | TIMERSTOP(alloc_memory) 61 | 62 | TIMERSTART(init) 63 | soa_init(x, y, z, num_vectors); 64 | TIMERSTOP(init) 65 | 66 | TIMERSTART(plain_soa_normalize) 67 | plain_soa_norm(x, y, z, num_vectors); 68 | TIMERSTOP(plain_soa_normalize) 69 | 70 | TIMERSTART(check) 71 | soa_check(x, y, z, num_vectors); 72 | TIMERSTOP(check) 73 | 74 | TIMERSTART(free_memory) 75 | delete [] x; 76 | delete [] y; 77 | delete [] z; 78 | TIMERSTOP(free_memory) 79 | } 80 | -------------------------------------------------------------------------------- /chapter3/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter3/matrix_matrix_mult_transposed/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ 2 | CXXFLAGS=-O2 -std=c++14 -Wall 3 | 4 | all: matrix_mult_seq matrix_mult_omp 5 | 6 | matrix_mult_seq: matrix_mult.cpp 7 | $(CXX) $(CXXFLAGS) matrix_mult.cpp -o matrix_mult_seq 8 | 9 | matrix_mult_omp: matrix_mult.cpp 10 | $(CXX) $(CXXFLAGS) matrix_mult.cpp -fopenmp -o matrix_mult_omp 11 | 12 | clean: 13 | rm -f matrix_mult_seq 14 | rm -f matrix_mult_omp 15 | -------------------------------------------------------------------------------- /chapter3/matrix_matrix_mult_transposed/matrix_mult.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "../include/hpc_helpers.hpp" 5 | 6 | int main () { 7 | 8 | // matrix shapes 9 | const uint64_t m = 1 << 15; 10 | const uint64_t n = 1 << 15; 11 | const uint64_t l = 1 << 5; 12 | 13 | TIMERSTART(init) 14 | // sum_k A_ik * B_kj = sum_k A_ik * B^t_jk = C_ij 15 | std::vector A (m*l, 0); // m x l 16 | std::vector B (l*n, 0); // l x n 17 | std::vector Bt(n*l, 0); // n x l 18 | std::vector C (m*n, 0); // m x n 19 | TIMERSTOP(init) 20 | 21 | TIMERSTART(transpose_and_mult) 22 | TIMERSTART(transpose) 23 | #pragma omp parallel for collapse(2) 24 | for (uint64_t k = 0; k < l; k++) 25 | for (uint64_t j = 0; j < n; j++) 26 | Bt[j*l+k] = B[k*n+j]; 27 | TIMERSTOP(transpose) 28 | 29 | TIMERSTART(transpose_mult) 30 | #pragma omp parallel for collapse(2) 31 | for (uint64_t i = 0; i < m; i++) 32 | for (uint64_t j = 0; j < n; j++) { 33 | float accum = 0; 34 | for (uint64_t k = 0; k < l; k++) 35 | accum += A[i*l+k]*Bt[j*l+k]; 36 | C[i*n+j] = accum; 37 | } 38 | 39 | TIMERSTOP(transpose_mult) 40 | TIMERSTOP(transpose_and_mult) 41 | 42 | TIMERSTART(naive_mult) 43 | #pragma omp parallel for collapse(2) 44 | for (uint64_t i = 0; i < m; i++) 45 | for (uint64_t j = 0; j < n; j++) { 46 | float accum = 0; 47 | for (uint64_t k = 0; k < l; k++) 48 | accum += A[i*l+k]*B[k*n+j]; 49 | C[i*n+j] = accum; 50 | } 51 | 52 | TIMERSTOP(naive_mult) 53 | } 54 | -------------------------------------------------------------------------------- /chapter4/all_pairs_distance_matrix/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: all_pair 5 | 6 | all_pair: all_pair.cpp 7 | $(CXX) all_pair.cpp $(CXXFLAGS) -o all_pair 8 | 9 | clean: 10 | rm -rf all_pair 11 | -------------------------------------------------------------------------------- /chapter4/all_pairs_distance_matrix/data/mnist_exporter.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # run __ONE__ of the following commands: 3 | # 4 | # pip install --user tensorflow (if you have no CUDA-enabled GPU) 5 | # pip install --user tensorflow-gpu 6 | # 7 | # Numpy should come bundled with tensorflow. Run this file et voila! 8 | ##################################################################### 9 | 10 | import array as ar 11 | import numpy as np 12 | 13 | # everyone has tensorflow installed nowadays :D 14 | from tensorflow.examples.tutorials.mnist import input_data 15 | mnist = input_data.read_data_sets('MNIST_data', one_hot=False) 16 | 17 | merge = np.vstack(( mnist.train.images, mnist.test.images)) 18 | 19 | with open("mnist_all_65000_28_28_32.bin", "wb") as f: 20 | f.write(ar.array("f", merge.flatten())) 21 | -------------------------------------------------------------------------------- /chapter4/condition_variables/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: alarm_clock ping_pong one_shot_alarm_clock 5 | 6 | alarm_clock: alarm_clock.cpp 7 | $(CXX) alarm_clock.cpp $(CXXFLAGS) -o alarm_clock 8 | 9 | one_shot_alarm_clock: one_shot_alarm_clock.cpp 10 | $(CXX) one_shot_alarm_clock.cpp $(CXXFLAGS) -o one_shot_alarm_clock 11 | 12 | ping_pong: ping_pong.cpp 13 | $(CXX) ping_pong.cpp $(CXXFLAGS) -o ping_pong 14 | 15 | clean: 16 | rm -rf alarm_clock 17 | rm -rf one_shot_alarm_clock 18 | rm -rf ping_pong 19 | -------------------------------------------------------------------------------- /chapter4/condition_variables/alarm_clock.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // std::thread 3 | #include // std::mutex 4 | #include // std::this_thread::sleep_for 5 | #include // std::condition_variable 6 | 7 | // convenient time formats (C++14 required) 8 | using namespace std::chrono_literals; 9 | 10 | int main() { 11 | 12 | std::mutex mutex; 13 | std::condition_variable cv; 14 | bool time_for_breakfast = false; // globally shared state 15 | 16 | // to be called by thread 17 | auto student = [&] ( ) -> void { 18 | 19 | { // this is the scope of the lock 20 | std::unique_lock unique_lock(mutex); 21 | 22 | // check the globally shared state 23 | do { 24 | // lock is released during wait 25 | cv.wait(unique_lock); 26 | } while (!time_for_breakfast); 27 | 28 | // alternatively, you can specify the 29 | // predicate directly using a closure 30 | // cv.wait(unique_lock, 31 | // [&](){ return time_for_break_fast; }); 32 | } // lock is finally released 33 | 34 | std::cout << "Time to make coffee!" << std::endl; 35 | }; 36 | 37 | // create the waiting thread and wait for 2s 38 | std::thread my_thread(student); 39 | std::this_thread::sleep_for(2s); 40 | 41 | { // prepare the alarm clock 42 | std::lock_guard lock_guard(mutex); 43 | time_for_breakfast = true; 44 | } // here the lock is released 45 | 46 | // ring the alarm clock 47 | cv.notify_one(); 48 | 49 | // wait until breakfast is finished 50 | my_thread.join(); 51 | } 52 | -------------------------------------------------------------------------------- /chapter4/condition_variables/one_shot_alarm_clock.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // std::thread 3 | #include // std::future 4 | #include // std::this_thread::sleep_for 5 | 6 | // convenient time formats (C++14 required) 7 | using namespace std::chrono_literals; 8 | 9 | int main() { 10 | 11 | // create pair (future, promise) 12 | std::promise promise; 13 | auto shared_future = promise.get_future().share(); 14 | 15 | // to be called by thread 16 | auto students = [&] ( ) -> void { 17 | 18 | // blocks until fulfilling promise 19 | shared_future.get(); 20 | std::cout << "Time to make coffee!" << std::endl; 21 | }; 22 | 23 | // create the waiting thread and wait for 2s 24 | std::thread my_thread0(students); 25 | std::thread my_thread1(students); 26 | std::this_thread::sleep_for(2s); 27 | promise.set_value(); 28 | 29 | // wait until breakfast is finished 30 | my_thread0.join(); 31 | my_thread1.join(); 32 | } 33 | -------------------------------------------------------------------------------- /chapter4/condition_variables/ping_pong.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // std::thread 3 | #include // std::mutex 4 | #include // std::this_thread::sleep_for 5 | #include // std::condition_variable 6 | 7 | // convenient time formats (C++14 required) 8 | using namespace std::chrono_literals; 9 | 10 | int main() { 11 | 12 | std::mutex mutex; 13 | std::condition_variable cv; 14 | bool is_ping = true; // globally shared state 15 | 16 | auto ping = [&] ( ) -> void { 17 | while (true) { 18 | 19 | // wait to be signalled 20 | std::unique_lock unique_lock(mutex); 21 | cv.wait(unique_lock,[&](){return is_ping;}); 22 | 23 | // print "ping" to the command line 24 | std::this_thread::sleep_for(1s); 25 | std::cout << "ping" << std::endl; 26 | 27 | // alter state and notify other thread 28 | is_ping = !is_ping; 29 | cv.notify_one(); 30 | } 31 | }; 32 | 33 | auto pong = [&] ( ) -> void { 34 | while (true) { 35 | // wait to be signalled 36 | std::unique_lock unique_lock(mutex); 37 | cv.wait(unique_lock,[&](){return !is_ping;}); 38 | 39 | // print "pong" to the command line 40 | std::this_thread::sleep_for(1s); 41 | std::cout << "pong" << std::endl; 42 | 43 | // alter state and notify other thread 44 | is_ping = !is_ping; 45 | cv.notify_one(); 46 | } 47 | }; 48 | 49 | std::thread ping_thread(ping); 50 | std::thread pong_thread(pong); 51 | ping_thread.join(); 52 | pong_thread.join(); 53 | } 54 | -------------------------------------------------------------------------------- /chapter4/false_sharing/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: false_sharing 5 | 6 | false_sharing: false_sharing.cpp 7 | $(CXX) false_sharing.cpp $(CXXFLAGS) -o false_sharing 8 | 9 | clean: 10 | rm -rf false_sharing 11 | -------------------------------------------------------------------------------- /chapter4/false_sharing/false_sharing.cpp: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | #include 4 | 5 | struct pack_t { 6 | uint64_t ying; 7 | uint64_t yang; 8 | 9 | pack_t() : ying(0), yang(0) {} 10 | }; 11 | 12 | void sequential_increment( 13 | volatile pack_t& pack) { 14 | 15 | for (uint64_t index = 0; index < 1UL << 30; index++) { 16 | pack.ying++; 17 | pack.yang++; 18 | } 19 | } 20 | 21 | 22 | void false_sharing_increment( 23 | volatile pack_t& pack) { 24 | 25 | auto eval_ying = [&pack] () -> void { 26 | for (uint64_t index = 0; index < 1UL << 30; index++) 27 | pack.ying++; 28 | }; 29 | 30 | auto eval_yang = [&pack] () -> void { 31 | for (uint64_t index = 0; index < 1UL << 30; index++) 32 | pack.yang++; 33 | }; 34 | 35 | std::thread ying_thread(eval_ying); 36 | std::thread yang_thread(eval_yang); 37 | ying_thread.join(); 38 | yang_thread.join(); 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | 43 | pack_t seq_pack; 44 | 45 | TIMERSTART(sequential_increment) 46 | sequential_increment(seq_pack); 47 | TIMERSTOP(sequential_increment) 48 | 49 | std::cout << seq_pack.ying << " " << seq_pack.yang << std::endl; 50 | 51 | pack_t par_pack; 52 | 53 | TIMERSTART(false_sharing_increment_increment) 54 | false_sharing_increment(par_pack); 55 | TIMERSTOP(false_sharing_increment_increment) 56 | 57 | std::cout << par_pack.ying << " " << par_pack.yang << std::endl; 58 | } 59 | -------------------------------------------------------------------------------- /chapter4/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: hello_world 5 | 6 | hello_world: hello_world.cpp 7 | $(CXX) hello_world.cpp $(CXXFLAGS) -o hello_world 8 | 9 | clean: 10 | rm -rf hello_world 11 | -------------------------------------------------------------------------------- /chapter4/hello_world/hello_world.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | template 7 | void say_hello_template(index_t id) { 8 | std::cout << "Hello from thread: " << id << std::endl; 9 | } 10 | 11 | void say_hello(uint64_t id) { 12 | std::cout << "Hello from thread: " << id << std::endl; 13 | } 14 | 15 | int main(int argc, char * argv[]) { 16 | 17 | const uint64_t num_threads = 4; 18 | 19 | std::vector threads; 20 | 21 | for (uint64_t id = 0; id < num_threads; id++) 22 | threads.emplace_back( 23 | std::thread( 24 | say_hello, id 25 | ) 26 | ); 27 | 28 | for (auto& thread: threads) 29 | thread.join(); 30 | } 31 | -------------------------------------------------------------------------------- /chapter4/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter4/matrix_vector_mult/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: matrix_vector 5 | 6 | matrix_vector: matrix_vector.cpp 7 | $(CXX) matrix_vector.cpp $(CXXFLAGS) -o matrix_vector 8 | 9 | clean: 10 | rm -rf matrix_vector 11 | -------------------------------------------------------------------------------- /chapter4/matrix_vector_mult/matrix_vector.cpp: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | template < 9 | typename value_t, 10 | typename index_t> 11 | void init( 12 | std::vector& A, 13 | std::vector& x, 14 | index_t m, 15 | index_t n) { 16 | 17 | for (index_t row = 0; row < m; row++) 18 | for (index_t col = 0; col < n; col++) 19 | A[row*n+col] = row >= col ? 1 : 0; 20 | 21 | for (index_t col = 0; col < m; col++) 22 | x[col] = col; 23 | } 24 | 25 | template < 26 | typename value_t, 27 | typename index_t> 28 | void sequential_mult( 29 | std::vector& A, 30 | std::vector& x, 31 | std::vector& b, 32 | index_t m, 33 | index_t n) { 34 | 35 | for (index_t row = 0; row < m; row++) { 36 | value_t accum = value_t(0); 37 | for (index_t col = 0; col < n; col++) 38 | accum += A[row*n+col]*x[col]; 39 | b[row] = accum; 40 | } 41 | } 42 | 43 | template < 44 | typename value_t, 45 | typename index_t> 46 | void cyclic_parallel_mult( 47 | std::vector& A, // linear memory for A 48 | std::vector& x, // to be mapped vector 49 | std::vector& b, // result vector 50 | index_t m, // number of rows 51 | index_t n, // number of cols 52 | index_t num_threads=8) { // number of threads p 53 | 54 | // this function is called by the threads 55 | auto cyclic = [&] (const index_t& id) -> void { 56 | 57 | // indices are incremented with a stride of p 58 | for (index_t row = id; row < m; row += num_threads) { 59 | value_t accum = value_t(0); 60 | for (index_t col = 0; col < n; col++) 61 | accum += A[row*n+col]*x[col]; 62 | b[row] = accum; 63 | } 64 | }; 65 | 66 | // business as usual 67 | std::vector threads; 68 | 69 | for (index_t id = 0; id < num_threads; id++) 70 | threads.emplace_back(cyclic, id); 71 | 72 | for (auto& thread : threads) 73 | thread.join(); 74 | } 75 | 76 | template < 77 | typename value_t, 78 | typename index_t> 79 | void block_parallel_mult( 80 | std::vector& A, 81 | std::vector& x, 82 | std::vector& b, 83 | index_t m, 84 | index_t n, 85 | index_t num_threads=32) { 86 | 87 | // this function is called by the threads 88 | auto block = [&] (const index_t& id) -> void { 89 | // ^-- capture whole scope by reference 90 | 91 | // compute chunk size, lower and upper task id 92 | const index_t chunk = SDIV(m, num_threads); 93 | const index_t lower = id*chunk; 94 | const index_t upper = std::min(lower+chunk, m); 95 | 96 | // only computes rows between lower and upper 97 | for (index_t row = lower; row < upper; row++) { 98 | value_t accum = value_t(0); 99 | for (index_t col = 0; col < n; col++) 100 | accum += A[row*n+col]*x[col]; 101 | b[row] = accum; 102 | } 103 | }; 104 | 105 | // business as usual 106 | std::vector threads; 107 | 108 | for (index_t id = 0; id < num_threads; id++) 109 | threads.emplace_back(block, id); 110 | 111 | for (auto& thread : threads) 112 | thread.join(); 113 | } 114 | 115 | 116 | template < 117 | typename value_t, 118 | typename index_t> 119 | void block_cyclic_parallel_mult( 120 | std::vector& A, 121 | std::vector& x, 122 | std::vector& b, 123 | index_t m, 124 | index_t n, 125 | index_t num_threads=8, 126 | index_t chunk_size=64/sizeof(value_t)) { 127 | 128 | 129 | // this function is called by the threads 130 | auto block_cyclic = [&] (const index_t& id) -> void { 131 | 132 | // precomupute the stride 133 | const index_t stride = num_threads*chunk_size; 134 | const index_t offset = id*chunk_size; 135 | 136 | // for each block of size chunk_size in cyclic order 137 | for (index_t lower = offset; lower < m; lower += stride) { 138 | 139 | // compute the upper border of the block 140 | const index_t upper = std::min(lower+chunk_size, m); 141 | 142 | // for each row in the block 143 | for (index_t row = lower; row < upper; row++) { 144 | 145 | // accumulate the contributions 146 | value_t accum = value_t(0); 147 | for (index_t col = 0; col < n; col++) 148 | accum += A[row*n+col]*x[col]; 149 | b[row] = accum; 150 | } 151 | } 152 | }; 153 | 154 | // business as usual 155 | std::vector threads; 156 | 157 | for (index_t id = 0; id < num_threads; id++) 158 | threads.emplace_back(block_cyclic, id); 159 | 160 | for (auto& thread : threads) 161 | thread.join(); 162 | } 163 | 164 | 165 | 166 | int main(int argc, char* argv[]) { 167 | 168 | const uint64_t n = 1UL << 15; 169 | const uint64_t m = 1UL << 15; 170 | 171 | TIMERSTART(overall) 172 | TIMERSTART(alloc) 173 | std::vector> A(m*n); 174 | std::vector> x(n); 175 | std::vector> b(m); 176 | TIMERSTOP(alloc) 177 | 178 | TIMERSTART(init) 179 | init(A, x, m, n); 180 | TIMERSTOP(init) 181 | 182 | TIMERSTART(mult) 183 | block_cyclic_parallel_mult(A, x, b, m, n); 184 | TIMERSTOP(mult) 185 | 186 | TIMERSTOP(overall) 187 | 188 | //for (const auto& entry: b) 189 | // std::cout << entry << std::endl; 190 | 191 | for (uint64_t index = 0; index < m; index++) 192 | if (b[index] != index*(index+1)/2) 193 | std::cout << "error at position " << index << " " 194 | << b[index] << std::endl; 195 | 196 | } 197 | -------------------------------------------------------------------------------- /chapter4/return_values/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: traditional promise_future packaged_task async 5 | 6 | traditional: traditional.cpp 7 | $(CXX) traditional.cpp $(CXXFLAGS) -o traditional 8 | 9 | promise_future: promise_future.cpp 10 | $(CXX) promise_future.cpp $(CXXFLAGS) -o promise_future 11 | 12 | packaged_task: packaged_task.cpp 13 | $(CXX) packaged_task.cpp $(CXXFLAGS) -o packaged_task 14 | 15 | async: async.cpp 16 | $(CXX) async.cpp $(CXXFLAGS) -o async 17 | 18 | clean: 19 | rm -rf traditional 20 | rm -rf promise_future 21 | rm -rf packaged_task 22 | rm -rf async 23 | -------------------------------------------------------------------------------- /chapter4/return_values/async.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | uint64_t fibo(uint64_t n) { 7 | 8 | uint64_t a_0 = 0; 9 | uint64_t a_1 = 1; 10 | 11 | for (uint64_t index = 0; index < n; index++) { 12 | const uint64_t tmp = a_0; a_0 = a_1; a_1 += tmp; 13 | } 14 | 15 | return a_0; 16 | } 17 | 18 | int main(int argc, char * argv[]) { 19 | 20 | const uint64_t num_threads = 32; 21 | std::vector> results; 22 | 23 | for (uint64_t id = 0; id < num_threads; id++) 24 | results.emplace_back( 25 | std::async( 26 | std::launch::async, fibo, id 27 | ) 28 | ); 29 | 30 | for (auto& result: results) 31 | std::cout << result.get() << std::endl; 32 | } 33 | -------------------------------------------------------------------------------- /chapter4/return_values/packaged_task.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | template < 8 | typename Func, // <-- type of function func 9 | typename ... Args, // <-- type of arguments arg0,arg1,... 10 | typename Rtrn=typename std::result_of::type> 11 | auto create_task( // ^-- type of return value func(args) 12 | Func && func, 13 | Args && ...args) -> std::packaged_task { 14 | 15 | // basically build an auxilliary function aux(void) 16 | // without arguments returning func(arg0,arg1,...) 17 | auto aux = std::bind(std::forward(func), 18 | std::forward(args)...); 19 | 20 | 21 | // create a task wrapping the auxilliary function: 22 | // task() executes aux(void) := func(arg0,arg1,...) 23 | auto task = std::packaged_task(aux); 24 | 25 | // the return value of aux(void) is assigned to a 26 | // future object accesible via task.get_future() 27 | return task; 28 | } 29 | 30 | uint64_t fibo(uint64_t n) { 31 | 32 | uint64_t a_0 = 0; 33 | uint64_t a_1 = 1; 34 | 35 | for (uint64_t index = 0; index < n; index++) { 36 | const uint64_t tmp = a_0; a_0 = a_1; a_1 += tmp; 37 | } 38 | 39 | return a_0; 40 | } 41 | 42 | int main(int argc, char * argv[]) { 43 | 44 | const uint64_t num_threads = 32; 45 | 46 | std::vector threads; 47 | std::vector> results; 48 | 49 | for (uint64_t id = 0; id < num_threads; id++) { 50 | auto task = create_task(fibo, id); 51 | results.emplace_back(task.get_future()); 52 | threads.emplace_back(std::move(task)); 53 | } 54 | 55 | for (auto& result: results) 56 | std::cout << result.get() << std::endl; 57 | 58 | for (auto& thread: threads) 59 | thread.detach(); 60 | } 61 | -------------------------------------------------------------------------------- /chapter4/return_values/promise_future.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | template < 8 | typename value_t, 9 | typename index_t> 10 | void fibo( 11 | value_t n, 12 | std::promise && result) { 13 | 14 | value_t a_0 = 0; 15 | value_t a_1 = 1; 16 | 17 | for (index_t index = 0; index < n; index++) { 18 | const value_t tmp = a_0; a_0 = a_1; a_1 += tmp; 19 | } 20 | 21 | result.set_value(a_0); 22 | } 23 | 24 | int main(int argc, char * argv[]) { 25 | 26 | const uint64_t num_threads = 32; 27 | 28 | std::vector threads; 29 | std::vector> results; 30 | 31 | for (uint64_t id = 0; id < num_threads; id++) { 32 | std::promise promise; 33 | results.emplace_back(promise.get_future()); 34 | 35 | threads.emplace_back( 36 | std::thread( 37 | fibo, id, std::move(promise) 38 | ) 39 | ); 40 | } 41 | 42 | 43 | for (auto& result: results) 44 | std::cout << result.get() << std::endl; 45 | 46 | for (auto& thread: threads) 47 | thread.detach(); 48 | 49 | } 50 | -------------------------------------------------------------------------------- /chapter4/return_values/traditional.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | template < 7 | typename value_t, 8 | typename index_t> 9 | void fibo( 10 | value_t n, 11 | value_t * result) { 12 | 13 | value_t a_0 = 0; 14 | value_t a_1 = 1; 15 | 16 | for (index_t index = 0; index < n; index++) { 17 | const value_t tmp = a_0; a_0 = a_1; a_1 += tmp; 18 | } 19 | 20 | *result = a_0; 21 | } 22 | 23 | int main(int argc, char * argv[]) { 24 | 25 | const uint64_t num_threads = 32; 26 | 27 | std::vector threads; 28 | std::vector results(num_threads); 29 | 30 | for (uint64_t id = 0; id < num_threads; id++) { 31 | 32 | threads.emplace_back( 33 | std::thread( 34 | fibo, id, &(results[id]) 35 | ) 36 | ); 37 | } 38 | 39 | for (auto& thread: threads) 40 | thread.join(); 41 | 42 | for (auto& result: results) 43 | std::cout << result << std::endl; 44 | } 45 | -------------------------------------------------------------------------------- /chapter4/thread_pool/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: main_basic main_basic_tree 5 | 6 | main_basic: main_basic.cpp 7 | $(CXX) main_basic.cpp $(CXXFLAGS) -o main_basic 8 | 9 | main_basic_tree: main_basic_tree.cpp 10 | $(CXX) main_basic_tree.cpp $(CXXFLAGS) -o main_basic_tree 11 | 12 | clean: 13 | rm -rf main_basic 14 | rm -rf main_basic_tree 15 | -------------------------------------------------------------------------------- /chapter4/thread_pool/main_basic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "threadpool_basic.hpp" 3 | 4 | ThreadPool TP(8); 5 | 6 | int main () { 7 | 8 | auto square = [](const uint64_t x) { 9 | return x*x; 10 | }; 11 | 12 | const uint64_t num_tasks = 32; 13 | std::vector> futures; 14 | 15 | for (uint64_t task = 0; task < num_tasks; task++) { 16 | auto future = TP.enqueue(square, task); 17 | futures.emplace_back(std::move(future)); 18 | } 19 | 20 | for (auto& future : futures) 21 | std::cout << future.get() << std::endl; 22 | } 23 | -------------------------------------------------------------------------------- /chapter4/thread_pool/main_basic_tree.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "threadpool_basic.hpp" 3 | 4 | ThreadPool TP(8); 5 | 6 | int main () { 7 | 8 | auto square = [](const uint64_t x) { 9 | return x*x; 10 | }; 11 | 12 | const uint64_t num_nodes = 32; 13 | std::vector> futures; 14 | 15 | typedef std::function traverse_t; 16 | traverse_t traverse = [&] (uint64_t node){ 17 | if (node < num_nodes) { 18 | 19 | // submit the job 20 | auto future = TP.enqueue(square, node); 21 | futures.emplace_back(std::move(future)); 22 | 23 | // traverse a complete binary tree 24 | traverse(2*node+1); 25 | traverse(2*node+2); 26 | } 27 | }; 28 | 29 | // start at the root node 30 | traverse(0); 31 | 32 | // get the results 33 | for (auto& future : futures) 34 | std::cout << future.get() << std::endl; 35 | } 36 | -------------------------------------------------------------------------------- /chapter4/thread_pool/threadpool_basic.hpp: -------------------------------------------------------------------------------- 1 | #ifndef THREADPOOL_BASIC_HPP 2 | #define THREADPOOL_BASIC_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | class ThreadPool { 13 | 14 | private: 15 | 16 | // storage for threads and tasks 17 | std::vector threads; 18 | std::queue> tasks; 19 | 20 | // primitives for signaling 21 | std::mutex mutex; 22 | std::condition_variable cv; 23 | 24 | // the state of the thread, pool 25 | bool stop_pool; 26 | uint32_t active_threads; 27 | const uint32_t capacity; 28 | 29 | // custom task factory 30 | template < 31 | typename Func, 32 | typename ... Args, 33 | typename Rtrn=typename std::result_of::type> 34 | auto make_task( 35 | Func && func, 36 | Args && ...args) -> std::packaged_task { 37 | 38 | auto aux = std::bind(std::forward(func), 39 | std::forward(args)...); 40 | 41 | return std::packaged_task(aux); 42 | } 43 | 44 | // will be executed before execution of a task 45 | void before_task_hook() { 46 | active_threads++; 47 | } 48 | 49 | // will be executed after execution of a task 50 | void after_task_hook() { 51 | active_threads--; 52 | } 53 | 54 | public: 55 | ThreadPool( 56 | uint64_t capacity_) : 57 | stop_pool(false), // pool is running 58 | active_threads(0), // no work to be done 59 | capacity(capacity_) { // remember size 60 | 61 | // this function is executed by the threads 62 | auto wait_loop = [this] ( ) -> void { 63 | 64 | // wait forever 65 | while (true) { 66 | 67 | // this is a placeholder task 68 | std::function task; 69 | 70 | { // lock this section for waiting 71 | std::unique_lock 72 | unique_lock(mutex); 73 | 74 | // actions must be performed on 75 | // wake-up if (i) the thread pool 76 | // has been stopped or (ii) there 77 | // are still tasks to be processed 78 | auto predicate = [this] ( ) -> bool { 79 | return (stop_pool) || 80 | !(tasks.empty()); 81 | }; 82 | 83 | // wait to be waken up on 84 | // aforementioned conditions 85 | cv.wait(unique_lock, predicate); 86 | 87 | // exit if thread pool stopped 88 | // and no tasks to be performed 89 | if (stop_pool && tasks.empty()) 90 | return; 91 | 92 | // else extract task from queue 93 | task = std::move(tasks.front()); 94 | tasks.pop(); 95 | before_task_hook(); 96 | } // here we release the lock 97 | 98 | // execute the task in parallel 99 | task(); 100 | 101 | { // adjust the thread counter 102 | std::lock_guard 103 | lock_guard(mutex); 104 | after_task_hook(); 105 | } // here we release the lock 106 | } 107 | }; 108 | 109 | // initially spawn capacity many threads 110 | for (uint64_t id = 0; id < capacity; id++) 111 | threads.emplace_back(wait_loop); 112 | } 113 | 114 | ~ThreadPool() { 115 | 116 | { // acquire a scoped lock 117 | std::lock_guard 118 | lock_guard(mutex); 119 | 120 | // and subsequently alter 121 | // the global state to stop 122 | stop_pool = true; 123 | } // here we release the lock 124 | 125 | // signal all threads 126 | cv.notify_all(); 127 | 128 | // finally join all threads 129 | for (auto& thread : threads) 130 | thread.join(); 131 | } 132 | 133 | template < 134 | typename Func, 135 | typename ... Args, 136 | typename Pair=Func(Args...), 137 | typename Rtrn=typename std::result_of::type> 138 | auto enqueue( 139 | Func && func, 140 | Args && ... args) -> std::future { 141 | 142 | // create the task, get the future 143 | // and wrap task in a shared pointer 144 | auto task = make_task(func, args...); 145 | auto future = task.get_future(); 146 | auto task_ptr = std::make_shared 147 | (std::move(task)); 148 | 149 | { // lock the scope 150 | std::lock_guard 151 | lock_guard(mutex); 152 | 153 | // you cannot reuse pool after being stopped 154 | if(stop_pool) 155 | throw std::runtime_error( 156 | "enqueue on stopped ThreadPool" 157 | ); 158 | 159 | // wrap the task in a generic void 160 | // function void -> void 161 | auto payload = [task_ptr] ( ) -> void { 162 | // basically call task() 163 | task_ptr->operator()(); 164 | }; 165 | 166 | // append the task to the queue 167 | tasks.emplace(payload); 168 | } 169 | 170 | // tell one thread to wake-up 171 | cv.notify_one(); 172 | 173 | return future; 174 | } 175 | }; 176 | 177 | #endif 178 | -------------------------------------------------------------------------------- /chapter5/atomics/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread -latomic -march=native 3 | 4 | all: query_atomics atomic_counting atomic_max arbitrary_atomics universal_atomics 5 | 6 | query_atomics: query_atomics.cpp 7 | $(CXX) query_atomics.cpp $(CXXFLAGS) -o query_atomics 8 | 9 | atomic_counting: atomic_counting.cpp 10 | $(CXX) atomic_counting.cpp $(CXXFLAGS) -o atomic_counting 11 | 12 | atomic_max: atomic_max.cpp 13 | $(CXX) atomic_max.cpp $(CXXFLAGS) -o atomic_max 14 | 15 | arbitrary_atomics: arbitrary_atomics.cpp 16 | $(CXX) arbitrary_atomics.cpp $(CXXFLAGS) -o arbitrary_atomics 17 | 18 | universal_atomics: universal_atomics.cpp 19 | $(CXX) universal_atomics.cpp $(CXXFLAGS) -o universal_atomics 20 | 21 | clean: 22 | rm -rf arbitrary_atomics 23 | rm -rf atomic_counting 24 | rm -rf atomic_max 25 | rm -rf arbitrary_atomics 26 | rm -rf universal_atomics 27 | -------------------------------------------------------------------------------- /chapter5/atomics/arbitrary_atomics.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | template < 9 | typename atomc_t, 10 | typename value_t, 11 | typename funct_t, 12 | typename predc_t> 13 | value_t binary_atomic( 14 | atomc_t& atomic, 15 | const value_t& operand, 16 | funct_t function, 17 | predc_t predicate) { 18 | 19 | value_t expect = atomic.load(); 20 | value_t target; 21 | 22 | do { 23 | // compute preliminary new value 24 | target = function(expect, operand); 25 | 26 | // immediately return if not fulfilling 27 | // the given constraint for a valid result 28 | if (!predicate(target)) 29 | return expect; 30 | 31 | // try to atomically swap new and old value 32 | } while (!atomic.compare_exchange_weak(expect, target)); 33 | 34 | // either new value if successful or the old 35 | // value for unsuccessful swap attempts: 36 | // in both cases it corresponds to atomic.load() 37 | return expect; 38 | } 39 | 40 | 41 | int main( ) { 42 | 43 | std::vector threads; 44 | const uint64_t num_threads = 10; 45 | const uint64_t num_iters = 100'000'000; 46 | 47 | auto even_max = 48 | [&] (volatile std::atomic* counter, 49 | const auto& id) -> void { 50 | 51 | auto func = [] (const auto& lhs, 52 | const auto& rhs) { 53 | return lhs > rhs ? lhs : rhs; 54 | }; 55 | 56 | auto pred = [] (const auto& val) { 57 | return val % 2 == 0; 58 | }; 59 | 60 | for (uint64_t i = id; i < num_iters; i += num_threads) 61 | binary_atomic(*counter, i, func, pred); 62 | }; 63 | 64 | TIMERSTART(even_max) 65 | std::atomic even_counter(0); 66 | for (uint64_t id = 0; id < num_threads; id++) 67 | threads.emplace_back(even_max, &even_counter, id); 68 | for (auto& thread : threads) 69 | thread.join(); 70 | TIMERSTOP(even_max) 71 | 72 | std::cout << even_counter << std::endl; 73 | } 74 | -------------------------------------------------------------------------------- /chapter5/atomics/atomic_counting.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../include/hpc_helpers.hpp" 8 | 9 | int main( ) { 10 | 11 | std::mutex mutex; 12 | std::vector threads; 13 | const uint64_t num_threads = 10; 14 | const uint64_t num_iters = 100'000'000; 15 | 16 | auto lock_count = 17 | [&] (volatile uint64_t* counter, 18 | const auto& id) -> void { 19 | 20 | for (uint64_t i = id; i < num_iters; i += num_threads) { 21 | std::lock_guard lock_guard(mutex); 22 | (*counter)++; 23 | } 24 | }; 25 | 26 | auto atomic_count = 27 | [&] (volatile std::atomic* counter, 28 | const auto& id) -> void { 29 | 30 | for (uint64_t i = id; i < num_iters; i += num_threads) 31 | (*counter)++; 32 | }; 33 | 34 | TIMERSTART(mutex_multithreaded) 35 | uint64_t counter = 0; 36 | threads.clear(); 37 | for (uint64_t id = 0; id < num_threads; id++) 38 | threads.emplace_back(lock_count, &counter, id); 39 | for (auto& thread : threads) 40 | thread.join(); 41 | TIMERSTOP(mutex_multithreaded) 42 | 43 | TIMERSTART(atomic_multithreaded) 44 | std::atomic atomic_counter(0); 45 | threads.clear(); 46 | for (uint64_t id = 0; id < num_threads; id++) 47 | threads.emplace_back(atomic_count, &atomic_counter, id); 48 | for (auto& thread : threads) 49 | thread.join(); 50 | TIMERSTOP(atomic_multithreaded) 51 | 52 | std::cout << counter << " " << atomic_counter << std::endl; 53 | } 54 | -------------------------------------------------------------------------------- /chapter5/atomics/atomic_max.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | int main( ) { 9 | 10 | std::vector threads; 11 | const uint64_t num_threads = 10; 12 | const uint64_t num_iters = 100'000'000; 13 | 14 | // WARNING: this closure produces incorrect results 15 | auto false_max = 16 | [&] (volatile std::atomic* counter, 17 | const auto& id) -> void { 18 | 19 | for (uint64_t i = id; i < num_iters; i += num_threads) 20 | if(i > *counter) 21 | *counter = i; 22 | }; 23 | 24 | auto correct_max = 25 | [&] (volatile std::atomic* counter, 26 | const auto& id) -> void { 27 | 28 | for (uint64_t i = id; i < num_iters; i += num_threads) { 29 | auto previous = counter->load(); 30 | while (previous < i && 31 | !counter->compare_exchange_weak(previous, i)) {} 32 | } 33 | }; 34 | 35 | TIMERSTART(incorrect_max) 36 | std::atomic false_counter(0); 37 | threads.clear(); 38 | for (uint64_t id = 0; id < num_threads; id++) 39 | threads.emplace_back(false_max, &false_counter, id); 40 | for (auto& thread : threads) 41 | thread.join(); 42 | TIMERSTOP(incorrect_max) 43 | 44 | TIMERSTART(correct_max) 45 | std::atomic correct_counter(0); 46 | threads.clear(); 47 | for (uint64_t id = 0; id < num_threads; id++) 48 | threads.emplace_back(correct_max, &correct_counter, id); 49 | for (auto& thread : threads) 50 | thread.join(); 51 | TIMERSTOP(correct_max) 52 | 53 | std::cout << false_counter << " " 54 | << correct_counter << std::endl; 55 | } 56 | -------------------------------------------------------------------------------- /chapter5/atomics/query_atomics.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template < 5 | typename x_value_t, 6 | typename y_value_t, 7 | typename z_value_t> 8 | struct state_t { 9 | 10 | x_value_t x; 11 | y_value_t y; 12 | z_value_t z; 13 | }; 14 | 15 | template < 16 | typename R, 17 | typename S, 18 | typename T> 19 | void status() { // report size and if lock-free 20 | typedef std::atomic> atomic_state_t; 21 | std::cout << sizeof(atomic_state_t) << "\t" 22 | << atomic_state_t().is_lock_free() 23 | << std::endl; 24 | } 25 | 26 | int main () { 27 | 28 | std::cout << "size\tlock_free?" << std::endl; 29 | 30 | status(); 31 | status(); 32 | status(); 33 | status(); 34 | status(); 35 | status(); 36 | } 37 | -------------------------------------------------------------------------------- /chapter5/atomics/universal_atomics.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | template < 9 | typename atomc_t, 10 | typename value_t, 11 | typename funcp_t, 12 | typename funcn_t, 13 | typename predc_t> 14 | value_t ternary_atomic( 15 | atomc_t& atomic, 16 | const value_t& operand, 17 | funcp_t pos_function, 18 | funcn_t neg_function, 19 | predc_t predicate) { 20 | 21 | value_t expect = atomic.load(); 22 | value_t target; 23 | 24 | do { 25 | 26 | if (predicate(expect, operand)) 27 | target = pos_function(expect, operand); 28 | else 29 | target = neg_function(expect, operand); 30 | 31 | // try to atomically swap new and old value 32 | } while (!atomic.compare_exchange_weak(expect, target)); 33 | 34 | // either new value if successful or the old 35 | // value for unsuccessful swap attempts: 36 | // in both cases it corresponds to atomic.load() 37 | return expect; 38 | } 39 | 40 | 41 | int main( ) { 42 | 43 | std::vector threads; 44 | const uint64_t num_threads = 10; 45 | const uint64_t num_iters = 100'000'000; 46 | 47 | auto even_max = 48 | [&] (volatile std::atomic* counter, 49 | const auto& id) -> void { 50 | 51 | auto pos_func = [] (const auto& lhs, 52 | const auto& rhs) { 53 | return lhs; 54 | }; 55 | 56 | auto neg_func = [] (const auto& lhs, 57 | const auto& rhs) { 58 | return rhs; 59 | }; 60 | 61 | auto pred = [] (const auto& lhs, 62 | const auto& rhs) { 63 | return lhs > rhs && lhs % 2 == 0; 64 | }; 65 | 66 | for (uint64_t i = id; i < num_iters; i += num_threads) 67 | ternary_atomic(*counter, i, pos_func, neg_func, pred); 68 | }; 69 | 70 | TIMERSTART(even_max) 71 | std::atomic even_counter(0); 72 | for (uint64_t id = 0; id < num_threads; id++) 73 | threads.emplace_back(even_max, &even_counter, id); 74 | for (auto& thread : threads) 75 | thread.join(); 76 | TIMERSTOP(even_max) 77 | 78 | std::cout << even_counter << std::endl; 79 | } 80 | -------------------------------------------------------------------------------- /chapter5/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter5/knapsack/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: knapsack 5 | 6 | knapsack: knapsack.cpp 7 | $(CXX) knapsack.cpp $(CXXFLAGS) -o knapsack 8 | 9 | clean: 10 | rm -rf knapsack 11 | -------------------------------------------------------------------------------- /chapter5/knapsack/knapsack.cpp: -------------------------------------------------------------------------------- 1 | #include // std::sort 2 | #include // std::cout 3 | #include // std::vector 4 | #include // std::atomic 5 | #include // std::uniform_int_distribution 6 | #include "threadpool.hpp" // work sharing thread pool 7 | 8 | template < 9 | typename value_t_, 10 | typename weight_t_> 11 | struct generic_tuple_t { 12 | 13 | value_t_ value; 14 | weight_t_ weight; 15 | 16 | // expose types 17 | typedef value_t_ value_t; 18 | typedef value_t_ weight_t; 19 | 20 | generic_tuple_t( 21 | value_t_ value_, 22 | weight_t_ weight_) : value (value_ ), 23 | weight(weight_) {} 24 | }; 25 | 26 | template < 27 | typename bmask_t_, 28 | typename value_t_> 29 | struct state_t { 30 | 31 | bmask_t_ bmask=0; 32 | value_t_ value=0; 33 | 34 | // expose template parameters 35 | typedef bmask_t_ bmask_t; 36 | typedef value_t_ value_t; 37 | 38 | // non-default constructors are not allowed 39 | // when wrapped with std::atomic 40 | }; 41 | 42 | // shortcuts for convenience 43 | typedef uint64_t index_t; 44 | typedef uint32_t bmask_t; 45 | typedef uint32_t value_t; 46 | typedef uint32_t weight_t; 47 | typedef generic_tuple_t tuple_t; 48 | 49 | // the global state encoding the mask and value 50 | std::atomic> global_state; 51 | const value_t capacity (1500); 52 | const index_t num_items (32); 53 | std::vector tuples; 54 | 55 | // our work-sharing thread pool 56 | ThreadPool TP(4); 57 | 58 | // initializes Knapsack problem 59 | template < 60 | typename tuple_t, 61 | typename index_t> 62 | void init_tuples( 63 | std::vector& tuples, 64 | index_t num_entries) { 65 | 66 | // recover the types stored in tuple_t 67 | typedef typename tuple_t::value_t value_t; 68 | typedef typename tuple_t::weight_t weight_t; 69 | 70 | // C++11 random number generator 71 | std::mt19937 engine(0); // mersenne twister 72 | std::uniform_int_distribution rho_v(80, 100); 73 | std::uniform_int_distribution rho_w(80, 100); 74 | 75 | // generate pairs of values and weights 76 | for (index_t index = 0; index < num_entries; index++) 77 | tuples.emplace_back(rho_v(engine), rho_w(engine)); 78 | 79 | // sort two pairs by value/weight density 80 | auto predicate = [] (const auto& lhs, 81 | const auto& rhs) -> bool { 82 | return lhs.value*rhs.weight > rhs.value*lhs.weight; 83 | }; 84 | 85 | std::sort(tuples.begin(), tuples.end(), predicate); 86 | } 87 | 88 | template < 89 | typename tuple_t, 90 | typename bmask_t> 91 | void atomic_update( 92 | tuple_t tuple, 93 | bmask_t bmask) { 94 | 95 | typedef typename tuple_t::value_t value_t; 96 | 97 | auto g_state = global_state.load(); 98 | auto l_value = tuple.value; 99 | state_t target; 100 | 101 | do { 102 | 103 | // exit if solution is not optimal 104 | if (g_state.value > l_value) 105 | return; 106 | 107 | // construct the desired target 108 | target.value = l_value; 109 | target.bmask = bmask; 110 | 111 | } while (!global_state.compare_exchange_weak(g_state, target)); 112 | } 113 | 114 | template < 115 | typename index_t, 116 | typename tuple_t> 117 | typename tuple_t::value_t dantzig_bound( 118 | index_t height, 119 | tuple_t tuple) { 120 | 121 | auto predicate = [&] (const index_t& i) { 122 | return i < num_items && 123 | tuple.weight < capacity; 124 | }; 125 | 126 | // greedily pack items until backpack full 127 | for (index_t i = height; predicate(i); i++) { 128 | tuple.value += tuples[i].value; 129 | tuple.weight += tuples[i].weight; 130 | } 131 | 132 | return tuple.value; 133 | } 134 | 135 | template < 136 | typename index_t, 137 | typename tuple_t, 138 | typename bmask_t> 139 | void traverse( 140 | index_t height, // height of the binary tree 141 | tuple_t tuple, // weight and value up to height 142 | bmask_t bmask) { // binary mask up to height 143 | 144 | // check whether item packed or not 145 | const bool bit = (bmask >> height) % 2; 146 | tuple.weight += bit*tuples[height].weight; 147 | tuple.value += bit*tuples[height].value; 148 | 149 | // check versus maximum capacity 150 | if (tuple.weight > capacity) 151 | return; // my backpack is full 152 | 153 | // update global lower bound if needed 154 | atomic_update(tuple, bmask); 155 | 156 | // calculate local Danzig upper bound 157 | // and compare with global upper bound 158 | auto bsf = global_state.load().value; 159 | if (dantzig_bound(height+1, tuple) < bsf) 160 | return; 161 | 162 | // if everything was fine generate new candidate 163 | if (height+1 < num_items) { 164 | traverse(height+1, tuple, bmask+(1<<(height+1))); 165 | traverse(height+1, tuple, bmask); 166 | } 167 | } 168 | 169 | int main () { 170 | 171 | // initialize tuples with random values 172 | init_tuples(tuples, num_items); 173 | 174 | // traverse left and right branch 175 | TP.spawn(traverse, 176 | 0, tuple_t(0, 0), 0); 177 | TP.spawn(traverse, 178 | 0, tuple_t(0, 0), 1); 179 | 180 | // wait for all tasks to be finished 181 | TP.wait_and_stop(); 182 | 183 | // report the final solution 184 | auto g_state = global_state.load(); 185 | std::cout << "value " << g_state.value << std::endl; 186 | 187 | auto bmask = g_state.bmask; 188 | for (index_t i = 0; i < num_items; i++) { 189 | std::cout << bmask % 2 << " "; 190 | bmask >>= 1; 191 | } 192 | std::cout << std::endl; 193 | } 194 | -------------------------------------------------------------------------------- /chapter5/knapsack/threadpool.hpp: -------------------------------------------------------------------------------- 1 | ../thread_pool/threadpool.hpp -------------------------------------------------------------------------------- /chapter5/thread_pool/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -pthread 3 | 4 | all: tree 5 | 6 | tree: tree.cpp 7 | $(CXX) tree.cpp $(CXXFLAGS) -o tree 8 | 9 | clean: 10 | rm -rf tree 11 | -------------------------------------------------------------------------------- /chapter5/thread_pool/tree.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "threadpool.hpp" 4 | #include "../include/hpc_helpers.hpp" 5 | 6 | ThreadPool TP(8); 7 | 8 | void waste_cycles(uint64_t num_cycles) { 9 | 10 | volatile uint64_t counter = 0; 11 | for (uint64_t i = 0; i < num_cycles; i++) 12 | counter++; 13 | } 14 | 15 | void traverse(uint64_t node, uint64_t num_nodes) { 16 | 17 | if (node < num_nodes) { 18 | 19 | waste_cycles(1<<15); 20 | 21 | TP.spawn(traverse, 2*node+1, num_nodes); 22 | traverse(2*node+2, num_nodes); 23 | } 24 | } 25 | 26 | int main() { 27 | 28 | TIMERSTART(traverse) 29 | TP.spawn(traverse, 0, 1<<20); 30 | TP.wait_and_stop(); 31 | TIMERSTOP(traverse) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /chapter6/1NN_classification/1NN.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // std::numeric_limits 3 | #include // std::vector 4 | 5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros 6 | // and the no_init_t template that disables implicit type 7 | // initialization 8 | #include "../include/hpc_helpers.hpp" 9 | // binary_IO contains the load_binary function to load 10 | // and store binary data from and to a file 11 | #include "../include/binary_IO.hpp" 12 | 13 | template 15 | void all_vs_all(value_t* test, 16 | value_t* train, 17 | value_t* delta, 18 | index_t num_test, 19 | index_t num_train, 20 | index_t num_features, 21 | bool parallel) { 22 | 23 | // coarse-grained parallelism 24 | #pragma omp parallel for collapse(2) if(parallel) 25 | for (index_t i = 0; i < num_test; i++) 26 | for (index_t j = 0; j < num_train; j++) { 27 | value_t accum = value_t(0); 28 | // fine-grained parallelism 29 | // #pragma omp parallel for reduction(+:accum) 30 | for (index_t k = 0; k < num_features; k++) { 31 | const value_t residue = test [i*num_features+k] 32 | - train[j*num_features+k]; 33 | accum += residue*residue; 34 | } 35 | delta[i*num_train+j] = accum; 36 | } 37 | } 38 | 39 | template 42 | value_t accuracy(label_t* label_test, 43 | label_t* label_train, 44 | value_t* delta, 45 | index_t num_test, 46 | index_t num_train, 47 | index_t num_classes, 48 | bool parallel) { 49 | 50 | index_t counter = index_t(0); 51 | 52 | #pragma omp parallel for reduction(+:counter) if(parallel) 53 | for (index_t i = 0; i < num_test; i++) { 54 | 55 | // the initial distance is float::max 56 | // the initial index j_star is some dummy value 57 | value_t bsf = std::numeric_limits::max(); 58 | index_t jst = std::numeric_limits::max(); 59 | 60 | // find training sample with smallest distance 61 | for (index_t j = 0; j < num_train; j++) { 62 | const value_t value = delta[i*num_train+j]; 63 | if (value < bsf) { 64 | bsf = value; 65 | jst = j; 66 | } 67 | } 68 | 69 | // compare predicted label with original label 70 | bool match = true; 71 | for (index_t k = 0; k < num_classes; k++) 72 | match &= label_test [i *num_classes+k] == 73 | label_train[jst*num_classes+k]; 74 | 75 | counter += match; 76 | } 77 | 78 | return value_t(counter)/value_t(num_test); 79 | } 80 | 81 | int main(int argc, char* argv[]) { 82 | 83 | // run parallelized when any command line argument given 84 | const bool parallel = argc > 1; 85 | 86 | std::cout << "running " 87 | << (parallel ? "in parallel" : "sequentially") 88 | << std::endl; 89 | 90 | // the shape of the data matrices 91 | const uint64_t num_features = 28*28; 92 | const uint64_t num_classes = 10; 93 | const uint64_t num_entries = 65000; 94 | const uint64_t num_train = 55000; 95 | const uint64_t num_test = num_entries-num_train; 96 | 97 | // memory for the data matrices and all-pair matrix 98 | std::vector input(num_entries*num_features); 99 | std::vector label(num_entries*num_classes); 100 | std::vector delta(num_test*num_train); 101 | 102 | // get the images and labels from disk 103 | load_binary(input.data(), input.size(), "./data/X.bin"); 104 | load_binary(label.data(), label.size(), "./data/Y.bin"); 105 | 106 | TIMERSTART(all_vs_all) 107 | const uint64_t inp_off = num_train * num_features; 108 | all_vs_all(input.data() + inp_off, 109 | input.data(), 110 | delta.data(), 111 | num_test, num_train, 112 | num_features, parallel); 113 | TIMERSTOP(all_vs_all) 114 | 115 | TIMERSTART(classify) 116 | const uint64_t lbl_off = num_train * num_classes; 117 | auto acc = accuracy(label.data() + lbl_off, 118 | label.data(), 119 | delta.data(), 120 | num_test, num_train, 121 | num_classes, parallel); 122 | TIMERSTOP(classify) 123 | 124 | std::cout << "test accuracy: " << acc << std::endl; 125 | } 126 | -------------------------------------------------------------------------------- /chapter6/1NN_classification/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: 1NN 5 | 6 | 1NN: 1NN.cpp 7 | $(CXX) 1NN.cpp $(CXXFLAGS) -o 1NN 8 | 9 | clean: 10 | rm -rf 1NN 11 | -------------------------------------------------------------------------------- /chapter6/1NN_classification/data/mnist_exporter.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # run __ONE__ of the following commands: 3 | # pip install --user tensorflow (if you have no CUDA-enabled GPU) 4 | # pip install --user tensorflow-gpu 5 | # 6 | # afterwards install tflearn 7 | # pip install --user tflearn 8 | # 9 | # Numpy should come bundled with tensorflow. Run this file et voila! 10 | ##################################################################### 11 | 12 | import tflearn 13 | 14 | # Data loading and preprocessing 15 | import tflearn.datasets.mnist as mnist 16 | X, Y, testX, testY = mnist.load_data(one_hot=True) 17 | 18 | import array as ar 19 | import numpy as np 20 | 21 | with open("X.bin", "wb") as f: 22 | images = np.vstack((X, testX)) 23 | print(images.shape) 24 | f.write(ar.array("f", images.flatten())) 25 | 26 | with open("Y.bin", "wb") as f: 27 | labels = np.vstack((Y, testY)) 28 | print(labels.shape) 29 | f.write(ar.array("f", labels.flatten())) 30 | -------------------------------------------------------------------------------- /chapter6/advanced_reductions/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++-6 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp -mavx -march=native 3 | 4 | all: custom_reduction avx_reduction string_reduction 5 | 6 | custom_reduction: custom_reduction.cpp 7 | $(CXX) custom_reduction.cpp $(CXXFLAGS) -o custom_reduction 8 | 9 | avx_reduction: avx_reduction.cpp 10 | $(CXX) avx_reduction.cpp $(CXXFLAGS) -o avx_reduction 11 | 12 | string_reduction: string_reduction.cpp 13 | $(CXX) string_reduction.cpp $(CXXFLAGS) -o string_reduction 14 | 15 | clean: 16 | rm -rf custom_reduction 17 | rm -rf avx_reduction 18 | rm -rf string_reduction 19 | -------------------------------------------------------------------------------- /chapter6/advanced_reductions/avx_reduction.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // uint64_t 3 | #include // INFINITY 4 | #include // random 5 | #include // AVX intrinsics 6 | 7 | struct avxop { 8 | 9 | __m256 neutral; 10 | 11 | avxop() : neutral(_mm256_set1_ps(-INFINITY)) {} 12 | 13 | inline __m256 operator()( 14 | const __m256& lhs, 15 | const __m256& rhs) const { 16 | 17 | return _mm256_max_ps(lhs, rhs); 18 | } 19 | }; 20 | 21 | void init(float * data, uint64_t length) { 22 | 23 | std::mt19937 engine(42); 24 | std::uniform_real_distribution density(-1L<<28, 1L<<28); 25 | 26 | for (uint64_t i = 0; i < length; i++) 27 | data[i] = density(engine); 28 | } 29 | 30 | inline float hmax_sse3(__m128 v) { 31 | __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 32 | __m128 maxs = _mm_max_ps(v, shuf); 33 | shuf = _mm_movehl_ps(shuf, maxs); // high half -> low half 34 | maxs = _mm_max_ss(maxs, shuf); 35 | return _mm_cvtss_f32(maxs); 36 | } 37 | 38 | inline float hmax_avx(__m256 v) { 39 | __m128 lo = _mm256_castps256_ps128(v); // low 128 40 | __m128 hi = _mm256_extractf128_ps(v, 1); // high 128 41 | lo = _mm_max_ps(lo, hi); // max the low 128 42 | return hmax_sse3(lo); // and inline the sse3 version 43 | } 44 | 45 | int main () { 46 | 47 | const uint64_t num_entries = 1UL << 28; 48 | const uint64_t num_bytes = num_entries*sizeof(float); 49 | auto data = static_cast(_mm_malloc(num_bytes , 32)); 50 | init(data, num_entries); 51 | 52 | #pragma omp declare reduction(avx_max : __m256 : \ 53 | omp_out = avxop()(omp_out, omp_in)) \ 54 | initializer (omp_priv=avxop().neutral) 55 | 56 | auto result = avxop().neutral; 57 | 58 | # pragma omp parallel for reduction(avx_max:result) 59 | for (uint64_t i = 0; i < num_entries; i += 8) 60 | result = avxop()(result, _mm256_load_ps(data+i)); 61 | 62 | std::cout << hmax_avx(result) << std::endl; 63 | 64 | _mm_free(data); 65 | } 66 | -------------------------------------------------------------------------------- /chapter6/advanced_reductions/custom_reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | template < 5 | typename value_t> 6 | struct binop { 7 | 8 | constexpr static value_t neutral = 0; 9 | 10 | inline value_t operator()( 11 | const value_t& lhs, 12 | const value_t& rhs) const { 13 | 14 | const value_t ying = std::abs(lhs); 15 | const value_t yang = std::abs(rhs); 16 | 17 | return ying > yang ? lhs : rhs; 18 | } 19 | }; 20 | 21 | int main () { 22 | 23 | const uint64_t num_iters = 1UL << 20; 24 | int64_t result = binop::neutral; 25 | 26 | #pragma omp declare reduction(custom_op : int64_t : \ 27 | omp_out = binop()(omp_out, omp_in)) \ 28 | initializer (omp_priv=binop::neutral) 29 | 30 | 31 | # pragma omp parallel for reduction(custom_op:result) 32 | for (uint64_t i = 0; i < num_iters; i++) 33 | result = binop()(result, i&1 ? -i : i); 34 | 35 | std::cout << result << std::endl; 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /chapter6/advanced_reductions/string_reduction.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main () { 7 | 8 | std::string result("SIMON SAYS_"); 9 | std::vector data {"p", "a", "r", "a", "l", "l", 10 | "e", "l", " ", "p", "r", "o", 11 | "g", "r", "a", "m", "m", "i", 12 | "n", "g", " ", "i", "s", " ", 13 | "f", "u", "n", "!"}; 14 | 15 | #pragma omp declare reduction(custom_op : std::string : \ 16 | omp_out = omp_out+omp_in) \ 17 | initializer (omp_priv=std::string("")) 18 | 19 | # pragma omp parallel for reduction(custom_op:result) num_threads(2) 20 | for (uint64_t i = 0; i < data.size(); i++) 21 | result = result+data[i]; 22 | 23 | std::cout << result << std::endl; 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /chapter6/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: hello_world 5 | 6 | hello_world: hello_world.cpp 7 | $(CXX) hello_world.cpp $(CXXFLAGS) -o hello_world 8 | 9 | clean: 10 | rm -rf hello_world 11 | -------------------------------------------------------------------------------- /chapter6/hello_world/hello_world.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() { 4 | // run the statement after the pragma in the current team 5 | #pragma omp parallel 6 | std::cout << "Hello world!" << std::endl; 7 | } 8 | -------------------------------------------------------------------------------- /chapter6/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter6/load_imbalance/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: scheduling 5 | 6 | scheduling: scheduling.cpp 7 | $(CXX) scheduling.cpp $(CXXFLAGS) -o scheduling 8 | 9 | clean: 10 | rm -rf scheduling 11 | -------------------------------------------------------------------------------- /chapter6/load_imbalance/data: -------------------------------------------------------------------------------- 1 | ../1NN_classification/data/ -------------------------------------------------------------------------------- /chapter6/load_imbalance/scheduling.cpp: -------------------------------------------------------------------------------- 1 | #include // std::cout 2 | #include // std::vector 3 | 4 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros 5 | #include "../include/hpc_helpers.hpp" 6 | // binary_IO contains the load_binary function to load 7 | // and store binary data from and to a file 8 | #include "../include/binary_IO.hpp" 9 | 10 | // we will change this mode later 11 | #define MODE dynamic 12 | 13 | template 15 | void inner_product(value_t * data, 16 | value_t * delta, 17 | index_t num_entries, 18 | index_t num_features, 19 | bool parallel) { 20 | 21 | #pragma omp parallel for schedule(MODE) if(parallel) 22 | for (index_t i = 0; i < num_entries; i++) 23 | for (index_t j = i; j < num_entries; j++) { 24 | value_t accum = value_t(0); 25 | for (index_t k = 0; k < num_features; k++) 26 | accum += data[i*num_features+k] * 27 | data[j*num_features+k]; 28 | delta[i*num_entries+j] = 29 | delta[j*num_entries+i] = accum; 30 | } 31 | } 32 | 33 | int main(int argc, char* argv[]) { 34 | 35 | // run parallelized when any command line argument given 36 | const bool parallel = argc > 1; 37 | 38 | std::cout << "running " 39 | << (parallel ? "in parallel" : "sequentially") 40 | << std::endl; 41 | 42 | // the shape of the data matrices 43 | const uint64_t num_features = 28*28; 44 | const uint64_t num_entries = 65000; 45 | 46 | TIMERSTART(alloc) 47 | // memory for the data matrices and all-pair matrix 48 | std::vector input(num_entries*num_features); 49 | std::vector delta(num_entries*num_entries); 50 | TIMERSTOP(alloc) 51 | 52 | TIMERSTART(read_data) 53 | // get the images and labels from disk 54 | load_binary(input.data(), input.size(), "./data/X.bin"); 55 | TIMERSTOP(read_data) 56 | 57 | TIMERSTART(inner_product) 58 | inner_product(input.data(), delta.data(), 59 | num_entries, num_features, parallel); 60 | TIMERSTOP(inner_product) 61 | } 62 | -------------------------------------------------------------------------------- /chapter6/matrix_vector/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: matrix_vector 5 | 6 | matrix_vector: matrix_vector.cpp 7 | $(CXX) matrix_vector.cpp $(CXXFLAGS) -o matrix_vector 8 | 9 | clean: 10 | rm -rf matrix_vector 11 | -------------------------------------------------------------------------------- /chapter6/matrix_vector/matrix_vector.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros 6 | // and the no_init_t template that disables implicit type 7 | // initialization 8 | #include "../include/hpc_helpers.hpp" 9 | 10 | template 12 | void init(std::vector& A, 13 | std::vector& x, 14 | index_t m, 15 | index_t n) { 16 | 17 | for (index_t row = 0; row < m; row++) 18 | for (index_t col = 0; col < n; col++) 19 | A[row*n+col] = row >= col ? 1 : 0; 20 | 21 | for (index_t col = 0; col < m; col++) 22 | x[col] = col; 23 | } 24 | 25 | template 27 | void mult(std::vector& A, 28 | std::vector& x, 29 | std::vector& b, 30 | index_t m, 31 | index_t n, 32 | bool parallel) { 33 | 34 | #pragma omp parallel for if(parallel) 35 | for (index_t row = 0; row < m; row++) { 36 | value_t accum = value_t(0); 37 | for (index_t col = 0; col < n; col++) 38 | accum += A[row*n+col]*x[col]; 39 | b[row] = accum; 40 | } 41 | } 42 | 43 | int main() { 44 | const uint64_t n = 1UL << 15; 45 | const uint64_t m = 1UL << 15; 46 | 47 | TIMERSTART(overall) 48 | // memory allocation for the three vectors x, y, and z 49 | // with the no_init_t template as a wrapper for the actual type 50 | TIMERSTART(alloc) 51 | std::vector> A(m*n); 52 | std::vector> x(n); 53 | std::vector> b(m); 54 | TIMERSTOP(alloc) 55 | 56 | // manually initialize the input matrix A and vector x 57 | TIMERSTART(init) 58 | init(A, x, m, n); 59 | TIMERSTOP(init) 60 | 61 | // compute A * x = b sequentially three times 62 | for (uint64_t k = 0; k < 3; k++) { 63 | TIMERSTART(mult_seq) 64 | mult(A, x, b, m, n, false); 65 | TIMERSTOP(mult_seq) 66 | } 67 | // compute A * x = b in parallel three times 68 | for (uint64_t k = 0; k < 3; k++) { 69 | TIMERSTART(mult_par) 70 | mult(A, x, b, m, n, true); 71 | TIMERSTOP(mult_par) 72 | } 73 | TIMERSTOP(overall) 74 | 75 | // check if (last) result is correct 76 | for (uint64_t index = 0; index < m; index++) 77 | if (b[index] != index*(index+1)/2) 78 | std::cout << "error at position " << index 79 | << " " << b[index] << std::endl; 80 | } 81 | 82 | -------------------------------------------------------------------------------- /chapter6/softmax_regression/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++-6 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: softmax 5 | 6 | softmax: softmax.cpp 7 | $(CXX) softmax.cpp $(CXXFLAGS) -o softmax 8 | 9 | clean: 10 | rm -rf softmax 11 | -------------------------------------------------------------------------------- /chapter6/softmax_regression/data/mnist_softmax.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # run __ONE__ of the following commands: 3 | # pip install --user tensorflow (if you have no CUDA-enabled GPU) 4 | # pip install --user tensorflow-gpu 5 | # 6 | # afterwards install tflearn 7 | # pip install --user tflearn 8 | # 9 | # Numpy should come bundled with tensorflow. Run this file et voila! 10 | ##################################################################### 11 | 12 | from __future__ import division, print_function, absolute_import 13 | import tflearn 14 | 15 | # Data loading and preprocessing 16 | import tflearn.datasets.mnist as mnist 17 | X, Y, testX, testY = mnist.load_data(one_hot=True) 18 | 19 | input_layer = tflearn.input_data(shape=[None, 784]) 20 | softmax = tflearn.fully_connected(input_layer, 10, activation='softmax', name="fully") 21 | 22 | net = tflearn.regression(softmax, optimizer="adam", 23 | loss='categorical_crossentropy') 24 | 25 | # Training 26 | model = tflearn.DNN(net, tensorboard_verbose=0) 27 | model.fit(X, Y, n_epoch=5, validation_set=(testX, testY), 28 | show_metric=True, run_id="dense_model") 29 | 30 | with model.session.as_default(): 31 | fully = tflearn.variables.get_layer_variables_by_name('fully') 32 | A = tflearn.variables.get_value(fully[0]).T 33 | b = tflearn.variables.get_value(fully[1]) 34 | 35 | print(A.shape) 36 | print(b.shape) 37 | 38 | import array as ar 39 | import numpy as np 40 | 41 | with open("A.bin", "wb") as f: 42 | f.write(ar.array("f", A.flatten())) 43 | 44 | with open("b.bin", "wb") as f: 45 | f.write(ar.array("f", b)) 46 | 47 | with open("X.bin", "wb") as f: 48 | images = np.vstack((X, testX)) 49 | print(images.shape) 50 | f.write(ar.array("f", images.flatten())) 51 | 52 | with open("Y.bin", "wb") as f: 53 | labels = np.vstack((Y, testY)) 54 | print(labels.shape) 55 | f.write(ar.array("f", labels.flatten())) 56 | -------------------------------------------------------------------------------- /chapter6/vector_add/Makefile: -------------------------------------------------------------------------------- 1 | CXX= g++ 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp 3 | 4 | all: vector_add vector_add_scoped 5 | 6 | vector_add: vector_add.cpp 7 | $(CXX) vector_add.cpp $(CXXFLAGS) -o vector_add 8 | 9 | vector_add_scoped: vector_add_scoped.cpp 10 | $(CXX) vector_add_scoped.cpp $(CXXFLAGS) -o vector_add_scoped 11 | 12 | clean: 13 | rm -rf vector_add 14 | rm -rf vector_add_scoped 15 | -------------------------------------------------------------------------------- /chapter6/vector_add/vector_add.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../include/hpc_helpers.hpp" 7 | 8 | int main() { 9 | 10 | TIMERSTART(alloc) 11 | const uint64_t num_entries = 1UL << 30; 12 | std::vector> x(num_entries); 13 | std::vector> y(num_entries); 14 | std::vector> z(num_entries); 15 | TIMERSTOP(alloc) 16 | 17 | TIMERSTART(init) 18 | #pragma omp parallel for 19 | for (uint64_t i = 0; i < num_entries; i++) { 20 | x[i] = i; 21 | y[i] = num_entries-i; 22 | } 23 | TIMERSTOP(init) 24 | 25 | TIMERSTART(add) 26 | #pragma omp parallel for 27 | for (uint64_t i = 0; i < num_entries; i++) 28 | z[i] = x[i]+y[i]; 29 | TIMERSTOP(add) 30 | 31 | TIMERSTART(check) 32 | #pragma omp parallel for 33 | for (uint64_t i = 0; i < num_entries; i++) 34 | if(z[i]-num_entries) 35 | std::cout << "error at position " 36 | << i << std::endl; 37 | TIMERSTOP(check) 38 | } 39 | -------------------------------------------------------------------------------- /chapter6/vector_add/vector_add_scoped.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros 6 | // and the no_init_t template that disables implicit type 7 | // initialization 8 | #include "../include/hpc_helpers.hpp" 9 | 10 | int main() { 11 | // memory allocation for the three vectors x, y, and z 12 | // with the no_init_t template as a wrapper for the actual type 13 | TIMERSTART(alloc) 14 | const uint64_t num_entries = 1UL << 30; 15 | std::vector> x(num_entries); 16 | std::vector> y(num_entries); 17 | std::vector> z(num_entries); 18 | TIMERSTOP(alloc) 19 | 20 | TIMERSTART(alltogether) 21 | #pragma omp parallel 22 | { 23 | #pragma omp for 24 | for (uint64_t i = 0; i < num_entries; i++) { 25 | x[i] = i; 26 | y[i] = num_entries - i; 27 | } 28 | 29 | #pragma omp for 30 | for (uint64_t i = 0; i < num_entries; i++) 31 | z[i] = x[i] + y[i]; 32 | 33 | #pragma omp for 34 | for (uint64_t i = 0; i < num_entries; i++) 35 | if (z[i] - num_entries) 36 | std::cout << "error at position " 37 | << i << std::endl; 38 | } 39 | TIMERSTOP(alltogether) 40 | } 41 | -------------------------------------------------------------------------------- /chapter7/dynamic_time_warping/Makefile: -------------------------------------------------------------------------------- 1 | NVCC= nvcc 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61 3 | CXXFLAGS= -Xcompiler="-fopenmp -march=native" 4 | 5 | all: dtw_host dtw_device 6 | 7 | dtw_host: dtw_host.cu 8 | $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) dtw_host.cu -o dtw_host 9 | 10 | dtw_device: dtw_device.cu 11 | $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) dtw_device.cu -o dtw_device 12 | 13 | clean: 14 | rm -f dtw_host 15 | rm -f dtw_device 16 | -------------------------------------------------------------------------------- /chapter7/dynamic_time_warping/dtw_host.cu: -------------------------------------------------------------------------------- 1 | #include "../include/cbf_generator.hpp" 2 | #include "../include/hpc_helpers.hpp" 3 | #include "../include/binary_IO.hpp" 4 | 5 | typedef uint64_t index_t; 6 | typedef uint8_t label_t; 7 | typedef float value_t; 8 | 9 | template < 10 | typename index_t, 11 | typename value_t> 12 | value_t plain_dtw( 13 | value_t * query, 14 | value_t * subject, 15 | index_t num_features) { 16 | 17 | // for convenient indexing 18 | const index_t lane = num_features+1; 19 | 20 | // allocate the matrix of M 21 | value_t * penalty = new value_t[lane*lane]; 22 | 23 | // initialize the matrix M 24 | for (index_t index = 1; index < lane-1; index++) { 25 | penalty[index] = INFINITY; 26 | penalty[index*lane] = INFINITY; 27 | } 28 | penalty[0] = 0; 29 | 30 | // traverse graph in row-major order 31 | for (index_t row = 1; row < lane; row++) { 32 | 33 | const value_t q_value = query[row-1]; 34 | 35 | for (index_t col = 1; col < lane; col++) { 36 | 37 | // determine contribution from incoming edges 38 | const value_t diag = penalty[(row-1)*lane+col-1]; 39 | const value_t abve = penalty[(row-1)*lane+col+0]; 40 | const value_t left = penalty[(row+0)*lane+col-1]; 41 | 42 | // compute residue between query and subject 43 | const value_t residue = q_value-subject[col-1]; 44 | 45 | // relax node 46 | penalty[row*lane+col] = residue*residue + 47 | min(diag, 48 | min(abve, left)); 49 | } 50 | } 51 | 52 | // report the lower right cell and free memory 53 | const value_t result = penalty[lane*lane-1]; 54 | delete [] penalty; 55 | 56 | return result; 57 | } 58 | 59 | template < 60 | typename index_t, 61 | typename value_t> 62 | value_t dtw( 63 | value_t * query, 64 | value_t * subject, 65 | index_t num_features) { 66 | 67 | const index_t lane = num_features+1; 68 | value_t * penalty = new value_t[2*lane]; 69 | 70 | for (index_t index = 0; index < lane; index++) 71 | penalty[index+1] = INFINITY; 72 | penalty[0] = 0; 73 | 74 | for (index_t row = 1; row < lane; row++) { 75 | 76 | const value_t q_value = query[row-1]; 77 | const index_t target_row = row & 1; 78 | const index_t source_row = !target_row; 79 | 80 | if (row == 2) 81 | penalty[target_row*lane] = INFINITY; 82 | 83 | for (index_t col = 1; col < lane; col++) { 84 | 85 | const value_t diag = penalty[source_row*lane+col-1]; 86 | const value_t abve = penalty[source_row*lane+col+0]; 87 | const value_t left = penalty[target_row*lane+col-1]; 88 | 89 | const value_t residue = q_value-subject[col-1]; 90 | 91 | penalty[target_row*lane+col] = residue*residue + 92 | min(diag, min(abve, left)); 93 | } 94 | } 95 | 96 | const index_t last_row = num_features & 1; 97 | const value_t result = penalty[last_row*lane+num_features]; 98 | delete [] penalty; 99 | 100 | return result; 101 | } 102 | 103 | #include 104 | template < 105 | typename index_t, 106 | typename value_t> 107 | void host_dtw( 108 | value_t * query, 109 | value_t * subject, 110 | value_t * dist, 111 | index_t num_entries, 112 | index_t num_features) { 113 | 114 | # pragma omp parallel for 115 | for (index_t entry = 0; entry < num_entries; entry++) 116 | dist[entry] = dtw(query, subject+entry*num_features, num_features); 117 | } 118 | 119 | int main () { 120 | 121 | constexpr index_t num_features = 128; 122 | constexpr index_t num_entries = 1UL << 20; 123 | 124 | // small letters for hosts, capital letters for device 125 | value_t * data = nullptr, * dist = nullptr; 126 | label_t * labels = nullptr; 127 | 128 | // malloc memory 129 | cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features); CUERR 130 | cudaMallocHost(&dist, sizeof(value_t)*num_entries); CUERR 131 | cudaMallocHost(&labels, sizeof(label_t)*num_entries); CUERR 132 | 133 | // create CBF data set on host 134 | TIMERSTART(generate_data) 135 | generate_cbf(data, labels, num_entries, num_features); 136 | TIMERSTOP(generate_data) 137 | 138 | 139 | TIMERSTART(DTW_openmp) 140 | host_dtw(data, data, dist, num_entries, num_features); 141 | TIMERSTOP(DTW_openmp) 142 | 143 | 144 | for (index_t index = 0; index < 10; index++) 145 | std::cout << index_t(labels[index]) << " " << dist[index] << std::endl; 146 | 147 | 148 | // get rid of the memory 149 | cudaFreeHost(labels); 150 | cudaFreeHost(data); 151 | cudaFreeHost(dist); 152 | } 153 | -------------------------------------------------------------------------------- /chapter7/eigenfaces/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVCCFLAGS=-O3 -std=c++11 -D_FORCE_INLINES -arch=sm_61 3 | 4 | all: mean_computation \ 5 | mean_correction_coalesced \ 6 | mean_correction_non_coalesced \ 7 | covariance_naive \ 8 | covariance_symmetric \ 9 | covariance_shared \ 10 | eigenfaces 11 | 12 | mean_computation: mean_computation.cu 13 | $(NVCC) $(NVCCFLAGS) mean_computation.cu -o mean_computation 14 | 15 | mean_correction_coalesced: mean_correction.cu 16 | $(NVCC) $(NVCCFLAGS) mean_correction.cu -o mean_correction_coalesced -DCOALESCED_ACCESS 17 | 18 | mean_correction_non_coalesced: mean_correction.cu 19 | $(NVCC) $(NVCCFLAGS) mean_correction.cu -o mean_correction_non_coalesced 20 | 21 | covariance_naive: covariance.cu 22 | $(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_naive -DCOV_MODE_NAIVE 23 | 24 | covariance_symmetric: covariance.cu 25 | $(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_symmetric -DCOV_MODE_SYMMETRIC 26 | 27 | covariance_shared: covariance.cu 28 | $(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_shared 29 | 30 | eigenfaces: eigenfaces.cu 31 | $(NVCC) $(NVCCFLAGS) eigenfaces.cu -o eigenfaces -lcusolver 32 | 33 | clean: 34 | rm -f mean_computation \ 35 | mean_correction_coalesced \ 36 | mean_correction_non_coalesced \ 37 | covariance_naive \ 38 | covariance_symmetric \ 39 | covariance_shared \ 40 | eigenfaces 41 | -------------------------------------------------------------------------------- /chapter7/eigenfaces/data/convert_images.py: -------------------------------------------------------------------------------- 1 | import os 2 | import array as ar 3 | import numpy as np 4 | from scipy.misc import imread 5 | from scipy.linalg import svd 6 | 7 | # specifiy the CelebA folder 8 | dirname = "./img_align_celeba/" 9 | 10 | files = [filename for (dirpath, dirnames, filenames) in os.walk(dirname) 11 | for filename in filenames if filename[-4:] == ".jpg"] 12 | 13 | if len(files) == 0: 14 | print "ERROR: goto folder im_align_celeba and inspect the README file" 15 | import sys 16 | sys.exit(1) 17 | 18 | # if you want to subsample in index space 19 | # files = files[::10] 20 | 21 | # downsample the resolution by 4 22 | subx, suby = 4, 4 23 | dimx, dimy = (218+subx-1)/subx, (178+suby-1)/suby 24 | 25 | data = np.zeros((len(files), dimx*dimy), dtype=np.float32) 26 | print dimx*dimy 27 | 28 | for index, filename in enumerate(files): 29 | if index % 1000 == 0: 30 | print index 31 | data[index] = np.mean(imread(dirname+filename), axis=2)[::subx,::suby].flatten() 32 | 33 | with open("celebA_gray_lowres.%d_%d_%d_32.bin" % (data.shape[0], dimx, dimy), "wb") as f: 34 | f.write(ar.array("f", data.flatten())) 35 | -------------------------------------------------------------------------------- /chapter7/eigenfaces/data/img_align_celeba/README.md: -------------------------------------------------------------------------------- 1 | ### Instructions 2 | 3 | - navigate to the [CelebA website](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) 4 | - download the "Align&Cropped Images" zip container 5 | - unzip it in this folder 6 | - run the conversion script in the parent dir 7 | -------------------------------------------------------------------------------- /chapter7/eigenfaces/mean_computation.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | #include "../include/binary_IO.hpp" 3 | #include "../include/bitmap_IO.hpp" 4 | 5 | template < 6 | typename index_t, 7 | typename value_t> __global__ 8 | void compute_mean_kernel( 9 | value_t * Data, 10 | value_t * Mean, 11 | index_t num_entries, 12 | index_t num_features); 13 | 14 | int main (int argc, char * argv[]) { 15 | 16 | // set the identifier of the used CUDA device 17 | cudaSetDevice(0); 18 | 19 | // 202599 grayscale images each of shape 55 x 45 20 | constexpr uint64_t imgs = 202599, rows = 55, cols = 45; 21 | 22 | // pointer for data matrix and mean vector 23 | float * data = nullptr, * mean = nullptr; 24 | cudaMallocHost(&data, sizeof(float)*imgs*rows*cols); CUERR 25 | cudaMallocHost(&mean, sizeof(float)*rows*cols); CUERR 26 | 27 | // allocate storage on GPU 28 | float * Data = nullptr, * Mean = nullptr; 29 | cudaMalloc(&Data, sizeof(float)*imgs*rows*cols); CUERR 30 | cudaMalloc(&Mean, sizeof(float)*rows*cols); CUERR 31 | 32 | // load data matrix from disk 33 | TIMERSTART(read_data_from_disk) 34 | std::string file_name = "./data/celebA_gray_lowres.202599_55_45_32.bin"; 35 | load_binary(data, imgs*rows*cols, file_name); 36 | TIMERSTOP(read_data_from_disk) 37 | 38 | // copy data to device and reset Mean 39 | TIMERSTART(data_H2D) 40 | cudaMemcpy(Data, data, sizeof(float)*imgs*rows*cols, 41 | cudaMemcpyHostToDevice); CUERR 42 | cudaMemset(Mean, 0, sizeof(float)*rows*cols); CUERR 43 | TIMERSTOP(data_H2D) 44 | 45 | // compute mean 46 | TIMERSTART(compute_mean_kernel) 47 | compute_mean_kernel<<>> 48 | (Data, Mean, imgs, rows*cols); CUERR 49 | TIMERSTOP(compute_mean_kernel) 50 | 51 | 52 | // transfer mean back to host 53 | TIMERSTART(mean_D2H) 54 | cudaMemcpy(mean, Mean, sizeof(float)*rows*cols, 55 | cudaMemcpyDeviceToHost); CUERR 56 | TIMERSTOP(mean_D2H) 57 | 58 | // write mean image to disk 59 | TIMERSTART(write_mean_image_to_disk) 60 | dump_bitmap(mean, rows, cols, "./imgs/celebA_mean.bmp"); 61 | TIMERSTOP(write_mean_image_to_disk) 62 | 63 | // get rid of the memory 64 | cudaFreeHost(data); CUERR 65 | cudaFreeHost(mean); CUERR 66 | cudaFree(Data); CUERR 67 | cudaFree(Mean); CUERR 68 | 69 | } 70 | 71 | template < 72 | typename index_t, 73 | typename value_t> __global__ 74 | void compute_mean_kernel( 75 | value_t * Data, 76 | value_t * Mean, 77 | index_t num_entries, 78 | index_t num_features) { 79 | 80 | auto thid = blockDim.x*blockIdx.x + threadIdx.x; 81 | 82 | if (thid < num_features) { 83 | 84 | value_t accum = 0; 85 | 86 | # pragma unroll 32 87 | for (index_t entry = 0; entry < num_entries; entry++) 88 | accum += Data[entry*num_features+thid]; 89 | 90 | Mean[thid] = accum/num_entries; 91 | } 92 | } 93 | 94 | -------------------------------------------------------------------------------- /chapter7/eigenfaces/mean_correction.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | #include "../include/binary_IO.hpp" 3 | 4 | template < 5 | typename index_t, 6 | typename value_t> __global__ 7 | void compute_mean_kernel( 8 | value_t * Data, 9 | value_t * Mean, 10 | index_t num_entries, 11 | index_t num_features) { 12 | 13 | auto thid = blockDim.x*blockIdx.x + threadIdx.x; 14 | 15 | if (thid < num_features) { 16 | 17 | value_t accum = 0; 18 | 19 | # pragma unroll 32 20 | for (index_t entry = 0; entry < num_entries; entry++) 21 | accum += Data[entry*num_features+thid]; 22 | 23 | Mean[thid] = accum/num_entries; 24 | } 25 | } 26 | 27 | template < 28 | typename index_t, 29 | typename value_t> __global__ 30 | void correction_kernel( 31 | value_t * Data, 32 | value_t * Mean, 33 | index_t num_entries, 34 | index_t num_features) { 35 | 36 | auto thid = blockDim.x*blockIdx.x + threadIdx.x; 37 | 38 | if (thid < num_features) { 39 | 40 | value_t value = Mean[thid]; 41 | 42 | for (index_t entry = 0; entry < num_entries; entry++) 43 | Data[entry*num_features+thid] -= value; 44 | 45 | } 46 | } 47 | 48 | template < 49 | typename index_t, 50 | typename value_t> __global__ 51 | void correction_kernel_ortho( 52 | value_t * Data, 53 | value_t * Mean, 54 | index_t num_entries, 55 | index_t num_features) { 56 | 57 | auto thid = blockDim.x*blockIdx.x + threadIdx.x; 58 | 59 | if (thid < num_entries) { 60 | 61 | for (index_t feature = 0; feature < num_features; feature++) 62 | Data[thid*num_features+feature] -= Mean[feature]; 63 | } 64 | } 65 | 66 | int main (int argc, char * argv[]) { 67 | 68 | // set the identifier of the used CUDA device 69 | cudaSetDevice(0); 70 | 71 | // 202599 grayscale images each of shape 55 x 45 72 | constexpr uint64_t imgs = 202599, rows = 55, cols = 45; 73 | 74 | // pointer for data matrix and mean vector 75 | float * data = nullptr; 76 | cudaMallocHost(&data, sizeof(float)*imgs*rows*cols); CUERR 77 | 78 | // allocate storage on GPU 79 | float * Data = nullptr, * Mean = nullptr; 80 | cudaMalloc(&Data, sizeof(float)*imgs*rows*cols); CUERR 81 | cudaMalloc(&Mean, sizeof(float)*rows*cols); CUERR 82 | 83 | // load data matrix from disk 84 | TIMERSTART(read_data_from_disk) 85 | auto file_name = "./data/celebA_gray_lowres.202599_55_45_32.bin"; 86 | load_binary(data, imgs*rows*cols, file_name); 87 | TIMERSTOP(read_data_from_disk) 88 | 89 | // copy data to device and reset Mean 90 | TIMERSTART(data_H2D) 91 | cudaMemcpy(Data, data, sizeof(float)*imgs*rows*cols, 92 | cudaMemcpyHostToDevice); CUERR 93 | cudaMemset(Mean, 0, sizeof(float)*rows*cols); CUERR 94 | TIMERSTOP(data_H2D) 95 | 96 | // compute mean 97 | TIMERSTART(compute_mean_kernel) 98 | compute_mean_kernel<<>> 99 | (Data, Mean, imgs, rows*cols); CUERR 100 | TIMERSTOP(compute_mean_kernel) 101 | 102 | // correct mean 103 | TIMERSTART(correction_kernel) 104 | #ifdef COALESCED_ACCESS 105 | correction_kernel<<>> 106 | (Data, Mean, imgs, rows*cols); CUERR 107 | #else 108 | correction_kernel_ortho<<>> 109 | (Data, Mean, imgs, rows*cols); CUERR 110 | #endif 111 | TIMERSTOP(correction_kernel) 112 | 113 | // get rid of the memory 114 | cudaFreeHost(data); CUERR 115 | cudaFree(Data); CUERR 116 | cudaFree(Mean); CUERR 117 | 118 | } 119 | -------------------------------------------------------------------------------- /chapter7/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | all: hello_world 4 | 5 | hello_world: hello_world.cu 6 | $(NVCC) -O2 -std=c++11 hello_world.cu -o hello_world 7 | 8 | clean: 9 | rm -f hello_world 10 | -------------------------------------------------------------------------------- /chapter7/hello_world/hello_world.cu: -------------------------------------------------------------------------------- 1 | #include // printf 2 | 3 | __global__ void hello_kernel() { 4 | 5 | // calculate global thread identifier, note blockIdx.x=0 here 6 | const auto thid = blockDim.x*blockIdx.x + threadIdx.x; 7 | 8 | // print a greeting message 9 | printf("Hello from thread %d!\n", thid); 10 | } 11 | 12 | // compile with: nvcc hello_world.cu -std=c++11 -O3 13 | // output: 14 | // Hello from thread 0! 15 | // Hello from thread 1! 16 | // Hello from thread 2! 17 | // Hello from thread 3! 18 | 19 | int main (int argc, char * argv[]) { 20 | 21 | // set the ID of the CUDA device 22 | cudaSetDevice(0); 23 | 24 | // invoke kernel using 4 threads executed in 1 thread block 25 | hello_kernel<<<1, 4>>>(); 26 | 27 | // synchronize the GPU preventing premature termination 28 | cudaDeviceSynchronize(); 29 | } 30 | -------------------------------------------------------------------------------- /chapter7/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter8/include: -------------------------------------------------------------------------------- 1 | ../include/ -------------------------------------------------------------------------------- /chapter8/intrinsics_and_atomics/Makefile: -------------------------------------------------------------------------------- 1 | NVCC= nvcc 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61 3 | 4 | all: znorm atomics cas 5 | 6 | znorm: znorm.cu 7 | $(NVCC) $(NVCCFLAGS) znorm.cu -o znorm 8 | 9 | atomics: atomics.cu 10 | $(NVCC) $(NVCCFLAGS) atomics.cu -o atomics 11 | 12 | cas: cas.cu 13 | $(NVCC) $(NVCCFLAGS) cas.cu -o cas 14 | 15 | clean: 16 | rm -f znorm 17 | rm -f atomics 18 | rm -f cas 19 | -------------------------------------------------------------------------------- /chapter8/intrinsics_and_atomics/atomics.cu: -------------------------------------------------------------------------------- 1 | #include "../include/cbf_generator.hpp" 2 | #include "../include/hpc_helpers.hpp" 3 | 4 | typedef uint64_t index_t; 5 | typedef uint8_t label_t; 6 | typedef float value_t; 7 | 8 | template < 9 | typename index_t, 10 | typename value_t, 11 | index_t warp_size=32> __global__ 12 | void global_reduction_kernel( 13 | value_t * Input, // pointer to the data 14 | value_t * Output, // pointer to the result 15 | index_t length) { // number of entries (n) 16 | 17 | // get thread and block identifiers 18 | const index_t thid = threadIdx.x; 19 | const index_t blid = blockIdx.x; 20 | const index_t base = blid*warp_size; 21 | 22 | // store entries in registers 23 | value_t x = 0; 24 | if (base+thid < length) 25 | x = Input[base+thid]; 26 | 27 | // do the Kepler shuffle 28 | for (index_t offset = warp_size/2; offset > 0; offset /= 2) 29 | x += __shfl_down(x, offset, warp_size); 30 | 31 | // write down result 32 | if (thid == 0) 33 | atomicAdd(Output, x); 34 | } 35 | 36 | template < 37 | typename index_t, 38 | typename value_t, 39 | index_t warp_size=32> __global__ 40 | void static_reduction_kernel( 41 | value_t * Input, // pointer to the data 42 | value_t * Output, // pointer to the result 43 | index_t length) { // number of entries (n) 44 | 45 | // get global thread identifier 46 | const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; 47 | 48 | // here we store the result 49 | value_t accum = value_t(0); 50 | 51 | // block-cyclic summation over all spawned blocks 52 | for (index_t i = thid; i < length; i += blockDim.x*gridDim.x) 53 | accum += Input[i]; 54 | 55 | // reduce all values within a warp 56 | for (index_t offset = warp_size/2; offset > 0; offset /= 2) 57 | accum += __shfl_down(accum, offset, warp_size); 58 | 59 | // first thread of every warp writes result 60 | if (thid % 32 == 0) 61 | atomicAdd(Output, accum); 62 | } 63 | 64 | 65 | int main () { 66 | 67 | constexpr index_t num_features = 32; 68 | constexpr index_t num_entries = 1UL << 10; 69 | 70 | // small letters for hosts, capital letters for device 71 | value_t * data = nullptr, * result = nullptr, 72 | * Data = nullptr, * Result = nullptr; 73 | label_t * labels = nullptr; 74 | 75 | // malloc memory 76 | cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features); CUERR 77 | cudaMalloc (&Data, sizeof(value_t)*num_entries*num_features); CUERR 78 | cudaMallocHost(&result, sizeof(value_t)); CUERR 79 | cudaMalloc (&Result, sizeof(value_t)); CUERR 80 | cudaMallocHost(&labels, sizeof(label_t)*num_entries); CUERR 81 | 82 | // create CBF data set on host 83 | TIMERSTART(generate_data) 84 | generate_cbf(data, labels, num_entries, num_features); 85 | TIMERSTOP(generate_data) 86 | 87 | TIMERSTART(copy_data_to_device) 88 | cudaMemcpy(Data, data, sizeof(value_t)*num_entries*num_features, H2D);CUERR 89 | cudaMemset(Result, 0, sizeof(value_t)); 90 | TIMERSTOP(copy_data_to_device) 91 | 92 | value_t accum = 0; 93 | for (index_t i = 0; i < num_entries*num_features; i++) 94 | accum += data[i]; 95 | std::cout << accum << std::endl; 96 | 97 | TIMERSTART(global_reduction) 98 | global_reduction_kernel<<>> 99 | (Data, Result, num_entries*num_features); CUERR 100 | TIMERSTOP(global_reduction) 101 | 102 | TIMERSTART(static_reduction) 103 | static_reduction_kernel<<<32, 32>>>(Data, Result, num_entries*num_features); CUERR 104 | TIMERSTOP(static_reduction) 105 | 106 | TIMERSTART(copy_data_to_host) 107 | cudaMemcpy(result, Result, sizeof(value_t), D2H); CUERR 108 | TIMERSTOP(copy_data_to_host) 109 | 110 | 111 | std::cout << *result << std::endl; 112 | 113 | // get rid of the memory 114 | cudaFreeHost(labels); 115 | cudaFreeHost(result); 116 | cudaFreeHost(data); 117 | cudaFree(Result); 118 | cudaFree(Data); 119 | 120 | } 121 | -------------------------------------------------------------------------------- /chapter8/intrinsics_and_atomics/cas.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | __device__ __forceinline__ 4 | int atomicUpdateResultBoundedByTwo( 5 | int* address, 6 | int value) { 7 | 8 | // get the source value stored at address 9 | int source = *address, expected; 10 | 11 | do { 12 | // we expect source 13 | expected = source; 14 | 15 | // compute our custom binary operation 16 | int target = expected+value+expected*value; 17 | 18 | // check the constraint 19 | if (target < 0 || target >= 10) 20 | return source; 21 | 22 | // try to swap the values 23 | source = atomicCAS(address, expected, target); 24 | 25 | // (expected == source) on success 26 | } while (expected != source); 27 | 28 | return source; 29 | } 30 | 31 | __global__ 32 | void apply_kernel(int * source_address, int value) { 33 | if (blockIdx.x == 0 && threadIdx.x == 0) 34 | atomicUpdateResultBoundedByTwo(source_address, value); 35 | 36 | } 37 | 38 | 39 | int main () { 40 | int * data = nullptr; 41 | cudaMallocHost(&data, sizeof(int)); CUERR 42 | 43 | *data = 0; 44 | apply_kernel<<<1, 1>>> (data, 10); 45 | 46 | cudaDeviceSynchronize(); 47 | 48 | std::cout << * data << std::endl; 49 | 50 | cudaFree(data); 51 | } 52 | -------------------------------------------------------------------------------- /chapter8/intrinsics_and_atomics/znorm.cu: -------------------------------------------------------------------------------- 1 | #include "../include/cbf_generator.hpp" 2 | #include "../include/hpc_helpers.hpp" 3 | 4 | typedef uint64_t index_t; 5 | typedef uint8_t label_t; 6 | typedef float value_t; 7 | 8 | __forceinline__ __device__ 9 | double cuda_rsqrt(const double& value) { 10 | return rsqrt(value); 11 | } 12 | 13 | __forceinline__ __device__ 14 | float cuda_rsqrt(const float& value) { 15 | return rsqrtf(value); 16 | } 17 | 18 | template < 19 | typename index_t, 20 | typename value_t> __global__ 21 | void znorm_kernel( 22 | value_t * Subject, // pointer to the subject 23 | index_t num_entries, // number of time series (m) 24 | index_t num_features) { // number of time ticks (n) 25 | 26 | // get thread and block identifiers 27 | const index_t blid = blockIdx.x; 28 | const index_t thid = threadIdx.x; 29 | const index_t base = blid*num_features; 30 | 31 | // 1. coalesced loading of entries 32 | value_t v = Subject[base+thid]; 33 | value_t x = v; // copy for later 34 | 35 | // 2a. perform a warp reduction (sum stored in thread zero) 36 | for (index_t offset = num_features/2; offset > 0; offset /= 2) 37 | x += __shfl_down(x, offset, num_features); 38 | 39 | // 2b. perform the first broadcast 40 | value_t mu = __shfl(x, 0)/num_features; 41 | 42 | // define the square residues 43 | value_t y = (v-mu)*(v-mu); 44 | 45 | // 3a. perform a warp reduction (sum stored in thread zero) 46 | for (index_t offset = num_features/2; offset > 0; offset /= 2) 47 | y += __shfl_down(y, offset, num_features); 48 | 49 | // 3b. perform the second broadcast 50 | value_t sigma = __shfl(y, 0)/(num_features-1); 51 | 52 | // 4. write result back 53 | Subject[base+thid] = (v-mu)*cuda_rsqrt(sigma); 54 | } 55 | 56 | int main () { 57 | 58 | constexpr index_t num_features = 32; 59 | constexpr index_t num_entries = 1UL << 20; 60 | 61 | // small letters for hosts, capital letters for device 62 | value_t * data = nullptr, * Data = nullptr; 63 | label_t * labels = nullptr; 64 | 65 | // malloc memory 66 | cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features); CUERR 67 | cudaMalloc (&Data, sizeof(value_t)*num_entries*num_features); CUERR 68 | cudaMallocHost(&labels, sizeof(label_t)*num_entries); CUERR 69 | 70 | // create CBF data set on host 71 | TIMERSTART(generate_data) 72 | generate_cbf(data, labels, num_entries, num_features); 73 | TIMERSTOP(generate_data) 74 | 75 | TIMERSTART(copy_data_to_device) 76 | cudaMemcpy(Data, data, sizeof(value_t)*num_entries*num_features, H2D);CUERR 77 | TIMERSTOP(copy_data_to_device) 78 | 79 | 80 | TIMERSTART(z_norm) 81 | znorm_kernel<<>>(Data, num_entries, num_features); CUERR 82 | TIMERSTOP(z_norm) 83 | 84 | TIMERSTART(copy_data_to_host) 85 | cudaMemcpy(data, Data, sizeof(value_t)*num_entries*num_features, D2H);CUERR 86 | TIMERSTOP(copy_data_to_host) 87 | 88 | 89 | value_t accum = 0, accum2=0; 90 | for (index_t i = 0; i < 32; i++) { 91 | accum += data[i]; 92 | accum2 += data[i]*data[i]; 93 | } 94 | 95 | std::cout << accum << " " << accum2 << std::endl; 96 | 97 | // get rid of the memory 98 | cudaFreeHost(labels); 99 | cudaFreeHost(data); 100 | cudaFree(Data); 101 | } 102 | -------------------------------------------------------------------------------- /chapter8/multi_gpu/Makefile: -------------------------------------------------------------------------------- 1 | NVCC= nvcc 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61 3 | 4 | all: single_gpu multi_gpu streamed_gpu multi_streamed_gpu 5 | 6 | single_gpu: single_gpu.cu 7 | $(NVCC) $(NVCCFLAGS) single_gpu.cu -o single_gpu 8 | 9 | multi_gpu: multi_gpu.cu 10 | $(NVCC) $(NVCCFLAGS) multi_gpu.cu -o multi_gpu 11 | 12 | streamed_gpu: streamed_gpu.cu 13 | $(NVCC) $(NVCCFLAGS) streamed_gpu.cu -o streamed_gpu 14 | 15 | multi_streamed_gpu: multi_streamed_gpu.cu 16 | $(NVCC) $(NVCCFLAGS) multi_streamed_gpu.cu -o multi_streamed_gpu 17 | 18 | clean: 19 | rm -f single_gpu 20 | rm -f multi_gpu 21 | rm -f streamed_gpu 22 | rm -f multi_streamed_gpu 23 | -------------------------------------------------------------------------------- /chapter8/multi_gpu/multi_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | template < 4 | typename index_t, 5 | typename value_t, 6 | index_t num_iters=256> __global__ 7 | void square_root_kernel( 8 | value_t * Data, 9 | index_t length) { 10 | 11 | const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; 12 | 13 | for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){ 14 | 15 | value_t value = Data[i]; 16 | value_t root = value; 17 | 18 | # pragma unroll (32) 19 | for (index_t iters = 0; iters < num_iters && value; iters++) 20 | root = 0.5*(root+value/root); 21 | 22 | Data[i] = root; 23 | } 24 | } 25 | 26 | int main () { 27 | 28 | typedef float value_t; 29 | typedef uint64_t index_t; 30 | 31 | const index_t length = 1UL << 30; 32 | 33 | int num_gpus; 34 | cudaGetDeviceCount(&num_gpus); 35 | const index_t batch_size = length/num_gpus; 36 | 37 | value_t * data = nullptr, * Data[num_gpus]; 38 | 39 | cudaMallocHost(&data, sizeof(value_t)*length); CUERR 40 | 41 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 42 | cudaSetDevice(gpu); 43 | cudaMalloc(&Data[gpu], sizeof(value_t)*batch_size); CUERR 44 | } 45 | 46 | for (index_t index = 0; index < length; index++) 47 | data[index] = index; 48 | 49 | TIMERSTART(overall) 50 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 51 | const index_t offset = gpu*batch_size; 52 | cudaSetDevice(gpu); CUERR 53 | cudaMemcpy(Data[gpu], data+offset, sizeof(value_t)*batch_size, 54 | cudaMemcpyHostToDevice); CUERR 55 | } 56 | 57 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 58 | cudaSetDevice(gpu); CUERR 59 | square_root_kernel<<<1024, 1024>>>(Data[gpu], batch_size); CUERR 60 | } 61 | 62 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 63 | const index_t offset = gpu*batch_size; 64 | cudaSetDevice(gpu); CUERR 65 | cudaMemcpy(data+offset, Data[gpu], sizeof(value_t)*batch_size, 66 | cudaMemcpyDeviceToHost); CUERR 67 | } 68 | TIMERSTOP(overall) 69 | 70 | for (index_t index = 0; index < length; index += batch_size/10) 71 | std::cout << index << " " << data[index] << std::endl; 72 | 73 | cudaFreeHost(data); CUERR 74 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 75 | cudaSetDevice(gpu); 76 | cudaFree(Data[gpu]); CUERR 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /chapter8/multi_gpu/multi_streamed_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | template < 4 | typename index_t, 5 | typename value_t, 6 | index_t num_iters=256> __global__ 7 | void square_root_kernel( 8 | value_t * Data, 9 | index_t length) { 10 | 11 | const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; 12 | 13 | for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){ 14 | 15 | value_t value = Data[i]; 16 | value_t root = value; 17 | 18 | # pragma unroll (32) 19 | for (index_t iters = 0; iters < num_iters && value; iters++) 20 | root = 0.5*(root+value/root); 21 | 22 | Data[i] = root; 23 | } 24 | } 25 | 26 | int main () { 27 | 28 | typedef float value_t; 29 | typedef uint64_t index_t; 30 | 31 | const index_t length = 1UL << 30; 32 | const index_t num_streams = 32; 33 | 34 | int num_gpus; 35 | cudaGetDeviceCount(&num_gpus); 36 | const index_t batch_size = length/(num_gpus*num_streams); 37 | 38 | value_t * data = nullptr, * Data[num_gpus]; 39 | cudaStream_t streams[num_gpus][num_streams]; 40 | 41 | cudaMallocHost(&data, sizeof(value_t)*length); CUERR 42 | 43 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 44 | cudaSetDevice(gpu); 45 | cudaMalloc(&Data[gpu], 46 | sizeof(value_t)*batch_size*num_streams); CUERR 47 | 48 | for (index_t streamID = 0; streamID < num_streams; streamID++) 49 | cudaStreamCreate(&streams[gpu][streamID]); CUERR 50 | } 51 | 52 | for (index_t index = 0; index < length; index++) 53 | data[index] = index; 54 | 55 | TIMERSTART(overall) 56 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 57 | const index_t offset = gpu*num_streams*batch_size; 58 | cudaSetDevice(gpu); CUERR 59 | 60 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 61 | const index_t loc_off = streamID*batch_size; 62 | const index_t glb_off = loc_off+offset; 63 | cudaMemcpyAsync(Data[gpu]+loc_off, data+glb_off, 64 | sizeof(value_t)*batch_size, 65 | cudaMemcpyHostToDevice, 66 | streams[gpu][streamID]); CUERR 67 | } 68 | } 69 | 70 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 71 | cudaSetDevice(gpu); CUERR 72 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 73 | const index_t offset = streamID*batch_size; 74 | square_root_kernel<<<1024, 1024, 0, streams[gpu][streamID]>>> 75 | (Data[gpu]+offset, batch_size); CUERR 76 | } 77 | } 78 | 79 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 80 | const index_t offset = gpu*num_streams*batch_size; 81 | cudaSetDevice(gpu); CUERR 82 | 83 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 84 | const index_t loc_off = streamID*batch_size; 85 | const index_t glb_off = loc_off+offset; 86 | cudaMemcpyAsync(data+glb_off, Data[gpu]+loc_off, 87 | sizeof(value_t)*batch_size, 88 | cudaMemcpyDeviceToHost, 89 | streams[gpu][streamID]); CUERR 90 | } 91 | } 92 | 93 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 94 | cudaSetDevice(gpu); 95 | cudaDeviceSynchronize(); 96 | } 97 | TIMERSTOP(overall) 98 | 99 | 100 | for (index_t index = 0; index < length; index += batch_size/10) 101 | std::cout << index << " " << data[index] << std::endl; 102 | 103 | cudaFreeHost(data); CUERR 104 | for (index_t gpu = 0; gpu < num_gpus; gpu++) { 105 | cudaSetDevice(gpu); 106 | cudaFree(Data[gpu]); CUERR 107 | 108 | for (index_t streamID = 0; streamID < num_streams; streamID++) 109 | cudaStreamDestroy(streams[gpu][streamID]); CUERR 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /chapter8/multi_gpu/single_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | template < 4 | typename index_t, 5 | typename value_t, 6 | index_t num_iters=256> __global__ 7 | void square_root_kernel( 8 | value_t * Data, 9 | index_t length) { 10 | 11 | const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; 12 | 13 | for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){ 14 | 15 | value_t value = Data[i]; 16 | value_t root = value; 17 | 18 | # pragma unroll (32) 19 | for (index_t iters = 0; iters < num_iters && value; iters++) 20 | root = 0.5*(root+value/root); 21 | 22 | Data[i] = root; 23 | } 24 | } 25 | 26 | int main () { 27 | 28 | typedef float value_t; 29 | typedef uint64_t index_t; 30 | 31 | const index_t length = 1UL << 30; 32 | 33 | value_t * data = nullptr, * Data = nullptr; 34 | 35 | cudaMallocHost(&data, sizeof(value_t)*length); CUERR 36 | cudaMalloc (&Data, sizeof(value_t)*length); CUERR 37 | 38 | for (index_t index = 0; index < length; index++) 39 | data[index] = index; 40 | 41 | TIMERSTART(overall) 42 | TIMERSTART(host_to_device) 43 | cudaMemcpy(Data, data, sizeof(value_t)*length, 44 | cudaMemcpyHostToDevice); CUERR 45 | TIMERSTOP(host_to_device) 46 | 47 | TIMERSTART(square_root_kernel) 48 | square_root_kernel<<<1024, 1024>>>(Data, length); CUERR 49 | TIMERSTOP(square_root_kernel) 50 | 51 | TIMERSTART(device_to_host) 52 | cudaMemcpy(data, Data, sizeof(value_t)*length, 53 | cudaMemcpyDeviceToHost); CUERR 54 | TIMERSTOP(device_to_host) 55 | TIMERSTOP(overall) 56 | 57 | for (index_t index = 0; index < 10; index++) 58 | std::cout << index << " " << data[index] << std::endl; 59 | 60 | cudaFreeHost(data); CUERR 61 | cudaFree(Data); CUERR 62 | } 63 | -------------------------------------------------------------------------------- /chapter8/multi_gpu/streamed_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "../include/hpc_helpers.hpp" 2 | 3 | template < 4 | typename index_t, 5 | typename value_t, 6 | index_t num_iters=256> __global__ 7 | void square_root_kernel( 8 | value_t * Data, 9 | index_t length) { 10 | 11 | const index_t thid = blockDim.x*blockIdx.x+threadIdx.x; 12 | 13 | for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){ 14 | 15 | value_t value = Data[i]; 16 | value_t root = value; 17 | 18 | # pragma unroll (32) 19 | for (index_t iters = 0; iters < num_iters && value; iters++) 20 | root = 0.5*(root+value/root); 21 | 22 | Data[i] = root; 23 | } 24 | } 25 | 26 | int main () { 27 | 28 | typedef float value_t; 29 | typedef uint64_t index_t; 30 | 31 | const index_t length = 1UL << 30; 32 | const index_t num_streams = 32; 33 | const index_t batch_size = length/num_streams; 34 | 35 | cudaStream_t streams[num_streams]; 36 | for (index_t streamID = 0; streamID < num_streams; streamID++) 37 | cudaStreamCreate(streams+streamID); CUERR 38 | 39 | value_t * data = nullptr, * Data = nullptr; 40 | 41 | cudaMallocHost(&data, sizeof(value_t)*length); CUERR 42 | cudaMalloc (&Data, sizeof(value_t)*length); CUERR 43 | 44 | for (index_t index = 0; index < length; index++) 45 | data[index] = index; 46 | 47 | TIMERSTART(overall) 48 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 49 | const index_t offset = streamID*batch_size; 50 | cudaMemcpyAsync(Data+offset, data+offset, 51 | sizeof(value_t)*batch_size, 52 | cudaMemcpyHostToDevice, streams[streamID]); CUERR 53 | } 54 | 55 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 56 | const index_t offset = streamID*batch_size; 57 | square_root_kernel<<<1024, 1024, 0, streams[streamID]>>> 58 | (Data+offset, batch_size); CUERR 59 | } 60 | 61 | for (index_t streamID = 0; streamID < num_streams; streamID++) { 62 | const index_t offset = streamID*batch_size; 63 | cudaMemcpyAsync(data+offset, Data+offset, 64 | sizeof(value_t)*batch_size, 65 | cudaMemcpyDeviceToHost, streams[streamID]); CUERR 66 | } 67 | 68 | cudaDeviceSynchronize(); 69 | TIMERSTOP(overall) 70 | 71 | 72 | 73 | for (index_t index = 0; index < 10; index++) 74 | std::cout << index << " " << data[index] << std::endl; 75 | 76 | for (index_t streamID = 0; streamID < num_streams; streamID++) 77 | cudaStreamDestroy(streams[streamID]); CUERR 78 | 79 | cudaFreeHost(data); CUERR 80 | cudaFree(Data); CUERR 81 | } 82 | -------------------------------------------------------------------------------- /chapter8/uvm/Makefile: -------------------------------------------------------------------------------- 1 | NVCC= nvcc 2 | NVCCFLAGS= -O2 -std=c++11 -arch=sm_61 3 | 4 | all: uvm_minimal_example 5 | 6 | uvm_minimal_example: uvm_minimal_example.cu 7 | $(NVCC) $(NVCCFLAGS) uvm_minimal_example.cu -o uvm_minimal_example 8 | 9 | clean: 10 | rm -f uvm_minimal_example 11 | -------------------------------------------------------------------------------- /chapter8/uvm/uvm_minimal_example.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void iota_kernel(float * input, uint64_t size) { 5 | 6 | uint64_t thid = blockIdx.x*blockDim.x+threadIdx.x; 7 | for (uint64_t i = thid; i < size; i += gridDim.x*blockDim.x) 8 | input[i] = i; 9 | } 10 | 11 | int main () { 12 | 13 | uint64_t size = 1UL << 20; 14 | float * input = nullptr; 15 | cudaMallocHost(&input, sizeof(float)*size); 16 | iota_kernel<<<1024, 1024>>>(input, size); 17 | 18 | cudaDeviceSynchronize(); 19 | 20 | for (uint64_t i = 0; i < 20; i++) 21 | std::cout << input[i] << std::endl; 22 | } 23 | -------------------------------------------------------------------------------- /chapter9/hello_world/Makefile: -------------------------------------------------------------------------------- 1 | MPICXX= mpic++ 2 | MPICXXFLAGS= -O2 -std=c++11 3 | 4 | all: hello_world 5 | 6 | hello_world: hello_world.cpp 7 | $(MPICXX) $(MPICXXFLAGS) hello_world.cpp -o hello_world 8 | 9 | clean: 10 | rm -rf hello_world 11 | -------------------------------------------------------------------------------- /chapter9/hello_world/hello_world.cpp: -------------------------------------------------------------------------------- 1 | #include "mpi.h" 2 | 3 | int main (int argc, char *argv[]){ 4 | // Initialize MPI 5 | MPI::Init(argc,argv); 6 | 7 | // Get the number of processes 8 | int numP=MPI::COMM_WORLD.Get_size(); 9 | 10 | // Get the ID of the process 11 | int myId=MPI::COMM_WORLD.Get_rank(); 12 | 13 | // Every process prints Hello 14 | std::cout << "Process " << myId << " of " << numP << ": Hello, world!" << std::endl; 15 | 16 | // Terminate MPI 17 | MPI::Finalize(); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /chapter9/jacobi_iteration/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -O2 -std=c++11 3 | MPICXX= mpic++ 4 | MPICXXFLAGS= $(CXXFLAGS) 5 | 6 | all: jacobi_seq jacobi_1D_block_simple jacobi_1D_block jacobi_1D_nonblock 7 | 8 | jacobi_seq: jacobi_seq.cpp 9 | $(CXX) $(CXXFLAGS) jacobi_seq.cpp -o jacobi_seq 10 | 11 | jacobi_1D_block_simple: jacobi_1D_block_simple.cpp 12 | $(MPICXX) $(MPICXXFLAGS) jacobi_1D_block_simple.cpp -o jacobi_1D_block_simple 13 | 14 | jacobi_1D_block: jacobi_1D_block.cpp 15 | $(MPICXX) $(MPICXXFLAGS) jacobi_1D_block.cpp -o jacobi_1D_block 16 | 17 | jacobi_1D_nonblock: jacobi_1D_nonblock.cpp 18 | $(MPICXX) $(MPICXXFLAGS) jacobi_1D_nonblock.cpp -o jacobi_1D_nonblock 19 | 20 | clean: 21 | rm -rf jacobi_seq 22 | rm -rf jacobi_1D_block_simple 23 | rm -rf jacobi_1D_block 24 | rm -rf jacobi_1D_nonblock 25 | -------------------------------------------------------------------------------- /chapter9/jacobi_iteration/jacobi_1D_block_simple.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "mpi.h" 7 | 8 | void readInput(std::string file, int rows, int cols, float *data){ 9 | 10 | // Open the file pointer 11 | /*FILE* fp = fopen(file.c_str(), "rb"); 12 | 13 | // Check if the file exists 14 | if(fp == NULL){ 15 | std::cout << "ERROR: File " << file << " could not be opened" << std::endl; 16 | MPI::COMM_WORLD.Abort(1); 17 | } 18 | 19 | for(int i=0; i errThres){ 124 | if(myId > 0){ 125 | // Send the first row to the previous process 126 | MPI::COMM_WORLD.Send(myData, cols, MPI::FLOAT, myId-1, 0); 127 | } 128 | 129 | if(myId < numP-1){ 130 | // Receive the next row from the next process 131 | MPI::COMM_WORLD.Recv(nextRow, cols, MPI::FLOAT, myId+1, 0); 132 | 133 | // Send the last row to the next process 134 | MPI::COMM_WORLD.Send(&myData[(myRows-1)*cols], cols, MPI::FLOAT, myId+1, 0); 135 | } 136 | 137 | if(myId > 0){ 138 | // Receive the previous row from the previous process 139 | MPI::COMM_WORLD.Recv(prevRow, cols, MPI::FLOAT, myId-1, 0); 140 | } 141 | 142 | // Update the first row 143 | if((myId > 0) && (myRows>1)){ 144 | for(int j=1; j 1)){ 159 | for(int j=1; j 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | void readInput(std::string file, int rows, int cols, float *data){ 8 | 9 | // Open the file pointer 10 | /*FILE* fp = fopen(file.c_str(), "rb"); 11 | 12 | // Check if the file exists 13 | if(fp == NULL){ 14 | std::cout << "ERROR: File " << file << " could not be opened" << std::endl; 15 | exit(1); 16 | } 17 | 18 | for(int i=0; i start, end; 77 | start = std::chrono::system_clock::now(); 78 | 79 | float error = errThres + 1.0; 80 | 81 | while(error > errThres){ 82 | for(int i=1; i elapsed_seconds = end-start; 103 | 104 | std::cout << "Sequential Jacobi with dimensions " << rows << "x" << cols << " in " << elapsed_seconds.count() 105 | << " seconds" << std::endl; 106 | 107 | printOutput(outputFile, rows, cols, data); 108 | 109 | delete [] data; 110 | delete [] buff; 111 | 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /chapter9/matrix_matrix_mult/Makefile: -------------------------------------------------------------------------------- 1 | MPICXX= mpic++ 2 | MPICXXFLAGS= -O2 -std=c++11 3 | 4 | all: matrix_mult_2D matrix_mult_cols matrix_mult_rows summa 5 | 6 | matrix_mult_2D: matrix_mult_2D.cpp 7 | $(MPICXX) $(MPICXXFLAGS) matrix_mult_2D.cpp -o matrix_mult_2D 8 | 9 | matrix_mult_cols: matrix_mult_cols.cpp 10 | $(MPICXX) $(MPICXXFLAGS) matrix_mult_cols.cpp -o matrix_mult_cols 11 | 12 | matrix_mult_rows: matrix_mult_rows.cpp 13 | $(MPICXX) $(MPICXXFLAGS) matrix_mult_rows.cpp -o matrix_mult_rows 14 | 15 | summa: summa.cpp 16 | $(MPICXX) $(MPICXXFLAGS) summa.cpp -o summa 17 | 18 | clean: 19 | rm -rf matrix_mult_2D 20 | rm -rf matrix_mult_cols 21 | rm -rf matrix_mult_rows 22 | rm -rf summa 23 | -------------------------------------------------------------------------------- /chapter9/matrix_matrix_mult/matrix_mult_cols.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "mpi.h" 7 | 8 | void readInput(std::string file, int rows, int cols, float *data){ 9 | 10 | // Open the file pointer 11 | /*FILE* fp = fopen(file.c_str(), "rb"); 12 | 13 | // Check if the file exists 14 | if(fp == NULL){ 15 | std::cout << "ERROR: File " << file << " could not be opened" << std::endl; 16 | MPI::COMM_WORLD.Abort(1); 17 | } 18 | 19 | for(int i=0; i=0; i--){ 122 | req = MPI::COMM_WORLD.Isend(&B[i*blockCols], 1, colTypeB, i, 0); 123 | } 124 | } 125 | 126 | MPI::COMM_WORLD.Recv(myB, k*blockCols, MPI::FLOAT, 0, 0, status); 127 | 128 | // The multiplication of the submatrices 129 | for(int i=0; i=0; i--){ 143 | MPI::COMM_WORLD.Recv(&C[i*blockCols], 1, colTypeC, i, 0, status); 144 | } 145 | } 146 | 147 | // Measure the current time 148 | double end = MPI::Wtime(); 149 | 150 | colTypeB.Free(); 151 | colTypeC.Free(); 152 | 153 | if(!myId){ 154 | std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl; 155 | printOutput(outputFile, m, n, C); 156 | delete [] B; 157 | delete [] C; 158 | } 159 | 160 | MPI::COMM_WORLD.Barrier(); 161 | 162 | delete [] A; 163 | delete [] myB; 164 | delete [] myC; 165 | 166 | // Terminate MPI 167 | MPI::Finalize(); 168 | return 0; 169 | } 170 | -------------------------------------------------------------------------------- /chapter9/matrix_matrix_mult/matrix_mult_rows.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "mpi.h" 7 | 8 | void readInput(std::string file, int rows, int cols, float *data){ 9 | 10 | // Open the file pointer 11 | /*FILE* fp = fopen(file.c_str(), "rb"); 12 | 13 | // Check if the file exists 14 | if(fp == NULL){ 15 | std::cout << "ERROR: File " << file << " could not be opened" << std::endl; 16 | MPI::COMM_WORLD.Abort(1); 17 | } 18 | 19 | for(int i=0; i0){ 125 | displs[i] = displs[i-1]+sendCounts[i-1]; 126 | } 127 | 128 | if(i < m%numP){ 129 | sendCounts[i] = (blockRows+1)*k; 130 | } else { 131 | sendCounts[i] = blockRows*k; 132 | } 133 | } 134 | } 135 | 136 | // Scatter the input matrix A 137 | MPI::COMM_WORLD.Scatterv(A, sendCounts, displs, MPI::FLOAT, myA, myRows*k, MPI::FLOAT, 0); 138 | // Broadcast the input matrix B 139 | MPI::COMM_WORLD.Bcast(B, k*n, MPI::FLOAT, 0); 140 | 141 | // The multiplication of the submatrices 142 | for(int i=0; i0){ 157 | displs[i] = displs[i-1]+sendCounts[i-1]; 158 | } 159 | 160 | if(i < m%numP){ 161 | sendCounts[i] = (blockRows+1)*n; 162 | } else { 163 | sendCounts[i] = blockRows*n; 164 | } 165 | } 166 | } 167 | MPI::COMM_WORLD.Gatherv(myC, myRows*n, MPI::FLOAT, C, sendCounts, displs, MPI::FLOAT, 0); 168 | 169 | // Measure the current time 170 | double end = MPI::Wtime(); 171 | 172 | if(!myId){ 173 | std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl; 174 | printOutput(outputFile, m, n, C); 175 | delete [] A; 176 | delete [] C; 177 | } 178 | 179 | delete [] B; 180 | delete [] myA; 181 | delete [] myC; 182 | 183 | // Terminate MPI 184 | MPI::Finalize(); 185 | return 0; 186 | } 187 | -------------------------------------------------------------------------------- /chapter9/ping_pong/Makefile: -------------------------------------------------------------------------------- 1 | MPICXX= mpic++ 2 | MPICXXFLAGS= -O2 -std=c++11 3 | 4 | all: ping_pong_ring ping_pong_ring_nonblock 5 | 6 | ping_pong_ring: ping_pong_ring.cpp 7 | $(MPICXX) $(MPICXXFLAGS) ping_pong_ring.cpp -o ping_pong_ring 8 | 9 | ping_pong_ring_nonblock: ping_pong_ring_nonblock.cpp 10 | $(MPICXX) $(MPICXXFLAGS) ping_pong_ring_nonblock.cpp -o ping_pong_ring_nonblock 11 | 12 | clean: 13 | rm -rf ping_pong_ring 14 | rm -rf ping_pong_nonblock 15 | -------------------------------------------------------------------------------- /chapter9/ping_pong/ping_pong_ring.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "mpi.h" 4 | 5 | int main (int argc, char *argv[]){ 6 | // Initialize MPI 7 | MPI::Init(argc,argv); 8 | // Get the number of processes 9 | int numP=MPI::COMM_WORLD.Get_size(); 10 | 11 | // Get the ID of the process 12 | int myId=MPI::COMM_WORLD.Get_rank(); 13 | 14 | if(argc < 2){ 15 | // Only the first process prints the output message 16 | if(!myId){ 17 | std::cout << "ERROR: The syntax of the program is" << argv[0] 18 | << " num_ping_pong" << std::endl; 19 | } 20 | MPI::COMM_WORLD.Abort(1); 21 | } 22 | 23 | int num_ping_pong = atoi(argv[1]); 24 | int ping_pong_count = 0; 25 | int next_id = myId+1, prev_id=myId-1; 26 | 27 | if(next_id >= numP){ 28 | next_id = 0; 29 | } 30 | if(prev_id < 0){ 31 | prev_id = numP-1; 32 | } 33 | 34 | while(ping_pong_count < num_ping_pong){ 35 | // First receive the ping and then send the pong 36 | ping_pong_count++; 37 | MPI::COMM_WORLD.Send(&ping_pong_count, 1, MPI::INT, next_id, 0); 38 | std::cout << "Process " << myId << " sends PING number " << ping_pong_count 39 | << " to process " << next_id << std::endl; 40 | MPI::COMM_WORLD.Recv(&ping_pong_count, 1, MPI::INT, prev_id, 0); 41 | std::cout << "Process " << myId << " receives PING number " << ping_pong_count 42 | << " from process " << prev_id << std::endl; 43 | 44 | MPI::COMM_WORLD.Send(&ping_pong_count, 1, MPI::INT, prev_id, 0); 45 | std::cout << "Process " << myId << " sends PONG number " << ping_pong_count 46 | << " to process " << prev_id << std::endl; 47 | MPI::COMM_WORLD.Recv(&ping_pong_count, 1, MPI::INT, next_id, 0); 48 | std::cout << "Process " << myId << " receives PONG number " << ping_pong_count 49 | << " from process " << next_id << std::endl; 50 | } 51 | 52 | // Terminate MPI 53 | MPI::Finalize(); 54 | 55 | return 0; 56 | } 57 | -------------------------------------------------------------------------------- /chapter9/ping_pong/ping_pong_ring_nonblock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "mpi.h" 4 | 5 | int main (int argc, char *argv[]){ 6 | // Initialize MPI 7 | MPI::Init(argc,argv); 8 | // Get the number of processes 9 | int numP=MPI::COMM_WORLD.Get_size(); 10 | 11 | // Get the ID of the process 12 | int myId=MPI::COMM_WORLD.Get_rank(); 13 | 14 | if(argc < 2){ 15 | // Only the first process prints the output message 16 | if(!myId){ 17 | std::cout << "ERROR: The syntax of the program is " << argv[0] 18 | << " num_ping_pong" << std::endl; 19 | } 20 | MPI::COMM_WORLD.Abort(1); 21 | } 22 | 23 | int num_ping_pong = atoi(argv[1]); 24 | int ping_pong_count = 0; 25 | int next_id = myId+1, prev_id=myId-1; 26 | 27 | if(next_id >= numP){ 28 | next_id = 0; 29 | } 30 | if(prev_id < 0){ 31 | prev_id = numP-1; 32 | } 33 | 34 | MPI::Request rq_send, rq_recv; 35 | 36 | while(ping_pong_count < num_ping_pong){ 37 | // First receive the ping and then send the pong 38 | ping_pong_count++; 39 | rq_send = MPI::COMM_WORLD.Isend(&ping_pong_count, 1, MPI::INT, next_id, 0); 40 | std::cout << "Process " << myId << " sends PING number " << ping_pong_count 41 | << " to process " << next_id << std::endl; 42 | rq_recv = MPI::COMM_WORLD.Irecv(&ping_pong_count, 1, MPI::INT, prev_id, 0); 43 | std::cout << "Process " << myId << " receives PING number " << ping_pong_count 44 | << " from process " << prev_id << std::endl; 45 | 46 | rq_recv.Wait(); 47 | 48 | rq_send = MPI::COMM_WORLD.Isend(&ping_pong_count, 1, MPI::INT, prev_id, 0); 49 | std::cout << "Process " << myId << " sends PONG number " << ping_pong_count 50 | << " to process " << prev_id << std::endl; 51 | rq_recv = MPI::COMM_WORLD.Irecv(&ping_pong_count, 1, MPI::INT, next_id, 0); 52 | std::cout << "Process " << myId << " receives PONG number " << ping_pong_count 53 | << " from process " << next_id << std::endl; 54 | 55 | rq_recv.Wait(); 56 | } 57 | 58 | // Terminate MPI 59 | MPI::Finalize(); 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /chapter9/primes/Makefile: -------------------------------------------------------------------------------- 1 | MPICXX= mpic++ 2 | MPICXXFLAGS= -O2 -std=c++11 3 | 4 | all: primes_serialized_comm primes 5 | 6 | primes_serialized_comm: primes_serialized_comm.cpp 7 | $(MPICXX) $(MPICXXFLAGS) primes_serialized_comm.cpp -o primes_serialized_comm 8 | 9 | primes: primes.cpp 10 | $(MPICXX) $(MPICXXFLAGS) primes.cpp -o primes 11 | 12 | clean: 13 | rm -rf primes_serialized_comm 14 | rm -rf primes 15 | -------------------------------------------------------------------------------- /chapter9/primes/primes.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "mpi.h" 4 | 5 | int main (int argc, char *argv[]){ 6 | // Initialize MPI 7 | MPI::Init(argc,argv); 8 | 9 | // Get the number of processes 10 | int numP=MPI::COMM_WORLD.Get_size(); 11 | 12 | // Get the ID of the process 13 | int myId=MPI::COMM_WORLD.Get_rank(); 14 | 15 | if(argc < 2){ 16 | // Only the first process prints the output message 17 | if(!myId){ 18 | std::cout << "ERROR: The syntax of the program is " 19 | << argv[0] << " n" << std::endl; 20 | } 21 | MPI::COMM_WORLD.Abort(1); 22 | } 23 | 24 | int n; 25 | 26 | if(!myId){ 27 | n = atoi(argv[1]); 28 | } 29 | 30 | // Barrier to synchronize the processes before measuring time 31 | MPI::COMM_WORLD.Barrier(); 32 | 33 | // Measure the current time 34 | double start = MPI::Wtime(); 35 | 36 | // Send the value of n to all processes 37 | MPI::COMM_WORLD.Bcast(&n, 1, MPI::INT, 0); 38 | 39 | if(n < 1){ 40 | // Only the first process prints the output message 41 | if(!myId){ 42 | std::cout << "ERROR: The parameter 'n' must be higher than 0" << std::endl; 43 | } 44 | MPI::COMM_WORLD.Abort(1); 45 | } 46 | 47 | // Perform the computation of the number of primes between 1 and n in parallel 48 | int myCount = 0; 49 | int total; 50 | bool prime; 51 | 52 | // Each process analyzes only part of the numbers below n 53 | // The distribution is cyclic for better workload balance 54 | //for(int i=2+myId; i<=n; i=i+numP){ 55 | for(int i=2*(1+myId); i<=n; i=i+2*numP){ 56 | prime = true; 57 | for(int j=2; j 2 | 3 | #include "mpi.h" 4 | 5 | int main (int argc, char *argv[]){ 6 | // Initialize MPI 7 | MPI::Init(argc,argv); 8 | 9 | // Get the number of processes 10 | int numP=MPI::COMM_WORLD.Get_size(); 11 | 12 | // Get the ID of the process 13 | int myId=MPI::COMM_WORLD.Get_rank(); 14 | 15 | if(argc < 2){ 16 | // Only the first process prints the output message 17 | if(!myId){ 18 | std::cout << "ERROR: The syntax of the program is " 19 | << argv[0] << " n" << std::endl; 20 | } 21 | MPI::COMM_WORLD.Abort(1); 22 | } 23 | 24 | int n; 25 | 26 | if(!myId){ 27 | n = atoi(argv[1]); 28 | } 29 | 30 | // Barrier to synchronize the processes before measuring time 31 | MPI::COMM_WORLD.Barrier(); 32 | 33 | // Measure the current time 34 | double start = MPI::Wtime(); 35 | 36 | // Send the value of n to all processes 37 | MPI::COMM_WORLD.Bcast(&n, 1, MPI::INT, 0); 38 | 39 | if(n < 1){ 40 | // Only the first process prints the output message 41 | if(!myId){ 42 | std::cout << "ERROR: The parameter 'n' must be higher than 0" << std::endl; 43 | } 44 | MPI::COMM_WORLD.Abort(1); 45 | } 46 | 47 | // Perform the computation of the number of primes between 1 and n in parallel 48 | int myCount = 0; 49 | int total; 50 | bool prime; 51 | 52 | // Each process analyzes only part of the numbers below n 53 | // Data to perform a block distribution 54 | int blockSize = (n-1)/numP; 55 | int myBlockSize = blockSize; 56 | int myStart = 2+myId*blockSize; 57 | 58 | // For the cases that n is not multiple of numP 59 | if(myId < (n-1)%numP){ 60 | myBlockSize++; 61 | myStart += myId; 62 | } else { 63 | myStart += (n-1)%numP; 64 | } 65 | 66 | int myEnd = myStart+myBlockSize; 67 | 68 | for(int i=myStart; i 5 | #include 6 | 7 | template < 8 | typename index_t, 9 | typename value_t> 10 | void dump_binary( 11 | const value_t * data, 12 | const index_t length, 13 | std::string filename) { 14 | 15 | std::ofstream ofile(filename.c_str(), std::ios::binary); 16 | ofile.write((char*) data, sizeof(value_t)*length); 17 | ofile.close(); 18 | } 19 | 20 | template < 21 | typename index_t, 22 | typename value_t> 23 | void load_binary( 24 | const value_t * data, 25 | const index_t length, 26 | std::string filename) { 27 | 28 | std::ifstream ifile(filename.c_str(), std::ios::binary); 29 | ifile.read((char*) data, sizeof(value_t)*length); 30 | ifile.close(); 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /include/cbf_generator.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CBF_GENERATOR_HPP 2 | #define CBF_GENERATOR_HPP 3 | 4 | #include 5 | #include 6 | 7 | template < 8 | typename index_t, 9 | typename value_t, 10 | typename label_t> 11 | void generate_cbf( 12 | value_t * data, 13 | label_t * labels, 14 | index_t num_entries, 15 | index_t num_features) { 16 | 17 | std::mt19937 engine(42); 18 | std::uniform_int_distribution lower_dist(0.125*num_features, 19 | 0.250*num_features); 20 | std::uniform_int_distribution delta_dist(0.250*num_features, 21 | 0.750*num_features); 22 | std::uniform_real_distribution normal_dist(0, 1); 23 | 24 | // create the labels (0: Cylinder, 1:Bell, 2:Funnel) 25 | for (index_t entry = 0; entry < num_entries; entry++) 26 | labels[entry] = entry % 3; 27 | 28 | for (index_t entry = 0; entry < num_entries; entry++) { 29 | 30 | const index_t a = lower_dist(engine); 31 | const index_t bma = delta_dist(engine); 32 | const value_t amp = normal_dist(engine)+6; 33 | 34 | // Cylinder 35 | if (labels[entry] == 0) { 36 | for (index_t index = 0; index < num_features; index++) { 37 | const value_t value = (index >= a && index < a+bma) ? amp : 0; 38 | data[entry*num_features+index] = value+normal_dist(engine); 39 | } 40 | } 41 | 42 | // Bell 43 | if (labels[entry] == 1) { 44 | for (index_t index = 0; index < num_features; index++) { 45 | const value_t delta = value_t(index)-value_t(a); 46 | const value_t value = (index >= a && index < a+bma) ? 47 | amp*delta/bma : 0; 48 | data[entry*num_features+index] = value+normal_dist(engine); 49 | } 50 | } 51 | 52 | // Funnel 53 | if (labels[entry] == 2) { 54 | for (index_t index = 0; index < num_features; index++) { 55 | const value_t delta = value_t(a+bma)-value_t(index); 56 | const value_t value = (index >= a && index < a+bma) ? 57 | amp*delta/bma : 0; 58 | data[entry*num_features+index] = value+normal_dist(engine); 59 | } 60 | } 61 | } 62 | } 63 | 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /include/hpc_helpers.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPC_HELPERS_HPP 2 | #define HPC_HELPERS_HPP 3 | 4 | #include 5 | #include 6 | 7 | #ifndef __CUDACC__ 8 | #include 9 | #endif 10 | 11 | #ifndef __CUDACC__ 12 | #define TIMERSTART(label) \ 13 | std::chrono::time_point a##label, b##label; \ 14 | a##label = std::chrono::system_clock::now(); 15 | #else 16 | #define TIMERSTART(label) \ 17 | cudaEvent_t start##label, stop##label; \ 18 | float time##label; \ 19 | cudaEventCreate(&start##label); \ 20 | cudaEventCreate(&stop##label); \ 21 | cudaEventRecord(start##label, 0); 22 | #endif 23 | 24 | #ifndef __CUDACC__ 25 | #define TIMERSTOP(label) \ 26 | b##label = std::chrono::system_clock::now(); \ 27 | std::chrono::duration delta##label = b##label-a##label; \ 28 | std::cout << "# elapsed time ("<< #label <<"): " \ 29 | << delta##label.count() << "s" << std::endl; 30 | #else 31 | #define TIMERSTOP(label) \ 32 | cudaEventRecord(stop##label, 0); \ 33 | cudaEventSynchronize(stop##label); \ 34 | cudaEventElapsedTime(&time##label, start##label, stop##label); \ 35 | std::cout << "TIMING: " << time##label << " ms (" << #label << ")" \ 36 | << std::endl; 37 | #endif 38 | 39 | 40 | #ifdef __CUDACC__ 41 | #define CUERR { \ 42 | cudaError_t err; \ 43 | if ((err = cudaGetLastError()) != cudaSuccess) { \ 44 | std::cout << "CUDA error: " << cudaGetErrorString(err) << " : " \ 45 | << __FILE__ << ", line " << __LINE__ << std::endl; \ 46 | exit(1); \ 47 | } \ 48 | } 49 | 50 | // transfer constants 51 | #define H2D (cudaMemcpyHostToDevice) 52 | #define D2H (cudaMemcpyDeviceToHost) 53 | #define H2H (cudaMemcpyHostToHost) 54 | #define D2D (cudaMemcpyDeviceToDevice) 55 | #endif 56 | 57 | // safe division 58 | #define SDIV(x,y)(((x)+(y)-1)/(y)) 59 | 60 | // no_init_t 61 | #include 62 | 63 | template 64 | class no_init_t { 65 | public: 66 | 67 | static_assert(std::is_fundamental::value && 68 | std::is_arithmetic::value, 69 | "wrapped type must be a fundamental, numeric type"); 70 | 71 | //do nothing 72 | constexpr no_init_t() noexcept {} 73 | 74 | //convertible from a T 75 | constexpr no_init_t(T value) noexcept: v_(value) {} 76 | 77 | //act as a T in all conversion contexts 78 | constexpr operator T () const noexcept { return v_; } 79 | 80 | // negation on value and bit level 81 | constexpr no_init_t& operator - () noexcept { v_ = -v_; return *this; } 82 | constexpr no_init_t& operator ~ () noexcept { v_ = ~v_; return *this; } 83 | 84 | // prefix increment/decrement operators 85 | constexpr no_init_t& operator ++ () noexcept { v_++; return *this; } 86 | constexpr no_init_t& operator -- () noexcept { v_--; return *this; } 87 | 88 | // postfix increment/decrement operators 89 | constexpr no_init_t operator ++ (int) noexcept { 90 | auto old(*this); 91 | v_++; 92 | return old; 93 | } 94 | constexpr no_init_t operator -- (int) noexcept { 95 | auto old(*this); 96 | v_--; 97 | return old; 98 | } 99 | 100 | // assignment operators 101 | constexpr no_init_t& operator += (T v) noexcept { v_ += v; return *this; } 102 | constexpr no_init_t& operator -= (T v) noexcept { v_ -= v; return *this; } 103 | constexpr no_init_t& operator *= (T v) noexcept { v_ *= v; return *this; } 104 | constexpr no_init_t& operator /= (T v) noexcept { v_ /= v; return *this; } 105 | 106 | // bit-wise operators 107 | constexpr no_init_t& operator &= (T v) noexcept { v_ &= v; return *this; } 108 | constexpr no_init_t& operator |= (T v) noexcept { v_ |= v; return *this; } 109 | constexpr no_init_t& operator ^= (T v) noexcept { v_ ^= v; return *this; } 110 | constexpr no_init_t& operator >>= (T v) noexcept { v_ >>= v; return *this; } 111 | constexpr no_init_t& operator <<= (T v) noexcept { v_ <<= v; return *this; } 112 | 113 | private: 114 | T v_; 115 | }; 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /include/svd.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HPC_BOOK_SVD_HPP 2 | #define HPC_BOOK_SVD_HPP 3 | 4 | #include 5 | #include "hpc_helpers.hpp" 6 | 7 | bool svd_device( 8 | float * M, 9 | float * U, 10 | float * S, 11 | float * V, 12 | int height, 13 | int width, 14 | bool verbose=false) { 15 | 16 | cusolverDnHandle_t handle; 17 | cusolverDnCreate(&handle); 18 | 19 | int temp_storage_bytes = 0; 20 | if (cusolverDnSgesvd_bufferSize(handle, width, height, &temp_storage_bytes)) 21 | return 1; 22 | 23 | float * temp_storage = nullptr; 24 | if (cudaMalloc(&temp_storage, sizeof(float)*temp_storage_bytes)) 25 | return 1; 26 | 27 | if (verbose) 28 | std::cout << "CUSOLVER: allocated " << temp_storage_bytes 29 | << " bytes of temporary storage." << std::endl; 30 | 31 | int * devInfo; 32 | if(cudaMalloc(&devInfo, sizeof(int))) 33 | return 1; 34 | 35 | if (cusolverDnSgesvd(handle, 'A', 'A', height, width, 36 | M, height, S, U, height, V, width, 37 | temp_storage, temp_storage_bytes, nullptr, devInfo )) 38 | return 1; 39 | 40 | if (verbose) 41 | std::cout << "CUSOLVER: computed SVD." << std::endl; 42 | 43 | if (cusolverDnDestroy(handle)) 44 | return 1; 45 | if (cudaFree(temp_storage)) 46 | return 1; 47 | 48 | if (verbose) 49 | std::cout << "CUSOLVER: freed " << temp_storage_bytes 50 | << " bytes of temporary storage." << std::endl; 51 | 52 | return 0; 53 | } 54 | 55 | #endif 56 | --------------------------------------------------------------------------------