├── README.md
├── chapter10
    ├── axpy
    │   ├── Makefile
    │   └── axpy.cxx
    ├── hello_world
    │   ├── Makefile
    │   └── hello_world.cxx
    ├── histogram
    │   ├── Makefile
    │   ├── histo1.cxx
    │   ├── histo2.cxx
    │   └── histo3.cxx
    ├── letter
    │   ├── Makefile
    │   ├── letter1.cxx
    │   └── letter2.cxx
    ├── mandelbrot
    │   ├── Makefile
    │   ├── mandel1.cxx
    │   ├── mandel2.cxx
    │   └── view.py
    └── matrix_vector
    │   ├── Makefile
    │   └── matrix_vector.cxx
├── chapter3
    ├── AVX
    │   ├── Makefile
    │   ├── matrix_matrix_mult.cpp
    │   ├── pointwise_vector_max.cpp
    │   ├── vector_max.cpp
    │   ├── vector_norm_aos_avx.cpp
    │   ├── vector_norm_aos_plain.cpp
    │   ├── vector_norm_soa_avx.cpp
    │   └── vector_norm_soa_plain.cpp
    ├── include
    └── matrix_matrix_mult_transposed
    │   ├── Makefile
    │   └── matrix_mult.cpp
├── chapter4
    ├── all_pairs_distance_matrix
    │   ├── Makefile
    │   ├── all_pair.cpp
    │   └── data
    │   │   └── mnist_exporter.py
    ├── condition_variables
    │   ├── Makefile
    │   ├── alarm_clock.cpp
    │   ├── one_shot_alarm_clock.cpp
    │   └── ping_pong.cpp
    ├── false_sharing
    │   ├── Makefile
    │   └── false_sharing.cpp
    ├── hello_world
    │   ├── Makefile
    │   └── hello_world.cpp
    ├── include
    ├── matrix_vector_mult
    │   ├── Makefile
    │   └── matrix_vector.cpp
    ├── return_values
    │   ├── Makefile
    │   ├── async.cpp
    │   ├── packaged_task.cpp
    │   ├── promise_future.cpp
    │   └── traditional.cpp
    └── thread_pool
    │   ├── Makefile
    │   ├── main_basic.cpp
    │   ├── main_basic_tree.cpp
    │   └── threadpool_basic.hpp
├── chapter5
    ├── atomics
    │   ├── Makefile
    │   ├── arbitrary_atomics.cpp
    │   ├── atomic_counting.cpp
    │   ├── atomic_max.cpp
    │   ├── query_atomics.cpp
    │   └── universal_atomics.cpp
    ├── include
    ├── knapsack
    │   ├── Makefile
    │   ├── knapsack.cpp
    │   └── threadpool.hpp
    └── thread_pool
    │   ├── Makefile
    │   ├── threadpool.hpp
    │   └── tree.cpp
├── chapter6
    ├── 1NN_classification
    │   ├── 1NN.cpp
    │   ├── Makefile
    │   └── data
    │   │   └── mnist_exporter.py
    ├── advanced_reductions
    │   ├── Makefile
    │   ├── avx_reduction.cpp
    │   ├── custom_reduction.cpp
    │   └── string_reduction.cpp
    ├── hello_world
    │   ├── Makefile
    │   └── hello_world.cpp
    ├── include
    ├── load_imbalance
    │   ├── Makefile
    │   ├── data
    │   └── scheduling.cpp
    ├── matrix_vector
    │   ├── Makefile
    │   └── matrix_vector.cpp
    ├── softmax_regression
    │   ├── Makefile
    │   ├── data
    │   │   └── mnist_softmax.py
    │   └── softmax.cpp
    └── vector_add
    │   ├── Makefile
    │   ├── vector_add.cpp
    │   └── vector_add_scoped.cpp
├── chapter7
    ├── dynamic_time_warping
    │   ├── Makefile
    │   ├── dtw_device.cu
    │   └── dtw_host.cu
    ├── eigenfaces
    │   ├── Makefile
    │   ├── covariance.cu
    │   ├── data
    │   │   ├── convert_images.py
    │   │   └── img_align_celeba
    │   │   │   └── README.md
    │   ├── eigenfaces.cu
    │   ├── mean_computation.cu
    │   └── mean_correction.cu
    ├── hello_world
    │   ├── Makefile
    │   └── hello_world.cu
    └── include
├── chapter8
    ├── include
    ├── intrinsics_and_atomics
    │   ├── Makefile
    │   ├── atomics.cu
    │   ├── cas.cu
    │   └── znorm.cu
    ├── multi_gpu
    │   ├── Makefile
    │   ├── multi_gpu.cu
    │   ├── multi_streamed_gpu.cu
    │   ├── single_gpu.cu
    │   └── streamed_gpu.cu
    └── uvm
    │   ├── Makefile
    │   └── uvm_minimal_example.cu
├── chapter9
    ├── hello_world
    │   ├── Makefile
    │   └── hello_world.cpp
    ├── jacobi_iteration
    │   ├── Makefile
    │   ├── jacobi_1D_block.cpp
    │   ├── jacobi_1D_block_simple.cpp
    │   ├── jacobi_1D_nonblock.cpp
    │   └── jacobi_seq.cpp
    ├── matrix_matrix_mult
    │   ├── Makefile
    │   ├── matrix_mult_2D.cpp
    │   ├── matrix_mult_cols.cpp
    │   ├── matrix_mult_rows.cpp
    │   └── summa.cpp
    ├── ping_pong
    │   ├── Makefile
    │   ├── ping_pong_ring.cpp
    │   └── ping_pong_ring_nonblock.cpp
    └── primes
    │   ├── Makefile
    │   ├── primes.cpp
    │   └── primes_serialized_comm.cpp
└── include
    ├── binary_IO.hpp
    ├── bitmap_IO.hpp
    ├── cbf_generator.hpp
    ├── hpc_helpers.hpp
    └── svd.hpp


/README.md:
--------------------------------------------------------------------------------
1 | # parallelprogrammingbook
2 | supplementary material/programming exercises
3 | 


--------------------------------------------------------------------------------
/chapter10/axpy/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXXHOME= /opt/upcxx/
 2 | UPCXX= $(UPCXXHOME)/bin/upc++
 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/
 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES
 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 6 | 
 7 | all: axpy
 8 | 
 9 | axpy: axpy.cxx
10 | 	$(UPCXX) $(UPCXXFLAGS) axpy.cxx -o axpy -I $(UPCXXINC)
11 | 
12 | axpy_run: axpy
13 | 	$(GASNETRUN) axpy 4 1
14 | 
15 | clean:
16 | 	rm -rf axpy
17 | 


--------------------------------------------------------------------------------
/chapter10/axpy/axpy.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>
  2 | #include <timer.h>
  3 | 
  4 | void printOutput(int n, upcxx::shared_array<float, 2> data){
  5 | 	FILE *fp = fopen("outAXPY.txt", "wb");
  6 | 	// Check if the file was opened
  7 | 	if(fp == NULL){
  8 | 		std::cout << "ERROR: Output file outAXPY.txt could not be opened" << std::endl;
  9 | 		exit(1);
 10 | 	}
 11 | 
 12 | 	float aux;
 13 | 	for(int i=0; i<n; i++){
 14 | 		aux = data[i];
 15 |         	fprintf(fp, "%lf ", aux);
 16 | 	}
 17 |         fprintf(fp, "\n");
 18 | 
 19 |     	fclose(fp);
 20 | }
 21 | 	
 22 | 
 23 | int main (int argc, char *argv[]){
 24 | 	// Initialize UPC++
 25 | 	upcxx::init(&argc, &argv);
 26 | 
 27 | 	int numT = upcxx::ranks();
 28 | 	int myId = upcxx::myrank();
 29 | 
 30 | 	if(argc < 3){
 31 | 		// Only the first process prints the output message
 32 | 		if(!MYTHREAD){
 33 | 			std::cout << "ERROR: The syntax of the program is "
 34 |                                   << argv[0] << " n alpha" << std::endl;
 35 | 		}
 36 | 		exit(1);
 37 | 	}
 38 | 
 39 | 	int n = atoi(argv[1]);
 40 | 	float alpha = atof(argv[2]);
 41 | 
 42 | 	if(n < 1){
 43 | 		// Only the first process prints the output message
 44 | 		if(!myId)
 45 | 			std::cout << "ERROR: 'n' must be higher than 0" << std::endl;
 46 | 		
 47 | 		exit(1);
 48 | 	}
 49 | 
 50 | 	if(n%2){
 51 | 		// Only the first process prints the output message
 52 | 		if(!myId)
 53 | 			std::cout << "ERROR: The blocks (of size 2) must be complete" << std::endl;
 54 | 		
 55 | 		exit(1);
 56 | 	}
 57 | 
 58 | 	// Declare the shared arrays
 59 | 	upcxx::shared_array<float, 2> x(n);
 60 | 	upcxx::shared_array<float, 2> y(n);
 61 | 
 62 | 	// To measure time
 63 | 	upcxx::timer t;
 64 | 	upcxx::barrier();
 65 | 	t.start();
 66 | 
 67 | 	// Example accessing memory without affinity
 68 | 	// Initialize arrays
 69 | 	for(int i=myId; i<n; i+=numT){
 70 | 		x[i] = i;
 71 | 		y[i] = numT;
 72 | 	}
 73 | 
 74 | 	// Compute axpy
 75 | 	for(int i=myId; i<n; i+=numT)
 76 | 		y[i] += alpha*x[i];
 77 | 
 78 | 	upcxx::barrier();
 79 | 
 80 | 	t.stop();
 81 | 	if(!myId){
 82 | 		std::cout << "Time with " << numT << " processes using global arrays and without affinity: " << t.secs() << " seconds" << std::endl;
 83 | 		//printOutput(n, y);
 84 | 	}  
 85 | 
 86 | 	// Example accessing memory with affinity
 87 | 	// Initialize arrays
 88 | 	upcxx::barrier();
 89 | 	t.reset();
 90 | 	t.start();
 91 | 	for(int i=2*myId; i<n; i+=2*numT){
 92 | 		x[i] = i;
 93 | 		y[i] = numT;
 94 | 		x[i+1] = i+1;
 95 | 		y[i+1] = numT;
 96 | 	}
 97 | 
 98 | 	// Compute axpy
 99 | 	for(int i=2*myId; i<n; i+=2*numT){
100 | 		y[i] += alpha*x[i];
101 | 		y[i+1] += alpha*x[i+1];
102 | 	}
103 | 
104 | 	upcxx::barrier();
105 | 	t.stop();
106 | 	if(!myId){
107 | 		std::cout << "Time with " << numT << " processes using global arrays and affinity: " << t.secs() << " seconds" << std::endl;
108 | 		//printOutput(n, y);
109 | 	}  
110 | 
111 | 	// Example with privatization
112 | 	float *privX = (float *) &x[myId*2];
113 | 	float *privY = (float *) &y[myId*2];
114 | 	upcxx::barrier();
115 | 	t.reset();
116 | 	t.start();
117 | 
118 | 	// Calculate the amount of data
119 | 	int myBlocks = n/(2*numT);
120 | 	int myIni = 2*myId;
121 | 	// When 'n' is not multiple of 'numT'
122 | 	if(!myId < (n/2)%numT)
123 | 		myBlocks++;
124 | 
125 | 	// Initialize arrays
126 | 	for(int i=0; i<myBlocks; i++){
127 | 		privX[2*i] = myIni+2*numT*i;
128 | 		privX[2*i+1] = myIni+2*numT*i+1;
129 | 		privY[2*i] = numT;
130 | 		privY[2*i+1] = numT;
131 | 	}
132 | 
133 | 	// Compute axpy
134 | 	for(int i=0; i<myBlocks*2; i++){	
135 | 		privY[i] += alpha*privX[i];
136 | 	}
137 | 
138 | 	upcxx::barrier();
139 | 	t.stop();
140 | 	if(!myId){
141 | 		std::cout << "Time with " << numT << " processes using private pointers: " << t.secs() << " seconds" << std::endl;
142 | 		printOutput(n, y);
143 | 	}
144 | 
145 | 	// Terminate UPC++
146 | 	upcxx::finalize();
147 | 	return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/chapter10/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXX= /opt/upcxx/bin/upc++                   # install UPC++ and specify binary
 2 | UPCXXFLAGS= -O2 -std=c++11                    # as usual
 3 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 4 | 
 5 | all: hello_world
 6 | 
 7 | hello_world: hello_world.cxx
 8 | 	$(UPCXX) $(UPCXXFLAGS) hello_world.cxx -o hello_world
 9 | 
10 | hello_world_run: hello_world
11 | 	$(GASNETRUN) hello_world
12 | 
13 | clean:
14 | 	rm -rf hello_world
15 | 


--------------------------------------------------------------------------------
/chapter10/hello_world/hello_world.cxx:
--------------------------------------------------------------------------------
 1 | #include <upcxx.h>
 2 | 
 3 | int main (int argc, char *argv[]){
 4 | 	// Initialize UPC++
 5 | 	upcxx::init(&argc, &argv);
 6 | 
 7 | 	// Every process prints Hello
 8 | 	std::cout << "Thread " << upcxx::myrank() << " of " << upcxx::ranks() << ": Hello, world!" << std::endl;
 9 | 
10 | 	// Terminate UPC++
11 | 	upcxx::finalize();
12 | 	return 0;
13 | }
14 | 


--------------------------------------------------------------------------------
/chapter10/histogram/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXX= /opt/upcxx/bin/upc++                   # install UPC++ and specify binary
 2 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES
 3 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 4 | 
 5 | all: histo1 histo2 histo3
 6 | 
 7 | histo1: histo1.cxx
 8 | 	$(UPCXX) $(UPCXXFLAGS) histo1.cxx -o histo1
 9 | 
10 | histo1_run: histo1
11 | 	$(GASNETRUN) histo1 16 16
12 | 
13 | histo2: histo2.cxx
14 | 	$(UPCXX) $(UPCXXFLAGS) histo2.cxx -o histo2
15 | 
16 | histo2_run: histo2
17 | 	$(GASNETRUN) histo2 16 16
18 | 
19 | histo3: histo3.cxx
20 | 	$(UPCXX) $(UPCXXFLAGS) histo3.cxx -o histo3
21 | 
22 | histo3_run: histo3
23 | 	$(GASNETRUN) histo3 16 16
24 | 
25 | clean:
26 | 	rm -rf histo1
27 | 	rm -rf histo2
28 | 	rm -rf histo3
29 | 


--------------------------------------------------------------------------------
/chapter10/histogram/histo1.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | void readImage(int rows, int cols, int *image){
  4 | 
  5 | 	for(int i=0; i<rows; i++){
  6 | 		for(int j=0; j<cols; j++){
  7 | 			image[i*cols+j] = (i*j)%256;
  8 | 		}
  9 | 
 10 | 	}
 11 | }
 12 | 
 13 | void printHistogram(upcxx::shared_array<int> h){
 14 | 
 15 | 	for(int i=0; i<256; i++)
 16 | 		std::cout << h[i] << " ";
 17 | 
 18 | 	std::cout << std::endl;
 19 | }
 20 | 
 21 | upcxx::shared_lock l;
 22 | 
 23 | int main (int argc, char *argv[]){
 24 | 	// Initialize UPC++
 25 | 	upcxx::init(&argc, &argv);
 26 | 
 27 | 	int numT = upcxx::ranks();
 28 | 	int myId = upcxx::myrank();
 29 | 
 30 | 	if(argc < 3){
 31 | 		// Only the first process prints the output message
 32 | 		if(!MYTHREAD){
 33 | 			std::cout << "ERROR: The syntax of the program is "
 34 |                                   << argv[0] << " rows cols" << std::endl;
 35 | 		}
 36 | 		exit(1);
 37 | 	}
 38 | 
 39 | 	int rows = atoi(argv[1]);
 40 | 	int cols = atoi(argv[2]);
 41 | 
 42 | 	if(rows < 0){
 43 | 		// Only the first process prints the output message
 44 | 		if(!myId)
 45 | 			std::cout << "ERROR: 'rows' must be higher than 0" << std::endl;
 46 | 		exit(1);
 47 | 	}
 48 | 
 49 | 	if(cols < 0){
 50 | 		// Only the first process prints the output message
 51 | 		if(!myId)
 52 | 			std::cout << "ERROR: 'cols' must be higher than 0" << std::endl;
 53 | 		exit(1);
 54 | 	}
 55 | 
 56 | 	if(rows%numT){
 57 | 		// Only the first process prints the output message
 58 | 		if(!myId)
 59 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 60 | 		exit(1);
 61 | 	}
 62 | 
 63 | 	// Create the array of global pointers
 64 | 	upcxx::shared_array<upcxx::global_ptr<int>> p(numT);
 65 | 	
 66 | 	// Each thread allocates the memory of its subspace
 67 | 	int blockRows = rows/numT;
 68 | 	p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int));
 69 | 
 70 | 	// Thread 0 reads the image and copies the fragments
 71 | 	if(!myId){
 72 | 		int *block = new int[blockRows*cols];
 73 | 		int *block2 = new int[blockRows*cols];
 74 | 		upcxx::event e;
 75 | 
 76 | 		readImage(blockRows, cols, block);
 77 | 
 78 | 		for(int i=0; i<numT-1; i++){
 79 | 			upcxx::async_copy<int>(block, p[i], blockRows*cols, &e);
 80 | 
 81 | 			// Overlap the copy with reading the next fragment
 82 | 			// We cannot use "block" for the next fragment because it has not been sent 
 83 | 			readImage(blockRows, cols, block2);
 84 | 
 85 | 			// The previous copy must have finished to reuse its buffer
 86 | 			e.wait();
 87 | 			int *aux = block;
 88 | 			block = block2;
 89 | 			block2 = aux;
 90 | 		}
 91 | 
 92 | 		// The last copy does not overlap
 93 | 		upcxx::copy<int>(block, p[numT-1], blockRows*cols);
 94 | 
 95 | 		delete block;
 96 | 		delete block2;
 97 | 	}
 98 | 
 99 | 	// Threads must wait until Thread 0 has copied the fragments of the text
100 | 	upcxx::barrier();
101 | 
102 | 	// Privatize the pointer
103 | 	int *myImage = (int *) (upcxx::global_ptr<int>) p[myId];
104 | 
105 | 	// Check whether it is really local
106 | 	if(!((upcxx::global_ptr<int>) p[myId]).is_local())
107 | 		std::cout << "Thread " << myId << " not accessing local memory" << std::endl;
108 | 
109 | 	// Declare the histogram
110 | 	upcxx::shared_array<int> histogram(256);
111 | 	for(int i=myId; i<256; i+=numT)
112 | 		histogram[i] = 0;
113 | 
114 | 	// Threads must wait until all locks and histogram have been initialized
115 | 	upcxx::barrier();
116 | 
117 | 	// Examine the local image
118 | 	for(int i=0; i<blockRows*cols; i++){
119 | 		// Close the lock to access the shared array
120 | 		l.lock();
121 | 
122 | 		histogram[myImage[i]] = histogram[myImage[i]]+1;
123 | 
124 | 		// Open the lock again
125 | 		l.unlock();
126 | 	}
127 | 
128 | 	// All threads must have finished their local computation
129 | 	upcxx::barrier();
130 | 
131 | 	if(!myId)
132 | 		printHistogram(histogram);
133 | 
134 | 	// Deallocate the local memory
135 | 	upcxx::deallocate<int>(p[myId]);
136 | 
137 | 	// Terminate UPC++
138 | 	upcxx::finalize();
139 | 	return 0;
140 | }
141 | 


--------------------------------------------------------------------------------
/chapter10/histogram/histo2.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | void readImage(int rows, int cols, int *image){
  4 | 
  5 | 	for(int i=0; i<rows; i++){
  6 | 		for(int j=0; j<cols; j++){
  7 | 			image[i*cols+j] = (i*j)%256;
  8 | 		}
  9 | 
 10 | 	}
 11 | }
 12 | 
 13 | void printHistogram(upcxx::shared_array<int> h){
 14 | 
 15 | 	for(int i=0; i<256; i++)
 16 | 		std::cout << h[i] << " ";
 17 | 
 18 | 	std::cout << std::endl;
 19 | }
 20 | 
 21 | upcxx::shared_array<upcxx::shared_lock> locks;
 22 | 
 23 | int main (int argc, char *argv[]){
 24 | 	// Initialize UPC++
 25 | 	upcxx::init(&argc, &argv);
 26 | 
 27 | 	int numT = upcxx::ranks();
 28 | 	int myId = upcxx::myrank();
 29 | 
 30 | 	if(argc < 3){
 31 | 		// Only the first process prints the output message
 32 | 		if(!MYTHREAD){
 33 | 			std::cout << "ERROR: The syntax of the program is "
 34 |                                   << argv[0] << " rows cols" << std::endl;
 35 | 		}
 36 | 		exit(1);
 37 | 	}
 38 | 
 39 | 	int rows = atoi(argv[1]);
 40 | 	int cols = atoi(argv[2]);
 41 | 
 42 | 	if(rows < 0){
 43 | 		// Only the first process prints the output message
 44 | 		if(!myId)
 45 | 			std::cout << "ERROR: 'rows' must be higher than 0" << std::endl;
 46 | 		exit(1);
 47 | 	}
 48 | 
 49 | 	if(cols < 0){
 50 | 		// Only the first process prints the output message
 51 | 		if(!myId)
 52 | 			std::cout << "ERROR: 'cols' must be higher than 0" << std::endl;
 53 | 		exit(1);
 54 | 	}
 55 | 
 56 | 	if(rows%numT){
 57 | 		// Only the first process prints the output message
 58 | 		if(!myId)
 59 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 60 | 		exit(1);
 61 | 	}
 62 | 
 63 | 	// Create the array of global pointers
 64 | 	upcxx::shared_array<upcxx::global_ptr<int>> p(numT);
 65 | 	
 66 | 	// Each thread allocates the memory of its subspace
 67 | 	int blockRows = rows/numT;
 68 | 	p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int));
 69 | 
 70 | 	// Thread 0 reads the image and copies the fragments
 71 | 	if(!myId){
 72 | 		int *block = new int[blockRows*cols];
 73 | 		int *block2 = new int[blockRows*cols];
 74 | 		upcxx::event e;
 75 | 
 76 | 		readImage(blockRows, cols, block);
 77 | 
 78 | 		for(int i=0; i<numT-1; i++){
 79 | 			upcxx::async_copy<int>(block, p[i], blockRows*cols, &e);
 80 | 
 81 | 			// Overlap the copy with reading the next fragment
 82 | 			// We cannot use "block" for the next fragment because it has not been sent 
 83 | 			readImage(blockRows, cols, block2);
 84 | 
 85 | 			// The previous copy must have finished to reuse its buffer
 86 | 			e.wait();
 87 | 			int *aux = block;
 88 | 			block = block2;
 89 | 			block2 = aux;
 90 | 		}
 91 | 
 92 | 		// The last copy does not overlap
 93 | 		upcxx::copy<int>(block, p[numT-1], blockRows*cols);
 94 | 
 95 | 		delete block;
 96 | 		delete block2;
 97 | 	}
 98 | 
 99 | 	// Threads must wait until Thread 0 has copied the fragments of the text
100 | 	upcxx::barrier();
101 | 
102 | 	// Privatize the pointer
103 | 	int *myImage = (int *) (upcxx::global_ptr<int>) p[myId];
104 | 
105 | 	// Check whether it is really local
106 | 	if(!((upcxx::global_ptr<int>) p[myId]).is_local())
107 | 		std::cout << "Thread " << myId << " not accessing local memory" << std::endl;
108 | 
109 | 	// Declare the histogram
110 | 	upcxx::shared_array<int> histogram(256);
111 | 	for(int i=myId; i<256; i+=numT)
112 | 		histogram[i] = 0;
113 | 
114 | 	// Initialize the locks
115 | 	locks.init(256);
116 | 	for(int i=myId; i<256; i+=numT)
117 | 		new (locks[i].raw_ptr()) upcxx::shared_lock(myId);
118 | 
119 | 	// Threads must wait until all locks and histogram have been initialized
120 | 	upcxx::barrier();
121 | 
122 | 	// Examine the local image
123 | 	for(int i=0; i<blockRows*cols; i++){
124 | 		// Close the lock to access the shared array
125 | 		((upcxx::shared_lock) locks[myImage[i]]).lock();
126 | 
127 | 		histogram[myImage[i]] = histogram[myImage[i]]+1;
128 | 
129 | 		// Open the lock again
130 | 		((upcxx::shared_lock) locks[myImage[i]]).unlock();
131 | 	}
132 | 
133 | 	// All threads must have finished their local computation
134 | 	upcxx::barrier();
135 | 
136 | 	if(!myId)
137 | 		printHistogram(histogram);
138 | 
139 | 	// Deallocate the local memory
140 | 	upcxx::deallocate<int>(p[myId]);
141 | 
142 | 	// Terminate UPC++
143 | 	upcxx::finalize();
144 | 	return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/chapter10/histogram/histo3.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | upcxx::shared_array<upcxx::atomic<int>> histogram;
  4 | 
  5 | void readImage(int rows, int cols, int *image){
  6 | 
  7 | 	for(int i=0; i<rows; i++){
  8 | 		for(int j=0; j<cols; j++){
  9 | 			image[i*cols+j] = (i*j)%256;
 10 | 		}
 11 | 
 12 | 	}
 13 | }
 14 | 
 15 | void printHistogram(){
 16 | 
 17 | 	for(int i=0; i<256; i++)
 18 | 		std::cout << ((upcxx::atomic<int>) histogram[i]).load() << " ";
 19 | 
 20 | 	std::cout << std::endl;
 21 | }
 22 | 
 23 | int main (int argc, char *argv[]){
 24 | 	// Initialize UPC++
 25 | 	upcxx::init(&argc, &argv);
 26 | 
 27 | 	int numT = upcxx::ranks();
 28 | 	int myId = upcxx::myrank();
 29 | 
 30 | 	if(argc < 3){
 31 | 		// Only the first process prints the output message
 32 | 		if(!MYTHREAD){
 33 | 			std::cout << "ERROR: The syntax of the program is "
 34 |                                   << argv[0] << " rows cols" << std::endl;
 35 | 		}
 36 | 		exit(1);
 37 | 	}
 38 | 
 39 | 	int rows = atoi(argv[1]);
 40 | 	int cols = atoi(argv[2]);
 41 | 
 42 | 	if(rows < 0){
 43 | 		// Only the first process prints the output message
 44 | 		if(!myId)
 45 | 			std::cout << "ERROR: 'rows' must be higher than 0" << std::endl;
 46 | 		exit(1);
 47 | 	}
 48 | 
 49 | 	if(cols < 0){
 50 | 		// Only the first process prints the output message
 51 | 		if(!myId)
 52 | 			std::cout << "ERROR: 'cols' must be higher than 0" << std::endl;
 53 | 		exit(1);
 54 | 	}
 55 | 
 56 | 	if(rows%numT){
 57 | 		// Only the first process prints the output message
 58 | 		if(!myId)
 59 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 60 | 		exit(1);
 61 | 	}
 62 | 
 63 | 	// Create the array of global pointers
 64 | 	upcxx::shared_array<upcxx::global_ptr<int>> p(numT);
 65 | 	
 66 | 	// Each thread allocates the memory of its subspace
 67 | 	int blockRows = rows/numT;
 68 | 	p[myId] = upcxx::allocate(myId, blockRows*cols*sizeof(int));
 69 | 
 70 | 	// Thread 0 reads the image and copies the fragments
 71 | 	if(!myId){
 72 | 		int *block = new int[blockRows*cols];
 73 | 		int *block2 = new int[blockRows*cols];
 74 | 		upcxx::event e;
 75 | 
 76 | 		readImage(blockRows, cols, block);
 77 | 
 78 | 		for(int i=0; i<numT-1; i++){
 79 | 			upcxx::async_copy<int>(block, p[i], blockRows*cols, &e);
 80 | 
 81 | 			// Overlap the copy with reading the next fragment
 82 | 			// We cannot use "block" for the next fragment because it has not been sent 
 83 | 			readImage(blockRows, cols, block2);
 84 | 
 85 | 			// The previous copy must have finished to reuse its buffer
 86 | 			e.wait();
 87 | 			int *aux = block;
 88 | 			block = block2;
 89 | 			block2 = aux;
 90 | 		}
 91 | 
 92 | 		// The last copy does not overlap
 93 | 		upcxx::copy<int>(block, p[numT-1], blockRows*cols);
 94 | 
 95 | 		delete block;
 96 | 		delete block2;
 97 | 	}
 98 | 
 99 | 	// Threads must wait until Thread 0 has copied the fragments of the text
100 | 	upcxx::barrier();
101 | 
102 | 	// Privatize the pointer
103 | 	int *myImage = (int *) (upcxx::global_ptr<int>) p[myId];
104 | 
105 | 	// Check whether it is really local
106 | 	if(!((upcxx::global_ptr<int>) p[myId]).is_local())
107 | 		std::cout << "Thread " << myId << " not accessing local memory" << std::endl;
108 | 
109 | std::cout << "To init histogram" << std::endl;
110 | 
111 | 	// Initialize the histogram
112 | 	histogram.init(256);
113 | 	for(int i=myId; i<256; i+=numT){
114 | 		std::cout << "Before, histogram[" << i << "] = " << histogram[i].get().load() << std::endl;
115 | 		//((upcxx::atomic<int>) histogram[i]).store(1);
116 | 		histogram[i].get().store(1);
117 | 		std::cout << "After, histogram[" << i << "] = " << histogram[i].get().load() << std::endl;
118 | 	}
119 | 
120 | std::cout << "histogram initialized" << std::endl;
121 | 
122 | 	// Threads must wait until the histogram has been initialized
123 | 	upcxx::barrier();
124 | 
125 | 	// Examine the local image
126 | 	/*for(int i=0; i<blockRows*cols; i++)
127 | 		// Atomic add
128 | 		((upcxx::atomic<int>) histogram[myImage[i]]).fetch_add(1);*/
129 | 
130 | 	// All threads must have finished their local computation
131 | 	upcxx::barrier();
132 | 
133 | 	if(!myId)
134 | 		printHistogram();
135 | 
136 | 	// Deallocate the local memory
137 | 	upcxx::deallocate<int>(p[myId]);
138 | 
139 | 	// Terminate UPC++
140 | 	upcxx::finalize();
141 | 	return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/chapter10/letter/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXXHOME= /opt/upcxx/
 2 | UPCXX= $(UPCXXHOME)/bin/upc++
 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/
 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES
 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 6 | 
 7 | all: letter1 letter2
 8 | 
 9 | letter1: letter1.cxx
10 | 	$(UPCXX) $(UPCXXFLAGS) letter1.cxx -o letter1 -I $(UPCXXINC)
11 | 
12 | letter1_run: letter1
13 | 	$(GASNETRUN) letter1 C 4
14 | 
15 | letter2: letter2.cxx
16 | 	$(UPCXX) $(UPCXXFLAGS) letter2.cxx -o letter2 -I $(UPCXXINC)
17 | 
18 | letter2_run: letter2
19 | 	$(GASNETRUN) letter2 C 4
20 | 
21 | clean:
22 | 	rm -rf letter1
23 | 	rm -rf letter2
24 | 


--------------------------------------------------------------------------------
/chapter10/letter/letter1.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | void readText(int n, char *text){
  4 | 
  5 | 	int i;
  6 | 	for(i=0; i<n/4; i++){
  7 | 		text[i*4] = 'A';
  8 | 		text[i*4+1] = 'C';
  9 | 		text[i*4+2] = 'G';
 10 | 		text[i*4+3] = 'T';
 11 | 	}
 12 | 
 13 | 	if(n%4){
 14 | 		text[i*4] = 'A';
 15 | 		if((n%4) > 1){
 16 | 			text[i*4+1] = 'C';
 17 | 			if((n%4) > 2){
 18 | 				text[i*4+2] = 'G';
 19 | 			}
 20 | 		}
 21 | 	}
 22 | }
 23 | 
 24 | int main (int argc, char *argv[]){
 25 | 	// Initialize UPC++
 26 | 	upcxx::init(&argc, &argv);
 27 | 
 28 | 	int numT = upcxx::ranks();
 29 | 	int myId = upcxx::myrank();
 30 | 
 31 | 	if(argc < 3){
 32 | 		// Only the first process prints the output message
 33 | 		if(!MYTHREAD){
 34 | 			std::cout << "ERROR: The syntax of the program is "
 35 |                                   << argv[0] << " l n" << std::endl;
 36 | 		}                   
 37 | 		exit(1);
 38 | 	}
 39 | 
 40 | 	char l = *argv[1];
 41 | 	int n = atoi(argv[2]);
 42 | 
 43 | 	if(n < 0){
 44 | 		// Only the first process prints the output message
 45 | 		if(!myId)
 46 | 			std::cout << "ERROR: 'n' must be higher than 0" << std::endl;
 47 | 		
 48 | 		exit(1);
 49 | 	}
 50 | 
 51 | 	if(n%numT){
 52 | 		// Only the first process prints the output message
 53 | 		if(!myId)
 54 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 55 | 		
 56 | 		exit(1);
 57 | 	}
 58 | 
 59 | 	// Create the array of global pointers
 60 | 	upcxx::shared_array<upcxx::global_ptr<char>> p(numT);
 61 | 	
 62 | 	// Each thread allocates the memory of its subspace
 63 | 	int blockFactor = n/numT;
 64 | 	p[myId] = upcxx::allocate(myId, blockFactor*sizeof(char));
 65 | 
 66 | 	// Thread 0 reads the text and copy the fragments
 67 | 	if(!myId){
 68 | 		char *text = new char[100];
 69 | 		readText(n, text);
 70 | 
 71 | 		for(int i=0; i<numT; i++)
 72 | 			upcxx::copy<char>(&text[blockFactor*i], p[i], blockFactor);
 73 | 
 74 | 		delete text;
 75 | 	}
 76 | 
 77 | 	// Threads must wait until Thread 0 has copied the fragments of the text
 78 | 	upcxx::barrier();
 79 | 
 80 | 	// Privatize the pointer
 81 | 	int myNumOcc = 0;
 82 | 	char *myText = (char *) (upcxx::global_ptr<char>) p[myId];
 83 | 
 84 | 	// Check whether it is really local
 85 | 	if(!((upcxx::global_ptr<char>) p[myId]).is_local())
 86 | 		std::cout << "Thread " << myId << " not accessing local memory" << std::endl;
 87 | 
 88 | 	// Find the local occurrences
 89 | 	for(int i=0; i<blockFactor; i++)
 90 | 		if(myText[i] == l)
 91 | 			myNumOcc++;
 92 | 
 93 | 	// Put the local occurrences accessible to all threads
 94 | 	upcxx::shared_array<int> occs(numT);
 95 | 	occs[myId] = myNumOcc;
 96 | 
 97 | 	// All threads must have put accessible the local occurrences
 98 | 	upcxx::barrier();
 99 | 
100 | 	if(!myId){
101 | 		int numOcc = myNumOcc;
102 | 		for(int i=1; i<numT; i++)
103 | 			numOcc += occs[i];
104 | 
105 | 		std::cout << "Letter " << l << " found " << numOcc << " in the text " << std::endl;
106 | 	}
107 | 
108 | 	// Deallocate the local memory
109 | 	upcxx::deallocate<char>(p[myId]);
110 | 
111 | 	// Terminate UPC++
112 | 	upcxx::finalize();
113 | 	return 0;
114 | }
115 | 


--------------------------------------------------------------------------------
/chapter10/letter/letter2.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | void readText(int n, char *text){
  4 | 
  5 | 	int i;
  6 | 	for(i=0; i<n/4; i++){
  7 | 		text[i*4] = 'A';
  8 | 		text[i*4+1] = 'C';
  9 | 		text[i*4+2] = 'G';
 10 | 		text[i*4+3] = 'T';
 11 | 	}
 12 | 
 13 | 	if(n%4){
 14 | 		text[i*4] = 'A';
 15 | 		if((n%4) > 1){
 16 | 			text[i*4+1] = 'C';
 17 | 			if((n%4) > 2){
 18 | 				text[i*4+2] = 'G';
 19 | 			}
 20 | 		}
 21 | 	}
 22 | }
 23 | 
 24 | int main (int argc, char *argv[]){
 25 | 	// Initialize UPC++
 26 | 	upcxx::init(&argc, &argv);
 27 | 
 28 | 	int numT = upcxx::ranks();
 29 | 	int myId = upcxx::myrank();
 30 | 
 31 | 	if(argc < 3){
 32 | 		// Only the first process prints the output message
 33 | 		if(!MYTHREAD){
 34 | 			std::cout << "ERROR: The syntax of the program is ./letter l n"
 35 | 					<< std::endl;
 36 | 		}
 37 | 		exit(1);
 38 | 	}
 39 | 
 40 | 	char l = *argv[1];
 41 | 	int n = atoi(argv[2]);
 42 | 
 43 | 	if(n < 0){
 44 | 		// Only the first process prints the output message
 45 | 		if(!myId)
 46 | 			std::cout << "ERROR: 'n' must be higher than 0" << std::endl;
 47 | 		
 48 | 		exit(1);
 49 | 	}
 50 | 
 51 | 	if(n%numT){
 52 | 		// Only the first process prints the output message
 53 | 		if(!myId)
 54 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 55 | 		
 56 | 		exit(1);
 57 | 	}
 58 | 
 59 | 	// Create the array of global pointers
 60 | 	upcxx::shared_array<upcxx::global_ptr<char>> p(numT);
 61 | 	
 62 | 	// Each thread allocates the memory of its subspace
 63 | 	int blockFactor = n/numT;
 64 | 	p[myId] = upcxx::allocate(myId, blockFactor*sizeof(char));
 65 | 
 66 | 	// Thread 0 reads the text and copy the fragments
 67 | 	if(!myId){
 68 | 		char *text = new char[blockFactor];
 69 | 		char *text2 = new char[blockFactor];
 70 | 		upcxx::event e;
 71 | 
 72 | 		readText(blockFactor, text);
 73 | 
 74 | 		for(int i=0; i<numT-1; i++){
 75 | 			upcxx::async_copy<char>(text, p[i], blockFactor, &e);
 76 | 
 77 | 			// Overlap the copy with reading the next fragment
 78 | 			// We cannot use text for teh next fragment because it has not been sent 
 79 | 			readText(blockFactor, text2);
 80 | 			char *aux = text;
 81 | 			text = text2;
 82 | 			text2 = aux;
 83 | 
 84 | 			// The previous copy must have finished to reuse its buffer
 85 | 			e.wait();
 86 | 		}
 87 | 
 88 | 		// The last copy does not overlap
 89 | 		upcxx::copy<char>(text, p[numT-1], blockFactor);
 90 | 
 91 | 		delete text;
 92 | 		delete text2;
 93 | 	}
 94 | 
 95 | 	// Threads must wait until Thread 0 has copied the fragments of the text
 96 | 	upcxx::barrier();
 97 | 
 98 | 	// Privatize the pointer
 99 | 	int myNumOcc = 0;
100 | 	char *myText = (char *) (upcxx::global_ptr<char>) p[myId];
101 | 
102 | 	// Check whether it is really local
103 | 	if(!((upcxx::global_ptr<char>) p[myId]).is_local())
104 | 		std::cout << "Thread " << myId << " not accessing local memory" << std::endl;
105 | 
106 | 	// Find the local occurrences
107 | 	for(int i=0; i<blockFactor; i++)
108 | 		if(myText[i] == l)
109 | 			myNumOcc++;
110 | 
111 | 	// Reduce number of occurrences
112 | 	int numOcc;
113 | 	upcxx::reduce(&myNumOcc, &numOcc, 1, 0, UPCXX_SUM, UPCXX_INT);
114 | 
115 | 	if(!myId){
116 | 		std::cout << "Letter " << l << " found " << numOcc << " in the text " << std::endl;
117 | 	}
118 | 
119 | 	// Deallocate the local memory
120 | 	upcxx::deallocate<char>(p[myId]);
121 | 
122 | 	// Terminate UPC++
123 | 	upcxx::finalize();
124 | 	return 0;
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter10/mandelbrot/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXXHOME= /opt/upcxx/
 2 | UPCXX= $(UPCXXHOME)/bin/upc++
 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/
 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES
 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 6 | 
 7 | all: mandel1 mandel2
 8 | 
 9 | mandel1: mandel1.cxx
10 | 	$(UPCXX) $(UPCXXFLAGS) mandel1.cxx -o mandel1 -I $(UPCXXINC)
11 | 
12 | mandel1_run: mandel1
13 | 	$(GASNETRUN) mandel1 512 512 1024
14 | 
15 | mandel2: mandel2.cxx
16 | 	$(UPCXX) $(UPCXXFLAGS) mandel2.cxx -o mandel2 -I $(UPCXXINC)
17 | 
18 | mandel2_run: mandel2
19 | 	$(GASNETRUN) mandel2 512 512 1024
20 | 
21 | clean:
22 | 	rm -rf mandel1
23 | 	rm -rf mandel2
24 | 


--------------------------------------------------------------------------------
/chapter10/mandelbrot/mandel1.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | void printMandel(int *image, int rows, int cols){
  4 | 		
  5 | 	for(int i=0; i<rows; i++){
  6 | 		for(int j=0; j<cols; j++)
  7 | 			std::cout << image[i*cols+j] << " ";
  8 | 		std::cout << std::endl;
  9 | 	}		
 10 | }
 11 | 
 12 | 
 13 | int mandel(int i, int j, int rows, int cols, int maxIter){
 14 | 	float zReal = 0.0, zImag = 0.0, cReal, cImag, temp, lengthsq;
 15 | 
 16 | 	cReal = -2.0+j*4.0/rows;
 17 |  	cImag = 2.0-i*4.0/cols;
 18 | 	int k = 0;
 19 | 
 20 | 	do { // Iterate for pixel color
 21 | 		temp = zReal*zReal-zImag*zImag+cReal;
 22 | 		zImag = 2.0*zReal*zImag+cImag;
 23 | 		zReal = temp;
 24 | 		lengthsq = zReal*zReal+zImag*zImag;
 25 | 		k++;
 26 | 	} while (lengthsq<4.0 && k < maxIter);
 27 | 
 28 |         if(k>=maxIter) 
 29 | 		return 0;
 30 | 
 31 |         return k;
 32 | } 
 33 | 
 34 | int main (int argc, char *argv[]){
 35 | 
 36 | 	// Initialize UPC++
 37 | 	upcxx::init(&argc, &argv);
 38 | 
 39 | 	int numT = upcxx::ranks();
 40 | 	int myId = upcxx::myrank();
 41 | 
 42 | 	if(argc < 4){
 43 | 		// Only the first process prints the output message
 44 | 		if(!MYTHREAD){
 45 | 			std::cout << "ERROR: The syntax of the program is "
 46 |                                   << argv[0] << " rows cols maxIter" << std::endl;
 47 | 		}
 48 | 		exit(1);
 49 | 	}
 50 | 
 51 | 	int rows = atoi(argv[1]);
 52 | 	int cols = atoi(argv[2]);
 53 | 	int maxIter = atoi(argv[3]);
 54 | 
 55 | 	if(rows < 0){
 56 | 		// Only the first process prints the output message
 57 | 		if(!myId)
 58 | 			std::cout << "ERROR: 'rows' must be higher than 0" << std::endl;
 59 | 		exit(1);
 60 | 	}
 61 | 
 62 | 	if(cols < 0){
 63 | 		// Only the first process prints the output message
 64 | 		if(!myId)
 65 | 			std::cout << "ERROR: 'cols' must be higher than 0" << std::endl;
 66 | 		exit(1);
 67 | 	}
 68 | 
 69 | 	if(maxIter < 0){
 70 | 		// Only the first process prints the output message
 71 | 		if(!myId)
 72 | 			std::cout << "ERROR: 'maxIter' must be higher than 0" << std::endl;
 73 | 		exit(1);
 74 | 	}
 75 | 
 76 | 	if(rows%numT){
 77 | 		// Only the first process prints the output message
 78 | 		if(!myId)
 79 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
 80 | 		exit(1);
 81 | 	}
 82 | 
 83 | 	// Output array
 84 | 	int blockRows = rows/numT;
 85 | 	int myImage[blockRows*cols];
 86 | 	upcxx::shared_var<upcxx::global_ptr<int>> outImage;
 87 | 
 88 | 	// Only the owner allocates the array to gather the output
 89 | 	if(!myId){
 90 | 		outImage.put(upcxx::allocate(0, rows*cols*sizeof(int)));
 91 | 	}
 92 | 
 93 | 	// To guarantee that memory is allocated
 94 | 	upcxx::barrier();
 95 | 
 96 | 	// Mandel computation of the block of rows
 97 | 	for(int i=0; i<blockRows; i++)
 98 | 		for(int j=0; j<cols; j++)
 99 | 			myImage[i*cols+j] = mandel(i+myId*blockRows, j, rows, cols, maxIter);
100 | 
101 | 	// Copy the partial result
102 | 	upcxx::copy<int>(myImage, (upcxx::global_ptr<int>) &(outImage.get())[myId*blockRows*cols], blockRows*cols);
103 | 
104 | 	// All threads must have finished their local computation
105 | 	upcxx::barrier();
106 | 
107 | 	if(!myId){
108 | 		printMandel((int *) outImage.get(), rows, cols);
109 | 		// Deallocate the local memory
110 | 		upcxx::deallocate<int>(outImage.get());
111 | 	}	
112 | 
113 | 	// Terminate UPC++
114 | 	upcxx::finalize();
115 | 	return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/chapter10/mandelbrot/mandel2.cxx:
--------------------------------------------------------------------------------
  1 | #include <upcxx.h>	
  2 | 
  3 | // Output array
  4 | upcxx::shared_var<upcxx::global_ptr<int>> outImage;
  5 | 
  6 | // Array to know the busy threads
  7 | upcxx::shared_array<bool> busyTh;
  8 | 
  9 | void printMandel(int *image, int rows, int cols){
 10 | 		
 11 | 	for(int i=0; i<rows; i++){
 12 | 		for(int j=0; j<cols; j++)
 13 | 			std::cout << image[i*cols+j] << " ";
 14 | 		std::cout << std::endl;
 15 | 	}		
 16 | }
 17 | 
 18 | 
 19 | int mandel(int i, int j, int rows, int cols, int maxIter){
 20 | 	float zReal = 0.0, zImag = 0.0, cReal, cImag, temp, lengthsq;
 21 | 
 22 | 	cReal = -2.0+j*4.0/rows;
 23 |  	cImag = 2.0-i*4.0/cols;
 24 | 	int k = 0;
 25 | 
 26 | 	do { // Iterate for pixel color
 27 | 		temp = zReal*zReal-zImag*zImag+cReal;
 28 | 		zImag = 2.0*zReal*zImag+cImag;
 29 | 		zReal = temp;
 30 | 		lengthsq = zReal*zReal+zImag*zImag;
 31 | 		k++;
 32 | 	} while (lengthsq<4.0 && k < maxIter);
 33 | 
 34 |         if(k>=maxIter) 
 35 | 		return 0;
 36 | 
 37 |         return k;
 38 | } 
 39 | 
 40 | 
 41 | void mandelRow(int iterRow, int th, int rows, int cols, int maxIter){
 42 | 	int rowRes[cols];
 43 | 
 44 | 	for(int j=0; j<cols; j++){
 45 | 		rowRes[j] = mandel(iterRow, j, rows, cols, maxIter);
 46 | 	}
 47 | 
 48 | 	// Copy the partial result
 49 | 	upcxx::copy<int>(rowRes, (upcxx::global_ptr<int>) &(outImage.get())[iterRow*cols], cols);
 50 | 
 51 | 	busyTh[th] = false;
 52 | }
 53 | 
 54 | int main (int argc, char *argv[]){
 55 | 
 56 | 	// Initialize UPC++
 57 | 	upcxx::init(&argc, &argv);
 58 | 
 59 | 	int numT = upcxx::ranks();
 60 | 	int myId = upcxx::myrank();
 61 | 
 62 | 	if(numT == 1){
 63 | 		std::cout << "ERROR: More than 1 thread is required for this master-slave approach"
 64 | 			<< std::endl;
 65 | 		exit(1);
 66 | 	}
 67 | 
 68 | 	if(argc < 4){
 69 | 		// Only the first process prints the output message
 70 | 		if(!MYTHREAD){
 71 | 			std::cout << "ERROR: The syntax of the program is "
 72 |                                   << argv[0] << " rows cols maxIter" << std::endl;
 73 | 		}
 74 | 		exit(1);
 75 | 	}
 76 | 
 77 | 	int rows = atoi(argv[1]);
 78 | 	int cols = atoi(argv[2]);
 79 | 	int maxIter = atoi(argv[3]);
 80 | 
 81 | 	if(rows < 0){
 82 | 		// Only the first process prints the output message
 83 | 		if(!myId)
 84 | 			std::cout << "ERROR: 'rows' must be higher than 0" << std::endl;
 85 | 		exit(1);
 86 | 	}
 87 | 
 88 | 	if(cols < 0){
 89 | 		// Only the first process prints the output message
 90 | 		if(!myId)
 91 | 			std::cout << "ERROR: 'cols' must be higher than 0" << std::endl;
 92 | 		exit(1);
 93 | 	}
 94 | 
 95 | 	if(maxIter < 0){
 96 | 		// Only the first process prints the output message
 97 | 		if(!myId)
 98 | 			std::cout << "ERROR: 'maxIter' must be higher than 0" << std::endl;
 99 | 		exit(1);
100 | 	}
101 | 
102 | 	if(rows%numT){
103 | 		// Only the first process prints the output message
104 | 		if(!myId)
105 | 			std::cout << "ERROR: 'n' must multiple of the number of processes" << std::endl;
106 | 		exit(1);
107 | 	}
108 | 
109 | 	// Initialize the lazy array
110 | 	// All elements with affinity to Thread 0
111 | 	busyTh.init(numT);
112 | 	busyTh[myId] = false;
113 | 
114 | 	// To guarantee that busyTh is initialized
115 | 	upcxx::barrier();
116 | 
117 | 	// Thread 0 is the master
118 | 	if(!myId){
119 | 		outImage.put(upcxx::allocate(0, rows*cols*sizeof(int)));
120 | 		int nextTh = 1;
121 | 
122 | 		// While there are more rows
123 | 		for(int i=0; i<rows; i++){
124 | 			// Check whether any thread has finished
125 | 			while(busyTh[nextTh]){
126 | 				nextTh++;
127 | 				if(nextTh == numT){
128 | 					nextTh = 1;
129 | 				}
130 | 			}
131 | 			busyTh[nextTh] = true;
132 | 
133 | 			upcxx::async(nextTh)(mandelRow, i, nextTh, rows, cols, maxIter);
134 | 			upcxx::advance();
135 | 		}
136 | 
137 | 		// Wait for the last row of each thread
138 | 		upcxx::async_wait();
139 | 
140 | 		printMandel((int *) outImage.get(), rows, cols);
141 | 		// Deallocate the local memory
142 | 		upcxx::deallocate<int>(outImage.get());
143 | 	}
144 | 
145 | 	// Terminate UPC++
146 | 	upcxx::finalize();
147 | 	return 0;
148 | }
149 | 


--------------------------------------------------------------------------------
/chapter10/mandelbrot/view.py:
--------------------------------------------------------------------------------
 1 | # use as follows:
 2 | # # $ ./mandel > mandel.txt
 3 | # $ python2 view.py mandel.txt
 4 | 
 5 | # install numpy and matplotlib from standard repositories
 6 | # or locally with pip
 7 | # pip install --user numpy
 8 | # pip install --user matplotlib
 9 | 
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | import math
13 | import sys
14 | 
15 | # Extract points from specified file
16 | im = np.loadtxt( sys.argv[1] )
17 | 
18 | # Display
19 | plt.imshow(im,cmap=plt.cm.flag)
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/chapter10/matrix_vector/Makefile:
--------------------------------------------------------------------------------
 1 | UPCXXHOME= /opt/upcxx/
 2 | UPCXX= $(UPCXXHOME)/bin/upc++
 3 | UPCXXINC= $(UPCXXHOME)/include/upcxx/
 4 | UPCXXFLAGS= -O2 -std=c++11 -DGASNET_SEQ -DUSE_GASNET_FAST_SEGMENT -DONLY_MSPACES
 5 | GASNETRUN= /opt/gasnet/bin/gasnetrun_mpi -n 4 # install gasnet and choose backend
 6 | 
 7 | all: matrix_vector
 8 | 
 9 | matrix_vector: matrix_vector.cxx
10 | 	$(UPCXX) $(UPCXXFLAGS) matrix_vector.cxx -o matrix_vector -I $(UPCXXINC)
11 | 
12 | matrix_vector_run: matrix_vector
13 | 	$(GASNETRUN) matrix_vector 128 256
14 | 
15 | 
16 | clean:
17 | 	rm -rf matrix_vector
18 | 


--------------------------------------------------------------------------------
/chapter10/matrix_vector/matrix_vector.cxx:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <iostream>
  4 | #include <string.h>
  5 | 
  6 | #include <upcxx.h>
  7 | #include <timer.h>
  8 | 
  9 | void readInput(int m, int n, float *A, float *x){
 10 | 
 11 |     // checkerboard
 12 | 	for(int i=0; i<m; i++)
 13 |         for(int j=0; j<n; j++)
 14 |         	A[i*n+j] = (i+j) % 2;
 15 | 
 16 |     for(int i=0; i<n; i++)
 17 |         x[i] = i;
 18 | }
 19 | 
 20 | void printOutput(int n, float *data, bool firstCall){
 21 | 	FILE *fp;
 22 | 	if(firstCall){
 23 | 		fp = fopen("outMatVec.txt", "wb");
 24 | 		// Check if the file was opened
 25 | 		if(fp == NULL){
 26 | 			std::cout << "ERROR: Output file outMatVec.txt could not be opened" << std::endl;
 27 | 			exit(1);
 28 | 		}
 29 | 	} else {
 30 | 		fp = fopen("outMatVecPriv.txt", "wb");
 31 | 		// Check if the file was opened
 32 | 		if(fp == NULL){
 33 | 			std::cout << "ERROR: Output file outMatVecPriv.txt could not be opened" << std::endl;
 34 | 			exit(1);
 35 | 		}
 36 | 	}
 37 | 
 38 | 	for(int i=0; i<n; i++)
 39 |         fprintf(fp, "%lf ", data[i]);
 40 | 
 41 |     fprintf(fp, "\n");
 42 |     fclose(fp);
 43 | }
 44 | 	
 45 | 
 46 | int main (int argc, char *argv[]){
 47 | 	// Initialize UPC++
 48 | 	upcxx::init(&argc, &argv);
 49 | 
 50 | 	int numP = upcxx::ranks();
 51 | 	int myId = upcxx::myrank();
 52 | 
 53 | 	if(argc < 3){
 54 | 		// Only the first process prints the output message
 55 | 		if(!MYTHREAD){
 56 | 			std::cout << "ERROR: The syntax of the program is "
 57 |                                   << argv[0] << " m n " << std::endl;
 58 | 		}
 59 | 		exit(1);
 60 | 	}
 61 | 
 62 | 	size_t m=atoi(argv[1]);
 63 | 	size_t n=atoi(argv[2]);
 64 | 
 65 | 	if((m < 1) || (n < 1)){
 66 | 		// Only the first process prints the output message
 67 | 		if(!myId)
 68 | 			std::cout << "ERROR: 'm' and 'n' must be higher than 0" << std::endl;
 69 | 		
 70 | 		exit(1);
 71 | 	}
 72 | 
 73 |     if(m % numP){
 74 | 		// Only the first process prints the output message
 75 | 		if(!myId)
 76 | 			std::cout << "ERROR: 'm' must be multiple of the number of processes" << std::endl;
 77 | 		
 78 | 		exit(1);
 79 |     }
 80 | 
 81 | 	upcxx::shared_var<upcxx::global_ptr<float>> globalA, globalx, globaly;
 82 | 	upcxx::global_ptr<float> A, x, y;
 83 | 
 84 |     if(!myId){
 85 |         // Allocate shared memory with affinity to process 0 to store the whole matrices
 86 |         A = upcxx::allocate<float>(0, m*n);
 87 |         x = upcxx::allocate<float>(0, n);
 88 |         y = upcxx::allocate<float>(0, m);
 89 |         readInput(m, n, (float *)A, (float *)x);
 90 | 		globalA = A;
 91 | 		globalx = x;
 92 | 		globaly = y;
 93 |     }
 94 | 
 95 |     size_t blockRows = m/numP;
 96 | 
 97 | 	// To measure time
 98 | 	upcxx::timer t;
 99 | 
100 | 	// Barrier to guarantee that 'A' and 'x' are initialized
101 | 	upcxx::barrier();
102 | 	t.start();
103 | 
104 | 	A = globalA;
105 | 	x = globalx;	
106 | 	y = globaly;
107 | 
108 |     // First option, directly access in computation to shared memory
109 |     for(size_t i=myId*blockRows; i<(myId+1)*blockRows; i++){
110 |         y[i] = 0;
111 |         for(size_t j=0; j<n; j++){
112 |             y[i] += A[i*n+j]*x[j];
113 | 		}
114 |     }
115 |     t.stop();
116 | 
117 |     if(!myId){
118 |         std::cout << "Time with " << numP << " processes accessing the elements on demand: " << t.secs() << " seconds" << std::endl;
119 |         printOutput(m, (float *)y, true);
120 |     }
121 | 
122 | 	upcxx::barrier();
123 | 	t.start();
124 | 
125 |     // Second option, use buffers and work over private memory
126 |     // Allocate memory for buffers 
127 | 	upcxx::global_ptr<float> privA = upcxx::allocate<float>(myId, blockRows*n);
128 | 	upcxx::global_ptr<float> privX = upcxx::allocate<float>(myId, n);
129 | 	upcxx::global_ptr<float> privY = upcxx::allocate<float>(myId, blockRows);
130 | 
131 |     upcxx::copy(A+blockRows*n*myId, privA, blockRows*n);
132 | 	upcxx::copy(x, privX, n);
133 | 	
134 |     for(size_t i=0; i<blockRows; i++){
135 |         privY[i] = 0;
136 |         for(size_t j=0; j<n; j++)
137 |             privY[i] += privA[i*n+j]*privX[j];
138 |     }
139 | 
140 | 	upcxx::copy(privY, y+myId*blockRows, blockRows);
141 | 
142 | 	// To guarantee that all processes have copied the results into the memory with affinity to process 0
143 | 	upcxx::barrier();
144 | 	t.stop();
145 |     if(!myId){
146 |         std::cout << "Time with " << numP << " processes accessing copying the elements to private memory: " << t.secs() << " seconds" << std::endl;
147 |         printOutput(m, (float *)y, false);
148 |     }
149 | 
150 | 	upcxx::deallocate(privA);
151 | 	upcxx::deallocate(privX);
152 | 	upcxx::deallocate(privY);
153 | 
154 |     if(!myId){
155 |         upcxx::deallocate(A);
156 |         upcxx::deallocate(x);
157 |         upcxx::deallocate(y);
158 |     }
159 | 
160 | 	// Terminate UPC++
161 | 	upcxx::finalize();
162 | 	return 0;
163 | }
164 | 


--------------------------------------------------------------------------------
/chapter3/AVX/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-O2 -std=c++14 -Wall -fopenmp
 3 | AVXFLAGS=-mavx
 4 | NOVECTOR=
 5 | 
 6 | all: vector_norm_build vector_max_build matrix_matrix_mult_build pointwise_vector_max_build
 7 | 
 8 | vector_norm_soa_plain: vector_norm_soa_plain.cpp
 9 | 	$(CXX) $(CXXFLAGS) vector_norm_soa_plain.cpp -o vector_norm_soa_plain $(NOVECTOR)
10 | 
11 | vector_norm_soa_avx: vector_norm_soa_avx.cpp
12 | 	$(CXX) $(CXXFLAGS) vector_norm_soa_avx.cpp -o vector_norm_soa_avx $(AVXFLAGS)
13 | 
14 | vector_norm_aos_plain: vector_norm_aos_plain.cpp
15 | 	$(CXX) $(CXXFLAGS) vector_norm_aos_plain.cpp -o vector_norm_aos_plain $(NOVECTOR)
16 | 
17 | vector_norm_aos_avx: vector_norm_aos_avx.cpp
18 | 	$(CXX) $(CXXFLAGS) vector_norm_aos_avx.cpp -o vector_norm_aos_avx $(AVXFLAGS)
19 | 
20 | vector_norm_build: vector_norm_soa_plain vector_norm_soa_avx \
21 | 	               vector_norm_aos_plain vector_norm_aos_avx
22 | 
23 | vector_norm_run: vector_norm_build
24 | 	@echo "#####################################"
25 | 	@echo "# Vector Normalization Benchmark"
26 | 	@echo "#####################################"
27 | 	./vector_norm_soa_plain
28 | 	./vector_norm_soa_avx
29 | 	./vector_norm_aos_plain
30 | 	./vector_norm_aos_avx
31 | 
32 | vector_max: vector_max.cpp
33 | 	$(CXX) $(CXXFLAGS) vector_max.cpp -o vector_max $(AVXFLAGS)
34 | 
35 | vector_max_build: vector_max
36 | 
37 | vector_max_run: vector_max_build
38 | 	@echo "#####################################"
39 | 	@echo "# Vector Reduction Benchmark"
40 | 	@echo "#####################################"
41 | 	./vector_max
42 | 
43 | matrix_matrix_mult: matrix_matrix_mult.cpp
44 | 	$(CXX) $(CXXFLAGS) matrix_matrix_mult.cpp -o matrix_matrix_mult $(AVXFLAGS)
45 | 
46 | matrix_matrix_mult_build: matrix_matrix_mult
47 | 
48 | matrix_matrix_mult_run: matrix_matrix_mult_build
49 | 	@echo "#####################################"
50 | 	@echo "# Matrix Matrix Mult Benchmark"
51 | 	@echo "#####################################"
52 | 	./matrix_matrix_mult
53 | 
54 | pointwise_vector_max: pointwise_vector_max.cpp
55 | 	$(CXX) $(CXXFLAGS) pointwise_vector_max.cpp -o pointwise_vector_max $(AVXFLAGS)
56 | 
57 | pointwise_vector_max_build: pointwise_vector_max
58 | 
59 | pointwise_vector_max_run: pointwise_vector_max_build
60 | 	@echo "#####################################"
61 | 	@echo "# Pointwise Vector Max Benchmark"
62 | 	@echo "#####################################"
63 | 	./pointwise_vector_max
64 | 
65 | run: vector_norm_run vector_max_run pointwise_vector_max_run matrix_matrix_mult_run
66 | 
67 | clean:
68 | 	rm -f vector_norm_soa_plain
69 | 	rm -f vector_norm_soa_avx
70 | 	rm -f vector_norm_aos_plain
71 | 	rm -f vector_norm_aos_avx
72 | 	rm -f vector_max
73 | 	rm -f matrix_matrix_mult
74 | 	rm -f pointwise_vector_max
75 | 


--------------------------------------------------------------------------------
/chapter3/AVX/matrix_matrix_mult.cpp:
--------------------------------------------------------------------------------
  1 | #include <random>       // prng
  2 | #include <cstdint>      // uint32_t
  3 | #include <iostream>     // std::cout
  4 | #include <immintrin.h>  // AVX intrinsics
  5 | 
  6 | // timers distributed with this book
  7 | #include "../include/hpc_helpers.hpp"
  8 | 
  9 | void init(float * data, uint64_t length) {
 10 | 
 11 |     std::mt19937 engine(42);
 12 |     std::uniform_real_distribution<float> density(-1, 1);
 13 | 
 14 |     for (uint64_t i = 0; i < length; i++)
 15 |         data[i] = density(engine);
 16 | }
 17 | 
 18 | inline float hsum_sse3(__m128 v) {
 19 |     __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
 20 |     __m128 maxs = _mm_add_ps(v, shuf);
 21 |     shuf        = _mm_movehl_ps(shuf, maxs); // high half -> low half
 22 |     maxs        = _mm_add_ss(maxs, shuf);
 23 |     return        _mm_cvtss_f32(maxs);
 24 | }
 25 | 
 26 | inline float hsum_avx(__m256 v) {
 27 |     __m128 lo = _mm256_castps256_ps128(v);   // low 128
 28 |     __m128 hi = _mm256_extractf128_ps(v, 1); // high 128
 29 |            lo = _mm_add_ps(lo, hi);          // max the low 128
 30 |     return hsum_sse3(lo);                    // and inline the sse3 version
 31 | }
 32 | 
 33 | void plain_dmm(float * A,
 34 |                float * B,
 35 |                float * C,
 36 |                uint64_t M,
 37 |                uint64_t L,
 38 |                uint64_t N,
 39 |                bool parallel) {
 40 | 
 41 |     #pragma omp parallel for collapse(2) if(parallel)
 42 |     for (uint64_t i = 0; i < M; i++)
 43 |         for (uint64_t j = 0; j < N; j++) {
 44 |             float accum = float(0);
 45 |             for (uint64_t k = 0; k < L; k++)
 46 |                 accum += A[i*L+k]*B[j*L+k];
 47 |             C[i*N+j] = accum;
 48 |        }
 49 | }
 50 | 
 51 | void avx_dmm(float * A,
 52 |              float * B,
 53 |              float * C,
 54 |              uint64_t M,
 55 |              uint64_t L,
 56 |              uint64_t N,
 57 |              bool parallel) {
 58 | 
 59 |     #pragma omp parallel for collapse(2) if(parallel)
 60 |     for (uint64_t i = 0; i < M; i++)
 61 |         for (uint64_t j = 0; j < N; j++) {
 62 | 
 63 |             __m256 X = _mm256_setzero_ps();
 64 |             for (uint64_t k = 0; k < L; k += 8) {
 65 |                 const __m256 AV = _mm256_load_ps(A+i*L+k);
 66 |                 const __m256 BV = _mm256_load_ps(B+j*L+k);
 67 |                 X = _mm256_add_ps(X, _mm256_mul_ps(AV, BV));
 68 |             }
 69 | 
 70 |             C[i*N+j] = hsum_avx(X);
 71 |        }
 72 | }
 73 | 
 74 | void avx_dmm_unroll_2(float * A,
 75 |                       float * B,
 76 |                       float * C,
 77 |                       uint64_t M,
 78 |                       uint64_t L,
 79 |                       uint64_t N,
 80 |                       bool parallel) {
 81 | 
 82 |     #pragma omp parallel for collapse(2) if(parallel)
 83 |     for (uint64_t i = 0; i < M; i++)
 84 |         for (uint64_t j = 0; j < N; j++) {
 85 | 
 86 |             __m256 X = _mm256_setzero_ps();
 87 |             __m256 Y = _mm256_setzero_ps();
 88 |             for (uint64_t k = 0; k < L; k += 16) {
 89 |                 const __m256 AVX = _mm256_load_ps(A+i*L+k+0);
 90 |                 const __m256 BVX = _mm256_load_ps(B+j*L+k+0);
 91 |                 const __m256 AVY = _mm256_load_ps(A+i*L+k+8);
 92 |                 const __m256 BVY = _mm256_load_ps(B+j*L+k+8);
 93 |                 X = _mm256_add_ps(X, _mm256_mul_ps(AVX, BVX));
 94 |                 Y = _mm256_add_ps(X, _mm256_mul_ps(AVY, BVY));
 95 |             }
 96 | 
 97 |             C[i*N+j] = hsum_avx(X)+hsum_avx(Y);
 98 |        }
 99 | }
100 | 
101 | int main () {
102 | 
103 |     const uint64_t M = 1UL <<  10;
104 |     const uint64_t L = 1UL <<  11;
105 |     const uint64_t N = 1UL <<  12;
106 | 
107 |     TIMERSTART(alloc_memory)
108 |     auto A = static_cast<float*>(_mm_malloc(M*L*sizeof(float) , 32));
109 |     auto B = static_cast<float*>(_mm_malloc(N*L*sizeof(float) , 32));
110 |     auto C = static_cast<float*>(_mm_malloc(M*N*sizeof(float) , 32));
111 |     TIMERSTOP(alloc_memory)
112 | 
113 |     TIMERSTART(init)
114 |     init(A, M*L);
115 |     init(B, N*L);
116 |     TIMERSTOP(init)
117 | 
118 |     TIMERSTART(plain_dmm_single)
119 |     plain_dmm(A, B, C, M, L, N, false);
120 |     TIMERSTOP(plain_dmm_single)
121 | 
122 |     TIMERSTART(plain_dmm_multi)
123 |     plain_dmm(A, B, C, M, L, N, true);
124 |     TIMERSTOP(plain_dmm_multi)
125 | 
126 |     TIMERSTART(avx_dmm_single)
127 |     avx_dmm(A, B, C, M, L, N, false);
128 |     TIMERSTOP(avx_dmm_single)
129 | 
130 |     TIMERSTART(avx_dmm_multi)
131 |     avx_dmm(A, B, C, M, L, N, true);
132 |     TIMERSTOP(avx_dmm_multi)
133 | 
134 |     TIMERSTART(avx_dmm_unroll_2_single)
135 |     avx_dmm_unroll_2(A, B, C, M, L, N, false);
136 |     TIMERSTOP(avx_dmm_unroll_2_single)
137 | 
138 |     TIMERSTART(avx_dmm_unroll_2_multi)
139 |     avx_dmm_unroll_2(A, B, C, M, L, N, true);
140 |     TIMERSTOP(avx_dmm_unroll_2_multi)
141 | 
142 |     TIMERSTART(free_memory)
143 |     _mm_free(A);
144 |     _mm_free(B);
145 |     _mm_free(C);
146 |     TIMERSTOP(free_memory)
147 | }
148 | 


--------------------------------------------------------------------------------
/chapter3/AVX/pointwise_vector_max.cpp:
--------------------------------------------------------------------------------
 1 | #include <random>       // prng
 2 | #include <cstdint>      // uint32_t
 3 | #include <iostream>     // std::cout
 4 | #include <immintrin.h>  // AVX intrinsics
 5 | 
 6 | // timers distributed with this book
 7 | #include "../include/hpc_helpers.hpp"
 8 | 
 9 | void init(float * data, uint64_t length) {
10 | 
11 |     std::mt19937 engine(42);
12 |     std::uniform_real_distribution<float> density(-1L<<28, 1L<<28);
13 | 
14 |     for (uint64_t i = 0; i < length; i++)
15 |         data[i] = density(engine);
16 | }
17 | 
18 | void plain_pointwise_max(float * x,
19 |                          float * y,
20 |                          float * z, uint64_t length) {
21 | 
22 |     for (uint64_t i = 0; i < length; i++)
23 |         z[i] = std::max(x[i], y[i]);
24 | }
25 | 
26 | void avx_pointwise_max(float * x,
27 |                        float * y,
28 |                        float * z, uint64_t length) {
29 | 
30 | 
31 |     for (uint64_t i = 0; i < length; i += 8) {
32 |         __m256 X = _mm256_load_ps(x+i);
33 |         __m256 Y = _mm256_load_ps(y+i);
34 |         _mm256_store_ps(z+i, _mm256_max_ps(X, Y));
35 |     }
36 | }
37 | 
38 | 
39 | int main () {
40 | 
41 |     const uint64_t num_entries = 1UL << 28;
42 |     const uint64_t num_bytes = num_entries*sizeof(float);
43 | 
44 |     TIMERSTART(alloc_memory)
45 |     auto x = static_cast<float*>(_mm_malloc(num_bytes , 32));
46 |     auto y = static_cast<float*>(_mm_malloc(num_bytes , 32));
47 |     auto z = static_cast<float*>(_mm_malloc(num_bytes , 32));
48 |     TIMERSTOP(alloc_memory)
49 | 
50 |     TIMERSTART(init)
51 |     init(x, num_entries);
52 |     init(y, num_entries);
53 |     TIMERSTOP(init)
54 | 
55 |     TIMERSTART(plain_pointwise_max)
56 |     plain_pointwise_max(x, y, z, num_entries);
57 |     TIMERSTOP(plain_pointwise_max)
58 | 
59 |     TIMERSTART(avx_pointwise_max)
60 |     avx_pointwise_max(x, y, z, num_entries);
61 |     TIMERSTOP(avx_pointwise_max)
62 | 
63 |     TIMERSTART(free_memory)
64 |     _mm_free(x);
65 |     _mm_free(y);
66 |     _mm_free(z);
67 |     TIMERSTOP(free_memory)
68 | }
69 | 


--------------------------------------------------------------------------------
/chapter3/AVX/vector_max.cpp:
--------------------------------------------------------------------------------
  1 | #include <random>       // prng
  2 | #include <cstdint>      // uint32_t
  3 | #include <iostream>     // std::cout
  4 | #include <immintrin.h>  // AVX intrinsics
  5 | 
  6 | // timers distributed with this book
  7 | #include "../include/hpc_helpers.hpp"
  8 | 
  9 | void init(float * data, uint64_t length) {
 10 | 
 11 |     std::mt19937 engine(42);
 12 |     std::uniform_real_distribution<float> density(-1L<<28, 1L<<28);
 13 | 
 14 |     for (uint64_t i = 0; i < length; i++)
 15 |         data[i] = density(engine);
 16 | }
 17 | 
 18 | inline float hmax_sse3(__m128 v) {
 19 |     __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
 20 |     __m128 maxs = _mm_max_ps(v, shuf);
 21 |     shuf        = _mm_movehl_ps(shuf, maxs); // high half -> low half
 22 |     maxs        = _mm_max_ss(maxs, shuf);
 23 |     return        _mm_cvtss_f32(maxs);
 24 | }
 25 | 
 26 | inline float hmax_avx(__m256 v) {
 27 |     __m128 lo = _mm256_castps256_ps128(v);   // low 128
 28 |     __m128 hi = _mm256_extractf128_ps(v, 1); // high 128
 29 |            lo = _mm_max_ps(lo, hi);          // max the low 128
 30 |     return hmax_sse3(lo);                    // and inline the sse3 version
 31 | }
 32 | 
 33 | float avx_max(float * data, uint64_t length) {
 34 | 
 35 |     // neutral element "e" in monoid (|R, max) is -oo
 36 |     const float e = -INFINITY;
 37 |     __m256 X = _mm256_set1_ps(e);
 38 | 
 39 |     for (uint64_t i = 0; i < length; i += 8) {
 40 |         __m256 DATA = _mm256_load_ps(data+i);
 41 |         X = _mm256_max_ps(X, DATA);
 42 |     }
 43 | 
 44 |     return hmax_avx(X);
 45 | }
 46 | 
 47 | float avx_max_unroll_2(float * data, uint64_t length) {
 48 | 
 49 |     // neutral element "e" in monoid (|R, max) is -oo
 50 |     const float e = -INFINITY;
 51 |     __m256 X = _mm256_set1_ps(e);
 52 |     __m256 Y = _mm256_set1_ps(e);
 53 | 
 54 |     for (uint64_t i = 0; i < length; i += 16) {
 55 |         __m256 DATA_X = _mm256_load_ps(data+i+0);
 56 |         __m256 DATA_Y = _mm256_load_ps(data+i+8);
 57 |         X = _mm256_max_ps(X, DATA_X);
 58 |         Y = _mm256_max_ps(Y, DATA_Y);
 59 |     }
 60 | 
 61 |     return std::max(hmax_avx(X), hmax_avx(Y));
 62 | }
 63 | 
 64 | float plain_max(float * data, uint64_t length) {
 65 | 
 66 |     // neutral element "e" in monoid (|R, max) is -oo
 67 |     float max = -INFINITY;
 68 | 
 69 |     for (uint64_t i = 0; i < length; i++)
 70 |         max = std::max(max, data[i]);
 71 | 
 72 |     return max;
 73 | }
 74 | 
 75 | float plain_max_unroll_2(float * data, uint64_t length) {
 76 | 
 77 |     // neutral element "e" in monoid (|R, max) is -oo
 78 |     float max_0 = -INFINITY;
 79 |     float max_1 = -INFINITY;
 80 | 
 81 |     for (uint64_t i = 0; i < length; i += 2) {
 82 |         max_0 = std::max(max_0, data[i+0]);
 83 |         max_1 = std::max(max_1, data[i+1]);
 84 |     }
 85 | 
 86 |     return std::max(max_0, max_1);
 87 | }
 88 | 
 89 | float plain_max_unroll_4(float * data, uint64_t length) {
 90 | 
 91 |     // neutral element "e" in monoid (|R, max) is -oo
 92 |     float max_0 = -INFINITY;
 93 |     float max_1 = -INFINITY;
 94 |     float max_2 = -INFINITY;
 95 |     float max_3 = -INFINITY;
 96 | 
 97 |     for (uint64_t i = 0; i < length; i += 4) {
 98 |         max_0 = std::max(max_0, data[i+0]);
 99 |         max_1 = std::max(max_1, data[i+1]);
100 |         max_2 = std::max(max_2, data[i+2]);
101 |         max_3 = std::max(max_3, data[i+3]);
102 |     }
103 | 
104 |     return std::max(max_0,
105 |            std::max(max_1,
106 |            std::max(max_2, max_3)));
107 | }
108 | 
109 | float plain_max_unroll_8(float * data, uint64_t length) {
110 | 
111 |     // neutral element "e" in monoid (|R, max) is -oo
112 |     float max_0 = -INFINITY;
113 |     float max_1 = -INFINITY;
114 |     float max_2 = -INFINITY;
115 |     float max_3 = -INFINITY;
116 |     float max_4 = -INFINITY;
117 |     float max_5 = -INFINITY;
118 |     float max_6 = -INFINITY;
119 |     float max_7 = -INFINITY;
120 | 
121 |     for (uint64_t i = 0; i < length; i += 8) {
122 |         max_0 = std::max(max_0, data[i+0]);
123 |         max_1 = std::max(max_1, data[i+1]);
124 |         max_2 = std::max(max_2, data[i+2]);
125 |         max_3 = std::max(max_3, data[i+3]);
126 |         max_4 = std::max(max_4, data[i+0]);
127 |         max_5 = std::max(max_5, data[i+1]);
128 |         max_6 = std::max(max_6, data[i+2]);
129 |         max_7 = std::max(max_7, data[i+3]);
130 |     }
131 | 
132 |     return std::max(max_0,
133 |            std::max(max_1,
134 |            std::max(max_2,
135 |            std::max(max_3,
136 |            std::max(max_4,
137 |            std::max(max_5,
138 |            std::max(max_6, max_7)))))));
139 | }
140 | 
141 | int main () {
142 | 
143 |     const uint64_t num_entries = 1UL << 28;
144 |     const uint64_t num_bytes = num_entries*sizeof(float);
145 | 
146 |     TIMERSTART(alloc_memory)
147 |     auto data = static_cast<float*>(_mm_malloc(num_bytes , 32));
148 |     TIMERSTOP(alloc_memory)
149 | 
150 |     TIMERSTART(init)
151 |     init(data, num_entries);
152 |     TIMERSTOP(init)
153 | 
154 |     TIMERSTART(plain_max)
155 |     std::cout << plain_max(data, num_entries) << std::endl;
156 |     TIMERSTOP(plain_max)
157 | 
158 |     TIMERSTART(plain_max_unroll_2)
159 |     std::cout << plain_max_unroll_2(data, num_entries) << std::endl;
160 |     TIMERSTOP(plain_max_unroll_2)
161 | 
162 |     TIMERSTART(plain_max_unroll_4)
163 |     std::cout << plain_max_unroll_4(data, num_entries) << std::endl;
164 |     TIMERSTOP(plain_max_unroll_4)
165 | 
166 |     TIMERSTART(plain_max_unroll_8)
167 |     std::cout << plain_max_unroll_8(data, num_entries) << std::endl;
168 |     TIMERSTOP(plain_max_unroll_8)
169 | 
170 |     TIMERSTART(avx_max)
171 |     std::cout << avx_max(data, num_entries) << std::endl;
172 |     TIMERSTOP(avx_max)
173 | 
174 |     TIMERSTART(avx_max_unroll_2)
175 |     std::cout << avx_max_unroll_2(data, num_entries) << std::endl;
176 |     TIMERSTOP(avx_max_unroll_2)
177 | 
178 |     TIMERSTART(free_memory)
179 |     _mm_free(data);
180 |     TIMERSTOP(free_memory)
181 | }
182 | 


--------------------------------------------------------------------------------
/chapter3/AVX/vector_norm_aos_avx.cpp:
--------------------------------------------------------------------------------
  1 | #include <random>       // prng
  2 | #include <cstdint>      // uint32_t
  3 | #include <iostream>     // std::cout
  4 | #include <immintrin.h>  // AVX intrinsics
  5 | 
  6 | // timers distributed with this book
  7 | #include "../include/hpc_helpers.hpp"
  8 | 
  9 | void aos_init(float * xyz, uint64_t length) {
 10 | 
 11 |     std::mt19937 engine(42);
 12 |     std::uniform_real_distribution<float> density(-1, 1);
 13 | 
 14 |     for (uint64_t i = 0; i < 3*length; i++)
 15 |         xyz[i] = density(engine);
 16 | }
 17 | 
 18 | void avx_aos_norm(float * xyz, uint64_t length) {
 19 | 
 20 |     for (uint64_t i = 0; i < 3*length; i += 3*8) {
 21 | 
 22 |         /////////////////////////////////////////////////////////////////////
 23 |         // AOS2SOA: XYZXYZXY ZXYZXYZX YZXYZXYZ --> XXXXXXX YYYYYYY ZZZZZZZZ
 24 |         /////////////////////////////////////////////////////////////////////
 25 | 
 26 |         // registers: NOTE: M is an SSE pointer (length 4)
 27 |         __m128 *M = (__m128*) (xyz+i);
 28 |         __m256 M03;
 29 |         __m256 M14;
 30 |         __m256 M25;
 31 | 
 32 |         // load lower halves
 33 |         M03 = _mm256_castps128_ps256(M[0]);
 34 |         M14 = _mm256_castps128_ps256(M[1]);
 35 |         M25 = _mm256_castps128_ps256(M[2]);
 36 | 
 37 |         // load upper halves
 38 |         M03 = _mm256_insertf128_ps(M03 ,M[3],1);
 39 |         M14 = _mm256_insertf128_ps(M14 ,M[4],1);
 40 |         M25 = _mm256_insertf128_ps(M25 ,M[5],1);
 41 | 
 42 |         // everyday I am shuffeling...
 43 |         __m256 XY = _mm256_shuffle_ps(M14, M25, _MM_SHUFFLE( 2,1,3,2));
 44 |         __m256 YZ = _mm256_shuffle_ps(M03, M14, _MM_SHUFFLE( 1,0,2,1));
 45 |         __m256 X  = _mm256_shuffle_ps(M03, XY , _MM_SHUFFLE( 2,0,3,0));
 46 |         __m256 Y  = _mm256_shuffle_ps(YZ , XY , _MM_SHUFFLE( 3,1,2,0));
 47 |         __m256 Z  = _mm256_shuffle_ps(YZ , M25, _MM_SHUFFLE( 3,0,3,1));
 48 | 
 49 |         /////////////////////////////////////////////////////////////////////
 50 |         // SOA computation
 51 |         /////////////////////////////////////////////////////////////////////
 52 | 
 53 |         // R <- X*X+Y*Y+Z*Z
 54 |         __m256 R = _mm256_add_ps(_mm256_mul_ps(X, X),
 55 |                    _mm256_add_ps(_mm256_mul_ps(Y, Y),
 56 |                                  _mm256_mul_ps(Z, Z)));
 57 |         // R <- 1/sqrt(R)
 58 |                R = _mm256_rsqrt_ps(R);
 59 | 
 60 |         // normalize vectors
 61 |         X = _mm256_mul_ps(X, R);
 62 |         Y = _mm256_mul_ps(Y, R);
 63 |         Z = _mm256_mul_ps(Z, R);
 64 | 
 65 |         /////////////////////////////////////////////////////////////////////
 66 |         // SOA2AOS: XXXXXXX YYYYYYY ZZZZZZZZ -> XYZXYZXY ZXYZXYZX YZXYZXYZ
 67 |         /////////////////////////////////////////////////////////////////////
 68 | 
 69 |         // everyday I am shuffeling...
 70 |         __m256 RXY = _mm256_shuffle_ps(X,Y, _MM_SHUFFLE(2,0,2,0));
 71 |         __m256 RYZ = _mm256_shuffle_ps(Y,Z, _MM_SHUFFLE(3,1,3,1));
 72 |         __m256 RZX = _mm256_shuffle_ps(Z,X, _MM_SHUFFLE(3,1,2,0));
 73 |         __m256 R03 = _mm256_shuffle_ps(RXY, RZX, _MM_SHUFFLE(2,0,2,0));
 74 |         __m256 R14 = _mm256_shuffle_ps(RYZ, RXY, _MM_SHUFFLE(3,1,2,0));
 75 |         __m256 R25 = _mm256_shuffle_ps(RZX, RYZ, _MM_SHUFFLE(3,1,3,1));
 76 | 
 77 |         // store in AOS (6*4=24)
 78 |         M[0] = _mm256_castps256_ps128(R03);
 79 |         M[1] = _mm256_castps256_ps128(R14);
 80 |         M[2] = _mm256_castps256_ps128(R25);
 81 |         M[3] = _mm256_extractf128_ps(R03, 1);
 82 |         M[4] = _mm256_extractf128_ps(R14, 1);
 83 |         M[5] = _mm256_extractf128_ps(R25, 1);
 84 |     }
 85 | }
 86 | 
 87 | void aos_check(float * xyz, uint64_t length) {
 88 | 
 89 |     for (uint64_t i = 0; i < 3*length; i += 3) {
 90 | 
 91 |         const float x = xyz[i+0];
 92 |         const float y = xyz[i+1];
 93 |         const float z = xyz[i+2];
 94 | 
 95 |         float rho = x*x+y*y+z*z;
 96 | 
 97 |         if ((rho-1)*(rho-1) > 1E-6)
 98 |             std::cout << "error too big at position "
 99 |                       << i << std::endl;
100 |     }
101 | }
102 | int main () {
103 | 
104 |     const uint64_t num_vectors = 1UL << 28;
105 |     const uint64_t num_bytes = 3*num_vectors*sizeof(float);
106 | 
107 |     TIMERSTART(alloc_memory)
108 |     auto xyz = static_cast<float*>(_mm_malloc(num_bytes , 32));
109 |     TIMERSTOP(alloc_memory)
110 | 
111 |     TIMERSTART(init)
112 |     aos_init(xyz, num_vectors);
113 |     TIMERSTOP(init)
114 | 
115 |     TIMERSTART(avx_aos_normalize)
116 |     avx_aos_norm(xyz, num_vectors);
117 |     TIMERSTOP(avx_aos_normalize)
118 | 
119 |     TIMERSTART(check)
120 |     aos_check(xyz, num_vectors);
121 |     TIMERSTOP(check)
122 | 
123 |     TIMERSTART(free_memory)
124 |     _mm_free(xyz);
125 |     TIMERSTOP(free_memory)
126 | }
127 | 


--------------------------------------------------------------------------------
/chapter3/AVX/vector_norm_aos_plain.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdint>      // uint32_t
 2 | #include <iostream>     // std::cout
 3 | #include <random>       // prng
 4 | 
 5 | // timers distributed with this book
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | void aos_init(float * xyz, uint64_t length) {
 9 | 
10 |     std::mt19937 engine(42);
11 |     std::uniform_real_distribution<float> density(-1, 1);
12 | 
13 |     for (uint64_t i = 0; i < 3*length; i++)
14 |         xyz[i] = density(engine);
15 | }
16 | 
17 | void plain_aos_norm(float * xyz, uint64_t length) {
18 | 
19 |     for (uint64_t i = 0; i < 3*length; i += 3) {
20 |         const float x = xyz[i+0];
21 |         const float y = xyz[i+1];
22 |         const float z = xyz[i+2];
23 | 
24 |         float irho = 1.0f/std::sqrt(x*x+y*y+z*z);
25 | 
26 |         xyz[i+0] *= irho;
27 |         xyz[i+1] *= irho;
28 |         xyz[i+2] *= irho;
29 |     }
30 | }
31 | 
32 | void aos_check(float * xyz, uint64_t length) {
33 | 
34 |     for (uint64_t i = 0; i < 3*length; i += 3) {
35 | 
36 |         const float x = xyz[i+0];
37 |         const float y = xyz[i+1];
38 |         const float z = xyz[i+2];
39 | 
40 |         float rho = x*x+y*y+z*z;
41 | 
42 |         if ((rho-1)*(rho-1) > 1E-6)
43 |             std::cout << "error too big at position "
44 |                       << i << std::endl;
45 |     }
46 | }
47 | 
48 | int main () {
49 | 
50 |     const uint64_t num_vectors = 1UL << 28;
51 | 
52 |     TIMERSTART(alloc_memory)
53 |     auto xyz = new float[3*num_vectors];
54 |     TIMERSTOP(alloc_memory)
55 | 
56 |     TIMERSTART(init)
57 |     aos_init(xyz, num_vectors);
58 |     TIMERSTOP(init)
59 | 
60 |     TIMERSTART(plain_aos_normalize)
61 |     plain_aos_norm(xyz, num_vectors);
62 |     TIMERSTOP(plain_aos_normalize)
63 | 
64 |     TIMERSTART(check)
65 |     aos_check(xyz, num_vectors);
66 |     TIMERSTOP(check)
67 | 
68 |     TIMERSTART(free_memory)
69 |     delete [] xyz;
70 |     TIMERSTOP(free_memory)
71 | }
72 | 


--------------------------------------------------------------------------------
/chapter3/AVX/vector_norm_soa_avx.cpp:
--------------------------------------------------------------------------------
 1 | #include <random>       // prng
 2 | #include <cstdint>      // uint32_t
 3 | #include <iostream>     // std::cout
 4 | #include <immintrin.h>  // AVX intrinsics
 5 | 
 6 | // timers distributed with this book
 7 | #include "../include/hpc_helpers.hpp"
 8 | 
 9 | void soa_init(float * x,
10 |               float * y,
11 |               float * z,
12 |               uint64_t length) {
13 | 
14 |     std::mt19937 engine(42);
15 |     std::uniform_real_distribution<float> density(-1, 1);
16 | 
17 |     for (uint64_t i = 0; i < length; i++) {
18 |         x[i] = density(engine);
19 |         y[i] = density(engine);
20 |         z[i] = density(engine);
21 |     }
22 | 
23 | }
24 | 
25 | void avx_soa_norm(float * x,
26 |                     float * y,
27 |                     float * z,
28 |                     uint64_t length) {
29 | 
30 |     for (uint64_t i = 0; i < length; i += 8) {
31 | 
32 |         // aligned loads
33 |         __m256 X = _mm256_load_ps(x+i);
34 |         __m256 Y = _mm256_load_ps(y+i);
35 |         __m256 Z = _mm256_load_ps(z+i);
36 | 
37 |         // R <- X*X+Y*Y+Z*Z
38 |         __m256 R = _mm256_add_ps(_mm256_mul_ps(X, X),
39 |                    _mm256_add_ps(_mm256_mul_ps(Y, Y),
40 |                                  _mm256_mul_ps(Z, Z)));
41 |         // R <- 1/sqrt(R)
42 |                R = _mm256_rsqrt_ps(R);
43 | 
44 |         // aligned stores
45 |         _mm256_store_ps(x+i, _mm256_mul_ps(X, R));
46 |         _mm256_store_ps(y+i, _mm256_mul_ps(Y, R));
47 |         _mm256_store_ps(z+i, _mm256_mul_ps(Z, R));
48 |     }
49 | }
50 | 
51 | void soa_check(float * x,
52 |                float * y,
53 |                float * z,
54 |                uint64_t length) {
55 | 
56 |     for (uint64_t i = 0; i < length; i++) {
57 |         float rho = x[i]*x[i]+y[i]*y[i]+z[i]*z[i];
58 |         if ((rho-1)*(rho-1) > 1E-6)
59 |             std::cout << "error too big at position "
60 |                       << i << std::endl;
61 |     }
62 | }
63 | 
64 | int main () {
65 | 
66 |     const uint64_t num_vectors = 1UL << 28;
67 |     const uint64_t num_bytes = num_vectors*sizeof(float);
68 | 
69 |     TIMERSTART(alloc_memory)
70 |     auto x = static_cast<float*>(_mm_malloc(num_bytes , 32));
71 |     auto y = static_cast<float*>(_mm_malloc(num_bytes , 32));
72 |     auto z = static_cast<float*>(_mm_malloc(num_bytes , 32));
73 |     TIMERSTOP(alloc_memory)
74 | 
75 |     TIMERSTART(init)
76 |     soa_init(x, y, z, num_vectors);
77 |     TIMERSTOP(init)
78 | 
79 |     TIMERSTART(avx_soa_normalize)
80 |     avx_soa_norm(x, y, z, num_vectors);
81 |     TIMERSTOP(avx_soa_normalize)
82 | 
83 |     TIMERSTART(check)
84 |     soa_check(x, y, z, num_vectors);
85 |     TIMERSTOP(check)
86 | 
87 |     TIMERSTART(free_memory)
88 |     _mm_free(x);
89 |     _mm_free(y);
90 |     _mm_free(z);
91 |     TIMERSTOP(free_memory)
92 | }
93 | 


--------------------------------------------------------------------------------
/chapter3/AVX/vector_norm_soa_plain.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdint>      // uint32_t
 2 | #include <iostream>     // std::cout
 3 | #include <random>       // prng
 4 | 
 5 | // timers distributed with this book
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | void soa_init(float * x,
 9 |               float * y,
10 |               float * z,
11 |               uint64_t length) {
12 | 
13 |     std::mt19937 engine(42);
14 |     std::uniform_real_distribution<float> density(-1, 1);
15 | 
16 |     for (uint64_t i = 0; i < length; i++) {
17 |         x[i] = density(engine);
18 |         y[i] = density(engine);
19 |         z[i] = density(engine);
20 |     }
21 | 
22 | }
23 | 
24 | void plain_soa_norm(float * x,
25 |                     float * y,
26 |                     float * z,
27 |                     uint64_t length) {
28 | 
29 |     for (uint64_t i = 0; i < length; i++) {
30 |         float irho = 1.0f/std::sqrt(x[i]*x[i]+
31 |                                     y[i]*y[i]+
32 |                                     z[i]*z[i]);
33 |         x[i] *= irho;
34 |         y[i] *= irho;
35 |         z[i] *= irho;
36 |     }
37 | }
38 | 
39 | void soa_check(float * x,
40 |                float * y,
41 |                float * z,
42 |                uint64_t length) {
43 | 
44 |     for (uint64_t i = 0; i < length; i++) {
45 |         float rho = x[i]*x[i]+y[i]*y[i]+z[i]*z[i];
46 |         if ((rho-1)*(rho-1) > 1E-6)
47 |             std::cout << "error too big at position "
48 |                       << i << std::endl;
49 |     }
50 | }
51 | 
52 | int main () {
53 | 
54 |     const uint64_t num_vectors = 1UL << 28;
55 | 
56 |     TIMERSTART(alloc_memory)
57 |     auto x = new float[num_vectors];
58 |     auto y = new float[num_vectors];
59 |     auto z = new float[num_vectors];
60 |     TIMERSTOP(alloc_memory)
61 | 
62 |     TIMERSTART(init)
63 |     soa_init(x, y, z, num_vectors);
64 |     TIMERSTOP(init)
65 | 
66 |     TIMERSTART(plain_soa_normalize)
67 |     plain_soa_norm(x, y, z, num_vectors);
68 |     TIMERSTOP(plain_soa_normalize)
69 | 
70 |     TIMERSTART(check)
71 |     soa_check(x, y, z, num_vectors);
72 |     TIMERSTOP(check)
73 | 
74 |     TIMERSTART(free_memory)
75 |     delete [] x;
76 |     delete [] y;
77 |     delete [] z;
78 |     TIMERSTOP(free_memory)
79 | }
80 | 


--------------------------------------------------------------------------------
/chapter3/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter3/matrix_matrix_mult_transposed/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | CXXFLAGS=-O2 -std=c++14 -Wall
 3 | 
 4 | all: matrix_mult_seq matrix_mult_omp
 5 | 
 6 | matrix_mult_seq: matrix_mult.cpp
 7 | 	$(CXX) $(CXXFLAGS) matrix_mult.cpp -o matrix_mult_seq
 8 | 
 9 | matrix_mult_omp: matrix_mult.cpp
10 | 	$(CXX) $(CXXFLAGS) matrix_mult.cpp -fopenmp -o matrix_mult_omp
11 | 
12 | clean:
13 | 	rm -f matrix_mult_seq
14 | 	rm -f matrix_mult_omp
15 | 


--------------------------------------------------------------------------------
/chapter3/matrix_matrix_mult_transposed/matrix_mult.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include "../include/hpc_helpers.hpp"
 5 | 
 6 | int main () {
 7 | 
 8 |     // matrix shapes
 9 |     const uint64_t m = 1 << 15;
10 |     const uint64_t n = 1 << 15;
11 |     const uint64_t l = 1 << 5;
12 | 
13 |     TIMERSTART(init)
14 |     // sum_k A_ik * B_kj = sum_k A_ik * B^t_jk = C_ij
15 |     std::vector<float> A (m*l, 0); // m x l
16 |     std::vector<float> B (l*n, 0); // l x n
17 |     std::vector<float> Bt(n*l, 0); // n x l
18 |     std::vector<float> C (m*n, 0); // m x n
19 |     TIMERSTOP(init)
20 | 
21 |     TIMERSTART(transpose_and_mult)
22 |     TIMERSTART(transpose)
23 |     #pragma omp parallel for collapse(2)
24 |     for (uint64_t k = 0; k < l; k++)
25 |         for (uint64_t j = 0; j < n; j++)
26 |             Bt[j*l+k] = B[k*n+j];
27 |     TIMERSTOP(transpose)
28 | 
29 |     TIMERSTART(transpose_mult)
30 |     #pragma omp parallel for collapse(2)
31 |     for (uint64_t i = 0; i < m; i++)
32 |         for (uint64_t j = 0; j < n; j++) {
33 |             float accum = 0;
34 |             for (uint64_t k = 0; k < l; k++)
35 |                 accum += A[i*l+k]*Bt[j*l+k];
36 |             C[i*n+j] = accum;
37 | 	    }
38 | 
39 |     TIMERSTOP(transpose_mult)
40 |     TIMERSTOP(transpose_and_mult)
41 | 
42 |     TIMERSTART(naive_mult)
43 |     #pragma omp parallel for collapse(2)
44 |     for (uint64_t i = 0; i < m; i++)
45 |         for (uint64_t j = 0; j < n; j++) {
46 |             float accum = 0;
47 | 	    for (uint64_t k = 0; k < l; k++)
48 |                 accum += A[i*l+k]*B[k*n+j];
49 | 	    C[i*n+j] = accum;
50 | 	}
51 | 
52 |     TIMERSTOP(naive_mult)
53 | }
54 | 


--------------------------------------------------------------------------------
/chapter4/all_pairs_distance_matrix/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: all_pair
 5 | 
 6 | all_pair: all_pair.cpp
 7 | 	$(CXX) all_pair.cpp $(CXXFLAGS) -o all_pair
 8 | 
 9 | clean:
10 | 	rm -rf all_pair
11 | 


--------------------------------------------------------------------------------
/chapter4/all_pairs_distance_matrix/data/mnist_exporter.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # run __ONE__ of the following commands:
 3 | #
 4 | # pip install --user tensorflow (if you have no CUDA-enabled GPU)
 5 | # pip install --user tensorflow-gpu
 6 | #
 7 | # Numpy should come bundled with tensorflow. Run this file et voila!
 8 | #####################################################################
 9 | 
10 | import array as ar
11 | import numpy as np
12 | 
13 | # everyone has tensorflow installed nowadays :D
14 | from tensorflow.examples.tutorials.mnist import input_data
15 | mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
16 | 
17 | merge = np.vstack(( mnist.train.images, mnist.test.images))
18 | 
19 | with open("mnist_all_65000_28_28_32.bin", "wb") as f:
20 |     f.write(ar.array("f", merge.flatten()))
21 | 


--------------------------------------------------------------------------------
/chapter4/condition_variables/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: alarm_clock ping_pong one_shot_alarm_clock
 5 | 
 6 | alarm_clock: alarm_clock.cpp
 7 | 	$(CXX) alarm_clock.cpp $(CXXFLAGS) -o alarm_clock
 8 | 
 9 | one_shot_alarm_clock: one_shot_alarm_clock.cpp
10 | 	$(CXX) one_shot_alarm_clock.cpp $(CXXFLAGS) -o one_shot_alarm_clock
11 | 
12 | ping_pong: ping_pong.cpp
13 | 	$(CXX) ping_pong.cpp $(CXXFLAGS) -o ping_pong
14 | 
15 | clean:
16 | 	rm -rf alarm_clock
17 | 	rm -rf one_shot_alarm_clock
18 | 	rm -rf ping_pong
19 | 


--------------------------------------------------------------------------------
/chapter4/condition_variables/alarm_clock.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>            // std::cout
 2 | #include <thread>              // std::thread
 3 | #include <mutex>               // std::mutex
 4 | #include <chrono>              // std::this_thread::sleep_for
 5 | #include <condition_variable>  // std::condition_variable
 6 | 
 7 | // convenient time formats (C++14 required)
 8 | using namespace std::chrono_literals;
 9 | 
10 | int main() {
11 | 
12 |     std::mutex mutex;
13 |     std::condition_variable cv;
14 |     bool time_for_breakfast = false; // globally shared state
15 | 
16 |     // to be called by thread
17 |     auto student = [&] ( ) -> void {
18 | 
19 |         { // this is the scope of the lock
20 |             std::unique_lock<std::mutex> unique_lock(mutex);
21 | 
22 |             // check the globally shared state
23 |             do {
24 |                 // lock is released during wait
25 |                 cv.wait(unique_lock);
26 |             } while (!time_for_breakfast);
27 | 
28 |             // alternatively, you can specify the
29 |             // predicate directly using a closure
30 |             // cv.wait(unique_lock,
31 |             //        [&](){ return time_for_break_fast; });
32 |         } // lock is finally released
33 | 
34 |         std::cout << "Time to make coffee!" << std::endl;
35 |     };
36 | 
37 |     // create the waiting thread and wait for 2s
38 |     std::thread my_thread(student);
39 |     std::this_thread::sleep_for(2s);
40 | 
41 |     { // prepare the alarm clock
42 |         std::lock_guard<std::mutex> lock_guard(mutex);
43 |         time_for_breakfast = true;
44 |     } // here the lock is released
45 | 
46 |     // ring the alarm clock
47 |     cv.notify_one();
48 | 
49 |     // wait until breakfast is finished
50 |     my_thread.join();
51 | }
52 | 


--------------------------------------------------------------------------------
/chapter4/condition_variables/one_shot_alarm_clock.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>            // std::cout
 2 | #include <thread>              // std::thread
 3 | #include <future>              // std::future
 4 | #include <chrono>              // std::this_thread::sleep_for
 5 | 
 6 | // convenient time formats (C++14 required)
 7 | using namespace std::chrono_literals;
 8 | 
 9 | int main() {
10 | 
11 |     // create pair (future, promise)
12 |     std::promise<void> promise;
13 |     auto shared_future = promise.get_future().share();
14 | 
15 |     // to be called by thread
16 |     auto students = [&] ( ) -> void {
17 | 
18 |         // blocks until fulfilling promise
19 |         shared_future.get();
20 |         std::cout << "Time to make coffee!" << std::endl;
21 |     };
22 | 
23 |     // create the waiting thread and wait for 2s
24 |     std::thread my_thread0(students);
25 |     std::thread my_thread1(students);
26 |     std::this_thread::sleep_for(2s);
27 |     promise.set_value();
28 | 
29 |     // wait until breakfast is finished
30 |     my_thread0.join();
31 |     my_thread1.join();
32 | }
33 | 


--------------------------------------------------------------------------------
/chapter4/condition_variables/ping_pong.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>            // std::cout
 2 | #include <thread>              // std::thread
 3 | #include <mutex>               // std::mutex
 4 | #include <chrono>              // std::this_thread::sleep_for
 5 | #include <condition_variable>  // std::condition_variable
 6 | 
 7 | // convenient time formats (C++14 required)
 8 | using namespace std::chrono_literals;
 9 | 
10 | int main() {
11 | 
12 |     std::mutex mutex;
13 |     std::condition_variable cv;
14 |     bool is_ping = true; // globally shared state
15 | 
16 |     auto ping = [&] ( ) -> void {
17 |         while (true) {
18 | 
19 |             // wait to be signalled
20 |             std::unique_lock<std::mutex> unique_lock(mutex);
21 |             cv.wait(unique_lock,[&](){return is_ping;});
22 | 
23 |             // print "ping" to the command line
24 |             std::this_thread::sleep_for(1s);
25 |             std::cout << "ping" << std::endl;
26 | 
27 |             // alter state and notify other thread
28 |             is_ping = !is_ping;
29 |             cv.notify_one();
30 |         }
31 |     };
32 | 
33 |     auto pong = [&] ( ) -> void {
34 |         while (true) {
35 |             // wait to be signalled
36 |             std::unique_lock<std::mutex> unique_lock(mutex);
37 |             cv.wait(unique_lock,[&](){return !is_ping;});
38 | 
39 |             // print "pong" to the command line
40 |             std::this_thread::sleep_for(1s);
41 |             std::cout << "pong" << std::endl;
42 | 
43 |             // alter state and notify other thread
44 |             is_ping = !is_ping;
45 |             cv.notify_one();
46 |         }
47 |     };
48 | 
49 |     std::thread ping_thread(ping);
50 |     std::thread pong_thread(pong);
51 |     ping_thread.join();
52 |     pong_thread.join();
53 | }
54 | 


--------------------------------------------------------------------------------
/chapter4/false_sharing/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: false_sharing
 5 | 
 6 | false_sharing: false_sharing.cpp
 7 | 	$(CXX) false_sharing.cpp $(CXXFLAGS) -o false_sharing
 8 | 
 9 | clean:
10 | 	rm -rf false_sharing
11 | 


--------------------------------------------------------------------------------
/chapter4/false_sharing/false_sharing.cpp:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | 
 3 | #include <thread>
 4 | 
 5 | struct pack_t {
 6 |     uint64_t ying;
 7 |     uint64_t yang;
 8 | 
 9 |     pack_t() : ying(0), yang(0) {}
10 | };
11 | 
12 | void sequential_increment(
13 |     volatile pack_t& pack) {
14 | 
15 |     for (uint64_t index = 0; index < 1UL << 30; index++) {
16 |         pack.ying++;
17 |         pack.yang++;
18 |     }
19 | }
20 | 
21 | 
22 | void false_sharing_increment(
23 |     volatile pack_t& pack) {
24 | 
25 |     auto eval_ying = [&pack] () -> void {
26 |         for (uint64_t index = 0; index < 1UL << 30; index++)
27 |             pack.ying++;
28 |     };
29 | 
30 |     auto eval_yang = [&pack] () -> void {
31 |         for (uint64_t index = 0; index < 1UL << 30; index++)
32 |             pack.yang++;
33 |     };
34 | 
35 |     std::thread ying_thread(eval_ying);
36 |     std::thread yang_thread(eval_yang);
37 |     ying_thread.join();
38 |     yang_thread.join();
39 | }
40 | 
41 | int main(int argc, char* argv[]) {
42 | 
43 |     pack_t seq_pack;
44 | 
45 |     TIMERSTART(sequential_increment)
46 |     sequential_increment(seq_pack);
47 |     TIMERSTOP(sequential_increment)
48 | 
49 |     std::cout << seq_pack.ying << " " << seq_pack.yang << std::endl;
50 | 
51 |     pack_t par_pack;
52 | 
53 |     TIMERSTART(false_sharing_increment_increment)
54 |     false_sharing_increment(par_pack);
55 |     TIMERSTOP(false_sharing_increment_increment)
56 | 
57 |     std::cout << par_pack.ying << " " << par_pack.yang << std::endl;
58 | }
59 | 


--------------------------------------------------------------------------------
/chapter4/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: hello_world
 5 | 
 6 | hello_world: hello_world.cpp
 7 | 	$(CXX) hello_world.cpp $(CXXFLAGS) -o hello_world
 8 | 
 9 | clean:
10 | 	rm -rf hello_world
11 | 


--------------------------------------------------------------------------------
/chapter4/hello_world/hello_world.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | 
 6 | template <typename index_t>
 7 | void say_hello_template(index_t id) {
 8 |     std::cout << "Hello from thread: "  << id << std::endl;
 9 | }
10 | 
11 | void say_hello(uint64_t id) {
12 |     std::cout << "Hello from thread: "  << id << std::endl;
13 | }
14 | 
15 | int main(int argc, char * argv[]) {
16 | 
17 |     const uint64_t num_threads = 4;
18 | 
19 |     std::vector<std::thread> threads;
20 | 
21 |     for (uint64_t id = 0; id < num_threads; id++)
22 |         threads.emplace_back(
23 |             std::thread(
24 |                 say_hello, id
25 |             )
26 |         );
27 | 
28 |     for (auto& thread: threads)
29 |         thread.join();
30 | }
31 | 


--------------------------------------------------------------------------------
/chapter4/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter4/matrix_vector_mult/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: matrix_vector
 5 | 
 6 | matrix_vector: matrix_vector.cpp
 7 | 	$(CXX) matrix_vector.cpp $(CXXFLAGS) -o matrix_vector
 8 | 
 9 | clean:
10 | 	rm -rf matrix_vector
11 | 


--------------------------------------------------------------------------------
/chapter4/matrix_vector_mult/matrix_vector.cpp:
--------------------------------------------------------------------------------
  1 | #include "../include/hpc_helpers.hpp"
  2 | 
  3 | #include <iostream>
  4 | #include <cstdint>
  5 | #include <vector>
  6 | #include <thread>
  7 | 
  8 | template <
  9 |     typename value_t,
 10 |     typename index_t>
 11 | void init(
 12 |     std::vector<value_t>& A,
 13 |     std::vector<value_t>& x,
 14 |     index_t m,
 15 |     index_t n) {
 16 | 
 17 |     for (index_t row = 0; row < m; row++)
 18 |         for (index_t col = 0; col < n; col++)
 19 |             A[row*n+col] = row >= col ? 1 : 0;
 20 | 
 21 |     for (index_t col = 0; col < m; col++)
 22 |         x[col] = col;
 23 | }
 24 | 
 25 | template <
 26 |     typename value_t,
 27 |     typename index_t>
 28 | void sequential_mult(
 29 |     std::vector<value_t>& A,
 30 |     std::vector<value_t>& x,
 31 |     std::vector<value_t>& b,
 32 |     index_t m,
 33 |     index_t n) {
 34 | 
 35 |     for (index_t row = 0; row < m; row++) {
 36 |         value_t accum = value_t(0);
 37 |         for (index_t col = 0; col < n; col++)
 38 |             accum += A[row*n+col]*x[col];
 39 |         b[row] = accum;
 40 |     }
 41 | }
 42 | 
 43 | template <
 44 |     typename value_t,
 45 |     typename index_t>
 46 | void cyclic_parallel_mult(
 47 |     std::vector<value_t>& A, // linear memory for A
 48 |     std::vector<value_t>& x, // to be mapped vector
 49 |     std::vector<value_t>& b, // result vector
 50 |     index_t m,               // number of rows
 51 |     index_t n,               // number of cols
 52 |     index_t num_threads=8) { // number of threads p
 53 | 
 54 |     // this  function  is  called  by the  threads
 55 |     auto cyclic = [&] (const index_t& id) -> void {
 56 | 
 57 |         // indices are incremented with a stride of p
 58 |         for (index_t row = id; row < m; row += num_threads) {
 59 |             value_t accum = value_t(0);
 60 | 	    for (index_t col = 0; col < n; col++)
 61 |                 accum += A[row*n+col]*x[col];
 62 |             b[row] = accum;
 63 |         }
 64 |     };
 65 | 
 66 |     // business as usual
 67 |     std::vector<std::thread> threads;
 68 | 
 69 |     for (index_t id = 0; id < num_threads; id++)
 70 |         threads.emplace_back(cyclic, id);
 71 | 
 72 |     for (auto& thread : threads)
 73 |         thread.join();
 74 | }
 75 | 
 76 | template <
 77 |     typename value_t,
 78 |     typename index_t>
 79 | void block_parallel_mult(
 80 |     std::vector<value_t>& A,
 81 |     std::vector<value_t>& x,
 82 |     std::vector<value_t>& b,
 83 |     index_t m,
 84 |     index_t n,
 85 |     index_t num_threads=32) {
 86 | 
 87 |     // this function is called by the threads
 88 |     auto block = [&] (const index_t& id) -> void {
 89 |         //        ^-- capture whole scope by reference
 90 | 
 91 |         // compute chunk size, lower and upper task id
 92 |         const index_t chunk = SDIV(m, num_threads);
 93 |         const index_t lower = id*chunk;
 94 |         const index_t upper = std::min(lower+chunk, m);
 95 | 
 96 |         // only computes rows between lower and upper
 97 |         for (index_t row = lower; row < upper; row++) {
 98 |             value_t accum = value_t(0);
 99 |             for (index_t col = 0; col < n; col++)
100 |                 accum += A[row*n+col]*x[col];
101 |             b[row] = accum;
102 |         }
103 |     };
104 | 
105 |     // business as usual
106 |     std::vector<std::thread> threads;
107 | 
108 |     for (index_t id = 0; id < num_threads; id++)
109 |         threads.emplace_back(block, id);
110 | 
111 |     for (auto& thread : threads)
112 |         thread.join();
113 | }
114 | 
115 | 
116 | template <
117 |     typename value_t,
118 |     typename index_t>
119 | void block_cyclic_parallel_mult(
120 |     std::vector<value_t>& A,
121 |     std::vector<value_t>& x,
122 |     std::vector<value_t>& b,
123 |     index_t m,
124 |     index_t n,
125 |     index_t num_threads=8,
126 |     index_t chunk_size=64/sizeof(value_t)) {
127 | 
128 | 
129 |     // this  function  is  called  by the  threads
130 |     auto block_cyclic = [&] (const index_t& id) -> void {
131 | 
132 |         // precomupute the stride
133 | 	const index_t stride = num_threads*chunk_size;
134 | 	const index_t offset = id*chunk_size;
135 | 
136 |         // for each block of size chunk_size in cyclic order
137 |         for (index_t lower = offset; lower < m; lower += stride) {
138 | 
139 |             // compute the upper border of the block
140 |             const index_t upper = std::min(lower+chunk_size, m);
141 | 
142 | 	    // for each row in the block
143 |             for (index_t row = lower; row < upper; row++) {
144 | 
145 | 		// accumulate the contributions
146 | 		value_t accum = value_t(0);
147 | 		for (index_t col = 0; col < n; col++)
148 |                     accum += A[row*n+col]*x[col];
149 |                 b[row] = accum;
150 |             }
151 | 	}
152 |     };
153 | 
154 |     // business as usual
155 |     std::vector<std::thread> threads;
156 | 
157 |     for (index_t id = 0; id < num_threads; id++)
158 |         threads.emplace_back(block_cyclic, id);
159 | 
160 |     for (auto& thread : threads)
161 |         thread.join();
162 | }
163 | 
164 | 
165 | 
166 | int main(int argc, char* argv[]) {
167 | 
168 |     const uint64_t n = 1UL << 15;
169 |     const uint64_t m = 1UL << 15;
170 | 
171 |     TIMERSTART(overall)
172 |     TIMERSTART(alloc)
173 |     std::vector<no_init_t<uint64_t>> A(m*n);
174 |     std::vector<no_init_t<uint64_t>> x(n);
175 |     std::vector<no_init_t<uint64_t>> b(m);
176 |     TIMERSTOP(alloc)
177 | 
178 |     TIMERSTART(init)
179 |     init(A, x, m, n);
180 |     TIMERSTOP(init)
181 | 
182 |     TIMERSTART(mult)
183 |     block_cyclic_parallel_mult(A, x, b, m, n);
184 |     TIMERSTOP(mult)
185 | 
186 |     TIMERSTOP(overall)
187 | 
188 |     //for (const auto& entry: b)
189 |     //    std::cout << entry << std::endl;
190 | 
191 |     for (uint64_t index = 0; index < m; index++)
192 |         if (b[index] != index*(index+1)/2)
193 |             std::cout << "error at position " << index << " "
194 |                       << b[index] << std::endl;
195 | 
196 | }
197 | 


--------------------------------------------------------------------------------
/chapter4/return_values/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: traditional promise_future packaged_task async
 5 | 
 6 | traditional: traditional.cpp
 7 | 	$(CXX) traditional.cpp $(CXXFLAGS) -o traditional
 8 | 
 9 | promise_future: promise_future.cpp
10 | 	$(CXX) promise_future.cpp $(CXXFLAGS) -o promise_future
11 | 
12 | packaged_task: packaged_task.cpp
13 | 	$(CXX) packaged_task.cpp $(CXXFLAGS) -o packaged_task
14 | 
15 | async: async.cpp
16 | 	$(CXX) async.cpp $(CXXFLAGS) -o async
17 | 
18 | clean:
19 | 	rm -rf traditional
20 | 	rm -rf promise_future
21 | 	rm -rf packaged_task
22 | 	rm -rf async
23 | 


--------------------------------------------------------------------------------
/chapter4/return_values/async.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <future>
 5 | 
 6 | uint64_t fibo(uint64_t n) {
 7 | 
 8 |     uint64_t a_0 = 0;
 9 |     uint64_t a_1 = 1;
10 | 
11 |     for (uint64_t index = 0; index < n; index++) {
12 |         const uint64_t tmp = a_0; a_0 = a_1; a_1 += tmp;
13 |     }
14 | 
15 |     return a_0;
16 | }
17 | 
18 | int main(int argc, char * argv[]) {
19 | 
20 |     const uint64_t num_threads = 32;
21 |     std::vector<std::future<uint64_t>> results;
22 | 
23 |     for (uint64_t id = 0; id < num_threads; id++)
24 |         results.emplace_back(
25 |             std::async(
26 |                 std::launch::async, fibo, id
27 |             )
28 |         );
29 | 
30 |     for (auto& result: results)
31 |         std::cout << result.get() << std::endl;
32 | }
33 | 


--------------------------------------------------------------------------------
/chapter4/return_values/packaged_task.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <future>
 6 | 
 7 | template <
 8 |     typename Func,      // <-- type of function func
 9 |     typename ... Args,  // <-- type of arguments arg0,arg1,...
10 |     typename Rtrn=typename std::result_of<Func(Args...)>::type>
11 | auto create_task(       // ^-- type of return value func(args)
12 |     Func &&    func,
13 |     Args && ...args) -> std::packaged_task<Rtrn(void)> {
14 | 
15 |     // basically build an auxilliary function aux(void)
16 |     // without arguments returning func(arg0,arg1,...)
17 |     auto aux = std::bind(std::forward<Func>(func),
18 |                          std::forward<Args>(args)...);
19 | 
20 | 
21 |     // create a task wrapping the auxilliary function:
22 |     // task() executes aux(void) := func(arg0,arg1,...)
23 |     auto task = std::packaged_task<Rtrn(void)>(aux);
24 | 
25 |     // the return value of aux(void) is assigned to a
26 |     // future object accesible via task.get_future()
27 |     return task;
28 | }
29 | 
30 | uint64_t fibo(uint64_t n) {
31 | 
32 |     uint64_t a_0 = 0;
33 |     uint64_t a_1 = 1;
34 | 
35 |     for (uint64_t index = 0; index < n; index++) {
36 |         const uint64_t tmp = a_0; a_0 = a_1; a_1 += tmp;
37 |     }
38 | 
39 |     return a_0;
40 | }
41 | 
42 | int main(int argc, char * argv[]) {
43 | 
44 |     const uint64_t num_threads = 32;
45 | 
46 |     std::vector<std::thread> threads;
47 |     std::vector<std::future<uint64_t>> results;
48 | 
49 |     for (uint64_t id = 0; id < num_threads; id++) {
50 |         auto task = create_task(fibo, id);
51 |         results.emplace_back(task.get_future());
52 |         threads.emplace_back(std::move(task));
53 |     }
54 | 
55 |     for (auto& result: results)
56 |         std::cout << result.get() << std::endl;
57 | 
58 |     for (auto& thread: threads)
59 |         thread.detach();
60 | }
61 | 


--------------------------------------------------------------------------------
/chapter4/return_values/promise_future.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <future>
 6 | 
 7 | template <
 8 |     typename value_t,
 9 |     typename index_t>
10 | void fibo(
11 |     value_t n,
12 |     std::promise<value_t> && result) {
13 | 
14 |     value_t a_0 = 0;
15 |     value_t a_1 = 1;
16 | 
17 |     for (index_t index = 0; index < n; index++) {
18 |         const value_t tmp = a_0; a_0 = a_1; a_1 += tmp;
19 |     }
20 | 
21 |     result.set_value(a_0);
22 | }
23 | 
24 | int main(int argc, char * argv[]) {
25 | 
26 |     const uint64_t num_threads = 32;
27 | 
28 |     std::vector<std::thread> threads;
29 |     std::vector<std::future<uint64_t>> results;
30 | 
31 |     for (uint64_t id = 0; id < num_threads; id++) {
32 |         std::promise<uint64_t> promise;
33 |         results.emplace_back(promise.get_future());
34 | 
35 |         threads.emplace_back(
36 |             std::thread(
37 |                 fibo<uint64_t, uint64_t>, id, std::move(promise)
38 |             )
39 |         );
40 |     }
41 | 
42 | 
43 |     for (auto& result: results)
44 |         std::cout << result.get() << std::endl;
45 | 
46 |     for (auto& thread: threads)
47 |         thread.detach();
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/chapter4/return_values/traditional.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | 
 6 | template <
 7 |     typename value_t,
 8 |     typename index_t>
 9 | void fibo(
10 |     value_t n,
11 |     value_t * result) {
12 | 
13 |     value_t a_0 = 0;
14 |     value_t a_1 = 1;
15 | 
16 |     for (index_t index = 0; index < n; index++) {
17 |         const value_t tmp = a_0; a_0 = a_1; a_1 += tmp;
18 |     }
19 | 
20 |     *result = a_0;
21 | }
22 | 
23 | int main(int argc, char * argv[]) {
24 | 
25 |     const uint64_t num_threads = 32;
26 | 
27 |     std::vector<std::thread> threads;
28 |     std::vector<uint64_t> results(num_threads);
29 | 
30 |     for (uint64_t id = 0; id < num_threads; id++) {
31 | 
32 |         threads.emplace_back(
33 |             std::thread(
34 |                 fibo<uint64_t, uint64_t>, id, &(results[id])
35 |             )
36 |         );
37 |     }
38 | 
39 |     for (auto& thread: threads)
40 |         thread.join();
41 | 
42 |     for (auto& result: results)
43 |         std::cout << result << std::endl;
44 | }
45 | 


--------------------------------------------------------------------------------
/chapter4/thread_pool/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: main_basic main_basic_tree
 5 | 
 6 | main_basic: main_basic.cpp
 7 | 	$(CXX) main_basic.cpp $(CXXFLAGS) -o main_basic
 8 | 
 9 | main_basic_tree: main_basic_tree.cpp
10 | 	$(CXX) main_basic_tree.cpp $(CXXFLAGS) -o main_basic_tree
11 | 
12 | clean:
13 | 	rm -rf main_basic
14 | 	rm -rf main_basic_tree
15 | 


--------------------------------------------------------------------------------
/chapter4/thread_pool/main_basic.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "threadpool_basic.hpp"
 3 | 
 4 | ThreadPool TP(8);
 5 | 
 6 | int main () {
 7 | 
 8 |     auto square = [](const uint64_t x) {
 9 |         return x*x;
10 |     };
11 | 
12 |     const uint64_t num_tasks = 32;
13 |     std::vector<std::future<uint64_t>> futures;
14 | 
15 |     for (uint64_t task = 0; task < num_tasks; task++) {
16 |         auto future = TP.enqueue(square, task);
17 |         futures.emplace_back(std::move(future));
18 |     }
19 | 
20 |     for (auto& future : futures)
21 |         std::cout << future.get() << std::endl;
22 | }
23 | 


--------------------------------------------------------------------------------
/chapter4/thread_pool/main_basic_tree.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "threadpool_basic.hpp"
 3 | 
 4 | ThreadPool TP(8);
 5 | 
 6 | int main () {
 7 | 
 8 |     auto square = [](const uint64_t x) {
 9 |         return x*x;
10 |     };
11 | 
12 |     const uint64_t num_nodes = 32;
13 |     std::vector<std::future<uint64_t>> futures;
14 | 
15 |     typedef std::function<void(uint64_t)> traverse_t;
16 |     traverse_t traverse = [&] (uint64_t node){
17 |         if (node < num_nodes) {
18 | 
19 |             // submit the job
20 |             auto future = TP.enqueue(square, node);
21 |             futures.emplace_back(std::move(future));
22 | 
23 |             // traverse a complete binary tree
24 |             traverse(2*node+1);
25 |             traverse(2*node+2);
26 |         }
27 |     };
28 | 
29 |     // start at the root node
30 |     traverse(0);
31 | 
32 |     // get the results
33 |     for (auto& future : futures)
34 |         std::cout << future.get() << std::endl;
35 | }
36 | 


--------------------------------------------------------------------------------
/chapter4/thread_pool/threadpool_basic.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef THREADPOOL_BASIC_HPP
  2 | #define THREADPOOL_BASIC_HPP
  3 | 
  4 | #include <cstdint>
  5 | #include <future>
  6 | #include <vector>
  7 | #include <queue>
  8 | #include <thread>
  9 | #include <mutex>
 10 | #include <condition_variable>
 11 | 
 12 | class ThreadPool {
 13 | 
 14 | private:
 15 | 
 16 |     // storage for threads and tasks
 17 |     std::vector<std::thread> threads;
 18 |     std::queue<std::function<void(void)>> tasks;
 19 | 
 20 |     // primitives for signaling
 21 |     std::mutex mutex;
 22 |     std::condition_variable cv;
 23 | 
 24 |     // the state of the thread, pool
 25 |     bool stop_pool;
 26 |     uint32_t active_threads;
 27 |     const uint32_t capacity;
 28 | 
 29 |     // custom task factory
 30 |     template <
 31 |         typename     Func,
 32 |         typename ... Args,
 33 |         typename Rtrn=typename std::result_of<Func(Args...)>::type>
 34 |     auto make_task(
 35 |         Func &&    func,
 36 |         Args && ...args) -> std::packaged_task<Rtrn(void)> {
 37 | 
 38 |         auto aux = std::bind(std::forward<Func>(func),
 39 |                              std::forward<Args>(args)...);
 40 | 
 41 |         return std::packaged_task<Rtrn(void)>(aux);
 42 |     }
 43 | 
 44 |     // will be executed before execution of a task
 45 |     void before_task_hook() {
 46 |         active_threads++;
 47 |     }
 48 | 
 49 |     // will be executed after execution of a task
 50 |     void after_task_hook() {
 51 |         active_threads--;
 52 |     }
 53 | 
 54 | public:
 55 |     ThreadPool(
 56 |         uint64_t capacity_) :
 57 |         stop_pool(false),     // pool is running
 58 |         active_threads(0),    // no work to be done
 59 |         capacity(capacity_) { // remember size
 60 | 
 61 |         // this function is executed by the threads
 62 |         auto wait_loop = [this] ( ) -> void {
 63 | 
 64 |             // wait forever
 65 |             while (true) {
 66 | 
 67 |                 // this is a placeholder task
 68 |                 std::function<void(void)> task;
 69 | 
 70 |                 { // lock this section for waiting
 71 |                     std::unique_lock<std::mutex>
 72 |                         unique_lock(mutex);
 73 | 
 74 |                     // actions must be performed on
 75 |                     // wake-up if (i) the thread pool
 76 |                     // has been stopped or (ii) there
 77 |                     // are still tasks to be processed
 78 |                     auto predicate = [this] ( ) -> bool {
 79 |                         return  (stop_pool) ||
 80 |                                !(tasks.empty());
 81 |                     };
 82 | 
 83 |                     // wait to be waken up on
 84 |                     // aforementioned conditions
 85 |                     cv.wait(unique_lock, predicate);
 86 | 
 87 |                     // exit if thread pool stopped
 88 |                     // and no tasks to be performed
 89 |                     if (stop_pool && tasks.empty())
 90 |                         return;
 91 | 
 92 |                     // else extract task from queue
 93 |                     task = std::move(tasks.front());
 94 |                     tasks.pop();
 95 |                     before_task_hook();
 96 |                 } // here we release the lock
 97 | 
 98 |                 // execute the task in parallel
 99 |                 task();
100 | 
101 |                 {   // adjust the thread counter
102 |                     std::lock_guard<std::mutex>
103 |                         lock_guard(mutex);
104 |                     after_task_hook();
105 |                 } // here we release the lock
106 |             }
107 |         };
108 | 
109 |         // initially spawn capacity many threads
110 |         for (uint64_t id = 0; id < capacity; id++)
111 |             threads.emplace_back(wait_loop);
112 |     }
113 | 
114 |     ~ThreadPool() {
115 | 
116 |         { // acquire a scoped lock
117 |             std::lock_guard<std::mutex>
118 |                 lock_guard(mutex);
119 | 
120 |             // and subsequently alter
121 |             // the global state to stop
122 |             stop_pool = true;
123 |         } // here we release the lock
124 | 
125 |         // signal all threads
126 |         cv.notify_all();
127 | 
128 |         // finally join all threads
129 |         for (auto& thread : threads)
130 |             thread.join();
131 |     }
132 | 
133 |     template <
134 |         typename     Func,
135 |         typename ... Args,
136 |         typename Pair=Func(Args...),
137 |         typename Rtrn=typename std::result_of<Pair>::type>
138 |     auto enqueue(
139 |         Func &&     func,
140 |         Args && ... args) -> std::future<Rtrn> {
141 | 
142 |         // create the task, get the future
143 |         // and wrap task in a shared pointer
144 |         auto task = make_task(func, args...);
145 |         auto future = task.get_future();
146 |         auto task_ptr = std::make_shared<decltype(task)>
147 |                         (std::move(task));
148 | 
149 |         {   // lock the scope
150 |             std::lock_guard<std::mutex>
151 |                 lock_guard(mutex);
152 | 
153 |             // you cannot reuse pool after being stopped
154 |             if(stop_pool)
155 |                 throw std::runtime_error(
156 |                     "enqueue on stopped ThreadPool"
157 |                 );
158 | 
159 |             // wrap the task in a generic void
160 |             // function void -> void
161 |             auto payload = [task_ptr] ( ) -> void {
162 |                 // basically call task()
163 |                 task_ptr->operator()();
164 |             };
165 | 
166 |             // append the task to the queue
167 |             tasks.emplace(payload);
168 |         }
169 | 
170 |         // tell one thread to wake-up
171 |         cv.notify_one();
172 | 
173 |         return future;
174 |     }
175 | };
176 | 
177 | #endif
178 | 


--------------------------------------------------------------------------------
/chapter5/atomics/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread -latomic -march=native
 3 | 
 4 | all: query_atomics atomic_counting atomic_max arbitrary_atomics universal_atomics
 5 | 
 6 | query_atomics: query_atomics.cpp
 7 | 	$(CXX) query_atomics.cpp $(CXXFLAGS) -o query_atomics
 8 | 
 9 | atomic_counting: atomic_counting.cpp
10 | 	$(CXX) atomic_counting.cpp $(CXXFLAGS) -o atomic_counting
11 | 
12 | atomic_max: atomic_max.cpp
13 | 	$(CXX) atomic_max.cpp $(CXXFLAGS) -o atomic_max
14 | 
15 | arbitrary_atomics: arbitrary_atomics.cpp
16 | 	$(CXX) arbitrary_atomics.cpp $(CXXFLAGS) -o arbitrary_atomics
17 | 
18 | universal_atomics: universal_atomics.cpp
19 | 	$(CXX) universal_atomics.cpp $(CXXFLAGS) -o universal_atomics
20 | 
21 | clean:
22 | 	rm -rf arbitrary_atomics
23 | 	rm -rf atomic_counting
24 | 	rm -rf atomic_max
25 | 	rm -rf arbitrary_atomics
26 | 	rm -rf universal_atomics
27 | 


--------------------------------------------------------------------------------
/chapter5/atomics/arbitrary_atomics.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <atomic>
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | template <
 9 |     typename atomc_t,
10 |     typename value_t,
11 |     typename funct_t,
12 |     typename predc_t>
13 | value_t binary_atomic(
14 |     atomc_t& atomic,
15 |     const value_t& operand,
16 |     funct_t function,
17 |     predc_t predicate) {
18 | 
19 |     value_t expect = atomic.load();
20 |     value_t target;
21 | 
22 |     do {
23 |         // compute preliminary new value
24 |         target = function(expect, operand);
25 | 
26 |         // immediately return if not fulfilling
27 |         // the given constraint for a valid result
28 |         if (!predicate(target))
29 |             return expect;
30 | 
31 |     // try to atomically swap new and old value
32 |     } while (!atomic.compare_exchange_weak(expect, target));
33 | 
34 |     // either new value if successful or the old
35 |     // value for unsuccessful swap attempts:
36 |     // in both cases it corresponds to atomic.load()
37 |     return expect;
38 | }
39 | 
40 | 
41 | int main( ) {
42 | 
43 |     std::vector<std::thread> threads;
44 |     const uint64_t num_threads = 10;
45 |     const uint64_t num_iters = 100'000'000;
46 | 
47 |     auto even_max =
48 |         [&] (volatile std::atomic<uint64_t>* counter,
49 |              const auto& id) -> void {
50 | 
51 |         auto func = [] (const auto& lhs,
52 |                         const auto& rhs) {
53 |             return lhs > rhs ? lhs : rhs;
54 |         };
55 | 
56 |         auto pred = [] (const auto& val) {
57 |             return val % 2 == 0;
58 |         };
59 | 
60 |         for (uint64_t i = id; i < num_iters; i += num_threads)
61 |             binary_atomic(*counter, i, func, pred);
62 |     };
63 | 
64 |     TIMERSTART(even_max)
65 |     std::atomic<uint64_t> even_counter(0);
66 |     for (uint64_t id = 0; id < num_threads; id++)
67 |         threads.emplace_back(even_max, &even_counter, id);
68 |     for (auto& thread : threads)
69 |         thread.join();
70 |     TIMERSTOP(even_max)
71 | 
72 |     std::cout << even_counter << std::endl;
73 | }
74 | 


--------------------------------------------------------------------------------
/chapter5/atomics/atomic_counting.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <atomic>
 6 | #include <mutex>
 7 | #include "../include/hpc_helpers.hpp"
 8 | 
 9 | int main( ) {
10 | 
11 |     std::mutex mutex;
12 |     std::vector<std::thread> threads;
13 |     const uint64_t num_threads = 10;
14 |     const uint64_t num_iters = 100'000'000;
15 | 
16 |     auto lock_count =
17 |         [&] (volatile uint64_t* counter,
18 |              const auto& id) -> void {
19 | 
20 |         for (uint64_t i = id; i < num_iters; i += num_threads) {
21 |             std::lock_guard<std::mutex> lock_guard(mutex);
22 |             (*counter)++;
23 |         }
24 |     };
25 | 
26 |     auto atomic_count =
27 |         [&] (volatile std::atomic<uint64_t>* counter,
28 |              const auto& id) -> void {
29 | 
30 |         for (uint64_t i = id; i < num_iters; i += num_threads)
31 |             (*counter)++;
32 |     };
33 | 
34 |     TIMERSTART(mutex_multithreaded)
35 |     uint64_t counter = 0;
36 |     threads.clear();
37 |     for (uint64_t id = 0; id < num_threads; id++)
38 |         threads.emplace_back(lock_count, &counter, id);
39 |     for (auto& thread : threads)
40 |         thread.join();
41 |     TIMERSTOP(mutex_multithreaded)
42 | 
43 |     TIMERSTART(atomic_multithreaded)
44 |     std::atomic<uint64_t> atomic_counter(0);
45 |     threads.clear();
46 |     for (uint64_t id = 0; id < num_threads; id++)
47 |         threads.emplace_back(atomic_count, &atomic_counter, id);
48 |     for (auto& thread : threads)
49 |         thread.join();
50 |     TIMERSTOP(atomic_multithreaded)
51 | 
52 |     std::cout << counter << " " << atomic_counter << std::endl;
53 | }
54 | 


--------------------------------------------------------------------------------
/chapter5/atomics/atomic_max.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <atomic>
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | int main( ) {
 9 | 
10 |     std::vector<std::thread> threads;
11 |     const uint64_t num_threads = 10;
12 |     const uint64_t num_iters = 100'000'000;
13 | 
14 |     // WARNING: this closure produces incorrect results
15 |     auto false_max =
16 |         [&] (volatile std::atomic<uint64_t>* counter,
17 |              const auto& id) -> void {
18 | 
19 |         for (uint64_t i = id; i < num_iters; i += num_threads)
20 |             if(i > *counter)
21 |                 *counter = i;
22 |     };
23 | 
24 |     auto correct_max =
25 |         [&] (volatile std::atomic<uint64_t>* counter,
26 |              const auto& id) -> void {
27 | 
28 |         for (uint64_t i = id; i < num_iters; i += num_threads) {
29 |             auto previous = counter->load();
30 |             while (previous < i &&
31 |                 !counter->compare_exchange_weak(previous, i)) {}
32 |         }
33 |     };
34 | 
35 |     TIMERSTART(incorrect_max)
36 |     std::atomic<uint64_t> false_counter(0);
37 |     threads.clear();
38 |     for (uint64_t id = 0; id < num_threads; id++)
39 |         threads.emplace_back(false_max, &false_counter, id);
40 |     for (auto& thread : threads)
41 |         thread.join();
42 |     TIMERSTOP(incorrect_max)
43 | 
44 |     TIMERSTART(correct_max)
45 |     std::atomic<uint64_t> correct_counter(0);
46 |     threads.clear();
47 |     for (uint64_t id = 0; id < num_threads; id++)
48 |         threads.emplace_back(correct_max, &correct_counter, id);
49 |     for (auto& thread : threads)
50 |         thread.join();
51 |     TIMERSTOP(correct_max)
52 | 
53 |     std::cout << false_counter << " "
54 |               << correct_counter << std::endl;
55 | }
56 | 


--------------------------------------------------------------------------------
/chapter5/atomics/query_atomics.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <atomic>
 3 | 
 4 | template <
 5 |     typename x_value_t,
 6 |     typename y_value_t,
 7 |     typename z_value_t>
 8 | struct state_t {
 9 | 
10 |     x_value_t x;
11 |     y_value_t y;
12 |     z_value_t z;
13 | };
14 | 
15 | template <
16 |     typename R,
17 |     typename S,
18 |     typename T>
19 | void status() { // report size and if lock-free
20 |     typedef std::atomic<state_t<R,S,T>> atomic_state_t;
21 |     std::cout << sizeof(atomic_state_t) << "\t"
22 |               << atomic_state_t().is_lock_free()
23 |               << std::endl;
24 | }
25 | 
26 | int main () {
27 | 
28 |     std::cout << "size\tlock_free?" << std::endl;
29 | 
30 |     status<uint8_t,  uint8_t,  uint8_t >();
31 |     status<uint16_t, uint8_t,  uint8_t >();
32 |     status<uint16_t, uint16_t, uint8_t >();
33 |     status<uint32_t, uint16_t, uint16_t>();
34 |     status<uint32_t, uint32_t, uint16_t>();
35 |     status<uint64_t, uint32_t, uint32_t>();
36 | }
37 | 


--------------------------------------------------------------------------------
/chapter5/atomics/universal_atomics.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <thread>
 5 | #include <atomic>
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | template <
 9 |     typename atomc_t,
10 |     typename value_t,
11 |     typename funcp_t,
12 |     typename funcn_t,
13 |     typename predc_t>
14 | value_t ternary_atomic(
15 |     atomc_t& atomic,
16 |     const value_t& operand,
17 |     funcp_t pos_function,
18 |     funcn_t neg_function,
19 |     predc_t predicate) {
20 | 
21 |     value_t expect = atomic.load();
22 |     value_t target;
23 | 
24 |     do {
25 | 
26 |         if (predicate(expect, operand))
27 |             target = pos_function(expect, operand);
28 |         else
29 |             target = neg_function(expect, operand);
30 | 
31 |     // try to atomically swap new and old value
32 |     } while (!atomic.compare_exchange_weak(expect, target));
33 | 
34 |     // either new value if successful or the old
35 |     // value for unsuccessful swap attempts:
36 |     // in both cases it corresponds to atomic.load()
37 |     return expect;
38 | }
39 | 
40 | 
41 | int main( ) {
42 | 
43 |     std::vector<std::thread> threads;
44 |     const uint64_t num_threads = 10;
45 |     const uint64_t num_iters = 100'000'000;
46 | 
47 |     auto even_max =
48 |         [&] (volatile std::atomic<uint64_t>* counter,
49 |              const auto& id) -> void {
50 | 
51 |         auto pos_func = [] (const auto& lhs,
52 |                             const auto& rhs) {
53 |             return lhs;
54 |         };
55 | 
56 |         auto neg_func = [] (const auto& lhs,
57 |                             const auto& rhs) {
58 |             return rhs;
59 |         };
60 | 
61 |         auto pred = [] (const auto& lhs,
62 |                         const auto& rhs) {
63 |             return lhs > rhs && lhs % 2 == 0;
64 |         };
65 | 
66 |         for (uint64_t i = id; i < num_iters; i += num_threads)
67 |             ternary_atomic(*counter, i, pos_func, neg_func, pred);
68 |     };
69 | 
70 |     TIMERSTART(even_max)
71 |     std::atomic<uint64_t> even_counter(0);
72 |     for (uint64_t id = 0; id < num_threads; id++)
73 |         threads.emplace_back(even_max, &even_counter, id);
74 |     for (auto& thread : threads)
75 |         thread.join();
76 |     TIMERSTOP(even_max)
77 | 
78 |     std::cout << even_counter << std::endl;
79 | }
80 | 


--------------------------------------------------------------------------------
/chapter5/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter5/knapsack/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: knapsack
 5 | 
 6 | knapsack: knapsack.cpp
 7 | 	$(CXX) knapsack.cpp $(CXXFLAGS) -o knapsack
 8 | 
 9 | clean:
10 | 	rm -rf knapsack
11 | 


--------------------------------------------------------------------------------
/chapter5/knapsack/knapsack.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>      // std::sort
  2 | #include <iostream>       // std::cout
  3 | #include <vector>         // std::vector
  4 | #include <atomic>         // std::atomic
  5 | #include <random>         // std::uniform_int_distribution
  6 | #include "threadpool.hpp" // work sharing thread pool
  7 | 
  8 | template <
  9 |     typename value_t_,
 10 |     typename weight_t_>
 11 | struct generic_tuple_t {
 12 | 
 13 |     value_t_  value;
 14 |     weight_t_ weight;
 15 | 
 16 |     // expose types
 17 |     typedef value_t_ value_t;
 18 |     typedef value_t_ weight_t;
 19 | 
 20 |     generic_tuple_t(
 21 |         value_t_  value_,
 22 |         weight_t_ weight_) : value (value_ ),
 23 |                              weight(weight_) {}
 24 | };
 25 | 
 26 | template <
 27 |     typename bmask_t_,
 28 |     typename value_t_>
 29 | struct state_t {
 30 | 
 31 |     bmask_t_ bmask=0;
 32 |     value_t_ value=0;
 33 | 
 34 |     // expose template parameters
 35 |     typedef bmask_t_ bmask_t;
 36 |     typedef value_t_ value_t;
 37 | 
 38 |     // non-default constructors are not allowed
 39 |     // when wrapped with std::atomic<state_t>
 40 | };
 41 | 
 42 | // shortcuts for convenience
 43 | typedef uint64_t index_t;
 44 | typedef uint32_t bmask_t;
 45 | typedef uint32_t value_t;
 46 | typedef uint32_t weight_t;
 47 | typedef generic_tuple_t<value_t, weight_t> tuple_t;
 48 | 
 49 | // the global state encoding the mask and value
 50 | std::atomic<state_t<bmask_t, value_t>> global_state;
 51 | const value_t capacity (1500);
 52 | const index_t num_items (32);
 53 | std::vector<tuple_t> tuples;
 54 | 
 55 | // our work-sharing thread pool
 56 | ThreadPool TP(4);
 57 | 
 58 | // initializes Knapsack problem
 59 | template <
 60 |     typename tuple_t,
 61 |     typename index_t>
 62 | void init_tuples(
 63 |     std::vector<tuple_t>&  tuples,
 64 |     index_t num_entries) {
 65 | 
 66 |     // recover the types stored in tuple_t
 67 |     typedef typename tuple_t::value_t  value_t;
 68 |     typedef typename tuple_t::weight_t weight_t;
 69 | 
 70 |     // C++11 random number generator
 71 |     std::mt19937 engine(0); // mersenne twister
 72 |     std::uniform_int_distribution<value_t>  rho_v(80, 100);
 73 |     std::uniform_int_distribution<weight_t> rho_w(80, 100);
 74 | 
 75 |     // generate pairs of values and weights
 76 |     for (index_t index = 0; index < num_entries; index++)
 77 |         tuples.emplace_back(rho_v(engine), rho_w(engine));
 78 | 
 79 |     // sort two pairs by value/weight density
 80 |     auto predicate = [] (const auto& lhs,
 81 |                          const auto& rhs) -> bool {
 82 |         return lhs.value*rhs.weight > rhs.value*lhs.weight;
 83 |     };
 84 | 
 85 |     std::sort(tuples.begin(), tuples.end(), predicate);
 86 | }
 87 | 
 88 | template <
 89 |     typename tuple_t,
 90 |     typename bmask_t>
 91 | void atomic_update(
 92 |     tuple_t tuple,
 93 |     bmask_t bmask) {
 94 | 
 95 |     typedef typename tuple_t::value_t value_t;
 96 | 
 97 |     auto g_state = global_state.load();
 98 |     auto l_value = tuple.value;
 99 |     state_t<bmask_t, value_t> target;
100 | 
101 |     do {
102 | 
103 |         // exit if solution is not optimal
104 |         if (g_state.value > l_value)
105 |             return;
106 | 
107 |         // construct the desired target
108 |         target.value = l_value;
109 |         target.bmask = bmask;
110 | 
111 |     } while (!global_state.compare_exchange_weak(g_state, target));
112 | }
113 | 
114 | template <
115 |     typename index_t,
116 |     typename tuple_t>
117 | typename tuple_t::value_t dantzig_bound(
118 |     index_t height,
119 |     tuple_t tuple) {
120 | 
121 |     auto predicate = [&] (const index_t& i) {
122 |         return i < num_items &&
123 |                tuple.weight < capacity;
124 |     };
125 | 
126 |     // greedily pack items until backpack full
127 |     for (index_t i = height; predicate(i); i++) {
128 |         tuple.value  += tuples[i].value;
129 |         tuple.weight += tuples[i].weight;
130 |     }
131 | 
132 |     return tuple.value;
133 | }
134 | 
135 | template <
136 |     typename index_t,
137 |     typename tuple_t,
138 |     typename bmask_t>
139 | void traverse(
140 |     index_t height,  // height of the binary tree
141 |     tuple_t tuple,   // weight and value up to height
142 |     bmask_t bmask) {  // binary mask up to height
143 | 
144 |     // check whether item packed or not
145 |     const bool bit  = (bmask >> height) % 2;
146 |     tuple.weight += bit*tuples[height].weight;
147 |     tuple.value  += bit*tuples[height].value;
148 | 
149 |     // check versus maximum capacity
150 |     if (tuple.weight > capacity)
151 |         return; // my backpack is full
152 | 
153 |     // update global lower bound if needed
154 |     atomic_update(tuple, bmask);
155 | 
156 |     // calculate local Danzig upper bound
157 |     // and compare with global upper bound
158 |     auto bsf = global_state.load().value;
159 |     if (dantzig_bound(height+1, tuple) < bsf)
160 |        return;
161 | 
162 |     // if everything was fine generate new candidate
163 |     if (height+1 < num_items) {
164 |         traverse(height+1, tuple, bmask+(1<<(height+1)));
165 |         traverse(height+1, tuple, bmask);
166 |     }
167 | }
168 | 
169 | int main () {
170 | 
171 |     // initialize tuples with random values
172 |     init_tuples(tuples, num_items);
173 | 
174 |     // traverse left and right branch
175 |     TP.spawn(traverse<index_t, tuple_t, bmask_t>,
176 |              0, tuple_t(0, 0), 0);
177 |     TP.spawn(traverse<index_t, tuple_t, bmask_t>,
178 |              0, tuple_t(0, 0), 1);
179 | 
180 |     // wait for all tasks to be finished
181 |     TP.wait_and_stop();
182 | 
183 |     // report the final solution
184 |     auto g_state = global_state.load();
185 |     std::cout << "value " << g_state.value << std::endl;
186 | 
187 |     auto bmask = g_state.bmask;
188 |     for (index_t i = 0; i < num_items; i++) {
189 |         std::cout << bmask % 2 << " ";
190 |         bmask >>= 1;
191 |     }
192 |     std::cout << std::endl;
193 | }
194 | 


--------------------------------------------------------------------------------
/chapter5/knapsack/threadpool.hpp:
--------------------------------------------------------------------------------
1 | ../thread_pool/threadpool.hpp


--------------------------------------------------------------------------------
/chapter5/thread_pool/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -pthread
 3 | 
 4 | all: tree
 5 | 
 6 | tree: tree.cpp
 7 | 	$(CXX) tree.cpp $(CXXFLAGS) -o tree
 8 | 
 9 | clean:
10 | 	rm -rf tree
11 | 


--------------------------------------------------------------------------------
/chapter5/thread_pool/tree.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include "threadpool.hpp"
 4 | #include "../include/hpc_helpers.hpp"
 5 | 
 6 | ThreadPool TP(8);
 7 | 
 8 | void waste_cycles(uint64_t num_cycles) {
 9 | 
10 |     volatile uint64_t counter = 0;
11 |     for (uint64_t i = 0; i < num_cycles; i++)
12 |         counter++;
13 | }
14 | 
15 | void traverse(uint64_t node, uint64_t num_nodes) {
16 | 
17 |     if (node < num_nodes) {
18 | 
19 |         waste_cycles(1<<15);
20 | 
21 |         TP.spawn(traverse, 2*node+1, num_nodes);
22 |         traverse(2*node+2, num_nodes);
23 |     }
24 | }
25 | 
26 | int main() {
27 | 
28 |     TIMERSTART(traverse)
29 |     TP.spawn(traverse, 0, 1<<20);
30 |     TP.wait_and_stop();
31 |     TIMERSTOP(traverse)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/chapter6/1NN_classification/1NN.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>   // std::cout
  2 | #include <limits>     // std::numeric_limits
  3 | #include <vector>     // std::vector
  4 | 
  5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros
  6 | // and the no_init_t template that disables implicit type
  7 | // initialization
  8 | #include "../include/hpc_helpers.hpp"
  9 | // binary_IO contains the load_binary function to load
 10 | // and store binary data from and to a file
 11 | #include "../include/binary_IO.hpp"
 12 | 
 13 | template <typename value_t,
 14 |           typename index_t>
 15 | void all_vs_all(value_t* test,
 16 |                 value_t* train,
 17 |                 value_t* delta,
 18 |                 index_t num_test,
 19 |                 index_t num_train,
 20 |                 index_t num_features,
 21 |                 bool parallel) {
 22 | 
 23 |     // coarse-grained parallelism
 24 |     #pragma omp parallel for collapse(2) if(parallel)
 25 |     for (index_t i = 0; i < num_test; i++)
 26 |         for (index_t j = 0; j < num_train; j++) {
 27 |             value_t accum = value_t(0);
 28 |             // fine-grained parallelism
 29 |             // #pragma omp parallel for reduction(+:accum) 
 30 |             for (index_t k = 0; k < num_features; k++) {
 31 |                 const value_t residue = test [i*num_features+k]
 32 |                                       - train[j*num_features+k];
 33 |                 accum += residue*residue;
 34 |             }
 35 |             delta[i*num_train+j] = accum;
 36 |         }
 37 | }
 38 | 
 39 | template <typename label_t,
 40 |           typename value_t,
 41 |           typename index_t>
 42 | value_t accuracy(label_t* label_test,
 43 |                  label_t* label_train,
 44 |                  value_t* delta,
 45 |                  index_t num_test,
 46 |                  index_t num_train,
 47 |                  index_t num_classes,
 48 |                  bool parallel) {
 49 | 
 50 |     index_t counter = index_t(0);
 51 | 
 52 |     #pragma omp parallel for reduction(+:counter) if(parallel)
 53 |     for (index_t i = 0; i < num_test; i++) {
 54 | 
 55 |         // the initial distance is float::max
 56 |         // the initial index j_star is some dummy value
 57 |         value_t bsf = std::numeric_limits<value_t>::max();
 58 |         index_t jst = std::numeric_limits<index_t>::max();
 59 | 
 60 |         // find training sample with smallest distance
 61 |         for (index_t j = 0; j < num_train; j++) {
 62 |             const value_t value = delta[i*num_train+j];
 63 |             if (value < bsf) {
 64 |                 bsf = value;
 65 |                 jst = j;
 66 |             }
 67 |         }
 68 | 
 69 |         // compare predicted label with original label
 70 |         bool match = true;
 71 |         for (index_t k = 0; k < num_classes; k++)
 72 |             match &= label_test [i  *num_classes+k] ==
 73 |                      label_train[jst*num_classes+k];
 74 | 
 75 |         counter += match;
 76 |     }
 77 | 
 78 |     return value_t(counter)/value_t(num_test);
 79 | }
 80 | 
 81 | int main(int argc, char* argv[]) {
 82 |  
 83 |    // run parallelized when any command line argument given
 84 |     const bool parallel = argc > 1;
 85 | 
 86 |     std::cout << "running "
 87 |               << (parallel ? "in parallel" : "sequentially")
 88 |               << std::endl;
 89 | 
 90 |     // the shape of the data matrices
 91 |     const uint64_t num_features = 28*28;
 92 |     const uint64_t num_classes = 10;
 93 |     const uint64_t num_entries = 65000;
 94 |     const uint64_t num_train = 55000;
 95 |     const uint64_t num_test = num_entries-num_train;
 96 | 
 97 |     // memory for the data matrices and all-pair matrix
 98 |     std::vector<float> input(num_entries*num_features);
 99 |     std::vector<float> label(num_entries*num_classes);
100 |     std::vector<float> delta(num_test*num_train);
101 | 
102 |     // get the images and labels from disk
103 |     load_binary(input.data(), input.size(), "./data/X.bin");
104 |     load_binary(label.data(), label.size(), "./data/Y.bin");
105 | 
106 |     TIMERSTART(all_vs_all)
107 |     const uint64_t inp_off = num_train * num_features;
108 |     all_vs_all(input.data() + inp_off,
109 |                input.data(),
110 |                delta.data(),
111 |                num_test, num_train,
112 |                num_features, parallel);
113 |     TIMERSTOP(all_vs_all)
114 | 
115 |     TIMERSTART(classify)
116 |     const uint64_t lbl_off = num_train * num_classes;
117 |     auto acc = accuracy(label.data() + lbl_off,
118 |                         label.data(),
119 |                         delta.data(),
120 |                         num_test, num_train,
121 |                         num_classes, parallel);
122 |     TIMERSTOP(classify)
123 | 
124 |     std::cout << "test accuracy: " << acc << std::endl;
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter6/1NN_classification/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: 1NN
 5 | 
 6 | 1NN: 1NN.cpp
 7 | 	$(CXX) 1NN.cpp $(CXXFLAGS) -o 1NN
 8 | 
 9 | clean:
10 | 	rm -rf 1NN
11 | 


--------------------------------------------------------------------------------
/chapter6/1NN_classification/data/mnist_exporter.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # run __ONE__ of the following commands:
 3 | # pip install --user tensorflow (if you have no CUDA-enabled GPU)
 4 | # pip install --user tensorflow-gpu
 5 | #
 6 | # afterwards install tflearn
 7 | # pip install --user tflearn
 8 | #
 9 | # Numpy should come bundled with tensorflow. Run this file et voila!
10 | #####################################################################
11 | 
12 | import tflearn
13 | 
14 | # Data loading and preprocessing
15 | import tflearn.datasets.mnist as mnist
16 | X, Y, testX, testY = mnist.load_data(one_hot=True)
17 | 
18 | import array as ar
19 | import numpy as np
20 | 
21 | with open("X.bin", "wb") as f:
22 |     images = np.vstack((X, testX))
23 |     print(images.shape)
24 |     f.write(ar.array("f", images.flatten()))
25 | 
26 | with open("Y.bin", "wb") as f:
27 |     labels = np.vstack((Y, testY))
28 |     print(labels.shape)
29 |     f.write(ar.array("f", labels.flatten()))
30 | 


--------------------------------------------------------------------------------
/chapter6/advanced_reductions/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++-6
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp -mavx -march=native
 3 | 
 4 | all: custom_reduction avx_reduction string_reduction
 5 | 
 6 | custom_reduction: custom_reduction.cpp
 7 | 	$(CXX) custom_reduction.cpp $(CXXFLAGS) -o custom_reduction
 8 | 
 9 | avx_reduction: avx_reduction.cpp
10 | 	$(CXX) avx_reduction.cpp $(CXXFLAGS) -o avx_reduction
11 | 
12 | string_reduction: string_reduction.cpp
13 | 	$(CXX) string_reduction.cpp $(CXXFLAGS) -o string_reduction
14 | 
15 | clean:
16 | 	rm -rf custom_reduction
17 | 	rm -rf avx_reduction
18 | 	rm -rf string_reduction
19 | 


--------------------------------------------------------------------------------
/chapter6/advanced_reductions/avx_reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>     // std::cout
 2 | #include <cstdint>      // uint64_t
 3 | #include <cmath>        // INFINITY
 4 | #include <random>       // random
 5 | #include <immintrin.h>  // AVX intrinsics
 6 | 
 7 | struct avxop {
 8 | 
 9 |     __m256 neutral;
10 | 
11 |     avxop() : neutral(_mm256_set1_ps(-INFINITY)) {}
12 | 
13 |     inline __m256 operator()(
14 |         const __m256& lhs,
15 |         const __m256& rhs) const {
16 | 
17 |         return _mm256_max_ps(lhs, rhs);
18 |     }
19 | };
20 | 
21 | void init(float * data, uint64_t length) {
22 | 
23 |     std::mt19937 engine(42);
24 |     std::uniform_real_distribution<float> density(-1L<<28, 1L<<28);
25 | 
26 |     for (uint64_t i = 0; i < length; i++)
27 |         data[i] = density(engine);
28 | }
29 | 
30 | inline float hmax_sse3(__m128 v) {
31 |     __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
32 |     __m128 maxs = _mm_max_ps(v, shuf);
33 |     shuf        = _mm_movehl_ps(shuf, maxs); // high half -> low half
34 |     maxs        = _mm_max_ss(maxs, shuf);
35 |     return        _mm_cvtss_f32(maxs);
36 | }
37 | 
38 | inline float hmax_avx(__m256 v) {
39 |     __m128 lo = _mm256_castps256_ps128(v);   // low 128
40 |     __m128 hi = _mm256_extractf128_ps(v, 1); // high 128
41 |            lo = _mm_max_ps(lo, hi);          // max the low 128
42 |     return hmax_sse3(lo);                    // and inline the sse3 version
43 | }
44 | 
45 | int main () {
46 | 
47 |     const uint64_t num_entries = 1UL << 28;
48 |     const uint64_t num_bytes = num_entries*sizeof(float);
49 |     auto data = static_cast<float*>(_mm_malloc(num_bytes , 32));
50 |     init(data, num_entries);
51 |  
52 |     #pragma omp declare reduction(avx_max : __m256 :  \
53 |     omp_out = avxop()(omp_out, omp_in))               \
54 |     initializer (omp_priv=avxop().neutral)
55 | 
56 |     auto result = avxop().neutral;
57 | 
58 |     # pragma omp parallel for reduction(avx_max:result)
59 |     for (uint64_t i = 0; i < num_entries; i += 8)
60 |         result = avxop()(result, _mm256_load_ps(data+i));
61 | 
62 |     std::cout << hmax_avx(result) << std::endl;
63 | 
64 |     _mm_free(data);
65 | }
66 | 


--------------------------------------------------------------------------------
/chapter6/advanced_reductions/custom_reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <omp.h>
 3 | 
 4 | template <
 5 |     typename value_t>
 6 | struct binop {
 7 | 
 8 |     constexpr static value_t neutral = 0;
 9 | 
10 |     inline value_t operator()(
11 |         const value_t& lhs,
12 |         const value_t& rhs) const {
13 | 
14 |         const value_t ying = std::abs(lhs);
15 |         const value_t yang = std::abs(rhs);
16 | 
17 |         return ying > yang ? lhs : rhs;
18 |     }
19 | };
20 | 
21 | int main () {
22 | 
23 |     const uint64_t num_iters = 1UL << 20;
24 |     int64_t result = binop<int64_t>::neutral;
25 | 
26 |     #pragma omp declare reduction(custom_op : int64_t : \
27 |     omp_out = binop<int64_t>()(omp_out, omp_in))        \
28 |     initializer (omp_priv=binop<int64_t>::neutral)
29 | 
30 | 
31 |     # pragma omp parallel for reduction(custom_op:result)
32 |     for (uint64_t i = 0; i < num_iters; i++)
33 |         result = binop<int64_t>()(result, i&1 ? -i : i);
34 | 
35 |     std::cout << result << std::endl;
36 | 
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/chapter6/advanced_reductions/string_reduction.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <string>
 4 | #include <omp.h>
 5 | 
 6 | int main () {
 7 | 
 8 |     std::string result("SIMON SAYS_");
 9 |     std::vector<std::string> data {"p", "a", "r", "a", "l", "l",
10 |                                    "e", "l", " ", "p", "r", "o",
11 |                                    "g", "r", "a", "m", "m", "i",
12 |                                    "n", "g", " ", "i", "s", " ",
13 |                                    "f", "u", "n", "!"};
14 | 
15 |     #pragma omp declare reduction(custom_op : std::string : \
16 |         omp_out = omp_out+omp_in)                           \
17 |         initializer (omp_priv=std::string(""))
18 | 
19 |     # pragma omp parallel for reduction(custom_op:result) num_threads(2)
20 |     for (uint64_t i = 0; i < data.size(); i++)
21 |         result = result+data[i];
22 | 
23 |     std::cout << result << std::endl;
24 | 
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/chapter6/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: hello_world
 5 | 
 6 | hello_world: hello_world.cpp
 7 | 	$(CXX) hello_world.cpp $(CXXFLAGS) -o hello_world
 8 | 
 9 | clean:
10 | 	rm -rf hello_world
11 | 


--------------------------------------------------------------------------------
/chapter6/hello_world/hello_world.cpp:
--------------------------------------------------------------------------------
1 | #include <iostream>
2 | 
3 | int main() {
4 |     // run the statement after the pragma in the current team
5 |     #pragma omp parallel
6 |     std::cout << "Hello world!" << std::endl;
7 | }
8 | 


--------------------------------------------------------------------------------
/chapter6/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter6/load_imbalance/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: scheduling
 5 | 
 6 | scheduling: scheduling.cpp
 7 | 	$(CXX) scheduling.cpp $(CXXFLAGS) -o scheduling
 8 | 
 9 | clean:
10 | 	rm -rf scheduling
11 | 


--------------------------------------------------------------------------------
/chapter6/load_imbalance/data:
--------------------------------------------------------------------------------
1 | ../1NN_classification/data/


--------------------------------------------------------------------------------
/chapter6/load_imbalance/scheduling.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>   // std::cout
 2 | #include <vector>     // std::vector
 3 | 
 4 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros
 5 | #include "../include/hpc_helpers.hpp"
 6 | // binary_IO contains the load_binary function to load
 7 | // and store binary data from and to a file
 8 | #include "../include/binary_IO.hpp"
 9 | 
10 | // we will change this mode later
11 | #define MODE dynamic
12 | 
13 | template <typename value_t,
14 |           typename index_t>
15 | void inner_product(value_t * data,
16 |                    value_t * delta,
17 |                    index_t num_entries,
18 |                    index_t num_features,
19 |                    bool    parallel) {
20 | 
21 |     #pragma omp parallel for schedule(MODE) if(parallel)
22 |     for (index_t i = 0; i < num_entries; i++)
23 |         for (index_t j = i; j < num_entries; j++) {
24 |             value_t accum = value_t(0);
25 |             for (index_t k = 0; k < num_features; k++)
26 |                 accum += data[i*num_features+k] *
27 |                          data[j*num_features+k];
28 |             delta[i*num_entries+j] =
29 |             delta[j*num_entries+i] = accum;
30 |         }
31 | }
32 | 
33 | int main(int argc, char* argv[]) {
34 | 
35 |     // run parallelized when any command line argument given
36 |     const bool parallel = argc > 1;
37 | 
38 |     std::cout << "running "
39 |               << (parallel ? "in parallel" : "sequentially")
40 |               << std::endl;
41 | 
42 |     // the shape of the data matrices
43 |     const uint64_t num_features = 28*28;
44 |     const uint64_t num_entries = 65000;
45 | 
46 |     TIMERSTART(alloc)
47 |     // memory for the data matrices and all-pair matrix
48 |     std::vector<float> input(num_entries*num_features);
49 |     std::vector<float> delta(num_entries*num_entries);
50 |     TIMERSTOP(alloc)
51 | 
52 |     TIMERSTART(read_data)
53 |     // get the images and labels from disk
54 |     load_binary(input.data(), input.size(), "./data/X.bin");
55 |     TIMERSTOP(read_data)
56 | 
57 |     TIMERSTART(inner_product)
58 |     inner_product(input.data(), delta.data(),
59 |                   num_entries, num_features, parallel);
60 |     TIMERSTOP(inner_product)
61 | }
62 | 


--------------------------------------------------------------------------------
/chapter6/matrix_vector/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: matrix_vector
 5 | 
 6 | matrix_vector: matrix_vector.cpp
 7 | 	$(CXX) matrix_vector.cpp $(CXXFLAGS) -o matrix_vector
 8 | 
 9 | clean:
10 | 	rm -rf matrix_vector
11 | 


--------------------------------------------------------------------------------
/chapter6/matrix_vector/matrix_vector.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | 
 5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros
 6 | // and the no_init_t template that disables implicit type
 7 | // initialization
 8 | #include "../include/hpc_helpers.hpp"
 9 | 
10 | template <typename value_t,
11 |           typename index_t>
12 | void init(std::vector<value_t>& A,
13 |           std::vector<value_t>& x,
14 |           index_t m,
15 |           index_t n) {
16 | 
17 |     for (index_t row = 0; row < m; row++)
18 |         for (index_t col = 0; col < n; col++)
19 |             A[row*n+col] = row >= col ? 1 : 0;
20 | 
21 |     for (index_t col = 0; col < m; col++)
22 |         x[col] = col;
23 | }
24 | 
25 | template <typename value_t,
26 |           typename index_t>
27 | void mult(std::vector<value_t>& A,
28 |           std::vector<value_t>& x,
29 |           std::vector<value_t>& b,
30 |           index_t m,
31 |           index_t n,
32 |           bool parallel) {
33 | 
34 |     #pragma omp parallel for if(parallel)
35 |     for (index_t row = 0; row < m; row++) {
36 |         value_t accum = value_t(0);
37 |         for (index_t col = 0; col < n; col++)
38 |             accum += A[row*n+col]*x[col];
39 |         b[row] = accum;
40 |     }
41 | }
42 | 
43 | int main() {
44 |     const uint64_t n = 1UL << 15;
45 |     const uint64_t m = 1UL << 15;
46 | 
47 |     TIMERSTART(overall)
48 |     // memory allocation for the three vectors x, y, and z
49 |     // with the no_init_t template as a wrapper for the actual type
50 |     TIMERSTART(alloc)
51 |     std::vector<no_init_t<uint64_t>> A(m*n);
52 |     std::vector<no_init_t<uint64_t>> x(n);
53 |     std::vector<no_init_t<uint64_t>> b(m);
54 |     TIMERSTOP(alloc)
55 | 
56 |     // manually initialize the input matrix A and vector x
57 |     TIMERSTART(init)
58 |     init(A, x, m, n);
59 |     TIMERSTOP(init)
60 | 
61 |     // compute A * x = b sequentially three times
62 |     for (uint64_t k = 0; k < 3; k++) {
63 |         TIMERSTART(mult_seq)
64 |         mult(A, x, b, m, n, false);
65 |         TIMERSTOP(mult_seq)
66 |     }
67 |     // compute A * x = b in parallel three times
68 |     for (uint64_t k = 0; k < 3; k++) {
69 |         TIMERSTART(mult_par)
70 |         mult(A, x, b, m, n, true);
71 |         TIMERSTOP(mult_par)
72 |     }
73 |     TIMERSTOP(overall)
74 | 
75 |     // check if (last) result is correct
76 |     for (uint64_t index = 0; index < m; index++)
77 |         if (b[index] != index*(index+1)/2)
78 |             std::cout << "error at position " << index 
79 |                       << " " << b[index] << std::endl;
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/chapter6/softmax_regression/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++-6
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: softmax
 5 | 
 6 | softmax: softmax.cpp
 7 | 	$(CXX) softmax.cpp $(CXXFLAGS) -o softmax
 8 | 
 9 | clean:
10 | 	rm -rf softmax
11 | 


--------------------------------------------------------------------------------
/chapter6/softmax_regression/data/mnist_softmax.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # run __ONE__ of the following commands:
 3 | # pip install --user tensorflow (if you have no CUDA-enabled GPU)
 4 | # pip install --user tensorflow-gpu
 5 | #
 6 | # afterwards install tflearn
 7 | # pip install --user tflearn
 8 | #
 9 | # Numpy should come bundled with tensorflow. Run this file et voila!
10 | #####################################################################
11 | 
12 | from __future__ import division, print_function, absolute_import
13 | import tflearn
14 | 
15 | # Data loading and preprocessing
16 | import tflearn.datasets.mnist as mnist
17 | X, Y, testX, testY = mnist.load_data(one_hot=True)
18 | 
19 | input_layer = tflearn.input_data(shape=[None, 784])
20 | softmax = tflearn.fully_connected(input_layer, 10, activation='softmax', name="fully")
21 | 
22 | net = tflearn.regression(softmax, optimizer="adam",
23 |                          loss='categorical_crossentropy')
24 | 
25 | # Training
26 | model = tflearn.DNN(net, tensorboard_verbose=0)
27 | model.fit(X, Y, n_epoch=5, validation_set=(testX, testY),
28 |           show_metric=True, run_id="dense_model")
29 | 
30 | with model.session.as_default():
31 |     fully = tflearn.variables.get_layer_variables_by_name('fully')
32 |     A = tflearn.variables.get_value(fully[0]).T
33 |     b = tflearn.variables.get_value(fully[1])
34 | 
35 | print(A.shape)
36 | print(b.shape)
37 | 
38 | import array as ar
39 | import numpy as np
40 | 
41 | with open("A.bin", "wb") as f:
42 |     f.write(ar.array("f", A.flatten()))
43 | 
44 | with open("b.bin", "wb") as f:
45 |     f.write(ar.array("f", b))
46 | 
47 | with open("X.bin", "wb") as f:
48 |     images = np.vstack((X, testX))
49 |     print(images.shape)
50 |     f.write(ar.array("f", images.flatten()))
51 | 
52 | with open("Y.bin", "wb") as f:
53 |     labels = np.vstack((Y, testY))
54 |     print(labels.shape)
55 |     f.write(ar.array("f", labels.flatten()))
56 | 


--------------------------------------------------------------------------------
/chapter6/vector_add/Makefile:
--------------------------------------------------------------------------------
 1 | CXX= g++
 2 | CXXFLAGS= -std=c++14 -O2 -fopenmp
 3 | 
 4 | all: vector_add vector_add_scoped
 5 | 
 6 | vector_add: vector_add.cpp
 7 | 	$(CXX) vector_add.cpp $(CXXFLAGS) -o vector_add
 8 | 
 9 | vector_add_scoped: vector_add_scoped.cpp
10 | 	$(CXX) vector_add_scoped.cpp $(CXXFLAGS) -o vector_add_scoped
11 | 
12 | clean:
13 | 	rm -rf vector_add
14 | 	rm -rf vector_add_scoped
15 | 


--------------------------------------------------------------------------------
/chapter6/vector_add/vector_add.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | #include <omp.h>
 5 | 
 6 | #include "../include/hpc_helpers.hpp"
 7 | 
 8 | int main() {
 9 | 
10 |     TIMERSTART(alloc)
11 |     const uint64_t num_entries = 1UL << 30;
12 |     std::vector<no_init_t<uint64_t>> x(num_entries);
13 |     std::vector<no_init_t<uint64_t>> y(num_entries);
14 |     std::vector<no_init_t<uint64_t>> z(num_entries);
15 |     TIMERSTOP(alloc)
16 | 
17 |     TIMERSTART(init)
18 |     #pragma omp parallel for
19 |     for (uint64_t i = 0; i < num_entries; i++) {
20 |         x[i] = i;
21 |         y[i] = num_entries-i;
22 |     }
23 |     TIMERSTOP(init)
24 | 
25 |     TIMERSTART(add)
26 |     #pragma omp parallel for
27 |     for (uint64_t i = 0; i < num_entries; i++)
28 |         z[i] = x[i]+y[i];
29 |     TIMERSTOP(add)
30 | 
31 |     TIMERSTART(check)
32 |     #pragma omp parallel for
33 |     for (uint64_t i = 0; i < num_entries; i++)
34 |         if(z[i]-num_entries)
35 |             std::cout << "error at position "
36 |                       << i << std::endl;
37 |     TIMERSTOP(check)
38 | }
39 | 


--------------------------------------------------------------------------------
/chapter6/vector_add/vector_add_scoped.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdint>
 3 | #include <vector>
 4 | 
 5 | // hpc_helpers contains the TIMERSTART and TIMERSTOP macros
 6 | // and the no_init_t template that disables implicit type
 7 | // initialization
 8 | #include "../include/hpc_helpers.hpp"
 9 | 
10 | int main() {
11 |     // memory allocation for the three vectors x, y, and z
12 |     // with the no_init_t template as a wrapper for the actual type
13 |     TIMERSTART(alloc)
14 |     const uint64_t num_entries = 1UL << 30;
15 |     std::vector<no_init_t<uint64_t>> x(num_entries);
16 |     std::vector<no_init_t<uint64_t>> y(num_entries);
17 |     std::vector<no_init_t<uint64_t>> z(num_entries);
18 |     TIMERSTOP(alloc)
19 | 
20 |     TIMERSTART(alltogether)
21 |     #pragma omp parallel 	    
22 |     {
23 |         #pragma omp for
24 |         for (uint64_t i = 0; i < num_entries; i++) {
25 |             x[i] = i;
26 |             y[i] = num_entries - i;
27 |         }
28 | 
29 |         #pragma omp for
30 |         for (uint64_t i = 0; i < num_entries; i++)
31 |             z[i] = x[i] + y[i];
32 | 
33 |         #pragma omp for
34 |         for (uint64_t i = 0; i < num_entries; i++)
35 |             if (z[i] - num_entries)
36 |                 std::cout << "error at position "
37 |                           << i << std::endl;
38 |     }
39 |     TIMERSTOP(alltogether)
40 | }
41 | 


--------------------------------------------------------------------------------
/chapter7/dynamic_time_warping/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC= nvcc
 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61
 3 | CXXFLAGS= -Xcompiler="-fopenmp -march=native"
 4 | 
 5 | all: dtw_host  dtw_device
 6 | 
 7 | dtw_host: dtw_host.cu
 8 | 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) dtw_host.cu -o dtw_host
 9 | 
10 | dtw_device: dtw_device.cu
11 | 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) dtw_device.cu -o dtw_device
12 | 
13 | clean:
14 | 	rm -f dtw_host
15 | 	rm -f dtw_device
16 | 


--------------------------------------------------------------------------------
/chapter7/dynamic_time_warping/dtw_host.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/cbf_generator.hpp"
  2 | #include "../include/hpc_helpers.hpp"
  3 | #include "../include/binary_IO.hpp"
  4 | 
  5 | typedef uint64_t index_t;
  6 | typedef uint8_t label_t;
  7 | typedef float value_t;
  8 | 
  9 | template <
 10 |     typename index_t,
 11 |     typename value_t>
 12 | value_t plain_dtw(
 13 |     value_t * query,
 14 |     value_t * subject,
 15 |     index_t num_features) {
 16 | 
 17 |     // for convenient indexing
 18 |     const index_t lane = num_features+1;
 19 | 
 20 |     // allocate the matrix of M
 21 |     value_t * penalty = new value_t[lane*lane];
 22 | 
 23 |     // initialize the matrix M
 24 |     for (index_t index = 1; index < lane-1; index++) {
 25 |         penalty[index] = INFINITY;
 26 |         penalty[index*lane] = INFINITY;
 27 |     }
 28 |     penalty[0] = 0;
 29 | 
 30 |     // traverse graph in row-major order
 31 |     for (index_t row = 1; row < lane; row++) {
 32 | 
 33 |         const value_t q_value = query[row-1];
 34 | 
 35 |         for (index_t col = 1; col < lane; col++) {
 36 | 
 37 |             // determine contribution from incoming edges
 38 |             const value_t diag = penalty[(row-1)*lane+col-1];
 39 |             const value_t abve = penalty[(row-1)*lane+col+0];
 40 |             const value_t left = penalty[(row+0)*lane+col-1];
 41 | 
 42 |             // compute residue between query and subject
 43 |             const value_t residue = q_value-subject[col-1];
 44 | 
 45 |             // relax node
 46 |             penalty[row*lane+col] = residue*residue +
 47 |                                     min(diag, 
 48 |                                     min(abve, left));
 49 |         }
 50 |     }
 51 | 
 52 |     // report the lower right cell and free memory
 53 |     const value_t result = penalty[lane*lane-1];
 54 |     delete [] penalty;
 55 | 
 56 |     return result;
 57 | }
 58 | 
 59 | template <
 60 |     typename index_t,
 61 |     typename value_t>
 62 | value_t dtw(
 63 |     value_t * query,
 64 |     value_t * subject,
 65 |     index_t num_features) {
 66 | 
 67 |     const index_t lane = num_features+1;
 68 |     value_t * penalty = new value_t[2*lane];
 69 | 
 70 |     for (index_t index = 0; index < lane; index++)
 71 |         penalty[index+1] = INFINITY;
 72 |     penalty[0] = 0;
 73 | 
 74 |     for (index_t row = 1; row < lane; row++) {
 75 | 
 76 |         const value_t q_value = query[row-1];
 77 |         const index_t target_row = row & 1;
 78 |         const index_t source_row = !target_row;
 79 | 
 80 |         if (row == 2)
 81 |             penalty[target_row*lane] = INFINITY;
 82 | 
 83 |         for (index_t col = 1; col < lane; col++) {
 84 | 
 85 |             const value_t diag = penalty[source_row*lane+col-1];
 86 |             const value_t abve = penalty[source_row*lane+col+0];
 87 |             const value_t left = penalty[target_row*lane+col-1];
 88 | 
 89 |             const value_t residue = q_value-subject[col-1];
 90 | 
 91 |             penalty[target_row*lane+col] = residue*residue +
 92 |                                             min(diag, min(abve, left));
 93 |         }
 94 |     }
 95 | 
 96 |     const index_t last_row = num_features & 1;
 97 |     const value_t result = penalty[last_row*lane+num_features];
 98 |     delete [] penalty;
 99 | 
100 |     return result;
101 | }
102 | 
103 | #include <omp.h>
104 | template <
105 |     typename index_t,
106 |     typename value_t>
107 | void host_dtw(
108 |     value_t * query,
109 |     value_t * subject,
110 |     value_t * dist,
111 |     index_t num_entries,
112 |     index_t num_features) {
113 | 
114 |     # pragma omp parallel for
115 |     for (index_t entry = 0; entry < num_entries; entry++)
116 |         dist[entry] = dtw(query, subject+entry*num_features, num_features);
117 | }
118 | 
119 | int main () {
120 | 
121 |     constexpr index_t num_features = 128;
122 |     constexpr index_t num_entries = 1UL << 20;
123 | 
124 |     // small letters for hosts, capital letters for device
125 |     value_t * data = nullptr, * dist = nullptr;
126 |     label_t * labels = nullptr;
127 | 
128 |     // malloc memory
129 |     cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features);      CUERR
130 |     cudaMallocHost(&dist, sizeof(value_t)*num_entries);                   CUERR
131 |     cudaMallocHost(&labels, sizeof(label_t)*num_entries);                 CUERR
132 | 
133 |     // create CBF data set on host
134 |     TIMERSTART(generate_data)
135 |     generate_cbf(data, labels, num_entries, num_features);
136 |     TIMERSTOP(generate_data)
137 | 
138 |   
139 |     TIMERSTART(DTW_openmp)
140 |     host_dtw(data, data, dist, num_entries, num_features);
141 |     TIMERSTOP(DTW_openmp)
142 | 
143 | 
144 |     for (index_t index = 0; index < 10; index++)
145 |         std::cout << index_t(labels[index]) << " " << dist[index] << std::endl;
146 | 
147 | 
148 |     // get rid of the memory
149 |     cudaFreeHost(labels);
150 |     cudaFreeHost(data);
151 |     cudaFreeHost(dist);
152 | }
153 | 


--------------------------------------------------------------------------------
/chapter7/eigenfaces/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | NVCCFLAGS=-O3 -std=c++11 -D_FORCE_INLINES -arch=sm_61
 3 | 
 4 | all: mean_computation                        \
 5 |      mean_correction_coalesced               \
 6 |      mean_correction_non_coalesced           \
 7 |      covariance_naive                        \
 8 | 	 covariance_symmetric                    \
 9 | 	 covariance_shared                       \
10 | 	 eigenfaces
11 | 
12 | mean_computation: mean_computation.cu
13 | 	$(NVCC) $(NVCCFLAGS) mean_computation.cu -o mean_computation
14 | 
15 | mean_correction_coalesced: mean_correction.cu
16 | 	$(NVCC) $(NVCCFLAGS) mean_correction.cu -o mean_correction_coalesced -DCOALESCED_ACCESS
17 | 
18 | mean_correction_non_coalesced: mean_correction.cu
19 | 	$(NVCC) $(NVCCFLAGS) mean_correction.cu -o mean_correction_non_coalesced
20 | 
21 | covariance_naive: covariance.cu
22 | 	$(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_naive -DCOV_MODE_NAIVE
23 | 
24 | covariance_symmetric: covariance.cu
25 | 	$(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_symmetric -DCOV_MODE_SYMMETRIC
26 | 
27 | covariance_shared: covariance.cu
28 | 	$(NVCC) $(NVCCFLAGS) covariance.cu -o covariance_shared
29 | 
30 | eigenfaces: eigenfaces.cu
31 | 	$(NVCC) $(NVCCFLAGS) eigenfaces.cu -o eigenfaces -lcusolver
32 | 
33 | clean:
34 | 	rm -f mean_computation                 \
35 | 	      mean_correction_coalesced        \
36 | 	      mean_correction_non_coalesced    \
37 |  	      covariance_naive                 \
38 | 	      covariance_symmetric             \
39 | 	      covariance_shared                \
40 | 	      eigenfaces
41 | 


--------------------------------------------------------------------------------
/chapter7/eigenfaces/data/convert_images.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import array as ar
 3 | import numpy as np
 4 | from scipy.misc import imread
 5 | from scipy.linalg import svd
 6 | 
 7 | # specifiy the CelebA folder
 8 | dirname = "./img_align_celeba/"
 9 | 
10 | files = [filename for (dirpath, dirnames, filenames) in os.walk(dirname)
11 |                   for filename in filenames if filename[-4:] == ".jpg"]
12 | 
13 | if len(files) == 0:
14 |     print "ERROR: goto folder im_align_celeba and inspect the README file"
15 |     import sys
16 |     sys.exit(1)
17 | 
18 | # if you want to subsample in index space
19 | # files = files[::10]
20 | 
21 | # downsample the resolution by 4
22 | subx, suby = 4, 4
23 | dimx, dimy = (218+subx-1)/subx, (178+suby-1)/suby
24 | 
25 | data = np.zeros((len(files), dimx*dimy), dtype=np.float32)
26 | print dimx*dimy
27 | 
28 | for index, filename in enumerate(files):
29 |     if index % 1000 == 0:
30 |         print index
31 |     data[index] = np.mean(imread(dirname+filename), axis=2)[::subx,::suby].flatten()
32 | 
33 | with open("celebA_gray_lowres.%d_%d_%d_32.bin" % (data.shape[0], dimx, dimy), "wb") as f:
34 |     f.write(ar.array("f", data.flatten()))
35 | 


--------------------------------------------------------------------------------
/chapter7/eigenfaces/data/img_align_celeba/README.md:
--------------------------------------------------------------------------------
1 | ### Instructions
2 | 
3 | - navigate to the [CelebA website](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)
4 | - download the "Align&Cropped Images" zip container
5 | - unzip it in this folder
6 | - run the conversion script in the parent dir 
7 | 


--------------------------------------------------------------------------------
/chapter7/eigenfaces/mean_computation.cu:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | #include "../include/binary_IO.hpp"
 3 | #include "../include/bitmap_IO.hpp"
 4 | 
 5 | template <
 6 |     typename index_t,
 7 |     typename value_t> __global__
 8 | void compute_mean_kernel(
 9 |     value_t * Data,
10 |     value_t * Mean,
11 |     index_t num_entries,
12 |     index_t num_features);
13 | 
14 | int main (int argc, char * argv[]) {
15 | 
16 |     // set the identifier of the used CUDA device
17 |     cudaSetDevice(0);
18 | 
19 |     // 202599 grayscale images each of shape 55 x 45
20 |     constexpr uint64_t imgs = 202599, rows = 55, cols = 45;
21 | 
22 |     // pointer for data matrix and mean vector
23 |     float * data = nullptr, * mean = nullptr;
24 |     cudaMallocHost(&data, sizeof(float)*imgs*rows*cols);                  CUERR
25 |     cudaMallocHost(&mean, sizeof(float)*rows*cols);                       CUERR
26 | 
27 |     // allocate storage on GPU
28 |     float * Data = nullptr, * Mean = nullptr;
29 |     cudaMalloc(&Data, sizeof(float)*imgs*rows*cols);                      CUERR
30 |     cudaMalloc(&Mean, sizeof(float)*rows*cols);                           CUERR
31 | 
32 |     // load data matrix from disk
33 |     TIMERSTART(read_data_from_disk)
34 |     std::string file_name = "./data/celebA_gray_lowres.202599_55_45_32.bin";
35 |     load_binary(data, imgs*rows*cols, file_name);
36 |     TIMERSTOP(read_data_from_disk)
37 | 
38 |     // copy data to device and reset Mean
39 |     TIMERSTART(data_H2D)
40 |     cudaMemcpy(Data, data, sizeof(float)*imgs*rows*cols,
41 |                cudaMemcpyHostToDevice);                                   CUERR
42 |     cudaMemset(Mean, 0, sizeof(float)*rows*cols);                         CUERR
43 |     TIMERSTOP(data_H2D)
44 | 
45 |     // compute mean
46 |     TIMERSTART(compute_mean_kernel)
47 |     compute_mean_kernel<<<SDIV(rows*cols, 32), 32>>>
48 |                        (Data, Mean, imgs, rows*cols);                     CUERR
49 |     TIMERSTOP(compute_mean_kernel)
50 | 
51 | 
52 |     // transfer mean back to host
53 |     TIMERSTART(mean_D2H)
54 |     cudaMemcpy(mean, Mean, sizeof(float)*rows*cols,
55 |                cudaMemcpyDeviceToHost);                                   CUERR
56 |     TIMERSTOP(mean_D2H)
57 | 
58 |     // write mean image to disk
59 |     TIMERSTART(write_mean_image_to_disk)
60 |     dump_bitmap(mean, rows, cols, "./imgs/celebA_mean.bmp");
61 |     TIMERSTOP(write_mean_image_to_disk)
62 | 
63 |     // get rid of the memory
64 |     cudaFreeHost(data);                                                   CUERR
65 |     cudaFreeHost(mean);                                                   CUERR
66 |     cudaFree(Data);                                                       CUERR
67 |     cudaFree(Mean);                                                       CUERR
68 | 
69 | }
70 | 
71 | template <
72 |     typename index_t,
73 |     typename value_t> __global__
74 | void compute_mean_kernel(
75 |     value_t * Data,
76 |     value_t * Mean,
77 |     index_t num_entries,
78 |     index_t num_features) {
79 | 
80 |     auto thid = blockDim.x*blockIdx.x + threadIdx.x;
81 | 
82 |     if (thid < num_features) {
83 | 
84 |         value_t accum = 0;
85 | 
86 |         # pragma unroll 32
87 |         for (index_t entry = 0; entry < num_entries; entry++)
88 |             accum += Data[entry*num_features+thid];
89 | 
90 |         Mean[thid] = accum/num_entries;
91 |     }
92 | }
93 | 
94 | 


--------------------------------------------------------------------------------
/chapter7/eigenfaces/mean_correction.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/hpc_helpers.hpp"
  2 | #include "../include/binary_IO.hpp"
  3 | 
  4 | template <
  5 |     typename index_t,
  6 |     typename value_t> __global__
  7 | void compute_mean_kernel(
  8 |     value_t * Data,
  9 |     value_t * Mean,
 10 |     index_t num_entries,
 11 |     index_t num_features) {
 12 | 
 13 |     auto thid = blockDim.x*blockIdx.x + threadIdx.x;
 14 | 
 15 |     if (thid < num_features) {
 16 | 
 17 |         value_t accum = 0;
 18 | 
 19 |         # pragma unroll 32
 20 |         for (index_t entry = 0; entry < num_entries; entry++)
 21 |             accum += Data[entry*num_features+thid];
 22 | 
 23 |         Mean[thid] = accum/num_entries;
 24 |     }
 25 | }
 26 | 
 27 | template <
 28 |     typename index_t,
 29 |     typename value_t> __global__
 30 | void correction_kernel(
 31 |     value_t * Data,
 32 |     value_t * Mean,
 33 |     index_t num_entries,
 34 |     index_t num_features) {
 35 | 
 36 |      auto thid = blockDim.x*blockIdx.x + threadIdx.x;
 37 | 
 38 |      if (thid < num_features) {
 39 | 
 40 |         value_t value = Mean[thid];
 41 | 
 42 |         for (index_t entry = 0; entry < num_entries; entry++)
 43 |             Data[entry*num_features+thid] -= value;
 44 | 
 45 |      }
 46 | }
 47 | 
 48 | template <
 49 |     typename index_t,
 50 |     typename value_t> __global__
 51 | void correction_kernel_ortho(
 52 |     value_t * Data,
 53 |     value_t * Mean,
 54 |     index_t num_entries,
 55 |     index_t num_features) {
 56 | 
 57 |      auto thid = blockDim.x*blockIdx.x + threadIdx.x;
 58 | 
 59 |      if (thid < num_entries) {
 60 | 
 61 |         for (index_t feature = 0; feature < num_features; feature++)
 62 |             Data[thid*num_features+feature] -= Mean[feature];
 63 |      }
 64 | }
 65 | 
 66 | int main (int argc, char * argv[]) {
 67 | 
 68 |     // set the identifier of the used CUDA device
 69 |     cudaSetDevice(0);
 70 | 
 71 |     // 202599 grayscale images each of shape 55 x 45
 72 |     constexpr uint64_t imgs = 202599, rows = 55, cols = 45;
 73 | 
 74 |     // pointer for data matrix and mean vector
 75 |     float * data = nullptr;
 76 |     cudaMallocHost(&data, sizeof(float)*imgs*rows*cols);                  CUERR
 77 | 
 78 |     // allocate storage on GPU
 79 |     float * Data = nullptr, * Mean = nullptr;
 80 |     cudaMalloc(&Data, sizeof(float)*imgs*rows*cols);                      CUERR
 81 |     cudaMalloc(&Mean, sizeof(float)*rows*cols);                           CUERR
 82 | 
 83 |     // load data matrix from disk
 84 |     TIMERSTART(read_data_from_disk)
 85 |     auto file_name = "./data/celebA_gray_lowres.202599_55_45_32.bin";
 86 |     load_binary(data, imgs*rows*cols, file_name);
 87 |     TIMERSTOP(read_data_from_disk)
 88 | 
 89 |     // copy data to device and reset Mean
 90 |     TIMERSTART(data_H2D)
 91 |     cudaMemcpy(Data, data, sizeof(float)*imgs*rows*cols,
 92 |                cudaMemcpyHostToDevice);                                   CUERR
 93 |     cudaMemset(Mean, 0, sizeof(float)*rows*cols);                         CUERR
 94 |     TIMERSTOP(data_H2D)
 95 | 
 96 |     // compute mean
 97 |     TIMERSTART(compute_mean_kernel)
 98 |     compute_mean_kernel<<<SDIV(rows*cols, 32), 32>>>
 99 |                        (Data, Mean, imgs, rows*cols);                     CUERR
100 |     TIMERSTOP(compute_mean_kernel)
101 | 
102 |     // correct mean
103 |     TIMERSTART(correction_kernel)
104 |     #ifdef COALESCED_ACCESS
105 |     correction_kernel<<<SDIV(rows*cols, 32), 32>>>
106 |                        (Data, Mean, imgs, rows*cols);                     CUERR
107 |     #else
108 |     correction_kernel_ortho<<<SDIV(imgs, 32), 32>>>
109 |                        (Data, Mean, imgs, rows*cols);                     CUERR
110 |     #endif
111 |     TIMERSTOP(correction_kernel)
112 | 
113 |     // get rid of the memory
114 |     cudaFreeHost(data);                                                   CUERR
115 |     cudaFree(Data);                                                       CUERR
116 |     cudaFree(Mean);                                                       CUERR
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/chapter7/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | all: hello_world
 4 | 
 5 | hello_world: hello_world.cu
 6 | 	$(NVCC) -O2 -std=c++11 hello_world.cu -o hello_world
 7 | 
 8 | clean:
 9 | 	rm -f hello_world
10 | 


--------------------------------------------------------------------------------
/chapter7/hello_world/hello_world.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>                      // printf
 2 | 
 3 | __global__ void hello_kernel() {
 4 | 
 5 |     // calculate global thread identifier, note blockIdx.x=0 here
 6 |     const auto thid = blockDim.x*blockIdx.x + threadIdx.x;
 7 | 
 8 |     // print a greeting message
 9 |     printf("Hello from thread %d!\n", thid);
10 | }
11 | 
12 | // compile with: nvcc hello_world.cu -std=c++11 -O3
13 | // output:
14 | // Hello from thread 0!
15 | // Hello from thread 1!
16 | // Hello from thread 2!
17 | // Hello from thread 3!
18 | 
19 | int main (int argc, char * argv[]) {
20 | 
21 |     // set the ID of the CUDA device
22 |     cudaSetDevice(0);
23 | 
24 |     // invoke kernel using 4 threads executed in 1 thread block
25 |     hello_kernel<<<1, 4>>>();
26 | 
27 |     // synchronize the GPU preventing premature termination
28 |     cudaDeviceSynchronize();
29 | }
30 | 


--------------------------------------------------------------------------------
/chapter7/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter8/include:
--------------------------------------------------------------------------------
1 | ../include/


--------------------------------------------------------------------------------
/chapter8/intrinsics_and_atomics/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC= nvcc
 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61
 3 | 
 4 | all: znorm atomics cas
 5 | 
 6 | znorm: znorm.cu
 7 | 	$(NVCC) $(NVCCFLAGS) znorm.cu -o znorm
 8 | 
 9 | atomics: atomics.cu
10 | 	$(NVCC) $(NVCCFLAGS) atomics.cu -o atomics
11 | 
12 | cas: cas.cu
13 | 	$(NVCC) $(NVCCFLAGS) cas.cu -o cas
14 | 
15 | clean:
16 | 	rm -f znorm
17 | 	rm -f atomics
18 | 	rm -f cas
19 | 


--------------------------------------------------------------------------------
/chapter8/intrinsics_and_atomics/atomics.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/cbf_generator.hpp"
  2 | #include "../include/hpc_helpers.hpp"
  3 | 
  4 | typedef uint64_t index_t;
  5 | typedef uint8_t label_t;
  6 | typedef float value_t;
  7 | 
  8 | template <
  9 |     typename index_t,
 10 |     typename value_t,
 11 |     index_t warp_size=32> __global__
 12 | void global_reduction_kernel(
 13 |     value_t * Input,         // pointer to the data
 14 |     value_t * Output,        // pointer to the result
 15 |     index_t   length) {      // number of entries (n)
 16 | 
 17 |     // get thread and block identifiers
 18 |     const index_t thid = threadIdx.x;
 19 |     const index_t blid = blockIdx.x;
 20 |     const index_t base = blid*warp_size;
 21 | 
 22 |     // store entries in registers
 23 |     value_t x = 0;
 24 |     if (base+thid < length)
 25 |         x = Input[base+thid];
 26 | 
 27 |     // do the Kepler shuffle
 28 |     for (index_t offset = warp_size/2; offset > 0; offset /= 2)
 29 |         x += __shfl_down(x, offset, warp_size);
 30 | 
 31 |     // write down result
 32 |     if (thid == 0)
 33 |       atomicAdd(Output, x);
 34 | }
 35 | 
 36 | template <
 37 |     typename index_t,
 38 |     typename value_t,
 39 |     index_t warp_size=32> __global__
 40 | void static_reduction_kernel(
 41 |     value_t * Input,         // pointer to the data
 42 |     value_t * Output,        // pointer to the result
 43 |     index_t length) {        // number of entries (n)
 44 | 
 45 |     // get global thread identifier
 46 |     const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
 47 | 
 48 |     // here we store the result
 49 |     value_t accum = value_t(0);
 50 | 
 51 |     // block-cyclic summation over all spawned blocks
 52 |     for (index_t i = thid; i < length; i += blockDim.x*gridDim.x)
 53 |         accum += Input[i];
 54 | 
 55 |     // reduce all values within a warp
 56 |     for (index_t offset = warp_size/2; offset > 0; offset /= 2)
 57 |         accum += __shfl_down(accum, offset, warp_size);
 58 | 
 59 |     // first thread of every warp writes result
 60 |     if (thid % 32  == 0)
 61 |       atomicAdd(Output, accum);
 62 | }
 63 | 
 64 | 
 65 | int main () {
 66 | 
 67 |     constexpr index_t num_features = 32;
 68 |     constexpr index_t num_entries = 1UL << 10;
 69 | 
 70 |     // small letters for hosts, capital letters for device
 71 |     value_t * data = nullptr, * result = nullptr,
 72 |             * Data = nullptr, * Result = nullptr;
 73 |     label_t * labels = nullptr;
 74 | 
 75 |     // malloc memory
 76 |     cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features);      CUERR
 77 |     cudaMalloc    (&Data, sizeof(value_t)*num_entries*num_features);      CUERR
 78 |     cudaMallocHost(&result, sizeof(value_t));                             CUERR
 79 |     cudaMalloc    (&Result, sizeof(value_t));                             CUERR
 80 |     cudaMallocHost(&labels, sizeof(label_t)*num_entries);                 CUERR
 81 | 
 82 |     // create CBF data set on host
 83 |     TIMERSTART(generate_data)
 84 |     generate_cbf(data, labels, num_entries, num_features);
 85 |     TIMERSTOP(generate_data)
 86 | 
 87 |     TIMERSTART(copy_data_to_device)
 88 |     cudaMemcpy(Data, data, sizeof(value_t)*num_entries*num_features, H2D);CUERR
 89 |     cudaMemset(Result, 0, sizeof(value_t));
 90 |     TIMERSTOP(copy_data_to_device)
 91 | 
 92 |     value_t accum = 0;
 93 |     for (index_t i = 0; i < num_entries*num_features; i++)
 94 |         accum += data[i];
 95 |     std::cout << accum << std::endl;
 96 | 
 97 |     TIMERSTART(global_reduction)
 98 |     global_reduction_kernel<<<SDIV(num_entries*num_features, 32), 32>>>
 99 |                                        (Data, Result, num_entries*num_features);    CUERR
100 |     TIMERSTOP(global_reduction)
101 | 
102 |     TIMERSTART(static_reduction)
103 |     static_reduction_kernel<<<32, 32>>>(Data, Result, num_entries*num_features);    CUERR
104 |     TIMERSTOP(static_reduction)
105 | 
106 |     TIMERSTART(copy_data_to_host)
107 |     cudaMemcpy(result, Result, sizeof(value_t), D2H);                               CUERR
108 |     TIMERSTOP(copy_data_to_host)
109 | 
110 | 
111 |     std::cout << *result << std::endl;
112 | 
113 |     // get rid of the memory
114 |     cudaFreeHost(labels);
115 |     cudaFreeHost(result);
116 |     cudaFreeHost(data);
117 |     cudaFree(Result);
118 |     cudaFree(Data);
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/chapter8/intrinsics_and_atomics/cas.cu:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | 
 3 | __device__ __forceinline__
 4 | int atomicUpdateResultBoundedByTwo(
 5 |     int* address,
 6 |     int value) {
 7 | 
 8 |     // get the source value stored at address
 9 |     int source = *address, expected;
10 | 
11 |     do {
12 |         // we expect source
13 |         expected = source;
14 | 
15 |         // compute our custom binary operation
16 |         int target = expected+value+expected*value;
17 | 
18 |         // check the constraint
19 |         if (target < 0 || target >= 10)
20 |             return source;
21 | 
22 |         // try to swap the values
23 |         source = atomicCAS(address, expected, target);
24 | 
25 |     // (expected == source) on success
26 |     } while (expected != source);
27 | 
28 |     return source;
29 | }
30 | 
31 | __global__
32 | void apply_kernel(int * source_address, int value) {
33 |     if (blockIdx.x == 0 && threadIdx.x == 0)
34 |         atomicUpdateResultBoundedByTwo(source_address, value);
35 | 
36 | }
37 | 
38 | 
39 | int main () {
40 |     int * data = nullptr;
41 |     cudaMallocHost(&data, sizeof(int));                               CUERR
42 | 
43 |     *data = 0;
44 |     apply_kernel<<<1, 1>>> (data, 10);
45 | 
46 |     cudaDeviceSynchronize();
47 | 
48 |     std::cout << * data << std::endl;
49 | 
50 |     cudaFree(data);
51 | }
52 | 


--------------------------------------------------------------------------------
/chapter8/intrinsics_and_atomics/znorm.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/cbf_generator.hpp"
  2 | #include "../include/hpc_helpers.hpp"
  3 | 
  4 | typedef uint64_t index_t;
  5 | typedef uint8_t label_t;
  6 | typedef float value_t;
  7 | 
  8 | __forceinline__ __device__
  9 | double cuda_rsqrt(const double& value) {
 10 |     return rsqrt(value);
 11 | }
 12 | 
 13 | __forceinline__ __device__
 14 | float cuda_rsqrt(const float& value) {
 15 |     return rsqrtf(value);
 16 | }
 17 | 
 18 | template <
 19 |     typename index_t,
 20 |     typename value_t> __global__
 21 | void znorm_kernel(
 22 |     value_t * Subject,       // pointer to the subject
 23 |     index_t num_entries,     // number of time series (m)
 24 |     index_t num_features) {  // number of time ticks (n)
 25 | 
 26 |     // get thread and block identifiers
 27 |     const index_t blid = blockIdx.x;
 28 |     const index_t thid = threadIdx.x;
 29 |     const index_t base = blid*num_features;
 30 | 
 31 |     // 1. coalesced loading of entries
 32 |     value_t v = Subject[base+thid];
 33 |     value_t x = v; // copy for later
 34 | 
 35 |     // 2a. perform a warp reduction (sum stored in thread zero)
 36 |     for (index_t offset = num_features/2; offset > 0; offset /= 2)
 37 |         x += __shfl_down(x, offset, num_features);
 38 | 
 39 |     // 2b. perform the first broadcast
 40 |     value_t mu = __shfl(x, 0)/num_features;
 41 | 
 42 |     // define the square residues
 43 |     value_t y = (v-mu)*(v-mu);
 44 | 
 45 |     // 3a. perform a warp reduction (sum stored in thread zero)
 46 |     for (index_t offset = num_features/2; offset > 0; offset /= 2)
 47 |         y += __shfl_down(y, offset, num_features);
 48 | 
 49 |     // 3b. perform the second broadcast
 50 |     value_t sigma = __shfl(y, 0)/(num_features-1);
 51 | 
 52 |     // 4. write result back
 53 |     Subject[base+thid] = (v-mu)*cuda_rsqrt(sigma);
 54 | }
 55 | 
 56 | int main () {
 57 | 
 58 |     constexpr index_t num_features = 32;
 59 |     constexpr index_t num_entries = 1UL << 20;
 60 | 
 61 |     // small letters for hosts, capital letters for device
 62 |     value_t * data = nullptr, * Data = nullptr;
 63 |     label_t * labels = nullptr;
 64 | 
 65 |     // malloc memory
 66 |     cudaMallocHost(&data, sizeof(value_t)*num_entries*num_features);      CUERR
 67 |     cudaMalloc    (&Data, sizeof(value_t)*num_entries*num_features);      CUERR
 68 |     cudaMallocHost(&labels, sizeof(label_t)*num_entries);                 CUERR
 69 | 
 70 |     // create CBF data set on host
 71 |     TIMERSTART(generate_data)
 72 |     generate_cbf(data, labels, num_entries, num_features);
 73 |     TIMERSTOP(generate_data)
 74 | 
 75 |     TIMERSTART(copy_data_to_device)
 76 |     cudaMemcpy(Data, data, sizeof(value_t)*num_entries*num_features, H2D);CUERR
 77 |     TIMERSTOP(copy_data_to_device)
 78 | 
 79 | 
 80 |     TIMERSTART(z_norm)
 81 |     znorm_kernel<<<num_entries, 32>>>(Data, num_entries, num_features);   CUERR
 82 |     TIMERSTOP(z_norm)
 83 | 
 84 |     TIMERSTART(copy_data_to_host)
 85 |     cudaMemcpy(data, Data, sizeof(value_t)*num_entries*num_features, D2H);CUERR
 86 |     TIMERSTOP(copy_data_to_host)
 87 | 
 88 | 
 89 |     value_t accum = 0, accum2=0;
 90 |     for (index_t i = 0; i < 32; i++) {
 91 |         accum  += data[i];
 92 |         accum2 += data[i]*data[i];
 93 |     }
 94 | 
 95 |     std::cout << accum << " " << accum2 << std::endl;
 96 | 
 97 |     // get rid of the memory
 98 |     cudaFreeHost(labels);
 99 |     cudaFreeHost(data);
100 |     cudaFree(Data);
101 | }
102 | 


--------------------------------------------------------------------------------
/chapter8/multi_gpu/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC= nvcc
 2 | NVCCFLAGS= -O2 -std=c++14 -arch=sm_61
 3 | 
 4 | all: single_gpu multi_gpu streamed_gpu multi_streamed_gpu
 5 | 
 6 | single_gpu: single_gpu.cu
 7 | 	$(NVCC) $(NVCCFLAGS) single_gpu.cu -o single_gpu
 8 | 
 9 | multi_gpu: multi_gpu.cu
10 | 	$(NVCC) $(NVCCFLAGS) multi_gpu.cu -o multi_gpu
11 | 
12 | streamed_gpu: streamed_gpu.cu
13 | 	$(NVCC) $(NVCCFLAGS) streamed_gpu.cu -o streamed_gpu
14 | 
15 | multi_streamed_gpu: multi_streamed_gpu.cu
16 | 	$(NVCC) $(NVCCFLAGS) multi_streamed_gpu.cu -o multi_streamed_gpu
17 | 
18 | clean:
19 | 	rm -f single_gpu
20 | 	rm -f multi_gpu
21 | 	rm -f streamed_gpu
22 | 	rm -f multi_streamed_gpu
23 | 


--------------------------------------------------------------------------------
/chapter8/multi_gpu/multi_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | 
 3 | template <
 4 |     typename index_t,
 5 |     typename value_t,
 6 |     index_t num_iters=256> __global__
 7 | void square_root_kernel(
 8 |     value_t * Data,
 9 |     index_t   length) {
10 | 
11 |     const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
12 | 
13 |     for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){
14 | 
15 |         value_t value = Data[i];
16 |         value_t root  = value;
17 | 
18 |         # pragma unroll (32)
19 |         for (index_t iters = 0; iters < num_iters && value; iters++)
20 |             root = 0.5*(root+value/root);
21 | 
22 |        Data[i] = root;
23 |     }
24 | }
25 | 
26 | int main () {
27 | 
28 |     typedef float    value_t;
29 |     typedef uint64_t index_t;
30 | 
31 |     const index_t length = 1UL << 30;
32 | 
33 |     int num_gpus;
34 |     cudaGetDeviceCount(&num_gpus);
35 |     const index_t batch_size = length/num_gpus;
36 | 
37 |     value_t * data = nullptr, * Data[num_gpus];
38 | 
39 |     cudaMallocHost(&data, sizeof(value_t)*length);                 CUERR
40 | 
41 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
42 |         cudaSetDevice(gpu);
43 |         cudaMalloc(&Data[gpu], sizeof(value_t)*batch_size);        CUERR
44 |     }
45 | 
46 |     for (index_t index = 0; index < length; index++)
47 |         data[index] = index;
48 | 
49 |     TIMERSTART(overall)
50 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
51 |         const index_t offset = gpu*batch_size;
52 |         cudaSetDevice(gpu);                                        CUERR
53 |         cudaMemcpy(Data[gpu], data+offset, sizeof(value_t)*batch_size,
54 |                    cudaMemcpyHostToDevice);                        CUERR
55 |     }
56 | 
57 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
58 |         cudaSetDevice(gpu);                                        CUERR
59 |         square_root_kernel<<<1024, 1024>>>(Data[gpu], batch_size); CUERR
60 |     }
61 | 
62 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
63 |         const index_t offset = gpu*batch_size;
64 |         cudaSetDevice(gpu);                                        CUERR
65 |         cudaMemcpy(data+offset, Data[gpu], sizeof(value_t)*batch_size,
66 |                    cudaMemcpyDeviceToHost);                        CUERR
67 |     }
68 |     TIMERSTOP(overall)
69 | 
70 |     for (index_t index = 0; index < length; index += batch_size/10)
71 |         std::cout << index << " " << data[index] << std::endl;
72 | 
73 |     cudaFreeHost(data);                                            CUERR
74 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
75 |         cudaSetDevice(gpu);
76 |         cudaFree(Data[gpu]);                                       CUERR
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/chapter8/multi_gpu/multi_streamed_gpu.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/hpc_helpers.hpp"
  2 | 
  3 | template <
  4 |     typename index_t,
  5 |     typename value_t,
  6 |     index_t num_iters=256> __global__
  7 | void square_root_kernel(
  8 |     value_t * Data,
  9 |     index_t   length) {
 10 | 
 11 |     const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
 12 | 
 13 |     for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){
 14 | 
 15 |         value_t value = Data[i];
 16 |         value_t root  = value;
 17 | 
 18 |         # pragma unroll (32)
 19 |         for (index_t iters = 0; iters < num_iters && value; iters++)
 20 |             root = 0.5*(root+value/root);
 21 | 
 22 |        Data[i] = root;
 23 |     }
 24 | }
 25 | 
 26 | int main () {
 27 | 
 28 |     typedef float    value_t;
 29 |     typedef uint64_t index_t;
 30 | 
 31 |     const index_t length = 1UL << 30;
 32 |     const index_t num_streams = 32;
 33 | 
 34 |     int num_gpus;
 35 |     cudaGetDeviceCount(&num_gpus);
 36 |     const index_t batch_size = length/(num_gpus*num_streams);
 37 | 
 38 |     value_t * data = nullptr, * Data[num_gpus];
 39 |     cudaStream_t streams[num_gpus][num_streams];
 40 | 
 41 |     cudaMallocHost(&data, sizeof(value_t)*length);                 CUERR
 42 | 
 43 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
 44 |         cudaSetDevice(gpu);
 45 |         cudaMalloc(&Data[gpu],
 46 |                    sizeof(value_t)*batch_size*num_streams);        CUERR
 47 | 
 48 |         for (index_t streamID = 0; streamID < num_streams; streamID++)
 49 |             cudaStreamCreate(&streams[gpu][streamID]);             CUERR
 50 |     }
 51 | 
 52 |     for (index_t index = 0; index < length; index++)
 53 |         data[index] = index;
 54 | 
 55 |     TIMERSTART(overall)
 56 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
 57 |         const index_t offset = gpu*num_streams*batch_size;
 58 |         cudaSetDevice(gpu);                                        CUERR
 59 | 
 60 |         for (index_t streamID = 0; streamID < num_streams; streamID++) {
 61 |             const index_t loc_off = streamID*batch_size;
 62 |             const index_t glb_off = loc_off+offset;
 63 |             cudaMemcpyAsync(Data[gpu]+loc_off, data+glb_off,
 64 |                        sizeof(value_t)*batch_size,
 65 |                        cudaMemcpyHostToDevice,
 66 |                        streams[gpu][streamID]);                    CUERR
 67 |         }
 68 |     }
 69 | 
 70 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
 71 |         cudaSetDevice(gpu);                                        CUERR
 72 |         for (index_t streamID = 0; streamID < num_streams; streamID++) {
 73 |             const index_t offset = streamID*batch_size;
 74 |             square_root_kernel<<<1024, 1024, 0, streams[gpu][streamID]>>>
 75 |                               (Data[gpu]+offset, batch_size);      CUERR
 76 |         }
 77 |     }
 78 | 
 79 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
 80 |         const index_t offset = gpu*num_streams*batch_size;
 81 |         cudaSetDevice(gpu);                                        CUERR
 82 | 
 83 |         for (index_t streamID = 0; streamID < num_streams; streamID++) {
 84 |             const index_t loc_off = streamID*batch_size;
 85 |             const index_t glb_off = loc_off+offset;
 86 |             cudaMemcpyAsync(data+glb_off, Data[gpu]+loc_off,
 87 |                        sizeof(value_t)*batch_size,
 88 |                        cudaMemcpyDeviceToHost,
 89 |                        streams[gpu][streamID]);                    CUERR
 90 |         }
 91 |     }
 92 | 
 93 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
 94 |         cudaSetDevice(gpu);
 95 |         cudaDeviceSynchronize();
 96 |     }
 97 |     TIMERSTOP(overall)
 98 | 
 99 | 
100 |     for (index_t index = 0; index < length; index += batch_size/10)
101 |         std::cout << index << " " << data[index] << std::endl;
102 | 
103 |     cudaFreeHost(data);                                            CUERR
104 |     for (index_t gpu = 0; gpu < num_gpus; gpu++) {
105 |         cudaSetDevice(gpu);
106 |         cudaFree(Data[gpu]);                                       CUERR
107 | 
108 |         for (index_t streamID = 0; streamID < num_streams; streamID++)
109 |             cudaStreamDestroy(streams[gpu][streamID]);             CUERR
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/chapter8/multi_gpu/single_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | 
 3 | template <
 4 |     typename index_t,
 5 |     typename value_t,
 6 |     index_t num_iters=256> __global__
 7 | void square_root_kernel(
 8 |     value_t * Data,
 9 |     index_t   length) {
10 | 
11 |     const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
12 | 
13 |     for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){
14 | 
15 |         value_t value = Data[i];
16 |         value_t root  = value;
17 | 
18 |         # pragma unroll (32)
19 |         for (index_t iters = 0; iters < num_iters && value; iters++)
20 |             root = 0.5*(root+value/root);
21 | 
22 |        Data[i] = root;
23 |     }
24 | }
25 | 
26 | int main () {
27 | 
28 |     typedef float    value_t;
29 |     typedef uint64_t index_t;
30 | 
31 |     const index_t length = 1UL << 30;
32 | 
33 |     value_t * data = nullptr, * Data = nullptr;
34 | 
35 |     cudaMallocHost(&data, sizeof(value_t)*length);                CUERR
36 |     cudaMalloc    (&Data, sizeof(value_t)*length);                CUERR
37 | 
38 |     for (index_t index = 0; index < length; index++)
39 |         data[index] = index;
40 | 
41 |     TIMERSTART(overall)
42 |     TIMERSTART(host_to_device)
43 |     cudaMemcpy(Data, data, sizeof(value_t)*length,
44 |                cudaMemcpyHostToDevice);                           CUERR
45 |     TIMERSTOP(host_to_device)
46 | 
47 |     TIMERSTART(square_root_kernel)
48 |     square_root_kernel<<<1024, 1024>>>(Data, length);             CUERR
49 |     TIMERSTOP(square_root_kernel)
50 | 
51 |     TIMERSTART(device_to_host)
52 |     cudaMemcpy(data, Data, sizeof(value_t)*length,
53 |                cudaMemcpyDeviceToHost);                           CUERR
54 |     TIMERSTOP(device_to_host)
55 |     TIMERSTOP(overall)
56 | 
57 |     for (index_t index = 0; index < 10; index++)
58 |         std::cout << index << " " << data[index] << std::endl;
59 | 
60 |     cudaFreeHost(data);                                           CUERR
61 |     cudaFree(Data);                                               CUERR
62 | }
63 | 


--------------------------------------------------------------------------------
/chapter8/multi_gpu/streamed_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include "../include/hpc_helpers.hpp"
 2 | 
 3 | template <
 4 |     typename index_t,
 5 |     typename value_t,
 6 |     index_t num_iters=256> __global__
 7 | void square_root_kernel(
 8 |     value_t * Data,
 9 |     index_t   length) {
10 | 
11 |     const index_t thid = blockDim.x*blockIdx.x+threadIdx.x;
12 | 
13 |     for (index_t i = thid; i < length; i += blockDim.x*gridDim.x){
14 | 
15 |         value_t value = Data[i];
16 |         value_t root  = value;
17 | 
18 |         # pragma unroll (32)
19 |         for (index_t iters = 0; iters < num_iters && value; iters++)
20 |             root = 0.5*(root+value/root);
21 | 
22 |        Data[i] = root;
23 |     }
24 | }
25 | 
26 | int main () {
27 | 
28 |     typedef float    value_t;
29 |     typedef uint64_t index_t;
30 | 
31 |     const index_t length = 1UL << 30;
32 |     const index_t num_streams = 32;
33 |     const index_t batch_size = length/num_streams;
34 | 
35 |     cudaStream_t streams[num_streams];
36 |     for (index_t streamID = 0; streamID < num_streams; streamID++)
37 |         cudaStreamCreate(streams+streamID);                        CUERR
38 | 
39 |     value_t * data = nullptr, * Data = nullptr;
40 | 
41 |     cudaMallocHost(&data, sizeof(value_t)*length);                 CUERR
42 |     cudaMalloc    (&Data, sizeof(value_t)*length);                 CUERR
43 | 
44 |     for (index_t index = 0; index < length; index++)
45 |         data[index] = index;
46 | 
47 |     TIMERSTART(overall)
48 |     for (index_t streamID = 0; streamID < num_streams; streamID++) {
49 |         const index_t offset = streamID*batch_size;
50 |         cudaMemcpyAsync(Data+offset, data+offset,
51 |                         sizeof(value_t)*batch_size,
52 |                         cudaMemcpyHostToDevice, streams[streamID]); CUERR
53 |     }
54 | 
55 |     for (index_t streamID = 0; streamID < num_streams; streamID++) {
56 |         const index_t offset = streamID*batch_size;
57 |         square_root_kernel<<<1024, 1024, 0, streams[streamID]>>>
58 |                           (Data+offset, batch_size);                CUERR
59 |     }
60 | 
61 |     for (index_t streamID = 0; streamID < num_streams; streamID++) {
62 |         const index_t offset = streamID*batch_size;
63 |         cudaMemcpyAsync(data+offset, Data+offset,
64 |                         sizeof(value_t)*batch_size,
65 |                         cudaMemcpyDeviceToHost, streams[streamID]); CUERR
66 |     }
67 | 
68 |     cudaDeviceSynchronize();
69 |     TIMERSTOP(overall)
70 | 
71 | 
72 | 
73 |     for (index_t index = 0; index < 10; index++)
74 |         std::cout << index << " " << data[index] << std::endl;
75 | 
76 |     for (index_t streamID = 0; streamID < num_streams; streamID++)
77 |             cudaStreamDestroy(streams[streamID]);                  CUERR
78 | 
79 |     cudaFreeHost(data);                                            CUERR
80 |     cudaFree(Data);                                                CUERR
81 | }
82 | 


--------------------------------------------------------------------------------
/chapter8/uvm/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC= nvcc
 2 | NVCCFLAGS= -O2 -std=c++11 -arch=sm_61
 3 | 
 4 | all: uvm_minimal_example
 5 | 
 6 | uvm_minimal_example: uvm_minimal_example.cu
 7 | 	$(NVCC) $(NVCCFLAGS) uvm_minimal_example.cu -o uvm_minimal_example
 8 | 
 9 | clean:
10 | 	rm -f uvm_minimal_example
11 | 


--------------------------------------------------------------------------------
/chapter8/uvm/uvm_minimal_example.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | #include <iostream>
 3 | 
 4 | __global__ void iota_kernel(float * input, uint64_t size) {
 5 | 
 6 |     uint64_t thid = blockIdx.x*blockDim.x+threadIdx.x;
 7 |     for (uint64_t i = thid; i < size; i += gridDim.x*blockDim.x)
 8 |         input[i] = i;
 9 | }
10 | 
11 | int main () {
12 | 
13 |     uint64_t size = 1UL << 20;
14 |     float * input = nullptr;
15 |     cudaMallocHost(&input, sizeof(float)*size);
16 |     iota_kernel<<<1024, 1024>>>(input, size);
17 | 
18 |     cudaDeviceSynchronize();
19 | 
20 |     for (uint64_t i = 0; i < 20; i++)
21 |         std::cout << input[i] << std::endl;
22 | }
23 | 


--------------------------------------------------------------------------------
/chapter9/hello_world/Makefile:
--------------------------------------------------------------------------------
 1 | MPICXX= mpic++
 2 | MPICXXFLAGS= -O2 -std=c++11
 3 | 
 4 | all: hello_world
 5 | 
 6 | hello_world: hello_world.cpp
 7 | 	$(MPICXX) $(MPICXXFLAGS) hello_world.cpp -o hello_world
 8 | 
 9 | clean:
10 | 	rm -rf hello_world
11 | 


--------------------------------------------------------------------------------
/chapter9/hello_world/hello_world.cpp:
--------------------------------------------------------------------------------
 1 | #include "mpi.h"
 2 | 
 3 | int main (int argc, char *argv[]){
 4 | 	// Initialize MPI
 5 | 	MPI::Init(argc,argv);
 6 | 
 7 | 	// Get the number of processes
 8 | 	int numP=MPI::COMM_WORLD.Get_size();
 9 | 
10 | 	// Get the ID of the process
11 | 	int myId=MPI::COMM_WORLD.Get_rank();
12 | 
13 | 	// Every process prints Hello
14 | 	std::cout << "Process " << myId << " of " << numP << ": Hello, world!" << std::endl;
15 | 
16 | 	// Terminate MPI
17 | 	MPI::Finalize();
18 | 	return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/chapter9/jacobi_iteration/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++
 2 | CXXFLAGS = -O2 -std=c++11
 3 | MPICXX= mpic++
 4 | MPICXXFLAGS= $(CXXFLAGS)
 5 | 
 6 | all: jacobi_seq jacobi_1D_block_simple jacobi_1D_block jacobi_1D_nonblock
 7 | 
 8 | jacobi_seq: jacobi_seq.cpp
 9 | 	$(CXX) $(CXXFLAGS) jacobi_seq.cpp -o jacobi_seq
10 | 
11 | jacobi_1D_block_simple: jacobi_1D_block_simple.cpp
12 | 	$(MPICXX) $(MPICXXFLAGS) jacobi_1D_block_simple.cpp -o jacobi_1D_block_simple
13 | 
14 | jacobi_1D_block: jacobi_1D_block.cpp
15 | 	$(MPICXX) $(MPICXXFLAGS) jacobi_1D_block.cpp -o jacobi_1D_block
16 | 
17 | jacobi_1D_nonblock: jacobi_1D_nonblock.cpp
18 | 	$(MPICXX) $(MPICXXFLAGS) jacobi_1D_nonblock.cpp -o jacobi_1D_nonblock
19 | 
20 | clean:
21 | 	rm -rf jacobi_seq
22 | 	rm -rf jacobi_1D_block_simple
23 | 	rm -rf jacobi_1D_block
24 | 	rm -rf jacobi_1D_nonblock
25 | 


--------------------------------------------------------------------------------
/chapter9/jacobi_iteration/jacobi_1D_block_simple.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <iostream>
  4 | #include <string.h>
  5 | 
  6 | #include "mpi.h"
  7 | 
  8 | void readInput(std::string file, int rows, int cols, float *data){
  9 | 
 10 | 	// Open the file pointer
 11 | 	/*FILE* fp = fopen(file.c_str(), "rb");
 12 | 
 13 | 	// Check if the file exists
 14 | 	if(fp == NULL){
 15 | 		std::cout << "ERROR: File " << file << " could not be opened" << std::endl;
 16 | 		MPI::COMM_WORLD.Abort(1);
 17 | 	}
 18 | 
 19 | 	for(int i=0; i<rows*cols; i++){
 20 | 		if(!fscanf(fp, "%f", &data[i])){
 21 | 			std::cout << "ERROR: Not enough values in file " << file << std::endl;
 22 | 			MPI::COMM_WORLD.Abort(1);
 23 | 		}
 24 | 	}*/
 25 | 
 26 |     // checkerboard
 27 |     for(int i=0; i<rows; i++)
 28 |         for(int j=0; j<cols; j++)
 29 |             data[i*cols+j] = (i/121+j/121) % 2;
 30 | }
 31 | 
 32 | void printOutput(std::string file, int rows, int cols, float *data){
 33 | 
 34 | 	FILE *fp = fopen(file.c_str(), "wb");
 35 | 	// Check if the file was opened
 36 | 	if(fp == NULL){
 37 | 		std::cout << "ERROR: Output file " << file << " could not be opened" << std::endl;
 38 | 		MPI::COMM_WORLD.Abort(1);
 39 | 	}
 40 | 
 41 |     for(int i=0; i<rows; i++){
 42 |         for(int j=0; j<cols; j++)
 43 |         	fprintf(fp, "%lf ", data[i*cols+j]);
 44 |         fprintf(fp, "\n");
 45 |     }
 46 | 
 47 |     fclose(fp);
 48 | }
 49 | 
 50 | int main (int argc, char *argv[]){
 51 | 	// Initialize MPI
 52 | 	MPI::Init(argc,argv);
 53 | 
 54 | 	// Get the number of processes
 55 | 	int numP=MPI::COMM_WORLD.Get_size();
 56 | 
 57 | 	// Get the ID of the process
 58 | 	int myId=MPI::COMM_WORLD.Get_rank();
 59 | 
 60 | 	if(argc < 6){
 61 | 		// Only the first process prints the output message
 62 | 		if(!myId){
 63 | 			std::cout << "ERROR: The syntax of the program is " << argv[0]
 64 |                                   << " inputFile rows cols outputFile errThreshold"
 65 | 			          << std::endl;
 66 | 		}
 67 | 		MPI::COMM_WORLD.Abort(1);
 68 | 	}
 69 | 
 70 | 	std::string inputFile = argv[1];
 71 | 	int rows = atoi(argv[2]);
 72 | 	int cols = atoi(argv[3]);
 73 | 	std::string outputFile = argv[4];
 74 | 	float errThres = atof(argv[5]);
 75 | 
 76 | 	if((rows < 1) || (cols < 1)){
 77 | 		// Only the first process prints the output message
 78 | 		if(!myId)
 79 | 			std::cout << "ERROR: The number of rows and columns must be higher than 0" << std::endl;
 80 | 
 81 | 		MPI::COMM_WORLD.Abort(1);
 82 | 	}
 83 | 
 84 | 	if(rows%numP){
 85 | 		// Only the first process prints the output message
 86 | 		if(!myId)
 87 | 			std::cout << "ERROR: The number of rows must be multiple of the number of processes" << std::endl;
 88 | 		
 89 | 		MPI::COMM_WORLD.Abort(1);
 90 | 	}
 91 | 
 92 | 	float *data;
 93 | 
 94 | 	// Only one process reads the data from the file and broadcasts the data
 95 | 	if(!myId){
 96 | 		data = new float[rows*cols];
 97 | 		readInput(inputFile, rows, cols, data);
 98 | 	}
 99 | 
100 | 	// The computation is divided by rows
101 | 	int myRows = rows/numP;
102 | 
103 | 	// Arrays for the chunk of data to work
104 | 	float *myData = new float[myRows*cols];
105 | 	float *buff = new float[myRows*cols];
106 | 
107 | 	MPI::COMM_WORLD.Barrier();
108 | 
109 | 	// Measure the current time
110 | 	double start = MPI::Wtime();
111 | 
112 | 	// Scatter the input matrix
113 | 	MPI::COMM_WORLD.Scatter(data, myRows*cols, MPI::FLOAT, myData, myRows*cols, MPI::FLOAT, 0);
114 | 	memcpy(buff, myData, myRows*cols*sizeof(float));
115 | 
116 | 	float error = errThres+1.0;
117 | 	float myError;
118 | 
119 | 	// Buffers to receive the rows
120 | 	float *prevRow = new float[cols];
121 | 	float *nextRow = new float[cols];
122 | 
123 | 	while(error > errThres){
124 | 		if(myId > 0){
125 | 			// Send the first row to the previous process
126 | 			MPI::COMM_WORLD.Send(myData, cols, MPI::FLOAT, myId-1, 0);
127 | 		}
128 | 
129 | 		if(myId < numP-1){
130 | 			// Receive the next row from the next process
131 | 			MPI::COMM_WORLD.Recv(nextRow, cols, MPI::FLOAT, myId+1, 0);
132 | 
133 | 			// Send the last row to the next process
134 | 			MPI::COMM_WORLD.Send(&myData[(myRows-1)*cols], cols, MPI::FLOAT, myId+1, 0);
135 | 		}
136 | 
137 | 		if(myId > 0){
138 | 			// Receive the previous row from the previous process
139 | 			MPI::COMM_WORLD.Recv(prevRow, cols, MPI::FLOAT, myId-1, 0);
140 | 		}
141 | 
142 | 		// Update the first row
143 | 		if((myId > 0) && (myRows>1)){
144 | 			for(int j=1; j<cols-1; j++){
145 | 				buff[j] = 0.25*(myData[cols+j]+myData[j-1]+myData[j+1]+prevRow[j]);
146 | 			}
147 | 		}
148 | 
149 | 		// Update the main block
150 | 		for(int i=1; i<myRows-1; i++){
151 | 			for(int j=1; j<cols-1; j++){
152 | 				// calculate discrete laplacian by averaging 4-neighbourhood
153 | 				buff[i*cols+j]= 0.25f*(myData[(i+1)*cols+j]+myData[i*cols+j-1]+myData[i*cols+j+1]+myData[(i-1)*cols+j]);
154 | 			}
155 | 		}
156 | 
157 | 		// Update the last row
158 | 		if((myId < numP-1) && (myRows > 1)){
159 | 			for(int j=1; j<cols-1; j++){
160 | 				buff[(myRows-1)*cols+j] = 0.25*(nextRow[j]+myData[(myRows-1)*cols+j-1]+
161 | 						myData[(myRows-1)*cols+j+1]+myData[(myRows-2)*cols+j]);
162 | 			}
163 | 		}
164 | 
165 | 		// Calculate the error of the block
166 | 		myError = 0.0;
167 | 		for(int i=0; i<myRows; i++){
168 | 			for(int j=1; j<cols-1; j++){
169 |                 // determine difference between 'data' and 'buff' and add up error
170 |                 myError += (myData[i*cols+j]-buff[i*cols+j])*(myData[i*cols+j]-buff[i*cols+j]);
171 | 			}
172 | 		}
173 | 
174 | 		memcpy(myData, buff, myRows*cols*sizeof(float));
175 | 
176 | 		// Sum the error of all the processes
177 | 		// The output is stored in the variable 'error' of all processes
178 | 		MPI::COMM_WORLD.Allreduce(&myError, &error, 1, MPI::FLOAT, MPI::SUM);
179 | 	}
180 | 
181 | 	// Only process 0 writes
182 | 	// Gather the final matrix to the memory of process 0
183 | 	MPI::COMM_WORLD.Gather(myData, myRows*cols, MPI::FLOAT, data, myRows*cols, MPI::FLOAT, 0);
184 | 
185 | 	// Measure the current time
186 | 	double end = MPI::Wtime();
187 | 
188 | 	if(!myId){
189 |     	std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl;
190 |     	printOutput(outputFile, rows, cols, data);
191 | 		delete [] data;
192 | 	}
193 | 
194 | 	delete [] myData;
195 | 	delete [] buff;
196 | 	delete [] prevRow;
197 | 	delete [] nextRow;
198 | 
199 | 	// Terminate MPI
200 | 	MPI::Finalize();
201 | 	return 0;
202 | }
203 | 


--------------------------------------------------------------------------------
/chapter9/jacobi_iteration/jacobi_seq.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <iostream>
  4 | #include <string.h>
  5 | #include <chrono>
  6 | 
  7 | void readInput(std::string file, int rows, int cols, float *data){
  8 | 
  9 | 	// Open the file pointer
 10 | 	/*FILE* fp = fopen(file.c_str(), "rb");
 11 | 
 12 | 	// Check if the file exists
 13 | 	if(fp == NULL){
 14 | 		std::cout << "ERROR: File " << file << " could not be opened" << std::endl;
 15 | 		exit(1);
 16 | 	}
 17 | 
 18 | 	for(int i=0; i<rows*cols; i++){
 19 | 		if(!fscanf(fp, "%f", &data[i])){
 20 | 			std::cout << "ERROR: Not enough values in file " << file << std::endl;
 21 | 			exit(1);
 22 | 		}
 23 | 	}*/
 24 | 
 25 |     // checkerboard
 26 |     for(int i=0; i<rows; i++)
 27 |         for(int j=0; j<cols; j++)
 28 |             data[i*cols+j] = (i/121+j/121) % 2;
 29 | 
 30 |     //fclose(fp);
 31 | }
 32 | 
 33 | void printOutput(std::string file, int rows, int cols, float *data){
 34 | 
 35 | 	FILE *fp = fopen(file.c_str(), "wb");
 36 | 	// Check if the file was opened
 37 | 	if(fp == NULL){
 38 | 		std::cout << "ERROR: Output file " << file << " could not be opened" << std::endl;
 39 | 		exit(1);
 40 | 	}
 41 | 
 42 |     for(int i=0; i<rows; i++){
 43 |         for(int j=0; j<cols; j++)
 44 |         	fprintf(fp, "%lf ", data[i*cols+j]);
 45 |         fprintf(fp, "\n");
 46 |     }
 47 | 
 48 |     fclose(fp);
 49 | }
 50 | 
 51 | int main (int argc, char *argv[]){
 52 | 	if(argc < 6){
 53 | 		std::cout << "ERROR: The syntax of the program is " << argv[0] 
 54 |                           << " inputFile rows cols outputFile errThreshold"
 55 | 			  << std::endl;
 56 | 		exit(1);
 57 | 	}
 58 | 
 59 | 	std::string inputFile = argv[1];
 60 | 	int rows = atoi(argv[2]);
 61 | 	int cols = atoi(argv[3]);
 62 | 	std::string outputFile = argv[4];
 63 | 	float errThres = atof(argv[5]);
 64 | 
 65 | 	if((rows < 1) || (cols < 1)){
 66 | 		std::cout << "ERROR: The number of rows and columns must be higher than 0" << std::endl;
 67 | 		exit(1);
 68 | 	}
 69 | 
 70 | 	float *data = new float[rows*cols];
 71 | 	float *buff = new float[rows*cols];
 72 | 	readInput(inputFile, rows, cols, data);
 73 | 	memcpy(buff, data, rows*cols*sizeof(float));
 74 | 
 75 | 	// Measure the current time
 76 |     std::chrono::time_point<std::chrono::system_clock> start, end;
 77 |     start = std::chrono::system_clock::now();
 78 | 
 79 | 	float error = errThres + 1.0;
 80 | 
 81 | 	while(error > errThres){
 82 | 		for(int i=1; i<rows-1; i++){
 83 | 			for(int j=1; j<cols-1; j++){
 84 | 				// calculate discrete laplacian by averaging 4-neighbourhood
 85 | 				buff[i*cols+j]= 0.25f*(data[(i+1)*cols+j]+data[i*cols+j-1]+data[i*cols+j+1]+data[(i-1)*cols+j]);
 86 | 			}
 87 | 		}
 88 | 
 89 | 		// Determine the error
 90 | 		error = 0.0;
 91 | 		for(int i=1; i<rows-1; i++){
 92 | 			for(int j=1; j<cols-1; j++){
 93 |                 // determine difference between 'data' and 'buff' and add up error
 94 |                 error += (data[i*cols+j]-buff[i*cols+j])*(data[i*cols+j]-buff[i*cols+j]);
 95 | 			}
 96 | 		}
 97 | 
 98 | 		memcpy(data, buff, rows*cols*sizeof(float));
 99 | 	}
100 | 
101 |     end = std::chrono::system_clock::now();
102 |     std::chrono::duration<float> elapsed_seconds = end-start;
103 | 
104 | 	std::cout << "Sequential Jacobi with dimensions " << rows << "x" << cols << " in " << elapsed_seconds.count()
105 | 			<< " seconds" << std::endl;
106 | 
107 | 	printOutput(outputFile, rows, cols, data);
108 | 
109 | 	delete [] data;
110 | 	delete [] buff;
111 | 
112 | 	return 0;
113 | }
114 | 


--------------------------------------------------------------------------------
/chapter9/matrix_matrix_mult/Makefile:
--------------------------------------------------------------------------------
 1 | MPICXX= mpic++
 2 | MPICXXFLAGS= -O2 -std=c++11
 3 | 
 4 | all: matrix_mult_2D matrix_mult_cols matrix_mult_rows summa
 5 | 
 6 | matrix_mult_2D: matrix_mult_2D.cpp
 7 | 	$(MPICXX) $(MPICXXFLAGS) matrix_mult_2D.cpp -o matrix_mult_2D
 8 | 
 9 | matrix_mult_cols: matrix_mult_cols.cpp
10 | 	$(MPICXX) $(MPICXXFLAGS) matrix_mult_cols.cpp -o matrix_mult_cols
11 | 
12 | matrix_mult_rows: matrix_mult_rows.cpp
13 | 	$(MPICXX) $(MPICXXFLAGS) matrix_mult_rows.cpp -o matrix_mult_rows
14 | 
15 | summa: summa.cpp
16 | 	$(MPICXX) $(MPICXXFLAGS) summa.cpp -o summa
17 | 
18 | clean:
19 | 	rm -rf matrix_mult_2D
20 | 	rm -rf matrix_mult_cols
21 | 	rm -rf matrix_mult_rows
22 | 	rm -rf summa
23 | 


--------------------------------------------------------------------------------
/chapter9/matrix_matrix_mult/matrix_mult_cols.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <iostream>
  4 | #include <string.h>
  5 | 
  6 | #include "mpi.h"
  7 | 
  8 | void readInput(std::string file, int rows, int cols, float *data){
  9 | 
 10 | 	// Open the file pointer
 11 | 	/*FILE* fp = fopen(file.c_str(), "rb");
 12 | 
 13 | 	// Check if the file exists
 14 | 	if(fp == NULL){
 15 | 		std::cout << "ERROR: File " << file << " could not be opened" << std::endl;
 16 | 		MPI::COMM_WORLD.Abort(1);
 17 | 	}
 18 | 
 19 | 	for(int i=0; i<rows*cols; i++){
 20 | 		if(!fscanf(fp, "%f", &data[i])){
 21 | 			std::cout << "ERROR: Not enough values in file " << file << std::endl;
 22 | 			MPI::COMM_WORLD.Abort(1);
 23 | 		}
 24 | 	}*/
 25 | 
 26 |     // checkerboard
 27 |     for(int i=0; i<rows; i++)
 28 |         for(int j=0; j<cols; j++)
 29 |             data[i*cols+j] = (i+j) % 2;
 30 | }
 31 | 
 32 | void printOutput(std::string file, int rows, int cols, float *data){
 33 | 
 34 | 	FILE *fp = fopen(file.c_str(), "wb");
 35 | 	// Check if the file was opened
 36 | 	if(fp == NULL){
 37 | 		std::cout << "ERROR: Output file " << file << " could not be opened" << std::endl;
 38 | 		MPI::COMM_WORLD.Abort(1);
 39 | 	}
 40 | 
 41 |     for(int i=0; i<rows; i++){
 42 |         for(int j=0; j<cols; j++)
 43 |         	fprintf(fp, "%lf ", data[i*cols+j]);
 44 |         fprintf(fp, "\n");
 45 |     }
 46 | 
 47 |     fclose(fp);
 48 | }
 49 | 
 50 | int main (int argc, char *argv[]){
 51 | 	// Initialize MPI
 52 | 	MPI::Init(argc,argv);
 53 | 
 54 | 	// Get the number of processes
 55 | 	int numP=MPI::COMM_WORLD.Get_size();
 56 | 
 57 | 	// Get the ID of the process
 58 | 	int myId=MPI::COMM_WORLD.Get_rank();
 59 | 
 60 | 	if(argc < 7){
 61 | 		// Only the first process prints the output message
 62 | 		if(!myId){
 63 | 			std::cout << "ERROR: The syntax of the program is ./matrix-mult-cols inputMatA inputMatB outputMat m k n"
 64 | 					<< std::endl;
 65 | 		}
 66 | 		MPI::COMM_WORLD.Abort(1);
 67 | 	}
 68 | 
 69 | 	std::string inputFileA = argv[1];
 70 | 	std::string inputFileB = argv[2];
 71 | 	std::string outputFile = argv[3];
 72 | 	int m = atoi(argv[4]);
 73 | 	int k = atoi(argv[5]);
 74 | 	int n = atoi(argv[6]);
 75 | 
 76 | 	if((m < 1) || (n < 1) || (k<1)){
 77 | 		// Only the first process prints the output message
 78 | 		if(!myId){
 79 | 			std::cout << "ERROR: 'm', 'k' and 'n' must be higher than 0" << std::endl;
 80 | 		}
 81 | 		MPI::COMM_WORLD.Abort(1);
 82 | 	}
 83 | 
 84 | 	// A is replicated in all the processes
 85 | 	float *A = new float[m*k];
 86 | 	float *B;
 87 | 	float *C;
 88 | 
 89 | 	// Only one process reads the data from the files and broadcasts the data
 90 | 	if(!myId){
 91 | 		readInput(inputFileA, m, k, A);
 92 | 		B = new float[k*n];
 93 | 		readInput(inputFileB, k, n, B);
 94 | 		C = new float[m*n];
 95 | 	}
 96 | 
 97 | 	// The computation is divided by rows
 98 | 	int blockCols = n/numP;
 99 | 
100 | 	// Create the datatype column
101 | 	MPI::Datatype colTypeB = MPI::FLOAT.Create_vector(k, blockCols, n);
102 | 	colTypeB.Commit();
103 | 	MPI::Datatype colTypeC = MPI::FLOAT.Create_vector(m, blockCols, n);
104 | 	colTypeC.Commit();
105 | 
106 | 	// Measure the current time
107 | 	double start = MPI::Wtime();
108 | 
109 | 	// Arrays for the chunk of data to work
110 | 	float *myB = new float[k*blockCols];
111 | 	float *myC = new float[m*blockCols];
112 | 
113 | 	// Broadcast the input matrix A
114 | 	MPI::COMM_WORLD.Bcast(A, m*k, MPI::FLOAT, 0);
115 | 
116 | 	MPI::Status status;
117 | 	MPI::Request req;
118 | 
119 | 	// Scatter the input matrix B
120 | 	if(!myId){
121 | 		for(int i=numP-1; i>=0; i--){
122 | 			req = MPI::COMM_WORLD.Isend(&B[i*blockCols], 1, colTypeB, i, 0);
123 | 		}
124 | 	}
125 | 
126 | 	MPI::COMM_WORLD.Recv(myB, k*blockCols, MPI::FLOAT, 0, 0, status);
127 | 
128 | 	// The multiplication of the submatrices
129 | 	for(int i=0; i<m; i++){
130 | 		for(int j=0; j<blockCols; j++){
131 | 			myC[i*blockCols+j] = 0.0;
132 | 			for(int l=0; l<k; l++){
133 | 				myC[i*blockCols+j] += A[i*k+l]*myB[l*blockCols+j];
134 | 			}
135 | 		}
136 | 	}
137 | 
138 | 	// Only process 0 writes
139 | 	// Gather the final matrix to the memory of process 0
140 | 	req = MPI::COMM_WORLD.Isend(myC, m*blockCols, MPI::FLOAT, 0, 0);
141 | 	if(!myId){
142 | 		for(int i=numP-1; i>=0; i--){
143 | 			MPI::COMM_WORLD.Recv(&C[i*blockCols], 1, colTypeC, i, 0, status);
144 | 		}
145 | 	}
146 | 
147 | 	// Measure the current time
148 | 	double end = MPI::Wtime();
149 | 
150 | 	colTypeB.Free();
151 | 	colTypeC.Free();
152 | 
153 | 	if(!myId){
154 |     	std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl;
155 |     	printOutput(outputFile, m, n, C);
156 | 		delete [] B;
157 | 		delete [] C;
158 | 	}
159 | 
160 | 	MPI::COMM_WORLD.Barrier();
161 | 
162 | 	delete [] A;
163 | 	delete [] myB;
164 | 	delete [] myC;
165 | 
166 | 	// Terminate MPI
167 | 	MPI::Finalize();
168 | 	return 0;
169 | }
170 | 


--------------------------------------------------------------------------------
/chapter9/matrix_matrix_mult/matrix_mult_rows.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <iostream>
  4 | #include <string.h>
  5 | 
  6 | #include "mpi.h"
  7 | 
  8 | void readInput(std::string file, int rows, int cols, float *data){
  9 | 
 10 | 	// Open the file pointer
 11 | 	/*FILE* fp = fopen(file.c_str(), "rb");
 12 | 
 13 | 	// Check if the file exists
 14 | 	if(fp == NULL){
 15 | 		std::cout << "ERROR: File " << file << " could not be opened" << std::endl;
 16 | 		MPI::COMM_WORLD.Abort(1);
 17 | 	}
 18 | 
 19 | 	for(int i=0; i<rows*cols; i++){
 20 | 		if(!fscanf(fp, "%f", &data[i])){
 21 | 			std::cout << "ERROR: Not enough values in file " << file << std::endl;
 22 | 			MPI::COMM_WORLD.Abort(1);
 23 | 		}
 24 | 	}*/
 25 | 
 26 |     // checkerboard
 27 |     for(int i=0; i<rows; i++)
 28 |         for(int j=0; j<cols; j++)
 29 |             data[i*cols+j] = (i+j) % 2;
 30 | }
 31 | 
 32 | void printOutput(std::string file, int rows, int cols, float *data){
 33 | 
 34 | 	FILE *fp = fopen(file.c_str(), "wb");
 35 | 	// Check if the file was opened
 36 | 	if(fp == NULL){
 37 | 		std::cout << "ERROR: Output file " << file << " could not be opened" << std::endl;
 38 | 		MPI::COMM_WORLD.Abort(1);
 39 | 	}
 40 | 
 41 |     for(int i=0; i<rows; i++){
 42 |         for(int j=0; j<cols; j++)
 43 |         	fprintf(fp, "%lf ", data[i*cols+j]);
 44 |         fprintf(fp, "\n");
 45 |     }
 46 | 
 47 |     fclose(fp);
 48 | }
 49 | 
 50 | int main (int argc, char *argv[]){
 51 | 	// Initialize MPI
 52 | 	MPI::Init(argc,argv);
 53 | 
 54 | 	// Get the number of processes
 55 | 	int numP=MPI::COMM_WORLD.Get_size();
 56 | 
 57 | 	// Get the ID of the process
 58 | 	int myId=MPI::COMM_WORLD.Get_rank();
 59 | 
 60 | 	if(argc < 7){
 61 | 		// Only the first process prints the output message
 62 | 		if(!myId){
 63 | 			std::cout << "ERROR: The syntax of the program is ./matrix-mult-rows inputMatA inputMatB outputMat m k n"
 64 | 					<< std::endl;
 65 | 		}
 66 | 		MPI::COMM_WORLD.Abort(1);
 67 | 	}
 68 | 
 69 | 	std::string inputFileA = argv[1];
 70 | 	std::string inputFileB = argv[2];
 71 | 	std::string outputFile = argv[3];
 72 | 	int m = atoi(argv[4]);
 73 | 	int k = atoi(argv[5]);
 74 | 	int n = atoi(argv[6]);
 75 | 
 76 | 	if((m < 1) || (n < 1) || (k<1)){
 77 | 		// Only the first process prints the output message
 78 | 		if(!myId){
 79 | 			std::cout << "ERROR: 'm', 'k' and 'n' must be higher than 0" << std::endl;
 80 | 		}
 81 | 		MPI::COMM_WORLD.Abort(1);
 82 | 	}
 83 | 
 84 | 	float *A;
 85 | 	// B is replicated in all the processes
 86 | 	float *B = new float[k*n];
 87 | 	float *C;
 88 | 
 89 | 	// Only one process reads the data from the files and broadcasts the data
 90 | 	if(!myId){
 91 | 		A = new float[m*k];
 92 | 		readInput(inputFileA, m, k, A);
 93 | 		readInput(inputFileB, k, n, B);
 94 | 		C = new float[m*n];
 95 | 	}
 96 | 
 97 | 	// The computation is divided by rows
 98 | 	int blockRows = m/numP;
 99 | 	int myRows = blockRows;
100 | 
101 | 	// For the cases that 'rows' is not multiple of numP
102 | 	if(myId < m%numP){
103 | 		myRows++;
104 | 	}
105 | 
106 | 	// Measure the current time
107 | 	double start = MPI::Wtime();
108 | 
109 | 	// Arrays for the chunk of data to work
110 | 	float *myA = new float[myRows*k];
111 | 	float *myC = new float[myRows*n];
112 | 
113 | 	// The process 0 must specify how many rows are sent to each process
114 | 	int *sendCounts;
115 | 	int *displs;
116 | 	if(!myId){
117 | 		sendCounts = new int[numP];
118 | 		displs = new int[numP];
119 | 
120 | 		displs[0] = 0;
121 | 
122 | 		for(int i=0; i<numP; i++){
123 | 
124 | 			if(i>0){
125 | 				displs[i] = displs[i-1]+sendCounts[i-1];
126 | 			}
127 | 
128 | 			if(i < m%numP){
129 | 				sendCounts[i] = (blockRows+1)*k;
130 | 			} else {
131 | 				sendCounts[i] = blockRows*k;
132 | 			}
133 | 		}
134 | 	}
135 | 
136 | 	// Scatter the input matrix A
137 | 	MPI::COMM_WORLD.Scatterv(A, sendCounts, displs, MPI::FLOAT, myA, myRows*k, MPI::FLOAT, 0);
138 | 	// Broadcast the input matrix B
139 | 	MPI::COMM_WORLD.Bcast(B, k*n, MPI::FLOAT, 0);
140 | 
141 | 	// The multiplication of the submatrices
142 | 	for(int i=0; i<myRows; i++){
143 | 		for(int j=0; j<n; j++){
144 | 			myC[i*n+j] = 0.0;
145 | 			for(int l=0; l<k; l++){
146 | 				myC[i*n+j] += myA[i*k+l]*B[l*n+j];
147 | 			}
148 | 		}
149 | 	}
150 | 
151 | 	// Only process 0 writes
152 | 	// Gather the final matrix to the memory of process 0
153 | 	if(!myId){
154 | 		for(int i=0; i<numP; i++){
155 | 
156 | 			if(i>0){
157 | 				displs[i] = displs[i-1]+sendCounts[i-1];
158 | 			}
159 | 
160 | 			if(i < m%numP){
161 | 				sendCounts[i] = (blockRows+1)*n;
162 | 			} else {
163 | 				sendCounts[i] = blockRows*n;
164 | 			}
165 | 		}
166 | 	}
167 | 	MPI::COMM_WORLD.Gatherv(myC, myRows*n, MPI::FLOAT, C, sendCounts, displs, MPI::FLOAT, 0);
168 | 
169 | 	// Measure the current time
170 | 	double end = MPI::Wtime();
171 | 
172 | 	if(!myId){
173 |     	std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl;
174 |     	printOutput(outputFile, m, n, C);
175 | 		delete [] A;
176 | 		delete [] C;
177 | 	}
178 | 
179 | 	delete [] B;
180 | 	delete [] myA;
181 | 	delete [] myC;
182 | 
183 | 	// Terminate MPI
184 | 	MPI::Finalize();
185 | 	return 0;
186 | }
187 | 


--------------------------------------------------------------------------------
/chapter9/ping_pong/Makefile:
--------------------------------------------------------------------------------
 1 | MPICXX= mpic++
 2 | MPICXXFLAGS= -O2 -std=c++11
 3 | 
 4 | all: ping_pong_ring ping_pong_ring_nonblock
 5 | 
 6 | ping_pong_ring: ping_pong_ring.cpp
 7 | 	$(MPICXX) $(MPICXXFLAGS) ping_pong_ring.cpp -o ping_pong_ring
 8 | 
 9 | ping_pong_ring_nonblock: ping_pong_ring_nonblock.cpp
10 | 	$(MPICXX) $(MPICXXFLAGS) ping_pong_ring_nonblock.cpp -o ping_pong_ring_nonblock
11 | 
12 | clean:
13 | 	rm -rf ping_pong_ring
14 | 	rm -rf ping_pong_nonblock
15 | 


--------------------------------------------------------------------------------
/chapter9/ping_pong/ping_pong_ring.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #include "mpi.h"
 4 | 
 5 | int main (int argc, char *argv[]){
 6 | 	// Initialize MPI
 7 | 	MPI::Init(argc,argv);
 8 | 	// Get the number of processes
 9 | 	int numP=MPI::COMM_WORLD.Get_size();
10 | 
11 | 	// Get the ID of the process
12 | 	int myId=MPI::COMM_WORLD.Get_rank();
13 | 
14 | 	if(argc < 2){
15 | 		// Only the first process prints the output message
16 | 		if(!myId){
17 | 			std::cout << "ERROR: The syntax of the program is" << argv[0]
18 |                                   << " num_ping_pong" << std::endl;
19 | 		}
20 | 		MPI::COMM_WORLD.Abort(1);
21 | 	}
22 | 
23 | 	int num_ping_pong = atoi(argv[1]);
24 | 	int ping_pong_count = 0;
25 | 	int next_id = myId+1, prev_id=myId-1;
26 | 
27 | 	if(next_id >= numP){
28 | 		next_id = 0;
29 | 	}
30 | 	if(prev_id < 0){
31 | 		prev_id = numP-1;
32 | 	}	
33 | 
34 | 	while(ping_pong_count < num_ping_pong){
35 | 		// First receive the ping and then send the pong
36 | 		ping_pong_count++;
37 | 		MPI::COMM_WORLD.Send(&ping_pong_count, 1, MPI::INT, next_id, 0);
38 | 		std::cout << "Process " << myId << " sends PING number " << ping_pong_count
39 | 					<< " to process " << next_id << std::endl;
40 | 		MPI::COMM_WORLD.Recv(&ping_pong_count, 1, MPI::INT, prev_id, 0);
41 | 		std::cout << "Process " << myId << " receives PING number " << ping_pong_count
42 | 					<< " from process " << prev_id << std::endl;
43 | 
44 | 		MPI::COMM_WORLD.Send(&ping_pong_count, 1, MPI::INT, prev_id, 0);
45 | 		std::cout << "Process " << myId << " sends PONG number " << ping_pong_count
46 | 					<< " to process " << prev_id << std::endl;
47 | 		MPI::COMM_WORLD.Recv(&ping_pong_count, 1, MPI::INT, next_id, 0);
48 | 		std::cout << "Process " << myId << " receives PONG number " << ping_pong_count
49 | 					<< " from process " << next_id << std::endl;
50 | 	}
51 | 
52 | 	// Terminate MPI
53 | 	MPI::Finalize();
54 | 
55 | 	return 0;
56 | }
57 | 


--------------------------------------------------------------------------------
/chapter9/ping_pong/ping_pong_ring_nonblock.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #include "mpi.h"
 4 | 
 5 | int main (int argc, char *argv[]){
 6 | 	// Initialize MPI
 7 | 	MPI::Init(argc,argv);
 8 | 	// Get the number of processes
 9 | 	int numP=MPI::COMM_WORLD.Get_size();
10 | 
11 | 	// Get the ID of the process
12 | 	int myId=MPI::COMM_WORLD.Get_rank();
13 | 
14 | 	if(argc < 2){
15 | 		// Only the first process prints the output message
16 | 		if(!myId){
17 | 			std::cout << "ERROR: The syntax of the program is " << argv[0] 
18 |                                   << " num_ping_pong" << std::endl;
19 | 		}
20 | 		MPI::COMM_WORLD.Abort(1);
21 | 	}
22 | 
23 | 	int num_ping_pong = atoi(argv[1]);
24 | 	int ping_pong_count = 0;
25 | 	int next_id = myId+1, prev_id=myId-1;
26 | 
27 | 	if(next_id >= numP){
28 | 		next_id = 0;
29 | 	}
30 | 	if(prev_id < 0){
31 | 		prev_id = numP-1;
32 | 	}	
33 | 
34 | 	MPI::Request rq_send, rq_recv;
35 | 
36 | 	while(ping_pong_count < num_ping_pong){
37 | 		// First receive the ping and then send the pong
38 | 		ping_pong_count++;
39 | 		rq_send = MPI::COMM_WORLD.Isend(&ping_pong_count, 1, MPI::INT, next_id, 0);
40 | 		std::cout << "Process " << myId << " sends PING number " << ping_pong_count
41 | 					<< " to process " << next_id << std::endl;
42 | 		rq_recv = MPI::COMM_WORLD.Irecv(&ping_pong_count, 1, MPI::INT, prev_id, 0);
43 | 		std::cout << "Process " << myId << " receives PING number " << ping_pong_count
44 | 					<< " from process " << prev_id << std::endl;
45 | 
46 | 		rq_recv.Wait();
47 | 
48 | 		rq_send = MPI::COMM_WORLD.Isend(&ping_pong_count, 1, MPI::INT, prev_id, 0);
49 | 		std::cout << "Process " << myId << " sends PONG number " << ping_pong_count
50 | 					<< " to process " << prev_id << std::endl;
51 | 		rq_recv = MPI::COMM_WORLD.Irecv(&ping_pong_count, 1, MPI::INT, next_id, 0);
52 | 		std::cout << "Process " << myId << " receives PONG number " << ping_pong_count
53 | 					<< " from process " << next_id << std::endl;
54 | 
55 | 		rq_recv.Wait();
56 | 	}
57 | 
58 | 	// Terminate MPI
59 | 	MPI::Finalize();
60 | 
61 | 	return 0;
62 | }
63 | 


--------------------------------------------------------------------------------
/chapter9/primes/Makefile:
--------------------------------------------------------------------------------
 1 | MPICXX= mpic++
 2 | MPICXXFLAGS= -O2 -std=c++11
 3 | 
 4 | all: primes_serialized_comm primes
 5 | 
 6 | primes_serialized_comm: primes_serialized_comm.cpp
 7 | 	$(MPICXX) $(MPICXXFLAGS) primes_serialized_comm.cpp -o primes_serialized_comm
 8 | 
 9 | primes: primes.cpp
10 | 	$(MPICXX) $(MPICXXFLAGS) primes.cpp -o primes
11 | 
12 | clean:
13 | 	rm -rf primes_serialized_comm
14 | 	rm -rf primes
15 | 


--------------------------------------------------------------------------------
/chapter9/primes/primes.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | #include "mpi.h"
 4 | 
 5 | int main (int argc, char *argv[]){
 6 | 	// Initialize MPI
 7 | 	MPI::Init(argc,argv);
 8 | 
 9 | 	// Get the number of processes
10 | 	int numP=MPI::COMM_WORLD.Get_size();
11 | 
12 | 	// Get the ID of the process
13 | 	int myId=MPI::COMM_WORLD.Get_rank();
14 | 
15 | 	if(argc < 2){
16 | 		// Only the first process prints the output message
17 | 		if(!myId){
18 | 			std::cout << "ERROR: The syntax of the program is "
19 |                                   << argv[0] << " n" << std::endl;
20 | 		}
21 | 		MPI::COMM_WORLD.Abort(1);
22 | 	}
23 | 
24 | 	int n;
25 | 
26 | 	if(!myId){
27 | 		n = atoi(argv[1]);
28 | 	}
29 | 
30 | 	// Barrier to synchronize the processes before measuring time
31 | 	MPI::COMM_WORLD.Barrier();
32 | 
33 | 	// Measure the current time
34 | 	double start = MPI::Wtime();
35 | 
36 | 	// Send the value of n to all processes
37 | 	MPI::COMM_WORLD.Bcast(&n, 1, MPI::INT, 0);
38 | 
39 | 	if(n < 1){
40 | 		// Only the first process prints the output message
41 | 		if(!myId){
42 | 			std::cout << "ERROR: The parameter 'n' must be higher than 0" << std::endl;
43 | 		}
44 | 		MPI::COMM_WORLD.Abort(1);
45 | 	}
46 | 
47 | 	// Perform the computation of the number of primes between 1 and n in parallel
48 | 	int myCount = 0;
49 | 	int total;
50 | 	bool prime;
51 | 
52 | 	// Each process analyzes only part of the numbers below n
53 | 	// The distribution is cyclic for better workload balance
54 | 	//for(int i=2+myId; i<=n; i=i+numP){
55 | 	for(int i=2*(1+myId); i<=n; i=i+2*numP){
56 | 		prime = true;
57 | 		for(int j=2; j<i; j++){
58 | 			if((i%j) == 0){
59 | 				prime = false;
60 | 				break;
61 | 			}
62 | 		}
63 | 		myCount += prime;
64 | 		prime = true;
65 | 		for(int j=2; j<i+1; j++){
66 | 			if(((i+1)%j) == 0){
67 | 				prime = false;
68 | 				break;
69 | 			}
70 | 		}
71 | 		myCount += prime;
72 | 	}
73 | 
74 | 	// Reduce the partial counts into 'total' in the process 0
75 |     MPI::COMM_WORLD.Reduce(&myCount, &total, 1, MPI::INT, MPI::SUM, 0);
76 | 
77 | 	// Measure the current time
78 |     // Barrier is not necessary because
79 | 	double end = MPI::Wtime();
80 | 
81 |     if(!myId){
82 |     	std::cout << total << " primes between 1 and " << n << std::endl;
83 |     	std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl;
84 |     }
85 | 
86 | 	// Terminate MPI
87 | 	MPI::Finalize();
88 | 	return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/chapter9/primes/primes_serialized_comm.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | 
  3 | #include "mpi.h"
  4 | 
  5 | int main (int argc, char *argv[]){
  6 | 	// Initialize MPI
  7 | 	MPI::Init(argc,argv);
  8 | 
  9 | 	// Get the number of processes
 10 | 	int numP=MPI::COMM_WORLD.Get_size();
 11 | 
 12 | 	// Get the ID of the process
 13 | 	int myId=MPI::COMM_WORLD.Get_rank();
 14 | 
 15 | 	if(argc < 2){
 16 | 		// Only the first process prints the output message
 17 | 		if(!myId){
 18 | 			std::cout << "ERROR: The syntax of the program is "
 19 |                                   << argv[0] << " n" << std::endl;
 20 | 		}
 21 | 		MPI::COMM_WORLD.Abort(1);
 22 | 	}
 23 | 
 24 | 	int n;
 25 | 
 26 | 	if(!myId){
 27 | 		n = atoi(argv[1]);
 28 | 	}
 29 | 
 30 | 	// Barrier to synchronize the processes before measuring time
 31 | 	MPI::COMM_WORLD.Barrier();
 32 | 
 33 | 	// Measure the current time
 34 | 	double start = MPI::Wtime();
 35 | 
 36 | 	// Send the value of n to all processes
 37 | 	MPI::COMM_WORLD.Bcast(&n, 1, MPI::INT, 0);
 38 | 
 39 | 	if(n < 1){
 40 | 		// Only the first process prints the output message
 41 | 		if(!myId){
 42 | 			std::cout << "ERROR: The parameter 'n' must be higher than 0" << std::endl;
 43 | 		}
 44 | 		MPI::COMM_WORLD.Abort(1);
 45 | 	}
 46 | 
 47 | 	// Perform the computation of the number of primes between 1 and n in parallel
 48 | 	int myCount = 0;
 49 | 	int total;
 50 | 	bool prime;
 51 | 
 52 | 	// Each process analyzes only part of the numbers below n
 53 | 	// Data to perform a block distribution
 54 | 	int blockSize = (n-1)/numP;
 55 | 	int myBlockSize = blockSize;
 56 | 	int myStart = 2+myId*blockSize;
 57 | 
 58 | 	// For the cases that n is not multiple of numP
 59 | 	if(myId < (n-1)%numP){
 60 | 		myBlockSize++;
 61 | 		myStart += myId;
 62 | 	} else {
 63 | 		myStart += (n-1)%numP;
 64 | 	}
 65 | 
 66 | 	int myEnd = myStart+myBlockSize;
 67 | 
 68 | 	for(int i=myStart; i<myEnd; i++){
 69 | 		prime = true;
 70 | 		for(int j=2; j<i; j++){
 71 | 			if((i%j) == 0){
 72 | 				prime = false;
 73 | 				break;
 74 | 			}
 75 | 		}
 76 | 		myCount += prime;
 77 | 	}
 78 | 
 79 | 	// Reduce the partial counts into 'total' in the process 0
 80 |     //MPI::COMM_WORLD.Reduce(&myCount, &total, 1, MPI::INT, MPI::SUM, 0);
 81 | 
 82 | MPI::Status st;
 83 |   if(!myId){
 84 |     total = myCount;
 85 |     for(int i=1; i<numP; i++){
 86 |       MPI::COMM_WORLD.Recv(&myCount, 1, MPI::INT, MPI::ANY_SOURCE, 0, st);
 87 |       total += myCount;
 88 |     }
 89 | }
 90 |   else {
 91 |     MPI::COMM_WORLD.Send(&myCount, 1, MPI::INT, 0, 0);
 92 |   }  
 93 | 
 94 | 	// Measure the current time
 95 |     // Barrier is not necessary because
 96 | 	double end = MPI::Wtime();
 97 | 
 98 |     if(!myId){
 99 |     	std::cout << total << " primes between 1 and " << n << std::endl;
100 |     	std::cout << "Time with " << numP << " processes: " << end-start << " seconds" << std::endl;
101 |     }
102 | 
103 | 	// Terminate MPI
104 | 	MPI::Finalize();
105 | 	return 0;
106 | }
107 | 


--------------------------------------------------------------------------------
/include/binary_IO.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef BINARY_IO_HPP
 2 | #define BINARY_IO_HPP
 3 | 
 4 | #include <string>
 5 | #include <fstream>
 6 | 
 7 | template <
 8 |     typename index_t, 
 9 |     typename value_t>
10 | void dump_binary(
11 |     const value_t * data, 
12 |     const index_t length, 
13 |     std::string filename) {
14 | 
15 |     std::ofstream ofile(filename.c_str(), std::ios::binary);
16 |     ofile.write((char*) data, sizeof(value_t)*length);
17 |     ofile.close();
18 | }
19 | 
20 | template <
21 |     typename index_t, 
22 |     typename value_t>
23 | void load_binary(
24 |     const value_t * data, 
25 |     const index_t length, 
26 |     std::string filename) {
27 | 
28 |     std::ifstream ifile(filename.c_str(), std::ios::binary);
29 |     ifile.read((char*) data, sizeof(value_t)*length);
30 |     ifile.close();
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/include/cbf_generator.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef CBF_GENERATOR_HPP
 2 | #define CBF_GENERATOR_HPP
 3 | 
 4 | #include <random>
 5 | #include <cstdint>
 6 | 
 7 | template <
 8 |     typename index_t,
 9 |     typename value_t,
10 |     typename label_t>
11 | void generate_cbf(
12 |     value_t * data,
13 |     label_t * labels,
14 |     index_t num_entries,
15 |     index_t num_features) {
16 | 
17 |     std::mt19937 engine(42);
18 |     std::uniform_int_distribution<index_t> lower_dist(0.125*num_features,
19 |                                                       0.250*num_features);
20 |     std::uniform_int_distribution<index_t> delta_dist(0.250*num_features,
21 |                                                       0.750*num_features);
22 |     std::uniform_real_distribution<value_t> normal_dist(0, 1);
23 | 
24 |     // create the labels (0: Cylinder, 1:Bell, 2:Funnel)
25 |     for (index_t entry = 0; entry < num_entries; entry++)
26 |         labels[entry] = entry % 3;
27 | 
28 |     for (index_t entry = 0; entry < num_entries; entry++) {
29 | 
30 |         const index_t a   = lower_dist(engine);
31 |         const index_t bma = delta_dist(engine);
32 |         const value_t amp = normal_dist(engine)+6;
33 | 
34 |         // Cylinder
35 |         if (labels[entry] == 0) {
36 |             for (index_t index = 0; index < num_features; index++) {
37 |                 const value_t value = (index >= a && index < a+bma) ? amp : 0;
38 |                 data[entry*num_features+index] = value+normal_dist(engine);
39 |             }
40 |         }
41 | 
42 |         // Bell
43 |         if (labels[entry] == 1) {
44 |             for (index_t index = 0; index < num_features; index++) {
45 |                 const value_t delta = value_t(index)-value_t(a);
46 |                 const value_t value = (index >= a && index < a+bma) ?
47 |                                       amp*delta/bma : 0;
48 |                 data[entry*num_features+index] = value+normal_dist(engine);
49 |             }
50 |         }
51 | 
52 |         // Funnel
53 |         if (labels[entry] == 2) {
54 |             for (index_t index = 0; index < num_features; index++) {
55 |                 const value_t delta = value_t(a+bma)-value_t(index);
56 |                 const value_t value = (index >= a && index < a+bma) ?
57 |                                       amp*delta/bma : 0;
58 |                 data[entry*num_features+index] = value+normal_dist(engine);
59 |             }
60 |         }
61 |     }
62 | }
63 | 
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/include/hpc_helpers.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef HPC_HELPERS_HPP
  2 | #define HPC_HELPERS_HPP
  3 | 
  4 | #include <iostream>
  5 | #include <cstdint>
  6 | 
  7 | #ifndef __CUDACC__
  8 |     #include <chrono>
  9 | #endif
 10 | 
 11 | #ifndef __CUDACC__
 12 |     #define TIMERSTART(label)                                                  \
 13 |         std::chrono::time_point<std::chrono::system_clock> a##label, b##label; \
 14 |         a##label = std::chrono::system_clock::now();
 15 | #else
 16 |     #define TIMERSTART(label)                                                  \
 17 |         cudaEvent_t start##label, stop##label;                                 \
 18 |         float time##label;                                                     \
 19 |         cudaEventCreate(&start##label);                                        \
 20 |         cudaEventCreate(&stop##label);                                         \
 21 |         cudaEventRecord(start##label, 0);
 22 | #endif
 23 | 
 24 | #ifndef __CUDACC__
 25 |     #define TIMERSTOP(label)                                                   \
 26 |         b##label = std::chrono::system_clock::now();                           \
 27 |         std::chrono::duration<double> delta##label = b##label-a##label;        \
 28 |         std::cout << "# elapsed time ("<< #label <<"): "                       \
 29 |                   << delta##label.count()  << "s" << std::endl;
 30 | #else
 31 |     #define TIMERSTOP(label)                                                   \
 32 |             cudaEventRecord(stop##label, 0);                                   \
 33 |             cudaEventSynchronize(stop##label);                                 \
 34 |             cudaEventElapsedTime(&time##label, start##label, stop##label);     \
 35 |             std::cout << "TIMING: " << time##label << " ms (" << #label << ")" \
 36 |                       << std::endl;
 37 | #endif
 38 | 
 39 | 
 40 | #ifdef __CUDACC__
 41 |     #define CUERR {                                                            \
 42 |         cudaError_t err;                                                       \
 43 |         if ((err = cudaGetLastError()) != cudaSuccess) {                       \
 44 |             std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "    \
 45 |                       << __FILE__ << ", line " << __LINE__ << std::endl;       \
 46 |             exit(1);                                                           \
 47 |         }                                                                      \
 48 |     }
 49 | 
 50 |     // transfer constants
 51 |     #define H2D (cudaMemcpyHostToDevice)
 52 |     #define D2H (cudaMemcpyDeviceToHost)
 53 |     #define H2H (cudaMemcpyHostToHost)
 54 |     #define D2D (cudaMemcpyDeviceToDevice)
 55 | #endif
 56 | 
 57 | // safe division
 58 | #define SDIV(x,y)(((x)+(y)-1)/(y))
 59 | 
 60 | // no_init_t
 61 | #include <type_traits>
 62 | 
 63 | template<class T>
 64 | class no_init_t {
 65 | public:
 66 | 
 67 |     static_assert(std::is_fundamental<T>::value &&
 68 |                   std::is_arithmetic<T>::value, 
 69 |                   "wrapped type must be a fundamental, numeric type");
 70 | 
 71 |     //do nothing
 72 |     constexpr no_init_t() noexcept {}
 73 | 
 74 |     //convertible from a T
 75 |     constexpr no_init_t(T value) noexcept: v_(value) {}
 76 | 
 77 |     //act as a T in all conversion contexts
 78 |     constexpr operator T () const noexcept { return v_; }
 79 | 
 80 |     // negation on value and bit level
 81 |     constexpr no_init_t& operator - () noexcept { v_ = -v_; return *this; }
 82 |     constexpr no_init_t& operator ~ () noexcept { v_ = ~v_; return *this; }
 83 | 
 84 |     // prefix increment/decrement operators
 85 |     constexpr no_init_t& operator ++ ()    noexcept { v_++; return *this; }
 86 |     constexpr no_init_t& operator -- ()    noexcept { v_--; return *this; }
 87 | 
 88 |     // postfix increment/decrement operators
 89 |     constexpr no_init_t operator ++ (int) noexcept {
 90 |        auto old(*this);
 91 |        v_++; 
 92 |        return old; 
 93 |     }
 94 |     constexpr no_init_t operator -- (int) noexcept {
 95 |        auto old(*this);
 96 |        v_--; 
 97 |        return old; 
 98 |     }
 99 | 
100 |     // assignment operators
101 |     constexpr no_init_t& operator  += (T v) noexcept { v_  += v; return *this; }
102 |     constexpr no_init_t& operator  -= (T v) noexcept { v_  -= v; return *this; }
103 |     constexpr no_init_t& operator  *= (T v) noexcept { v_  *= v; return *this; }
104 |     constexpr no_init_t& operator  /= (T v) noexcept { v_  /= v; return *this; }
105 | 
106 |     // bit-wise operators
107 |     constexpr no_init_t& operator  &= (T v) noexcept { v_  &= v; return *this; }
108 |     constexpr no_init_t& operator  |= (T v) noexcept { v_  |= v; return *this; }
109 |     constexpr no_init_t& operator  ^= (T v) noexcept { v_  ^= v; return *this; }
110 |     constexpr no_init_t& operator >>= (T v) noexcept { v_ >>= v; return *this; }
111 |     constexpr no_init_t& operator <<= (T v) noexcept { v_ <<= v; return *this; }
112 | 
113 | private:
114 |    T v_;
115 | };
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/include/svd.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef HPC_BOOK_SVD_HPP
 2 | #define HPC_BOOK_SVD_HPP
 3 | 
 4 | #include <cusolverDn.h>
 5 | #include "hpc_helpers.hpp"
 6 | 
 7 | bool svd_device(
 8 |     float * M,
 9 |     float * U,
10 |     float * S,
11 |     float * V,
12 |     int height,
13 |     int width,
14 |     bool verbose=false) {
15 | 
16 |     cusolverDnHandle_t handle;
17 |     cusolverDnCreate(&handle);
18 | 
19 |     int temp_storage_bytes = 0;
20 |     if (cusolverDnSgesvd_bufferSize(handle, width, height, &temp_storage_bytes))
21 |         return 1;
22 | 
23 |     float * temp_storage = nullptr;
24 |     if (cudaMalloc(&temp_storage, sizeof(float)*temp_storage_bytes))
25 |         return 1;
26 | 
27 |     if (verbose)
28 |         std::cout << "CUSOLVER: allocated " << temp_storage_bytes
29 |                   << " bytes of temporary storage." << std::endl;
30 | 
31 |     int * devInfo;
32 |     if(cudaMalloc(&devInfo, sizeof(int)))
33 |         return 1;
34 | 
35 |     if (cusolverDnSgesvd(handle, 'A', 'A', height, width,
36 |                          M, height, S, U, height, V, width,
37 |                          temp_storage, temp_storage_bytes, nullptr, devInfo ))
38 |         return 1;
39 | 
40 |     if (verbose)
41 |         std::cout << "CUSOLVER: computed SVD." << std::endl;
42 | 
43 |     if (cusolverDnDestroy(handle))
44 |         return 1;
45 |     if (cudaFree(temp_storage))
46 |         return 1;
47 | 
48 |     if (verbose)
49 |         std::cout << "CUSOLVER: freed " << temp_storage_bytes
50 |                   << " bytes of temporary storage." << std::endl;
51 | 
52 |     return 0;
53 | }
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------