├── README.md ├── lec04 └── hello_world_0.c ├── lec05 └── ring.c ├── lec06 ├── README.md ├── argv.c ├── pi_parallel.c ├── pi_seq.c └── rand.c ├── lec08 └── sum_vectors.c ├── lec12 ├── README.md ├── branch_prediction_fast.c ├── branch_prediction_slow.c ├── cache_01_fast.c ├── cache_01_fast_broken.c ├── cache_01_fast_broken_fixed.c ├── cache_01_slow.c ├── cache_fs_fast.c ├── cache_fs_slow.c ├── compile.sh └── my_timer.h ├── lec13 ├── README.md ├── blackscholes.c ├── compile.sh ├── inputgen.c ├── inputs │ ├── input_1000.txt │ ├── input_10000.txt │ └── input_100000.txt ├── my_timer.h ├── optionData.txt └── solution │ ├── blackscholes_omp │ ├── blackscholes_omp.c │ ├── blackscholes_pthreads │ └── blackscholes_pthreads.c ├── lec15 ├── Makefile ├── README.md ├── histogram.c ├── histogram_solution_ato_local.c ├── histogram_solution_ato_local_better.c ├── histogram_solution_red.c ├── histogram_solution_trivial_ato.c ├── histogram_solution_trivial_crit.c ├── matmul.c ├── matmul_solution.c ├── pi.c ├── pi_solution_ato.c ├── pi_solution_crit.c ├── pi_solution_crit_rand.c ├── pi_solution_red.c └── rand_vs_rand_r.c ├── lec19 ├── Makefile ├── README.md ├── cuda_job.sub ├── error_checks.h ├── image_blur.cu ├── image_blur_shared.cu ├── jacobi.cu ├── jacobi_solution.cu ├── test_cuda.cu └── vector_add.cu ├── projects ├── README.md ├── kmeans │ ├── KMEANS.c │ ├── KMEANS_cuda.cu │ ├── KMEANS_mpi.c │ ├── KMEANS_omp.c │ ├── LICENSE │ ├── Makefile │ ├── README │ ├── handout.pdf │ └── test_files │ │ ├── input100D.inp │ │ ├── input100D2.inp │ │ ├── input10D.inp │ │ ├── input20D.inp │ │ ├── input2D.inp │ │ └── input2D2.inp ├── sequence │ ├── LICENSE │ ├── Makefile │ ├── README │ ├── align.c │ ├── align_cuda.cu │ ├── align_mpi.c │ ├── align_omp.c │ ├── handout.pdf │ └── rng.c └── wind │ ├── LICENSE │ ├── Makefile │ ├── README │ ├── handout.pdf │ ├── wind.c │ ├── wind_cuda.cu │ ├── wind_mpi.c │ └── wind_omp.c └── utils ├── create_users_pmc.sh └── openmpiscript.sh /README.md: -------------------------------------------------------------------------------- 1 | This repository contains the code samples shown and discussed in class. -------------------------------------------------------------------------------- /lec04/hello_world_0.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void){ 5 | int r = MPI_Init(NULL, NULL); 6 | int size, rank; 7 | MPI_Comm_size(MPI_COMM_WORLD, &size); 8 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 9 | if(r != MPI_SUCCESS){ 10 | printf("Error starting MPI program. Terminating.\n"); 11 | MPI_Abort(MPI_COMM_WORLD, r); 12 | } 13 | char str[256]; 14 | if(rank == 0){ 15 | printf("Hello, World! I am process %d of %d.\n", rank, size); 16 | int i; 17 | for(i = 1; i < size; i++){ 18 | 19 | MPI_Recv(str, 256, MPI_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 20 | printf("%s", str); 21 | } 22 | }else{ 23 | sprintf(str, "Hello, World! I am process %d of %d.\n", rank, size); 24 | MPI_Send(str, 256, MPI_CHAR, 0, 0, MPI_COMM_WORLD); 25 | } 26 | 27 | MPI_Finalize(); 28 | return 0; 29 | } -------------------------------------------------------------------------------- /lec05/ring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void){ 5 | int rank, size; 6 | int send_right = 19; 7 | int send_left = 23; 8 | int recv_left, recv_right; 9 | MPI_Init(NULL, NULL); 10 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 11 | MPI_Comm_size(MPI_COMM_WORLD, &size); 12 | MPI_Request requests[4]; 13 | // Send right 14 | MPI_Isend(&send_right, 1, MPI_INT, (rank + 1) % size, 0, MPI_COMM_WORLD, &requests[0]); 15 | // Send left 16 | MPI_Isend(&send_left, 1, MPI_INT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &requests[1]); 17 | // Recv from right 18 | MPI_Irecv(&recv_right, 1, MPI_INT, (rank + 1) % size, 0, MPI_COMM_WORLD, &requests[2]); 19 | // Recv from left 20 | MPI_Irecv(&recv_left, 1, MPI_INT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &requests[3]); 21 | // Compute anything 22 | // ... 23 | MPI_Waitall(4, requests, MPI_STATUSES_IGNORE); 24 | 25 | MPI_Finalize(); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /lec06/README.md: -------------------------------------------------------------------------------- 1 | Exercises shown during lecture 6. 2 | - rand.c: A simple program in which each program prints a random number. You can see that each rank print the same number, since each one use the same seed. Now, try to add the following code after the MPI_Init: 3 | ```c 4 | int rank; 5 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 6 | srand(rank); 7 | ``` 8 | If you run the program again, you will see that each rank print a different number. This is because each rank is using a different seed. 9 | 10 | - pi_seq.c: A program that calculates the value of pi (sequentially) using the process we have seen in the slides. 11 | - pi_parallel.c: A program that calculates the value of pi (in parallel) using the process we have seen in the slides. 12 | - argv.c: A program that shows how to use the argv parameter in the main function. -------------------------------------------------------------------------------- /lec06/argv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv){ 5 | MPI_Init(&argc, &argv); 6 | int rank; 7 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 8 | printf("I am rank %d and argv[0] is %s argv[1] is %s\n", 9 | rank, argv[0], argv[1]); 10 | MPI_Finalize(); 11 | return 0; 12 | } -------------------------------------------------------------------------------- /lec06/pi_parallel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | double get_rand_minus_one_one(){ 7 | return 2 * (rand() / (double)RAND_MAX) - 1; 8 | } 9 | 10 | int main(int argc, char** argv){ 11 | int num_tosses = atoi(argv[1]); 12 | int toss; 13 | int num_hits = 0; 14 | MPI_Init(NULL, NULL); 15 | double start_time = MPI_Wtime(); 16 | int world_size, rank; 17 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 18 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 19 | srand(time(NULL)*rank); 20 | int local_tosses = num_tosses / world_size; 21 | for(toss = 0; toss < local_tosses; toss++){ 22 | double x = get_rand_minus_one_one(); 23 | double y = get_rand_minus_one_one(); 24 | if(x*x + y*y <= 1){ 25 | num_hits++; 26 | } 27 | } 28 | int total_hits; 29 | MPI_Reduce(&num_hits, &total_hits, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); 30 | if(rank == 0){ 31 | double pi_estimate = 4 * total_hits / ((double)num_tosses); 32 | printf("Estimate of pi = %f Computed in %f seconds\n", pi_estimate, MPI_Wtime() - start_time); 33 | } 34 | MPI_Finalize(); 35 | return 0; 36 | } -------------------------------------------------------------------------------- /lec06/pi_seq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | double get_rand_minus_one_one(){ 6 | return 2 * (rand() / (double)RAND_MAX) - 1; 7 | } 8 | 9 | int main(int argc, char** argv){ 10 | int num_tosses = atoi(argv[1]); 11 | srand(time(NULL)); 12 | int toss; 13 | int num_hits = 0; 14 | for(toss = 0; toss < num_tosses; toss++){ 15 | double x = get_rand_minus_one_one(); 16 | double y = get_rand_minus_one_one(); 17 | if(x*x + y*y <= 1){ 18 | num_hits++; 19 | } 20 | } 21 | double pi_estimate = 4 * num_hits / ((double)num_tosses); 22 | printf("Estimate of pi = %f\n", pi_estimate); 23 | return 0; 24 | } -------------------------------------------------------------------------------- /lec06/rand.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main(int argc, char** argv){ 7 | MPI_Init(NULL, NULL); 8 | printf("Rand %d\n", rand()); 9 | MPI_Finalize(); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /lec08/sum_vectors.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int* create_random_vector(int n){ 6 | int* vec = (int*) malloc(n * sizeof(int)); 7 | for(int i = 0; i < n; i++){ 8 | vec[i] = rand() % 10; 9 | } 10 | return vec; 11 | } 12 | 13 | void print_vector(int* vec, int n){ 14 | for(int i = 0; i < n; i++){ 15 | printf("%d ", vec[i]); 16 | } 17 | printf("\n"); 18 | } 19 | 20 | int main(int argc, char** argv){ 21 | MPI_Init(&argc, &argv); 22 | int rank, size; 23 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 24 | MPI_Comm_size(MPI_COMM_WORLD, &size); 25 | int *a, *b; 26 | int n = atoi(argv[1]); 27 | if(n % size != 0){ 28 | printf("n must be divisible by the number of processes\n"); 29 | MPI_Abort(MPI_COMM_WORLD, 1); 30 | } 31 | if(rank == 0){ 32 | a = create_random_vector(n); 33 | b = create_random_vector(n); 34 | printf("Rank 0: a = "); 35 | print_vector(a, n); 36 | printf("Rank 0: b = "); 37 | print_vector(b, n); 38 | MPI_Scatter(a, n/size, MPI_INT, MPI_IN_PLACE, n/size, MPI_INT, 0, MPI_COMM_WORLD); 39 | MPI_Scatter(b, n/size, MPI_INT, MPI_IN_PLACE, n/size, MPI_INT, 0, MPI_COMM_WORLD); 40 | }else{ 41 | a = (int*) malloc(n/size * sizeof(int)); 42 | b = (int*) malloc(n/size * sizeof(int)); 43 | MPI_Scatter(NULL, n/size, MPI_INT, a, n/size, MPI_INT, 0, MPI_COMM_WORLD); 44 | MPI_Scatter(NULL, n/size, MPI_INT, b, n/size, MPI_INT, 0, MPI_COMM_WORLD); 45 | } 46 | int* c = (int*) malloc(n/size * sizeof(int)); 47 | for(int i = 0; i < n/size; i++){ 48 | c[i] = a[i] + b[i]; 49 | } 50 | int* c_finale = NULL; 51 | if(rank == 0){ 52 | c_finale = (int*) malloc(n * sizeof(int)); 53 | } 54 | MPI_Gather(c, n/size, MPI_INT, c_finale, n/size, MPI_INT, 0, MPI_COMM_WORLD); 55 | if(rank == 0){ 56 | printf("Rank 0: c = "); 57 | print_vector(c_finale, n); 58 | } 59 | MPI_Finalize(); 60 | return 0; 61 | } -------------------------------------------------------------------------------- /lec12/README.md: -------------------------------------------------------------------------------- 1 | - cache_01_slow.c: The program does not perform well because the matrix is read by column rather than by row 2 | - cache_01_fast.c: Same as cache_01_slow.c, but the matrix is read by row rather than by column, thus outperforming the previous version 3 | - cache_01_fast_broken.c: Same as cache_01_fast.c, but the vector declaration is within the main body. This enables GCC to apply Dead Code Elimination (DCE), and remove basically all the code. The application would then just not compute anything. Beware, some compilers can apply dead code elimination also to global variable, and in that case the cache_01_slow.c and cache_01_fast.c would also not compute anything. 4 | - cache_01_fast_broken_fixed.c Same as cache_01_fast_broken_fixed.c, but now we do something with the result of the calculation (e.g., print the sum of the elements of y), so that the compiler does not eliminate the code. Alternatively, you can remove DCE by adding the following flags when compiling: -fno-dce -fno-dse -fno-tree-dce -fno-tree-dse 5 | - cache_fs_slow.c: It shows the false sharing problem 6 | - cache_fs_fast.c: It solves the false sharing problem by padding the structure 7 | - branch_prediction_slow.c: Fills an array with random elements between 0 and 9, and then counts the number of elements that are greater than 5. The program is slow because the branch predictor is not able to predict the outcome of the if statement. 8 | - branch_prediction_fast.c: Same as branch_prediction_slow.c, but it sorts the array before doing the check, so that (approximatively), the first n/2 elements are smaller an the remaining n/2 larger. In this way, the branch predictor is able to predict the outcome of the if statement more effectively. 9 | 10 | 11 | For more examples, check https://github.com/Kobzol/hardware-effects 12 | 13 | To install perf on WSL2: 14 | ```bash 15 | sudo apt install linux-tools-generic 16 | ``` -------------------------------------------------------------------------------- /lec12/branch_prediction_fast.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "my_timer.h" 6 | 7 | double x[MAX]; 8 | 9 | int compare(const void* a, const void* b) { 10 | return (*(double*)a - *(double*)b); 11 | } 12 | 13 | // Computes matrix-vector multiplication sequentially 14 | int main(int argc, char** argv) { 15 | int i,iter; 16 | srand(time(NULL)); 17 | for (i = 0; i < MAX; i++) { 18 | x[i] = rand() % 10; 19 | } 20 | qsort(x, MAX, sizeof(double), compare); 21 | 22 | double total_time = 0.0; 23 | int total_smaller = 0; 24 | for(iter = 0; iter < ITER; iter++){ 25 | double start, stop; 26 | int smaller = 0; 27 | GET_TIME(start); 28 | for(i = 0; i < MAX; i++){ 29 | if(x[i] < 5){ 30 | smaller++; 31 | } 32 | } 33 | GET_TIME(stop); 34 | total_smaller += smaller; 35 | total_time += stop-start; 36 | } 37 | printf("Total smaller %d\n", total_smaller); 38 | printf("Average runtime %f sec\n", total_time/ITER); 39 | } 40 | -------------------------------------------------------------------------------- /lec12/branch_prediction_slow.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "my_timer.h" 6 | 7 | double x[MAX]; 8 | 9 | // Computes matrix-vector multiplication sequentially 10 | int main(int argc, char** argv) { 11 | int i,iter; 12 | srand(time(NULL)); 13 | for (i = 0; i < MAX; i++) { 14 | x[i] = rand() % 10; 15 | } 16 | 17 | double total_time = 0.0; 18 | int total_smaller = 0; 19 | for(iter = 0; iter < ITER; iter++){ 20 | double start, stop; 21 | int smaller = 0; 22 | GET_TIME(start); 23 | for(i = 0; i < MAX; i++){ 24 | if(x[i] < 5){ 25 | smaller++; 26 | } 27 | } 28 | GET_TIME(stop); 29 | total_smaller += smaller; 30 | total_time += stop-start; 31 | } 32 | printf("Total smaller %d\n", total_smaller); 33 | printf("Average runtime %f sec\n", total_time/ITER); 34 | } 35 | -------------------------------------------------------------------------------- /lec12/cache_01_fast.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "my_timer.h" 5 | 6 | double A[MAX][MAX]; 7 | double x[MAX]; 8 | double y[MAX]; 9 | 10 | // Computes matrix-vector multiplication sequentially 11 | int main(int argc, char** argv) { 12 | int i,j,iter; 13 | srand(time(NULL)); 14 | /* Initialize A and x with random values, and y to 0s*/ 15 | for (i = 0; i < MAX; i++) { 16 | x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1 17 | y[i] = 0.0; 18 | for (j = 0; j < MAX; j++) 19 | A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1 20 | } 21 | 22 | double total_time = 0.0; 23 | for(iter = 0; iter < ITER; iter++){ 24 | double start, stop; 25 | GET_TIME(start); 26 | for (i = 0; i < MAX; i++) 27 | for (j = 0; j < MAX; j++) 28 | y[i] += A[i][j]*x[j]; 29 | GET_TIME(stop); 30 | total_time += stop-start; 31 | } 32 | 33 | /** 34 | for (i = 0; i < MAX; i++) 35 | printf("%f\n", y[i]); 36 | **/ 37 | 38 | printf("Average runtime %f sec\n", total_time/ITER); 39 | } 40 | -------------------------------------------------------------------------------- /lec12/cache_01_fast_broken.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "my_timer.h" 5 | 6 | double A[MAX][MAX]; 7 | double x[MAX]; 8 | 9 | // Computes matrix-vector multiplication sequentially 10 | int main(int argc, char** argv) { 11 | double y[MAX]; 12 | int i,j,iter; 13 | srand(time(NULL)); 14 | /* Initialize A and x with random values, and y to 0s*/ 15 | for (i = 0; i < MAX; i++) { 16 | x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1 17 | y[i] = 0.0; 18 | for (j = 0; j < MAX; j++) 19 | A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1 20 | } 21 | 22 | double total_time = 0.0; 23 | for(iter = 0; iter < ITER; iter++){ 24 | double start, stop; 25 | GET_TIME(start); 26 | for (i = 0; i < MAX; i++) 27 | for (j = 0; j < MAX; j++) 28 | y[i] += A[i][j]*x[j]; 29 | GET_TIME(stop); 30 | total_time += stop-start; 31 | } 32 | 33 | /** 34 | for (i = 0; i < MAX; i++) 35 | printf("%f\n", y[i]); 36 | **/ 37 | 38 | printf("Average runtime %f sec\n", total_time/ITER); 39 | } 40 | -------------------------------------------------------------------------------- /lec12/cache_01_fast_broken_fixed.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "my_timer.h" 5 | 6 | double A[MAX][MAX]; 7 | double x[MAX]; 8 | 9 | // Computes matrix-vector multiplication sequentially 10 | int main(int argc, char** argv) { 11 | double y[MAX]; 12 | int i,j,iter; 13 | double dummy = 0.0; 14 | srand(time(NULL)); 15 | /* Initialize A and x with random values, and y to 0s*/ 16 | for (i = 0; i < MAX; i++) { 17 | x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1 18 | y[i] = 0.0; 19 | for (j = 0; j < MAX; j++) 20 | A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1 21 | } 22 | 23 | double total_time = 0.0; 24 | for(iter = 0; iter < ITER; iter++){ 25 | double start, stop; 26 | GET_TIME(start); 27 | for (i = 0; i < MAX; i++) 28 | for (j = 0; j < MAX; j++) 29 | y[i] += A[i][j]*x[j]; 30 | GET_TIME(stop); 31 | total_time += stop-start; 32 | 33 | // Do something with the values of y to avoid dead code elimination 34 | for (i = 0; i < MAX; i++) 35 | dummy += y[i]; 36 | } 37 | printf("Dummy value to avoid dead code elimination: %f\n", dummy); 38 | printf("Average runtime %f sec\n", total_time/ITER); 39 | } 40 | -------------------------------------------------------------------------------- /lec12/cache_01_slow.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "my_timer.h" 5 | 6 | double A[MAX][MAX]; 7 | double x[MAX]; 8 | double y[MAX]; 9 | 10 | // Computes matrix-vector multiplication sequentially 11 | int main(int argc, char** argv) { 12 | int i,j,iter; 13 | srand(time(NULL)); 14 | /* Initialize A and x with random values, and y to 0s*/ 15 | for (i = 0; i < MAX; i++) { 16 | x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1 17 | y[i] = 0.0; 18 | for (j = 0; j < MAX; j++) 19 | A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1 20 | } 21 | 22 | double total_time = 0.0; 23 | for(iter = 0; iter < ITER; iter++){ 24 | double start, stop; 25 | GET_TIME(start); 26 | for (j = 0; j < MAX; j++) 27 | for (i = 0; i < MAX; i++) 28 | y[i] += A[i][j]*x[j]; 29 | GET_TIME(stop); 30 | total_time += stop-start; 31 | } 32 | 33 | /** 34 | for (i = 0; i < MAX; i++) 35 | printf("%f\n", y[i]); 36 | **/ 37 | 38 | printf("Average runtime %f sec\n", total_time/ITER); 39 | } 40 | -------------------------------------------------------------------------------- /lec12/cache_fs_fast.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "my_timer.h" 9 | 10 | 11 | #define CLS 16 12 | 13 | float data[NUM_THREADS*CLS]; 14 | 15 | void* thread_fun(void* arg){ 16 | int thread_id = *((int*) arg); 17 | // Pin 18 | cpu_set_t cpuset; 19 | pthread_t thread = pthread_self(); 20 | CPU_ZERO(&cpuset); 21 | CPU_SET(thread_id, &cpuset); 22 | pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); 23 | 24 | for(int i = 0; i < 100000; i++){ 25 | data[thread_id*CLS] += i; 26 | } 27 | return NULL; 28 | } 29 | 30 | // Computes matrix-vector multiplication sequentially 31 | int main(int argc, char** argv) { 32 | int iter; 33 | srand(time(NULL)); 34 | for(int i = 0; i < NUM_THREADS; i++){ 35 | data[i] = rand(); 36 | } 37 | 38 | int ids[NUM_THREADS]; 39 | for(int i = 0; i < NUM_THREADS; i++){ 40 | ids[i] = i; 41 | } 42 | pthread_t threads[NUM_THREADS]; 43 | 44 | double total_time = 0.0; 45 | for(iter = 0; iter < ITER; iter++){ 46 | double start, stop; 47 | GET_TIME(start); 48 | // Create threads 49 | for(int i = 0; i < NUM_THREADS; i++){ 50 | pthread_create(&threads[i], NULL, thread_fun, (void*) &ids[i]); 51 | } 52 | 53 | // Join threads 54 | for(int i = 0; i < NUM_THREADS; i++){ 55 | pthread_join(threads[i], NULL); 56 | } 57 | GET_TIME(stop); 58 | total_time += stop-start; 59 | } 60 | 61 | /** 62 | for (i = 0; i < MAX; i++) 63 | printf("%f\n", y[i]); 64 | **/ 65 | 66 | printf("Average runtime %f sec\n", total_time/ITER); 67 | } 68 | -------------------------------------------------------------------------------- /lec12/cache_fs_slow.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "my_timer.h" 9 | 10 | 11 | 12 | float data[NUM_THREADS]; 13 | 14 | void* thread_fun(void* arg){ 15 | int thread_id = *((int*) arg); 16 | // Pin 17 | cpu_set_t cpuset; 18 | pthread_t thread = pthread_self(); 19 | CPU_ZERO(&cpuset); 20 | CPU_SET(thread_id, &cpuset); 21 | pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); 22 | int t = data[thread_id]; 23 | for(int i = 0; i < 100000; i++){ 24 | t += i; 25 | } 26 | data[thread_id] = t; 27 | return NULL; 28 | } 29 | 30 | // Computes matrix-vector multiplication sequentially 31 | int main(int argc, char** argv) { 32 | int iter; 33 | srand(time(NULL)); 34 | for(int i = 0; i < NUM_THREADS; i++){ 35 | data[i] = rand(); 36 | } 37 | 38 | int ids[NUM_THREADS]; 39 | for(int i = 0; i < NUM_THREADS; i++){ 40 | ids[i] = i; 41 | } 42 | pthread_t threads[NUM_THREADS]; 43 | 44 | double total_time = 0.0; 45 | for(iter = 0; iter < ITER; iter++){ 46 | double start, stop; 47 | GET_TIME(start); 48 | // Create threads 49 | for(int i = 0; i < NUM_THREADS; i++){ 50 | pthread_create(&threads[i], NULL, thread_fun, (void*) &ids[i]); 51 | } 52 | 53 | // Join threads 54 | for(int i = 0; i < NUM_THREADS; i++){ 55 | pthread_join(threads[i], NULL); 56 | } 57 | GET_TIME(stop); 58 | total_time += stop-start; 59 | } 60 | 61 | printf("Average runtime %f sec\n", total_time/ITER); 62 | } 63 | -------------------------------------------------------------------------------- /lec12/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # You should use Makefiles instead ;) 3 | 4 | # Examples on profiling 5 | gcc -Wall -pg -O3 -o mat_vec_p mat_vec.c -pthread 6 | 7 | # Examples on row-major access 8 | MATRIX_SIZE=10000 9 | NUM_ITERATIONS=10 10 | OPT=-O3 11 | 12 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_slow cache_01_slow.c 13 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast cache_01_fast.c 14 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast_broken cache_01_fast_broken.c 15 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast_broken_fixed cache_01_fast_broken_fixed.c 16 | 17 | # Examples on false sharing 18 | NUM_ITERATIONS=1000 19 | OPT=-O0 20 | NUM_THREADS=4 21 | gcc -Wall -g ${OPT} -D NUM_THREADS=${NUM_THREADS} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_fs_slow cache_fs_slow.c -pthread 22 | gcc -Wall -g ${OPT} -D NUM_THREADS=${NUM_THREADS} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_fs_fast cache_fs_fast.c -pthread 23 | 24 | # Examples on branch prediction 25 | MATRIX_SIZE=1000000 26 | NUM_ITERATIONS=10 27 | OPT=-O0 28 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o branch_prediction_slow branch_prediction_slow.c 29 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o branch_prediction_fast branch_prediction_fast.c 30 | 31 | # Matrix-vector mul 32 | gcc -Wall -g -O3 -o mat_vec mat_vec.c -pthread 33 | -------------------------------------------------------------------------------- /lec12/my_timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define GET_TIME(now) { \ 4 | struct timeval t; \ 5 | gettimeofday(&t, NULL); \ 6 | now = t.tv_sec + t.tv_usec/1000000.0; \ 7 | } 8 | -------------------------------------------------------------------------------- /lec13/README.md: -------------------------------------------------------------------------------- 1 | In this exercise, you will parallelize a sequential program that applies the Black-Scholes option pricing formula to a large dataset. The Black-Scholes formula is a mathematical model for the dynamics of a financial market containing derivative investment instruments. The formula calculates the price of a financial option comprising a stock and an option to buy or sell the stock at a specified price at a future date. 2 | Regardless of the specific details of the formula, in this exercise you are supposed to parallelize the sequential code available in the file `blackscholes.c`. 3 | The code reads a dataset from a file and applies the Black-Scholes formula to each record in the dataset. Sample input datasets can be found in the `inputs` directory. The code writes the results to an output file. 4 | You can generate new datasets (e.g., if you want to generate bigger datasets), by running the `inputgen.c` program. The prorgram takes two arguments: the number of records in the dataset and the output file. 5 | You can compile all the code available in this directory by running the `compile.sh` script. 6 | 7 | The `blackscholes.c` application takes two arguments from command line: the input file and the output file. The program runs some correctness checks. You should implement two parallel versions of this application, one using OpenMP and another using Pthreads. The parallel versions should read the input file and write the output file in the same format as the sequential version. The parallel versions should also produce the same results as the sequential version (if not, the program will print error messages). 8 | 9 | In the `solution` folder, you will find the proposed solutions. 10 | 11 | The code for both the sequential version and the parallel solution has been adapted from the PARSEC benchmark. -------------------------------------------------------------------------------- /lec13/blackscholes.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2007 Intel Corp. 2 | 3 | // Black-Scholes 4 | // Analytical method for calculating European Options 5 | // 6 | // 7 | // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 8 | // Hall, John C. Hull, 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "my_timer.h" 15 | 16 | //Precision to use for calculations 17 | #define fptype float 18 | 19 | #define NUM_RUNS 1000 20 | 21 | typedef struct OptionData_ { 22 | fptype s; // spot price 23 | fptype strike; // strike price 24 | fptype r; // risk-free interest rate 25 | fptype divq; // dividend rate 26 | fptype v; // volatility 27 | fptype t; // time to maturity or option expiration in years 28 | // (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc) 29 | char OptionType; // Option type. "P"=PUT, "C"=CALL 30 | fptype divs; // dividend vals (not used in this test) 31 | fptype DGrefval; // DerivaGem Reference Value 32 | } OptionData; 33 | 34 | OptionData *data; 35 | fptype *prices; 36 | int numOptions; 37 | 38 | int * otype; 39 | fptype * sptprice; 40 | fptype * strike; 41 | fptype * rate; 42 | fptype * volatility; 43 | fptype * otime; 44 | int numError = 0; 45 | 46 | //////////////////////////////////////////////////////////////////////////////// 47 | //////////////////////////////////////////////////////////////////////////////// 48 | /////////////////////////////////////////////////////////////////////////////// 49 | //////////////////////////////////////////////////////////////////////////////// 50 | // Cumulative Normal Distribution Function 51 | // See Hull, Section 11.8, P.243-244 52 | #define inv_sqrt_2xPI 0.39894228040143270286 53 | 54 | fptype CNDF ( fptype InputX ) 55 | { 56 | int sign; 57 | 58 | fptype OutputX; 59 | fptype xInput; 60 | fptype xNPrimeofX; 61 | fptype expValues; 62 | fptype xK2; 63 | fptype xK2_2, xK2_3; 64 | fptype xK2_4, xK2_5; 65 | fptype xLocal, xLocal_1; 66 | fptype xLocal_2, xLocal_3; 67 | 68 | // Check for negative value of InputX 69 | if (InputX < 0.0) { 70 | InputX = -InputX; 71 | sign = 1; 72 | } else 73 | sign = 0; 74 | 75 | xInput = InputX; 76 | 77 | // Compute NPrimeX term common to both four & six decimal accuracy calcs 78 | expValues = exp(-0.5f * InputX * InputX); 79 | xNPrimeofX = expValues; 80 | xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI; 81 | 82 | xK2 = 0.2316419 * xInput; 83 | xK2 = 1.0 + xK2; 84 | xK2 = 1.0 / xK2; 85 | xK2_2 = xK2 * xK2; 86 | xK2_3 = xK2_2 * xK2; 87 | xK2_4 = xK2_3 * xK2; 88 | xK2_5 = xK2_4 * xK2; 89 | 90 | xLocal_1 = xK2 * 0.319381530; 91 | xLocal_2 = xK2_2 * (-0.356563782); 92 | xLocal_3 = xK2_3 * 1.781477937; 93 | xLocal_2 = xLocal_2 + xLocal_3; 94 | xLocal_3 = xK2_4 * (-1.821255978); 95 | xLocal_2 = xLocal_2 + xLocal_3; 96 | xLocal_3 = xK2_5 * 1.330274429; 97 | xLocal_2 = xLocal_2 + xLocal_3; 98 | 99 | xLocal_1 = xLocal_2 + xLocal_1; 100 | xLocal = xLocal_1 * xNPrimeofX; 101 | xLocal = 1.0 - xLocal; 102 | 103 | OutputX = xLocal; 104 | 105 | if (sign) { 106 | OutputX = 1.0 - OutputX; 107 | } 108 | 109 | return OutputX; 110 | } 111 | 112 | ////////////////////////////////////////////////////////////////////////////////////// 113 | ////////////////////////////////////////////////////////////////////////////////////// 114 | ////////////////////////////////////////////////////////////////////////////////////// 115 | ////////////////////////////////////////////////////////////////////////////////////// 116 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice, 117 | fptype strike, fptype rate, fptype volatility, 118 | fptype time, int otype, float timet ) 119 | { 120 | fptype OptionPrice; 121 | 122 | // local private working variables for the calculation 123 | fptype xRiskFreeRate; 124 | fptype xVolatility; 125 | fptype xTime; 126 | fptype xSqrtTime; 127 | 128 | fptype logValues; 129 | fptype xLogTerm; 130 | fptype xD1; 131 | fptype xD2; 132 | fptype xPowerTerm; 133 | fptype xDen; 134 | fptype d1; 135 | fptype d2; 136 | fptype FutureValueX; 137 | fptype NofXd1; 138 | fptype NofXd2; 139 | fptype NegNofXd1; 140 | fptype NegNofXd2; 141 | 142 | xRiskFreeRate = rate; 143 | xVolatility = volatility; 144 | 145 | xTime = time; 146 | xSqrtTime = sqrt(xTime); 147 | 148 | logValues = log( sptprice / strike ); 149 | 150 | xLogTerm = logValues; 151 | 152 | 153 | xPowerTerm = xVolatility * xVolatility; 154 | xPowerTerm = xPowerTerm * 0.5; 155 | 156 | xD1 = xRiskFreeRate + xPowerTerm; 157 | xD1 = xD1 * xTime; 158 | xD1 = xD1 + xLogTerm; 159 | 160 | xDen = xVolatility * xSqrtTime; 161 | xD1 = xD1 / xDen; 162 | xD2 = xD1 - xDen; 163 | 164 | d1 = xD1; 165 | d2 = xD2; 166 | 167 | NofXd1 = CNDF( d1 ); 168 | NofXd2 = CNDF( d2 ); 169 | 170 | FutureValueX = strike * ( exp( -(rate)*(time) ) ); 171 | if (otype == 0) { 172 | OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2); 173 | } else { 174 | NegNofXd1 = (1.0 - NofXd1); 175 | NegNofXd2 = (1.0 - NofXd2); 176 | OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1); 177 | } 178 | 179 | return OptionPrice; 180 | } 181 | 182 | int main (int argc, char **argv) 183 | { 184 | FILE *file; 185 | int i; 186 | int loopnum; 187 | int rv; 188 | 189 | if (argc != 3) { 190 | printf("Usage:\n\t%s \n", argv[0]); 191 | exit(1); 192 | } 193 | char *inputFile = argv[1]; 194 | char *outputFile = argv[2]; 195 | 196 | //Read input data from file 197 | file = fopen(inputFile, "r"); 198 | if(file == NULL) { 199 | printf("ERROR: Unable to open file `%s'.\n", inputFile); 200 | exit(1); 201 | } 202 | rv = fscanf(file, "%i", &numOptions); 203 | if(rv != 1) { 204 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 205 | fclose(file); 206 | exit(1); 207 | } 208 | 209 | // alloc spaces for the option data 210 | data = (OptionData*)malloc(numOptions*sizeof(OptionData)); 211 | prices = (fptype*)malloc(numOptions*sizeof(fptype)); 212 | for ( loopnum = 0; loopnum < numOptions; ++ loopnum ) 213 | { 214 | rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval); 215 | if(rv != 9) { 216 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 217 | fclose(file); 218 | exit(1); 219 | } 220 | } 221 | rv = fclose(file); 222 | if(rv != 0) { 223 | printf("ERROR: Unable to close file `%s'.\n", inputFile); 224 | exit(1); 225 | } 226 | 227 | printf("Num of Options: %d\n", numOptions); 228 | printf("Num of Runs: %d\n", NUM_RUNS); 229 | 230 | sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype)); 231 | strike = sptprice + numOptions; 232 | rate = strike + numOptions; 233 | volatility = rate + numOptions; 234 | otime = volatility + numOptions; 235 | 236 | otype = (int *) malloc(numOptions * sizeof(fptype)); 237 | 238 | for (i=0; i= 1e-4 ){ 271 | printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n", 272 | i, price, data[i].DGrefval, priceDelta); 273 | numError ++; 274 | } 275 | #endif 276 | } 277 | } 278 | GET_TIME(stop); 279 | end = stop - start; 280 | printf("Time: %f seconds\n", end); 281 | 282 | 283 | //Write prices to output file 284 | file = fopen(outputFile, "w"); 285 | if(file == NULL) { 286 | printf("ERROR: Unable to open file `%s'.\n", outputFile); 287 | exit(1); 288 | } 289 | rv = fprintf(file, "%i\n", numOptions); 290 | if(rv < 0) { 291 | printf("ERROR: Unable to write to file `%s'.\n", outputFile); 292 | fclose(file); 293 | exit(1); 294 | } 295 | for(i=0; i 6 | #include 7 | 8 | 9 | 10 | //Precision to use 11 | #define fptype double 12 | 13 | typedef struct OptionData_ { 14 | fptype s; // spot price 15 | fptype strike; // strike price 16 | fptype r; // risk-free interest rate 17 | fptype divq; // dividend rate 18 | fptype v; // volatility 19 | fptype t; // time to maturity or option expiration in years 20 | // (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc) 21 | const char *OptionType; // Option type. "P"=PUT, "C"=CALL 22 | fptype divs; // dividend vals (not used in this test) 23 | fptype DGrefval; // DerivaGem Reference Value 24 | } OptionData; 25 | 26 | //Total number of options in optionData.txt 27 | #define MAX_OPTIONS 1000 28 | 29 | OptionData data_init[] = { 30 | #include "optionData.txt" 31 | }; 32 | 33 | 34 | 35 | int main (int argc, char **argv) { 36 | int numOptions; 37 | char *fileName; 38 | int rv; 39 | int i; 40 | 41 | if (argc != 3) { 42 | printf("Usage:\n\t%s \n", argv[0]); 43 | exit(1); 44 | } 45 | numOptions = atoi(argv[1]); 46 | fileName = argv[2]; 47 | if(numOptions < 1) { 48 | printf("ERROR: Number of options must at least be 1.\n"); 49 | exit(1); 50 | } 51 | 52 | FILE *file; 53 | file = fopen(fileName, "w"); 54 | if(file == NULL) { 55 | printf("ERROR: Unable to open file `%s'.\n", fileName); 56 | exit(1); 57 | } 58 | 59 | //write number of options 60 | rv = fprintf(file, "%i\n", numOptions); 61 | if(rv < 0) { 62 | printf("ERROR: Unable to write to file `%s'.\n", fileName); 63 | fclose(file); 64 | exit(1); 65 | } 66 | 67 | //write values for options 68 | for(i=0; i 2 | 3 | #define GET_TIME(now) { \ 4 | struct timeval t; \ 5 | gettimeofday(&t, NULL); \ 6 | now = t.tv_sec + t.tv_usec/1000000.0; \ 7 | } 8 | -------------------------------------------------------------------------------- /lec13/solution/blackscholes_omp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/lec13/solution/blackscholes_omp -------------------------------------------------------------------------------- /lec13/solution/blackscholes_omp.c: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2007 Intel Corp. 2 | 3 | // Black-Scholes 4 | // Analytical method for calculating European Options 5 | // 6 | // 7 | // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 8 | // Hall, John C. Hull, 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../my_timer.h" 15 | 16 | // Multi-threaded OpenMP header 17 | #include 18 | 19 | //Precision to use for calculations 20 | #define fptype float 21 | 22 | #define NUM_RUNS 1000 23 | 24 | typedef struct OptionData_ { 25 | fptype s; // spot price 26 | fptype strike; // strike price 27 | fptype r; // risk-free interest rate 28 | fptype divq; // dividend rate 29 | fptype v; // volatility 30 | fptype t; // time to maturity or option expiration in years 31 | // (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc) 32 | char OptionType; // Option type. "P"=PUT, "C"=CALL 33 | fptype divs; // dividend vals (not used in this test) 34 | fptype DGrefval; // DerivaGem Reference Value 35 | } OptionData; 36 | 37 | OptionData *data; 38 | fptype *prices; 39 | int numOptions; 40 | 41 | int * otype; 42 | fptype * sptprice; 43 | fptype * strike; 44 | fptype * rate; 45 | fptype * volatility; 46 | fptype * otime; 47 | int numError = 0; 48 | int nThreads; 49 | 50 | //////////////////////////////////////////////////////////////////////////////// 51 | //////////////////////////////////////////////////////////////////////////////// 52 | /////////////////////////////////////////////////////////////////////////////// 53 | //////////////////////////////////////////////////////////////////////////////// 54 | // Cumulative Normal Distribution Function 55 | // See Hull, Section 11.8, P.243-244 56 | #define inv_sqrt_2xPI 0.39894228040143270286 57 | 58 | fptype CNDF ( fptype InputX ) 59 | { 60 | int sign; 61 | 62 | fptype OutputX; 63 | fptype xInput; 64 | fptype xNPrimeofX; 65 | fptype expValues; 66 | fptype xK2; 67 | fptype xK2_2, xK2_3; 68 | fptype xK2_4, xK2_5; 69 | fptype xLocal, xLocal_1; 70 | fptype xLocal_2, xLocal_3; 71 | 72 | // Check for negative value of InputX 73 | if (InputX < 0.0) { 74 | InputX = -InputX; 75 | sign = 1; 76 | } else 77 | sign = 0; 78 | 79 | xInput = InputX; 80 | 81 | // Compute NPrimeX term common to both four & six decimal accuracy calcs 82 | expValues = exp(-0.5f * InputX * InputX); 83 | xNPrimeofX = expValues; 84 | xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI; 85 | 86 | xK2 = 0.2316419 * xInput; 87 | xK2 = 1.0 + xK2; 88 | xK2 = 1.0 / xK2; 89 | xK2_2 = xK2 * xK2; 90 | xK2_3 = xK2_2 * xK2; 91 | xK2_4 = xK2_3 * xK2; 92 | xK2_5 = xK2_4 * xK2; 93 | 94 | xLocal_1 = xK2 * 0.319381530; 95 | xLocal_2 = xK2_2 * (-0.356563782); 96 | xLocal_3 = xK2_3 * 1.781477937; 97 | xLocal_2 = xLocal_2 + xLocal_3; 98 | xLocal_3 = xK2_4 * (-1.821255978); 99 | xLocal_2 = xLocal_2 + xLocal_3; 100 | xLocal_3 = xK2_5 * 1.330274429; 101 | xLocal_2 = xLocal_2 + xLocal_3; 102 | 103 | xLocal_1 = xLocal_2 + xLocal_1; 104 | xLocal = xLocal_1 * xNPrimeofX; 105 | xLocal = 1.0 - xLocal; 106 | 107 | OutputX = xLocal; 108 | 109 | if (sign) { 110 | OutputX = 1.0 - OutputX; 111 | } 112 | 113 | return OutputX; 114 | } 115 | 116 | ////////////////////////////////////////////////////////////////////////////////////// 117 | ////////////////////////////////////////////////////////////////////////////////////// 118 | ////////////////////////////////////////////////////////////////////////////////////// 119 | ////////////////////////////////////////////////////////////////////////////////////// 120 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice, 121 | fptype strike, fptype rate, fptype volatility, 122 | fptype time, int otype, float timet ) 123 | { 124 | fptype OptionPrice; 125 | 126 | // local private working variables for the calculation 127 | fptype xRiskFreeRate; 128 | fptype xVolatility; 129 | fptype xTime; 130 | fptype xSqrtTime; 131 | 132 | fptype logValues; 133 | fptype xLogTerm; 134 | fptype xD1; 135 | fptype xD2; 136 | fptype xPowerTerm; 137 | fptype xDen; 138 | fptype d1; 139 | fptype d2; 140 | fptype FutureValueX; 141 | fptype NofXd1; 142 | fptype NofXd2; 143 | fptype NegNofXd1; 144 | fptype NegNofXd2; 145 | 146 | xRiskFreeRate = rate; 147 | xVolatility = volatility; 148 | 149 | xTime = time; 150 | xSqrtTime = sqrt(xTime); 151 | 152 | logValues = log( sptprice / strike ); 153 | 154 | xLogTerm = logValues; 155 | 156 | 157 | xPowerTerm = xVolatility * xVolatility; 158 | xPowerTerm = xPowerTerm * 0.5; 159 | 160 | xD1 = xRiskFreeRate + xPowerTerm; 161 | xD1 = xD1 * xTime; 162 | xD1 = xD1 + xLogTerm; 163 | 164 | xDen = xVolatility * xSqrtTime; 165 | xD1 = xD1 / xDen; 166 | xD2 = xD1 - xDen; 167 | 168 | d1 = xD1; 169 | d2 = xD2; 170 | 171 | NofXd1 = CNDF( d1 ); 172 | NofXd2 = CNDF( d2 ); 173 | 174 | FutureValueX = strike * ( exp( -(rate)*(time) ) ); 175 | if (otype == 0) { 176 | OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2); 177 | } else { 178 | NegNofXd1 = (1.0 - NofXd1); 179 | NegNofXd2 = (1.0 - NofXd2); 180 | OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1); 181 | } 182 | 183 | return OptionPrice; 184 | } 185 | 186 | int main (int argc, char **argv) 187 | { 188 | FILE *file; 189 | int i; 190 | int loopnum; 191 | int rv; 192 | 193 | if (argc != 4) 194 | { 195 | printf("Usage:\n\t%s \n", argv[0]); 196 | exit(1); 197 | } 198 | nThreads = atoi(argv[1]); 199 | char *inputFile = argv[2]; 200 | char *outputFile = argv[3]; 201 | 202 | //Read input data from file 203 | file = fopen(inputFile, "r"); 204 | if(file == NULL) { 205 | printf("ERROR: Unable to open file `%s'.\n", inputFile); 206 | exit(1); 207 | } 208 | rv = fscanf(file, "%i", &numOptions); 209 | if(rv != 1) { 210 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 211 | fclose(file); 212 | exit(1); 213 | } 214 | if(nThreads > numOptions) { 215 | printf("WARNING: Not enough work, reducing number of threads to match number of options.\n"); 216 | nThreads = numOptions; 217 | } 218 | 219 | // alloc spaces for the option data 220 | data = (OptionData*)malloc(numOptions*sizeof(OptionData)); 221 | prices = (fptype*)malloc(numOptions*sizeof(fptype)); 222 | for ( loopnum = 0; loopnum < numOptions; ++ loopnum ) 223 | { 224 | rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval); 225 | if(rv != 9) { 226 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 227 | fclose(file); 228 | exit(1); 229 | } 230 | } 231 | rv = fclose(file); 232 | if(rv != 0) { 233 | printf("ERROR: Unable to close file `%s'.\n", inputFile); 234 | exit(1); 235 | } 236 | 237 | printf("Num of Options: %d\n", numOptions); 238 | printf("Num of Runs: %d\n", NUM_RUNS); 239 | 240 | 241 | sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype)); 242 | strike = sptprice + numOptions; 243 | rate = strike + numOptions; 244 | volatility = rate + numOptions; 245 | otime = volatility + numOptions; 246 | 247 | 248 | otype = (int *) malloc(numOptions * sizeof(fptype)); 249 | 250 | for (i=0; i= 1e-4 ){ 287 | printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n", 288 | i, price, data[i].DGrefval, priceDelta); 289 | numError ++; 290 | } 291 | #endif 292 | } 293 | } 294 | } 295 | GET_TIME(stop); 296 | end = stop - start; 297 | printf("Time: %f seconds\n", end); 298 | 299 | //Write prices to output file 300 | file = fopen(outputFile, "w"); 301 | if(file == NULL) { 302 | printf("ERROR: Unable to open file `%s'.\n", outputFile); 303 | exit(1); 304 | } 305 | rv = fprintf(file, "%i\n", numOptions); 306 | if(rv < 0) { 307 | printf("ERROR: Unable to write to file `%s'.\n", outputFile); 308 | fclose(file); 309 | exit(1); 310 | } 311 | for(i=0; i 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "../my_timer.h" 16 | 17 | //Precision to use for calculations 18 | #define fptype float 19 | 20 | #define NUM_RUNS 1000 21 | 22 | typedef struct OptionData_ { 23 | fptype s; // spot price 24 | fptype strike; // strike price 25 | fptype r; // risk-free interest rate 26 | fptype divq; // dividend rate 27 | fptype v; // volatility 28 | fptype t; // time to maturity or option expiration in years 29 | // (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc) 30 | char OptionType; // Option type. "P"=PUT, "C"=CALL 31 | fptype divs; // dividend vals (not used in this test) 32 | fptype DGrefval; // DerivaGem Reference Value 33 | } OptionData; 34 | 35 | OptionData *data; 36 | fptype *prices; 37 | int numOptions; 38 | 39 | int * otype; 40 | fptype * sptprice; 41 | fptype * strike; 42 | fptype * rate; 43 | fptype * volatility; 44 | fptype * otime; 45 | int numError = 0; 46 | int nThreads; 47 | 48 | //////////////////////////////////////////////////////////////////////////////// 49 | //////////////////////////////////////////////////////////////////////////////// 50 | /////////////////////////////////////////////////////////////////////////////// 51 | //////////////////////////////////////////////////////////////////////////////// 52 | // Cumulative Normal Distribution Function 53 | // See Hull, Section 11.8, P.243-244 54 | #define inv_sqrt_2xPI 0.39894228040143270286 55 | 56 | fptype CNDF ( fptype InputX ) 57 | { 58 | int sign; 59 | 60 | fptype OutputX; 61 | fptype xInput; 62 | fptype xNPrimeofX; 63 | fptype expValues; 64 | fptype xK2; 65 | fptype xK2_2, xK2_3; 66 | fptype xK2_4, xK2_5; 67 | fptype xLocal, xLocal_1; 68 | fptype xLocal_2, xLocal_3; 69 | 70 | // Check for negative value of InputX 71 | if (InputX < 0.0) { 72 | InputX = -InputX; 73 | sign = 1; 74 | } else 75 | sign = 0; 76 | 77 | xInput = InputX; 78 | 79 | // Compute NPrimeX term common to both four & six decimal accuracy calcs 80 | expValues = exp(-0.5f * InputX * InputX); 81 | xNPrimeofX = expValues; 82 | xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI; 83 | 84 | xK2 = 0.2316419 * xInput; 85 | xK2 = 1.0 + xK2; 86 | xK2 = 1.0 / xK2; 87 | xK2_2 = xK2 * xK2; 88 | xK2_3 = xK2_2 * xK2; 89 | xK2_4 = xK2_3 * xK2; 90 | xK2_5 = xK2_4 * xK2; 91 | 92 | xLocal_1 = xK2 * 0.319381530; 93 | xLocal_2 = xK2_2 * (-0.356563782); 94 | xLocal_3 = xK2_3 * 1.781477937; 95 | xLocal_2 = xLocal_2 + xLocal_3; 96 | xLocal_3 = xK2_4 * (-1.821255978); 97 | xLocal_2 = xLocal_2 + xLocal_3; 98 | xLocal_3 = xK2_5 * 1.330274429; 99 | xLocal_2 = xLocal_2 + xLocal_3; 100 | 101 | xLocal_1 = xLocal_2 + xLocal_1; 102 | xLocal = xLocal_1 * xNPrimeofX; 103 | xLocal = 1.0 - xLocal; 104 | 105 | OutputX = xLocal; 106 | 107 | if (sign) { 108 | OutputX = 1.0 - OutputX; 109 | } 110 | 111 | return OutputX; 112 | } 113 | 114 | ////////////////////////////////////////////////////////////////////////////////////// 115 | ////////////////////////////////////////////////////////////////////////////////////// 116 | ////////////////////////////////////////////////////////////////////////////////////// 117 | ////////////////////////////////////////////////////////////////////////////////////// 118 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice, 119 | fptype strike, fptype rate, fptype volatility, 120 | fptype time, int otype, float timet ) 121 | { 122 | fptype OptionPrice; 123 | 124 | // local private working variables for the calculation 125 | fptype xRiskFreeRate; 126 | fptype xVolatility; 127 | fptype xTime; 128 | fptype xSqrtTime; 129 | 130 | fptype logValues; 131 | fptype xLogTerm; 132 | fptype xD1; 133 | fptype xD2; 134 | fptype xPowerTerm; 135 | fptype xDen; 136 | fptype d1; 137 | fptype d2; 138 | fptype FutureValueX; 139 | fptype NofXd1; 140 | fptype NofXd2; 141 | fptype NegNofXd1; 142 | fptype NegNofXd2; 143 | 144 | xRiskFreeRate = rate; 145 | xVolatility = volatility; 146 | 147 | xTime = time; 148 | xSqrtTime = sqrt(xTime); 149 | 150 | logValues = log( sptprice / strike ); 151 | 152 | xLogTerm = logValues; 153 | 154 | 155 | xPowerTerm = xVolatility * xVolatility; 156 | xPowerTerm = xPowerTerm * 0.5; 157 | 158 | xD1 = xRiskFreeRate + xPowerTerm; 159 | xD1 = xD1 * xTime; 160 | xD1 = xD1 + xLogTerm; 161 | 162 | xDen = xVolatility * xSqrtTime; 163 | xD1 = xD1 / xDen; 164 | xD2 = xD1 - xDen; 165 | 166 | d1 = xD1; 167 | d2 = xD2; 168 | 169 | NofXd1 = CNDF( d1 ); 170 | NofXd2 = CNDF( d2 ); 171 | 172 | FutureValueX = strike * ( exp( -(rate)*(time) ) ); 173 | if (otype == 0) { 174 | OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2); 175 | } else { 176 | NegNofXd1 = (1.0 - NofXd1); 177 | NegNofXd2 = (1.0 - NofXd2); 178 | OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1); 179 | } 180 | 181 | return OptionPrice; 182 | } 183 | 184 | void* bs_thread(void *tid_ptr) { 185 | int i, j; 186 | fptype price; 187 | fptype priceDelta; 188 | int tid = *(int *)tid_ptr; 189 | int start = tid * (numOptions / nThreads); 190 | int end = start + (numOptions / nThreads); 191 | 192 | for (j=0; j= 1e-4 ){ 205 | printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n", 206 | i, price, data[i].DGrefval, priceDelta); 207 | numError ++; 208 | } 209 | #endif 210 | } 211 | } 212 | 213 | return NULL; 214 | } 215 | 216 | int main (int argc, char **argv) 217 | { 218 | FILE *file; 219 | int i; 220 | int loopnum; 221 | int rv; 222 | 223 | if (argc != 4) 224 | { 225 | printf("Usage:\n\t%s \n", argv[0]); 226 | exit(1); 227 | } 228 | nThreads = atoi(argv[1]); 229 | char *inputFile = argv[2]; 230 | char *outputFile = argv[3]; 231 | 232 | //Read input data from file 233 | file = fopen(inputFile, "r"); 234 | if(file == NULL) { 235 | printf("ERROR: Unable to open file `%s'.\n", inputFile); 236 | exit(1); 237 | } 238 | rv = fscanf(file, "%i", &numOptions); 239 | if(rv != 1) { 240 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 241 | fclose(file); 242 | exit(1); 243 | } 244 | if(nThreads > numOptions) { 245 | printf("WARNING: Not enough work, reducing number of threads to match number of options.\n"); 246 | nThreads = numOptions; 247 | } 248 | 249 | // alloc spaces for the option data 250 | data = (OptionData*)malloc(numOptions*sizeof(OptionData)); 251 | prices = (fptype*)malloc(numOptions*sizeof(fptype)); 252 | for ( loopnum = 0; loopnum < numOptions; ++ loopnum ) 253 | { 254 | rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval); 255 | if(rv != 9) { 256 | printf("ERROR: Unable to read from file `%s'.\n", inputFile); 257 | fclose(file); 258 | exit(1); 259 | } 260 | } 261 | rv = fclose(file); 262 | if(rv != 0) { 263 | printf("ERROR: Unable to close file `%s'.\n", inputFile); 264 | exit(1); 265 | } 266 | 267 | printf("Num of Options: %d\n", numOptions); 268 | printf("Num of Runs: %d\n", NUM_RUNS); 269 | 270 | sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype)); 271 | strike = sptprice + numOptions; 272 | rate = strike + numOptions; 273 | volatility = rate + numOptions; 274 | otime = volatility + numOptions; 275 | 276 | otype = (int *) malloc(numOptions * sizeof(fptype)); 277 | 278 | for (i=0; i 6 | #include 7 | #include 8 | 9 | #define ARRAY_SIZE 100000000 10 | 11 | int main(int argc, char** argv){ 12 | int max = atoi(argv[1]); 13 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 14 | int* counts = (int*)malloc(max * sizeof(int)); 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | double start = omp_get_wtime(); 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts[i] = 0; 23 | } 24 | 25 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 26 | counts[array[i]]++; 27 | } 28 | double stop = omp_get_wtime(); 29 | 30 | for(unsigned long i = 0; i < max; i++){ 31 | printf("%d elements with value %ld\n", counts[i], i); 32 | } 33 | printf("Total runtime: %f secs\n", stop - start); 34 | 35 | free(counts); 36 | free(array); 37 | return 0; 38 | } -------------------------------------------------------------------------------- /lec15/histogram_solution_ato_local.c: -------------------------------------------------------------------------------- 1 | // Implements counting sort 2 | // First argument from command line is the maximum value that each element in the array can have. 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100. 4 | #include 5 | #include 6 | #include 7 | 8 | #define ARRAY_SIZE 100000000 9 | 10 | int main(int argc, char** argv){ 11 | int max = atoi(argv[1]); 12 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 13 | int* counts = (int*)malloc(max * sizeof(int)); 14 | int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | // Compute reference counts for error check 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts_reference[i] = 0; 23 | } 24 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 25 | counts_reference[array[i]]++; 26 | } 27 | 28 | // Create local counts -- ATTENTION: We need to do it for omp_get_max_threads (we do not know how many threads will be used) 29 | // Alternatively, we could check the value of the OMP_NUM_THREADS env variable 30 | int** counts_local = (int**)malloc(omp_get_max_threads() * sizeof(int*)); 31 | for(int i = 0; i < omp_get_max_threads(); i++){ 32 | counts_local[i] = (int*)malloc(max * sizeof(int)); 33 | for(unsigned long j = 0; j < max; j++){ 34 | counts_local[i][j] = 0; 35 | } 36 | } 37 | 38 | double start = omp_get_wtime(); 39 | for(unsigned long i = 0; i < max; i++){ 40 | counts[i] = 0; 41 | } 42 | 43 | #pragma omp parallel 44 | { 45 | int tid = omp_get_thread_num(); 46 | #pragma omp for 47 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 48 | counts_local[tid][array[i]]++; // ATTENTION: Still some false sharing might happen here 49 | } 50 | 51 | #pragma omp for 52 | for(int t = 0; t < omp_get_num_threads(); t++){ 53 | for(unsigned long i = 0; i < max; i++){ 54 | #pragma omp atomic 55 | counts[i] += counts_local[t][i]; 56 | } 57 | } 58 | } 59 | 60 | double stop = omp_get_wtime(); 61 | 62 | for(unsigned long i = 0; i < max; i++){ 63 | if(counts[i] != counts_reference[i]){ 64 | fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]); 65 | return 1; 66 | } 67 | printf("%d elements with value %ld\n", counts[i], i); 68 | } 69 | printf("Total runtime: %f secs\n", stop - start); 70 | 71 | for(int i = 0; i < omp_get_max_threads(); i++){ 72 | free(counts_local[i]); 73 | } 74 | free(counts_local); 75 | free(counts_reference); 76 | free(counts); 77 | free(array); 78 | return 0; 79 | } -------------------------------------------------------------------------------- /lec15/histogram_solution_ato_local_better.c: -------------------------------------------------------------------------------- 1 | // Implements counting sort 2 | // First argument from command line is the maximum value that each element in the array can have. 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100. 4 | #include 5 | #include 6 | #include 7 | 8 | #define ARRAY_SIZE 100000000 9 | 10 | int main(int argc, char** argv){ 11 | int max = atoi(argv[1]); 12 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 13 | int* counts = (int*)malloc(max * sizeof(int)); 14 | int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | // Compute reference counts for error check 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts_reference[i] = 0; 23 | } 24 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 25 | counts_reference[array[i]]++; 26 | } 27 | 28 | // Create local counts -- ATTENTION: We need to do it for omp_get_max_threads (we do not know how many threads will be used) 29 | // Alternatively, we could check the value of the OMP_NUM_THREADS env variable 30 | int** counts_local = (int**)malloc(omp_get_max_threads() * sizeof(int*)); 31 | for(int i = 0; i < omp_get_max_threads(); i++){ 32 | int adjusted_size = (max*sizeof(int) + 64); // Add padding (assuming 64 byte cache line size) 33 | counts_local[i] = (int*)malloc(adjusted_size); 34 | for(unsigned long j = 0; j < max; j++){ 35 | counts_local[i][j] = 0; 36 | } 37 | } 38 | 39 | double start = omp_get_wtime(); 40 | for(unsigned long i = 0; i < max; i++){ 41 | counts[i] = 0; 42 | } 43 | 44 | #pragma omp parallel 45 | { 46 | int tid = omp_get_thread_num(); 47 | #pragma omp for 48 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 49 | counts_local[tid][array[i]]++; 50 | } 51 | 52 | #pragma omp for 53 | for(int t = 0; t < omp_get_num_threads(); t++){ 54 | for(unsigned long i = 0; i < max; i++){ 55 | #pragma omp atomic 56 | counts[i] += counts_local[t][i]; 57 | } 58 | } 59 | } 60 | 61 | double stop = omp_get_wtime(); 62 | 63 | for(unsigned long i = 0; i < max; i++){ 64 | if(counts[i] != counts_reference[i]){ 65 | fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]); 66 | return 1; 67 | } 68 | printf("%d elements with value %ld\n", counts[i], i); 69 | } 70 | printf("Total runtime: %f secs\n", stop - start); 71 | 72 | for(int i = 0; i < omp_get_max_threads(); i++){ 73 | free(counts_local[i]); 74 | } 75 | free(counts_local); 76 | free(counts_reference); 77 | free(counts); 78 | free(array); 79 | return 0; 80 | } -------------------------------------------------------------------------------- /lec15/histogram_solution_red.c: -------------------------------------------------------------------------------- 1 | // Implements counting sort 2 | // First argument from command line is the maximum value that each element in the array can have. 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100. 4 | #include 5 | #include 6 | #include 7 | 8 | #define ARRAY_SIZE 100000000 9 | 10 | int main(int argc, char** argv){ 11 | int max = atoi(argv[1]); 12 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 13 | int* counts = (int*)malloc(max * sizeof(int)); 14 | int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | // Compute reference counts for error check 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts_reference[i] = 0; 23 | } 24 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 25 | counts_reference[array[i]]++; 26 | } 27 | 28 | double start = omp_get_wtime(); 29 | for(unsigned long i = 0; i < max; i++){ 30 | counts[i] = 0; 31 | } 32 | 33 | #pragma omp parallel for reduction(+:counts[:max]) 34 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 35 | counts[array[i]]++; 36 | } 37 | double stop = omp_get_wtime(); 38 | 39 | for(unsigned long i = 0; i < max; i++){ 40 | if(counts[i] != counts_reference[i]){ 41 | fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]); 42 | return 1; 43 | } 44 | printf("%d elements with value %ld\n", counts[i], i); 45 | } 46 | printf("Total runtime: %f secs\n", stop - start); 47 | 48 | free(counts_reference); 49 | free(counts); 50 | free(array); 51 | return 0; 52 | } -------------------------------------------------------------------------------- /lec15/histogram_solution_trivial_ato.c: -------------------------------------------------------------------------------- 1 | // Implements counting sort 2 | // First argument from command line is the maximum value that each element in the array can have. 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100. 4 | #include 5 | #include 6 | #include 7 | 8 | #define ARRAY_SIZE 100000000 9 | 10 | int main(int argc, char** argv){ 11 | int max = atoi(argv[1]); 12 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 13 | int* counts = (int*)malloc(max * sizeof(int)); 14 | int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | // Compute reference counts for error check 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts_reference[i] = 0; 23 | } 24 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 25 | counts_reference[array[i]]++; 26 | } 27 | 28 | double start = omp_get_wtime(); 29 | for(unsigned long i = 0; i < max; i++){ 30 | counts[i] = 0; 31 | } 32 | 33 | #pragma omp parallel for 34 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 35 | #pragma omp atomic 36 | counts[array[i]]++; 37 | } 38 | double stop = omp_get_wtime(); 39 | 40 | for(unsigned long i = 0; i < max; i++){ 41 | if(counts[i] != counts_reference[i]){ 42 | fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]); 43 | return 1; 44 | } 45 | printf("%d elements with value %ld\n", counts[i], i); 46 | } 47 | printf("Total runtime: %f secs\n", stop - start); 48 | 49 | free(counts_reference); 50 | free(counts); 51 | free(array); 52 | return 0; 53 | } -------------------------------------------------------------------------------- /lec15/histogram_solution_trivial_crit.c: -------------------------------------------------------------------------------- 1 | // Implements counting sort 2 | // First argument from command line is the maximum value that each element in the array can have. 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100. 4 | #include 5 | #include 6 | #include 7 | 8 | #define ARRAY_SIZE 100000000 9 | 10 | int main(int argc, char** argv){ 11 | int max = atoi(argv[1]); 12 | int* array = (int*)malloc(ARRAY_SIZE * sizeof(int)); 13 | int* counts = (int*)malloc(max * sizeof(int)); 14 | int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks 15 | // Generate random array 16 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 17 | array[i] = rand() % max; 18 | } 19 | 20 | // Compute reference counts for error check 21 | for(unsigned long i = 0; i < max; i++){ 22 | counts_reference[i] = 0; 23 | } 24 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 25 | counts_reference[array[i]]++; 26 | } 27 | 28 | double start = omp_get_wtime(); 29 | for(unsigned long i = 0; i < max; i++){ 30 | counts[i] = 0; 31 | } 32 | 33 | #pragma omp parallel for 34 | for(unsigned long i = 0; i < ARRAY_SIZE; i++){ 35 | #pragma omp critical 36 | counts[array[i]]++; 37 | } 38 | double stop = omp_get_wtime(); 39 | 40 | for(unsigned long i = 0; i < max; i++){ 41 | if(counts[i] != counts_reference[i]){ 42 | fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]); 43 | return 1; 44 | } 45 | printf("%d elements with value %ld\n", counts[i], i); 46 | } 47 | printf("Total runtime: %f secs\n", stop - start); 48 | 49 | free(counts_reference); 50 | free(counts); 51 | free(array); 52 | return 0; 53 | } -------------------------------------------------------------------------------- /lec15/matmul.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** PROGRAM: Matrix Multiply 3 | ** 4 | ** PURPOSE: This is a simple matrix multiply program. 5 | ** It will compute the product 6 | ** 7 | ** C = A * B 8 | ** 9 | ** A and B are set to constant matrices so we 10 | ** can make a quick test of the multiplication. 11 | ** 12 | ** USAGE: Right now, I hardwire the martix dimensions. 13 | ** later, I'll take them from the command line. 14 | ** 15 | ** HISTORY: Written by Tim Mattson, Nov 1999. 16 | */ 17 | #include 18 | #include 19 | #include 20 | 21 | #define ORDER 1000 22 | #define AVAL 3.0 23 | #define BVAL 5.0 24 | #define TOL 0.001 25 | 26 | int main(int argc, char **argv) 27 | { 28 | int Ndim, Pdim, Mdim; /* A[N][P], B[P][M], C[N][M] */ 29 | int i,j,k; 30 | double *A, *B, *C, cval, err, errsq; 31 | double dN, mflops; 32 | double start_time, run_time; 33 | 34 | 35 | Ndim = ORDER; 36 | Pdim = ORDER; 37 | Mdim = ORDER; 38 | 39 | A = (double *)malloc(Ndim*Pdim*sizeof(double)); 40 | B = (double *)malloc(Pdim*Mdim*sizeof(double)); 41 | C = (double *)malloc(Ndim*Mdim*sizeof(double)); 42 | 43 | /* Initialize matrices */ 44 | 45 | for (i=0; i TOL) 89 | printf("\n Errors in multiplication: %f",errsq); 90 | else 91 | printf("\n Hey, it worked"); 92 | 93 | printf("\n all done \n"); 94 | } 95 | -------------------------------------------------------------------------------- /lec15/matmul_solution.c: -------------------------------------------------------------------------------- 1 | /* 2 | ** PROGRAM: Parallel Matrix Multiply (using OpenMP) 3 | ** 4 | ** PURPOSE: This is a simple matrix multiply program. 5 | ** It will compute the product 6 | ** 7 | ** C = A * B 8 | ** 9 | ** A and B are set to constant matrices so we 10 | ** can make a quick test of the multiplication. 11 | ** 12 | ** USAGE: Right now, I hardwire the martix dimensions. 13 | ** later, I'll take them from the command line. 14 | ** 15 | ** HISTORY: Written by Tim Mattson, Nov 1999. 16 | */ 17 | #include 18 | #include 19 | #include 20 | 21 | #define ORDER 1000 22 | #define AVAL 3.0 23 | #define BVAL 5.0 24 | #define TOL 0.001 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | int Ndim, Pdim, Mdim; /* A[N][P], B[P][M], C[N][M] */ 29 | int i,j,k; 30 | 31 | double *A, *B, *C, cval, err, errsq; 32 | double dN, mflops; 33 | double start_time, run_time; 34 | 35 | 36 | Ndim = ORDER; 37 | Pdim = ORDER; 38 | Mdim = ORDER; 39 | 40 | A = (double *)malloc(Ndim*Pdim*sizeof(double)); 41 | B = (double *)malloc(Pdim*Mdim*sizeof(double)); 42 | C = (double *)malloc(Ndim*Mdim*sizeof(double)); 43 | 44 | /* Initialize matrices */ 45 | 46 | for (i=0; i TOL) 92 | printf("\n Errors in multiplication: %f",errsq); 93 | else 94 | printf("\n Hey, it worked"); 95 | 96 | printf("\n all done \n"); 97 | } 98 | -------------------------------------------------------------------------------- /lec15/pi.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | NAME: 4 | Pi_mc: PI Monte Carlo 5 | 6 | Purpose: 7 | This program uses a Monte Carlo algorithm to compute PI as an 8 | example of how random number generators are used to solve problems. 9 | Note that if your goal is to find digits of pi, there are much 10 | better algorithms you could use. 11 | 12 | Usage: 13 | To keep the program as simple as possible, you must edit the file 14 | and change the value of num_trials to change the number of samples 15 | used. Then compile and run the program. 16 | 17 | Algorithm: 18 | The basic idea behind the algorithm is easy to visualize. Draw a 19 | square on a wall. Inside the square, draw a circle. Now randomly throw 20 | darts at the wall. some darts will land inside the square. Of those, 21 | some will fall inside the circle. The probability of landing inside 22 | the circle or the square is proportional to their areas. 23 | 24 | We can use a random number generator to "throw the darts" and count 25 | how many "darts" fall inside the square and how many inside the 26 | cicle. Dividing these two numbers gives us the ratio of their areas 27 | and from that we can compute pi. 28 | 29 | Algorithm details: 30 | To turn this into code, I need a bit more detail. Assume the circle 31 | is centered inside the square. the circle will have a radius of r and 32 | each side of the square will be of area 2*r (i.e. the diameter of the 33 | circle). 34 | 35 | A(circle) = pi * r^2 36 | A(square) = (2*r)*(2*r) = 4*r^2 37 | 38 | ratio = A(circle)/A(square) = pi/4 39 | 40 | Since the probability (P) of a dart falling inside a figure (i.e. the square 41 | or the circle) is proportional to the area, we have 42 | 43 | ratio = P(circle)/P(square) = pi/4 44 | 45 | If I throw N darts as computed by random numbers evenly distributed 46 | over the area of the square 47 | 48 | P(sqaure) = N/N .... i.e. every dart lands in the square 49 | P(circle) = N(circle)/N 50 | 51 | ratio = (N(circle)/N)/(N/N) = N(circle)/N 52 | 53 | Hence, to find the area, I compute N random "darts" and count how many fall 54 | inside the circle. The equation for a circle is 55 | 56 | x^2 + y^2 = r^2 57 | 58 | So I randomly compute "x" and "y" evenly distributed from -r to r and 59 | count the "dart" as falling inside the cicle if 60 | 61 | x^2 + y^2 < or = r 62 | 63 | Results: 64 | Remember, our goal is to demonstrate a simple monte carlo algorithm, 65 | not compute pi. But just for the record, here are some results (Intel compiler 66 | version 10.0, Windows XP, core duo laptop) 67 | 68 | 100 3.160000 69 | 1000 3.148000 70 | 10000 3.154000 71 | 100000 3.139920 72 | 1000000 3.141456 73 | 10000000 3.141590 74 | 100000000 3.141581 75 | 76 | As a point of reference, the first 7 digits of the true value of pi 77 | is 3.141592 78 | 79 | 80 | History: 81 | Written by Tim Mattson, 9/2007. 82 | 83 | */ 84 | #include 85 | #include 86 | #include 87 | #include 88 | 89 | // 90 | // The monte carlo pi program 91 | // 92 | 93 | static long num_trials = 100000000; 94 | 95 | double get_rand_minus_one_one(){ 96 | return 2 * (rand() / (double)RAND_MAX) - 1; 97 | } 98 | 99 | int main () 100 | { 101 | long i; long Ncirc = 0; 102 | double pi, x, y, test, total_time; 103 | double r = 1.0; // radius of circle. Side of squrare is 2*r 104 | srand(time(NULL)); 105 | 106 | total_time = omp_get_wtime(); 107 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | 7 | static long num_trials = 100000000; 8 | 9 | double get_rand_minus_one_one(unsigned int* seed){ 10 | return 2 * (rand_r(seed) / (double)RAND_MAX) - 1; 11 | } 12 | 13 | int main () 14 | { 15 | long i; long Ncirc = 0; 16 | double pi, x, y, test, total_time; 17 | double r = 1.0; // radius of circle. Side of squrare is 2*r 18 | srand(time(NULL)); 19 | 20 | total_time = omp_get_wtime(); 21 | #pragma omp parallel 22 | { 23 | unsigned int seed = omp_get_thread_num(); 24 | #pragma omp single 25 | printf(" %d threads ",omp_get_num_threads()); 26 | 27 | #pragma omp for private(x,y,test) 28 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | 7 | static long num_trials = 100000000; 8 | 9 | double get_rand_minus_one_one(unsigned int* seed){ 10 | return 2 * (rand_r(seed) / (double)RAND_MAX) - 1; 11 | } 12 | 13 | int main () 14 | { 15 | long i; long Ncirc = 0; 16 | double pi, x, y, test, total_time; 17 | double r = 1.0; // radius of circle. Side of squrare is 2*r 18 | srand(time(NULL)); 19 | 20 | total_time = omp_get_wtime(); 21 | #pragma omp parallel 22 | { 23 | unsigned int seed = omp_get_thread_num(); 24 | #pragma omp single 25 | printf(" %d threads ",omp_get_num_threads()); 26 | 27 | #pragma omp for private(x,y,test) 28 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | 7 | static long num_trials = 100000000; 8 | 9 | double get_rand_minus_one_one(){ 10 | return 2 * (rand() / (double)RAND_MAX) - 1; 11 | } 12 | 13 | int main () 14 | { 15 | long i; long Ncirc = 0; 16 | double pi, x, y, test, total_time; 17 | double r = 1.0; // radius of circle. Side of squrare is 2*r 18 | srand(time(NULL)); 19 | 20 | total_time = omp_get_wtime(); 21 | #pragma omp parallel 22 | { 23 | 24 | #pragma omp single 25 | printf(" %d threads ",omp_get_num_threads()); 26 | 27 | #pragma omp for private(x,y,test) 28 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | 7 | static long num_trials = 100000000; 8 | 9 | double get_rand_minus_one_one(unsigned int* seed){ 10 | return 2 * (rand_r(seed) / (double)RAND_MAX) - 1; 11 | } 12 | 13 | int main () 14 | { 15 | long i; long Ncirc = 0; 16 | double pi, x, y, test, total_time; 17 | double r = 1.0; // radius of circle. Side of squrare is 2*r 18 | srand(time(NULL)); 19 | 20 | total_time = omp_get_wtime(); 21 | #pragma omp parallel 22 | { 23 | unsigned int seed = omp_get_thread_num(); 24 | #pragma omp single 25 | printf(" %d threads ",omp_get_num_threads()); 26 | 27 | #pragma omp for reduction(+:Ncirc) private(x,y,test) 28 | for(i=0;i 4 | #include 5 | 6 | #define ITER 100000000 7 | 8 | int main(int argc, char** argv){ 9 | double start, stop; 10 | GET_TIME(start); 11 | double dummy = 0; 12 | for(int i = 0; i < ITER; i++){ 13 | dummy += rand(); 14 | } 15 | GET_TIME(stop); 16 | printf("rand() time: %lf sec\n", stop - start); 17 | unsigned int s = 0; 18 | GET_TIME(start); 19 | for(int i = 0; i < ITER; i++){ 20 | dummy += rand_r(&s); 21 | } 22 | GET_TIME(stop); 23 | printf("rand_r() time: %lf sec\n", stop - start); 24 | return 0; 25 | } -------------------------------------------------------------------------------- /lec19/Makefile: -------------------------------------------------------------------------------- 1 | CC = nvcc 2 | LIBS = 3 | CFLAGS = -arch=compute_80 -DBLUR_SIZE=1 -DBLOCK_SIZE=16 4 | 5 | RM=rm -f 6 | 7 | EXES=vector_add test_cuda image_blur image_blur_shared image_blur_shared_all jacobi jacobi_solution 8 | 9 | all: $(EXES) 10 | 11 | %: %.cu 12 | $(CC) $(CFLAGS) -o $@ $@.cu $(LIBS) 13 | 14 | clean: 15 | $(RM) $(EXES) 16 | -------------------------------------------------------------------------------- /lec19/README.md: -------------------------------------------------------------------------------- 1 | - vector_add.cu: Example on vectors addition 2 | - image_blur.cu: Example on image blurring with data in global memory 3 | - image_blur_shared.cu: Example on image blurring with (part of the) data in shared memory 4 | - jacobi.cu: Example on Jacobi solver (taken from https://github.com/csc-training/CUDA/tree/master/exercises/jacobi). 5 | It computes it both on CPU and GPU, comparing the runtime and the result. 6 | This file only contains the CPU implementation, you are supposed to implement the GPU part. 7 | - jacobi_solution.cu: Solution of jacobi.cu -------------------------------------------------------------------------------- /lec19/cuda_job.sub: -------------------------------------------------------------------------------- 1 | universe = vanilla 2 | 3 | log = cuda_job.log 4 | output = cuda_job.out 5 | error = cuda_job.err 6 | 7 | # Request GPU resources 8 | request_gpus = 1 9 | 10 | # Specify any environment setup if needed 11 | getenv = True 12 | 13 | queue 14 | 15 | -------------------------------------------------------------------------------- /lec19/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /lec19/image_blur.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "../lec13/my_timer.h" 5 | 6 | #define HEIGHT 8192 7 | #define WIDTH 8192 8 | #define NUM_PIXELS (HEIGHT*WIDTH) 9 | #define NUM_CHANNELS 1 10 | 11 | __global__ void blurKernel(unsigned char* in, unsigned char* out, int w, int h){ 12 | int col = blockIdx.x*blockDim.x + threadIdx.x; 13 | int row = blockIdx.y*blockDim.y + threadIdx.y; 14 | 15 | if(col < w && row < h){ 16 | int pixVal = 0; 17 | int pixels = 0; 18 | for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){ 19 | for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){ 20 | int curRow = row + blurRow; 21 | int curCol = col + blurCol; 22 | if(curRow >= 0 && curRow < h && curCol >= 0 && curCol < w){ 23 | pixVal += in[curRow*w + curCol]; 24 | pixels++; 25 | } 26 | } 27 | 28 | } 29 | out[row*w + col] = (unsigned char) (pixVal / pixels); 30 | } 31 | } 32 | 33 | int main(int argc, char** argv){ 34 | // We do not actually load an image 35 | // In principle, we should load an image from a file into a host buffer 36 | // and then copy it to a device buffer. 37 | // Instead, we create an 'image' made of random bytes. 38 | 39 | size_t numBytes = NUM_PIXELS*NUM_CHANNELS*sizeof(unsigned char); 40 | 41 | // Allocate host input/output vectors 42 | unsigned char *h_input, *h_output, *h_output_ref; 43 | h_input = (unsigned char*) malloc(numBytes); 44 | h_output = (unsigned char*) malloc(numBytes); 45 | h_output_ref = (unsigned char*) malloc(numBytes); 46 | 47 | 48 | printf("Initializing input.\n"); fflush(stdout); 49 | // Initialize the input with random stuff 50 | for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){ 51 | h_input[i] = rand() % 256; 52 | } 53 | printf("Input initialized.\n"); fflush(stdout); 54 | 55 | // Allocate device input/output vectors and copy the input data to the device 56 | unsigned char *d_input, *d_output; 57 | cudaMalloc((void**) &d_input, numBytes); 58 | cudaMalloc((void**) &d_output, numBytes); 59 | cudaMemcpy(d_input, h_input, numBytes, cudaMemcpyHostToDevice); 60 | 61 | printf("Data copied to device.\n"); fflush(stdout); 62 | 63 | dim3 dimGrid(ceil(WIDTH/(float) BLOCK_SIZE), ceil(HEIGHT/ (float) BLOCK_SIZE)); 64 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 65 | double start, stop; 66 | 67 | // In principle we should also include in the timing the time to copy data to/from the device, 68 | // we do not do it here mostly to show the difference in time between the version with/without shared memory 69 | GET_TIME(start); 70 | blurKernel<<>>(d_input, d_output, WIDTH, HEIGHT); 71 | cudaDeviceSynchronize(); 72 | GET_TIME(stop); 73 | printf("Runtime: %lf seconds\n", stop - start); fflush(stdout); 74 | 75 | // Copy the output data from the device to the host 76 | cudaMemcpy(h_output, d_output, numBytes, cudaMemcpyDeviceToHost); 77 | printf("Output retrieved.\n"); fflush(stdout); 78 | 79 | // Now we check that the result computed by the device is correct 80 | // Do the same blurring on the host 81 | for(int row = 0; row < HEIGHT; row++){ 82 | for(int col = 0; col < WIDTH; col++){ 83 | int pixVal = 0; 84 | int pixels = 0; 85 | for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){ 86 | for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){ 87 | int curRow = row + blurRow; 88 | int curCol = col + blurCol; 89 | if(curRow >= 0 && curRow < HEIGHT && curCol >= 0 && curCol < WIDTH){ 90 | pixVal += h_input[curRow*WIDTH + curCol]; 91 | pixels++; 92 | } 93 | } 94 | } 95 | h_output_ref[row*WIDTH + col] = (unsigned char) (pixVal / pixels); 96 | } 97 | } 98 | printf("Reference result computed\n"); fflush(stdout); 99 | // Now check that the content of h_output is equal to h_output_ref 100 | for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){ 101 | if(h_output_ref[i] != h_output[i]){ 102 | fprintf(stderr, "Outputs differ at index %d (%d vs. %d)\n", i, h_output_ref[i], h_output[i]); fflush(stderr); 103 | exit(-1); 104 | } 105 | 106 | } 107 | 108 | printf("Everything is fine\n"); fflush(stdout); 109 | 110 | cudaFree(d_input); 111 | cudaFree(d_output); 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /lec19/image_blur_shared.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "../lec13/my_timer.h" 5 | 6 | #define HEIGHT 8192 7 | #define WIDTH 8192 8 | #define NUM_PIXELS (HEIGHT*WIDTH) 9 | #define NUM_CHANNELS 1 10 | 11 | __global__ void blurKernel(unsigned char* in, unsigned char* out, int w, int h){ 12 | __device__ __shared__ unsigned char in_shared[BLOCK_SIZE][BLOCK_SIZE]; // ATTENTION: I expressed it as a static 2D array, so I can use 2D indexing 13 | int col = blockIdx.x*blockDim.x + threadIdx.x; 14 | int row = blockIdx.y*blockDim.y + threadIdx.y; 15 | 16 | if(col < w && row < h){ 17 | // Each thread copies its element from global to shared memory 18 | in_shared[threadIdx.y][threadIdx.x] = in[row*w + col]; 19 | __syncthreads(); 20 | 21 | int pixVal = 0; 22 | int pixels = 0; 23 | for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){ 24 | for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){ 25 | int curRow = row + blurRow; 26 | int curCol = col + blurCol; 27 | if(curRow >= 0 && curRow < h && curCol >= 0 && curCol < w){ 28 | // We need to check if the element in[curRow][curCol] 29 | // is one of those managed by this block (and then we 30 | // find the data in shared memory), or managed by a different 31 | // block (and then we need to get the data from global memory). 32 | // To do so, we can just divide the curRow by the BLOCK_SIZE. 33 | // If that's different from row/BLOCK_SIZE, then this thread 34 | // and the element it wants to read now are in two different 35 | // blocks, and it is read from global memory (the same for curCol). 36 | if(curRow / BLOCK_SIZE != row / BLOCK_SIZE || curCol / BLOCK_SIZE != col / BLOCK_SIZE){ 37 | pixVal += in[curRow*w + curCol]; 38 | }else{ 39 | int curRowShared = threadIdx.y + blurRow; 40 | int curColShared = threadIdx.x + blurCol; 41 | pixVal += in_shared[curRowShared][curColShared]; 42 | } 43 | pixels++; 44 | } 45 | } 46 | 47 | } 48 | out[row*w + col] = (unsigned char) (pixVal / pixels); 49 | } 50 | } 51 | 52 | int main(int argc, char** argv){ 53 | // We do not actually load an image 54 | // In principle, we should load an image from a file into a host buffer 55 | // and then copy it to a device buffer. 56 | // Instead, we create an 'image' made of random bytes. 57 | 58 | size_t numBytes = NUM_PIXELS*NUM_CHANNELS*sizeof(unsigned char); 59 | 60 | // Allocate host input/output vectors 61 | unsigned char *h_input, *h_output, *h_output_ref; 62 | h_input = (unsigned char*) malloc(numBytes); 63 | h_output = (unsigned char*) malloc(numBytes); 64 | h_output_ref = (unsigned char*) malloc(numBytes); 65 | 66 | 67 | printf("Initializing input.\n"); fflush(stdout); 68 | // Initialize the input with random stuff 69 | for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){ 70 | h_input[i] = rand() % 256; 71 | } 72 | printf("Input initialized.\n"); fflush(stdout); 73 | 74 | // Allocate device input/output vectors and copy the input data to the device 75 | unsigned char *d_input, *d_output; 76 | cudaMalloc((void**) &d_input, numBytes); 77 | cudaMalloc((void**) &d_output, numBytes); 78 | cudaMemcpy(d_input, h_input, numBytes, cudaMemcpyHostToDevice); 79 | 80 | printf("Data copied to device.\n"); fflush(stdout); 81 | 82 | dim3 dimGrid(ceil(WIDTH/(float) BLOCK_SIZE), ceil(HEIGHT/(float) BLOCK_SIZE)); 83 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 84 | double start, stop; 85 | 86 | // In principle we should also include in the timing the time to copy data to/from the device, 87 | // we do not do it here mostly to show the difference in time between the version with/without shared memory 88 | GET_TIME(start); 89 | blurKernel<<>>(d_input, d_output, WIDTH, HEIGHT); 90 | cudaDeviceSynchronize(); 91 | GET_TIME(stop); 92 | printf("Runtime: %lf seconds\n", stop - start); fflush(stdout); 93 | 94 | // Copy the output data from the device to the host 95 | cudaMemcpy(h_output, d_output, numBytes, cudaMemcpyDeviceToHost); 96 | printf("Output retrieved.\n"); fflush(stdout); 97 | 98 | // Now we check that the result computed by the device is correct 99 | // Do the same blurring on the host 100 | for(int row = 0; row < HEIGHT; row++){ 101 | for(int col = 0; col < WIDTH; col++){ 102 | int pixVal = 0; 103 | int pixels = 0; 104 | for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){ 105 | for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){ 106 | int curRow = row + blurRow; 107 | int curCol = col + blurCol; 108 | if(curRow >= 0 && curRow < HEIGHT && curCol >= 0 && curCol < WIDTH){ 109 | pixVal += h_input[curRow*WIDTH + curCol]; 110 | pixels++; 111 | } 112 | } 113 | } 114 | h_output_ref[row*WIDTH + col] = (unsigned char) (pixVal / pixels); 115 | } 116 | } 117 | printf("Reference result computed\n"); fflush(stdout); 118 | // Now check that the content of h_output is equal to h_output_ref 119 | for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){ 120 | if(h_output_ref[i] != h_output[i]){ 121 | fprintf(stderr, "Outputs differ at index %d (%d vs. %d)\n", i, h_output_ref[i], h_output[i]); fflush(stderr); 122 | exit(-1); 123 | } 124 | 125 | } 126 | 127 | printf("Everything is fine\n"); fflush(stdout); 128 | 129 | cudaFree(d_input); 130 | cudaFree(d_output); 131 | return 0; 132 | } 133 | -------------------------------------------------------------------------------- /lec19/jacobi.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" 4 | 5 | // Change this to 0 if CPU reference result is not needed 6 | #define COMPUTE_CPU_REFERENCE 1 7 | #define MAX_ITERATIONS 3000 8 | #define GPU_VERSION 0 // TODO: Change this to 1 to enable the GPU version 9 | 10 | // CPU kernel 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 12 | double h2, int N) 13 | { 14 | int i, j; 15 | int index, i1, i2, i3, i4; 16 | 17 | for (j = 1; j < N-1; j++) { 18 | for (i = 1; i < N-1; i++) { 19 | index = i + j*N; 20 | i1 = (i-1) + j * N; 21 | i2 = (i+1) + j * N; 22 | i3 = i + (j-1) * N; 23 | i4 = i + (j+1) * N; 24 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 25 | phiPrev[i3] + phiPrev[i4] - 26 | h2 * source[index]); 27 | } 28 | } 29 | } 30 | 31 | // GPU kernel 32 | __global__ 33 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 34 | double h2, int N) 35 | { 36 | // TODO: Add here the GPU implementation 37 | } 38 | 39 | 40 | double compareArrays(const double *a, const double *b, int N) 41 | { 42 | double error = 0.0; 43 | int i; 44 | for (i = 0; i < N*N; i++) { 45 | error += fabs(a[i] - b[i]); 46 | } 47 | return error/(N*N); 48 | } 49 | 50 | 51 | double diffCPU(const double *phi, const double *phiPrev, int N) 52 | { 53 | int i; 54 | double sum = 0; 55 | double diffsum = 0; 56 | 57 | for (i = 0; i < N*N; i++) { 58 | diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]); 59 | sum += phi[i] * phi[i]; 60 | } 61 | 62 | return sqrt(diffsum/sum); 63 | } 64 | 65 | 66 | int main() 67 | { 68 | timeval t1, t2; // Structs for timing 69 | const int N = 512; 70 | double h = 1.0 / (N - 1); 71 | int iterations; 72 | const double tolerance = 5e-4; // Stopping condition 73 | int i, j, index; 74 | 75 | const int blocksize = 16; 76 | 77 | double *phi = new double[N*N]; 78 | double *phiPrev = new double[N*N]; 79 | double *source = new double[N*N]; 80 | double *phi_cuda = new double[N*N]; 81 | 82 | double *phi_d, *phiPrev_d, *source_d; 83 | // Size of the arrays in bytes 84 | const int size = N*N*sizeof(double); 85 | double diff; 86 | 87 | // Source initialization 88 | for (i = 0; i < N; i++) { 89 | for (j = 0; j < N; j++) { 90 | double x, y; 91 | x = (i - N / 2) * h; 92 | y = (j - N / 2) * h; 93 | index = j + i * N; 94 | if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1) 95 | source[index] = 1e10*h*h; 96 | else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1) 97 | source[index] = -1e10*h*h; 98 | else 99 | source[index] = 0.0; 100 | } 101 | } 102 | 103 | CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 104 | CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 105 | 106 | // Reset values to zero 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | index = j + i * N; 110 | phi[index] = 0.0; 111 | phiPrev[index] = 0.0; 112 | } 113 | } 114 | 115 | CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 116 | CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 117 | CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) ); 118 | CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) ); 119 | 120 | // CPU version 121 | if(COMPUTE_CPU_REFERENCE) { 122 | gettimeofday(&t1, NULL); 123 | 124 | // Do sweeps untill difference is under the tolerance 125 | diff = tolerance * 2; 126 | iterations = 0; 127 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 128 | sweepCPU(phiPrev, phi, source, h * h, N); 129 | sweepCPU(phi, phiPrev, source, h * h, N); 130 | 131 | iterations += 2; 132 | if (iterations % 100 == 0) { 133 | diff = diffCPU(phi, phiPrev, N); 134 | printf("%d %g\n", iterations, diff); 135 | } 136 | } 137 | gettimeofday(&t2, NULL); 138 | printf("CPU Jacobi: %g seconds, %d iterations\n", 139 | t2.tv_sec - t1.tv_sec + 140 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 141 | } 142 | 143 | 144 | #if GPU_VERSION 145 | // GPU version 146 | 147 | dim3 dimBlock(blocksize, blocksize); 148 | dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 149 | 150 | //do sweeps until diff under tolerance 151 | diff = tolerance * 2; 152 | iterations = 0; 153 | 154 | gettimeofday(&t1, NULL); 155 | 156 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 157 | // See above how the CPU update kernel is called 158 | // and implement similar calling sequence for the GPU code 159 | 160 | //// Add routines here 161 | // TODO: Add GPU kernel calls here (see CPU version above) 162 | 163 | iterations += 2; 164 | 165 | if (iterations % 100 == 0) { 166 | // TODO: Add GPU kernel calls here (see CPU version above) 167 | CHECK_ERROR_MSG("Difference computation"); 168 | printf("%d %g\n", iterations, diff); 169 | } 170 | } 171 | 172 | //// Add here the routine to copy back the results 173 | //TODO: Copy back the results 174 | 175 | gettimeofday(&t2, NULL); 176 | printf("GPU Jacobi: %g seconds, %d iterations\n", 177 | t2.tv_sec - t1.tv_sec + 178 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 179 | 180 | //// Add here the clean up code for all allocated CUDA resources 181 | // TODO: Add here the clean up code 182 | #endif 183 | 184 | if (COMPUTE_CPU_REFERENCE) { 185 | printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N)); 186 | } 187 | 188 | delete[] phi; 189 | delete[] phi_cuda; 190 | delete[] phiPrev; 191 | delete[] source; 192 | 193 | return EXIT_SUCCESS; 194 | } -------------------------------------------------------------------------------- /lec19/jacobi_solution.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" 4 | 5 | // Change this to 0 if CPU reference result is not needed 6 | #define COMPUTE_CPU_REFERENCE 1 7 | #define MAX_ITERATIONS 3000 8 | 9 | // CPU kernel 10 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 11 | double h2, int N) 12 | { 13 | int i, j; 14 | int index, i1, i2, i3, i4; 15 | 16 | for (j = 1; j < N-1; j++) { 17 | for (i = 1; i < N-1; i++) { 18 | index = i + j*N; 19 | i1 = (i-1) + j * N; 20 | i2 = (i+1) + j * N; 21 | i3 = i + (j-1) * N; 22 | i4 = i + (j+1) * N; 23 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 24 | phiPrev[i3] + phiPrev[i4] - 25 | h2 * source[index]); 26 | } 27 | } 28 | } 29 | 30 | // GPU kernel 31 | __global__ 32 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 33 | double h2, int N) 34 | { 35 | int i = blockIdx.x * blockDim.x + threadIdx.x; 36 | int j = blockIdx.y * blockDim.y + threadIdx.y; 37 | int index = i + j*N; 38 | int i1, i2, i3, i4; 39 | 40 | i1 = (i-1) + j * N; 41 | i2 = (i+1) + j * N; 42 | i3 = i + (j-1) * N; 43 | i4 = i + (j+1) * N; 44 | 45 | if (i > 0 && j > 0 && i < N-1 && j < N-1) 46 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 47 | phiPrev[i3] + phiPrev[i4] - 48 | h2 * source[index]); 49 | } 50 | 51 | 52 | double compareArrays(const double *a, const double *b, int N) 53 | { 54 | double error = 0.0; 55 | int i; 56 | for (i = 0; i < N*N; i++) { 57 | error += fabs(a[i] - b[i]); 58 | } 59 | return error/(N*N); 60 | } 61 | 62 | 63 | double diffCPU(const double *phi, const double *phiPrev, int N) 64 | { 65 | int i; 66 | double sum = 0; 67 | double diffsum = 0; 68 | 69 | for (i = 0; i < N*N; i++) { 70 | diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]); 71 | sum += phi[i] * phi[i]; 72 | } 73 | 74 | return sqrt(diffsum/sum); 75 | } 76 | 77 | __global__ 78 | void diffGPU(const double *phi, const double *phiPrev, int N, double* sum, double* diffsum) 79 | { 80 | int i = blockIdx.x * blockDim.x + threadIdx.x; 81 | int j = blockIdx.y * blockDim.y + threadIdx.y; 82 | int index = i + j*N; 83 | atomicAdd(diffsum, (phi[index] - phiPrev[index]) * (phi[index] - phiPrev[index])); 84 | atomicAdd(sum, phi[index] * phi[index]); 85 | } 86 | 87 | 88 | int main() 89 | { 90 | timeval t1, t2; // Structs for timing 91 | const int N = 512; 92 | double h = 1.0 / (N - 1); 93 | int iterations; 94 | const double tolerance = 5e-4; // Stopping condition 95 | int i, j, index; 96 | 97 | const int blocksize = 16; 98 | 99 | double *phi = new double[N*N]; 100 | double *phiPrev = new double[N*N]; 101 | double *source = new double[N*N]; 102 | double *phi_cuda = new double[N*N]; 103 | 104 | double *phi_d, *phiPrev_d, *source_d; 105 | // Size of the arrays in bytes 106 | const int size = N*N*sizeof(double); 107 | double diff; 108 | 109 | // Source initialization 110 | for (i = 0; i < N; i++) { 111 | for (j = 0; j < N; j++) { 112 | double x, y; 113 | x = (i - N / 2) * h; 114 | y = (j - N / 2) * h; 115 | index = j + i * N; 116 | if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1) 117 | source[index] = 1e10*h*h; 118 | else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1) 119 | source[index] = -1e10*h*h; 120 | else 121 | source[index] = 0.0; 122 | } 123 | } 124 | 125 | CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 126 | CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 127 | double sum_h, diffsum_h, *sum_d, *diffsum_d; 128 | CUDA_CHECK( cudaMalloc( (void**)&sum_d, sizeof(double)) ); 129 | CUDA_CHECK( cudaMalloc( (void**)&diffsum_d, sizeof(double)) ); 130 | 131 | // Reset values to zero 132 | for (i = 0; i < N; i++) { 133 | for (j = 0; j < N; j++) { 134 | index = j + i * N; 135 | phi[index] = 0.0; 136 | phiPrev[index] = 0.0; 137 | } 138 | } 139 | 140 | CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 141 | CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 142 | CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) ); 143 | CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) ); 144 | 145 | // CPU version 146 | if(COMPUTE_CPU_REFERENCE) { 147 | gettimeofday(&t1, NULL); 148 | 149 | // Do sweeps untill difference is under the tolerance 150 | diff = tolerance * 2; 151 | iterations = 0; 152 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 153 | sweepCPU(phiPrev, phi, source, h * h, N); 154 | sweepCPU(phi, phiPrev, source, h * h, N); 155 | 156 | iterations += 2; 157 | if (iterations % 100 == 0) { 158 | diff = diffCPU(phi, phiPrev, N); 159 | printf("%d %g\n", iterations, diff); 160 | } 161 | } 162 | gettimeofday(&t2, NULL); 163 | printf("CPU Jacobi: %g seconds, %d iterations\n", 164 | t2.tv_sec - t1.tv_sec + 165 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 166 | } 167 | 168 | // GPU version 169 | 170 | dim3 dimBlock(blocksize, blocksize); 171 | dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 172 | 173 | //do sweeps until diff under tolerance 174 | diff = tolerance * 2; 175 | iterations = 0; 176 | 177 | gettimeofday(&t1, NULL); 178 | 179 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 180 | // See above how the CPU update kernel is called 181 | // and implement similar calling sequence for the GPU code 182 | 183 | //// Add routines here 184 | sweepGPU<<>>(phiPrev_d, phi_d, source_d, h*h, N); 185 | sweepGPU<<>>(phi_d, phiPrev_d, source_d, h*h, N); 186 | CHECK_ERROR_MSG("Jacobi kernels"); 187 | iterations += 2; 188 | 189 | if (iterations % 100 == 0) { 190 | // Reinitialize sum_d and diffsum_d to 0 191 | sum_h = 0; diffsum_h = 0; 192 | CUDA_CHECK( cudaMemcpy(sum_d, &sum_h, sizeof(double), cudaMemcpyHostToDevice) ); 193 | CUDA_CHECK( cudaMemcpy(diffsum_d, &diffsum_h, sizeof(double), cudaMemcpyHostToDevice) ); 194 | 195 | diffGPU<<>>(phiPrev_d, phi_d, N, sum_d, diffsum_d); 196 | CUDA_CHECK( cudaMemcpy(&sum_h, sum_d, sizeof(double), cudaMemcpyDeviceToHost) ); 197 | CUDA_CHECK( cudaMemcpy(&diffsum_h, diffsum_d, sizeof(double), cudaMemcpyDeviceToHost) ); 198 | diff = sqrt(diffsum_h/sum_h); 199 | CHECK_ERROR_MSG("Difference computation"); 200 | printf("%d %g\n", iterations, diff); 201 | } 202 | } 203 | 204 | CUDA_CHECK( cudaMemcpy(phi_cuda, phi_d, size, cudaMemcpyDeviceToHost) ); 205 | 206 | gettimeofday(&t2, NULL); 207 | printf("GPU Jacobi: %g seconds, %d iterations\n", 208 | t2.tv_sec - t1.tv_sec + 209 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 210 | 211 | //// Add here the clean up code for all allocated CUDA resources 212 | CUDA_CHECK( cudaFree(phi_d) ); 213 | CUDA_CHECK( cudaFree(phiPrev_d) ); 214 | CUDA_CHECK( cudaFree(source_d) ); 215 | CUDA_CHECK( cudaFree(sum_d) ); 216 | CUDA_CHECK( cudaFree(diffsum_d) ); 217 | 218 | if (COMPUTE_CPU_REFERENCE) { 219 | printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N)); 220 | } 221 | 222 | delete[] phi; 223 | delete[] phi_cuda; 224 | delete[] phiPrev; 225 | delete[] source; 226 | 227 | return EXIT_SUCCESS; 228 | } -------------------------------------------------------------------------------- /lec19/test_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main() { 5 | int count; 6 | cudaError_t err = cudaGetDeviceCount(&count); 7 | 8 | if (err != cudaSuccess) { 9 | printf("CUDA Error: %s\n", cudaGetErrorString(err)); 10 | return -1; 11 | } 12 | 13 | printf("Number of CUDA devices: %d\n", count); 14 | return 0; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /lec19/vector_add.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /** 29 | * Vector addition: C = A + B. 30 | * 31 | * This sample is a very basic sample that implements element by element 32 | * vector addition. It is the same as the sample illustrating Chapter 2 33 | * of the programming guide with some additions like error checking. 34 | */ 35 | 36 | #include 37 | #include 38 | 39 | /** 40 | * CUDA Kernel Device code 41 | * 42 | * Computes the vector addition of A and B into C. The 3 vectors have the same 43 | * number of elements numElements. 44 | */ 45 | __global__ void vectorAdd(const float *A, const float *B, float *C, 46 | int numElements) { 47 | int i = blockDim.x * blockIdx.x + threadIdx.x; 48 | 49 | if (i < numElements) { 50 | C[i] = A[i] + B[i] + 0.0f; 51 | } 52 | } 53 | 54 | /** 55 | * Host main routine 56 | */ 57 | int main(void) { 58 | // Error code to check return values for CUDA calls 59 | cudaError_t err = cudaSuccess; 60 | 61 | // Print the vector length to be used, and compute its size 62 | int numElements = 50000; 63 | size_t size = numElements * sizeof(float); 64 | printf("[Vector addition of %d elements]\n", numElements); 65 | 66 | // Allocate the host input vector A 67 | float *h_A = (float *)malloc(size); 68 | 69 | // Allocate the host input vector B 70 | float *h_B = (float *)malloc(size); 71 | 72 | // Allocate the host output vector C 73 | float *h_C = (float *)malloc(size); 74 | 75 | // Verify that allocations succeeded 76 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 77 | fprintf(stderr, "Failed to allocate host vectors!\n"); 78 | exit(EXIT_FAILURE); 79 | } 80 | 81 | // Initialize the host input vectors 82 | for (int i = 0; i < numElements; ++i) { 83 | h_A[i] = rand() / (float)RAND_MAX; 84 | h_B[i] = rand() / (float)RAND_MAX; 85 | } 86 | 87 | // Allocate the device input vector A 88 | float *d_A = NULL; 89 | err = cudaMalloc((void **)&d_A, size); 90 | 91 | if (err != cudaSuccess) { 92 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", 93 | cudaGetErrorString(err)); 94 | exit(EXIT_FAILURE); 95 | } 96 | 97 | // Allocate the device input vector B 98 | float *d_B = NULL; 99 | err = cudaMalloc((void **)&d_B, size); 100 | 101 | if (err != cudaSuccess) { 102 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", 103 | cudaGetErrorString(err)); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | // Allocate the device output vector C 108 | float *d_C = NULL; 109 | err = cudaMalloc((void **)&d_C, size); 110 | 111 | if (err != cudaSuccess) { 112 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", 113 | cudaGetErrorString(err)); 114 | exit(EXIT_FAILURE); 115 | } 116 | 117 | // Copy the host input vectors A and B in host memory to the device input 118 | // vectors in 119 | // device memory 120 | printf("Copy input data from the host memory to the CUDA device\n"); 121 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 122 | 123 | if (err != cudaSuccess) { 124 | fprintf(stderr, 125 | "Failed to copy vector A from host to device (error code %s)!\n", 126 | cudaGetErrorString(err)); 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 131 | 132 | if (err != cudaSuccess) { 133 | fprintf(stderr, 134 | "Failed to copy vector B from host to device (error code %s)!\n", 135 | cudaGetErrorString(err)); 136 | exit(EXIT_FAILURE); 137 | } 138 | 139 | // Launch the Vector Add CUDA Kernel 140 | int threadsPerBlock = 256; 141 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 142 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, 143 | threadsPerBlock); 144 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 145 | err = cudaGetLastError(); 146 | 147 | if (err != cudaSuccess) { 148 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", 149 | cudaGetErrorString(err)); 150 | exit(EXIT_FAILURE); 151 | } 152 | 153 | // Copy the device result vector in device memory to the host result vector 154 | // in host memory. 155 | printf("Copy output data from the CUDA device to the host memory\n"); 156 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 157 | 158 | if (err != cudaSuccess) { 159 | fprintf(stderr, 160 | "Failed to copy vector C from device to host (error code %s)!\n", 161 | cudaGetErrorString(err)); 162 | exit(EXIT_FAILURE); 163 | } 164 | 165 | // Verify that the result vector is correct 166 | for (int i = 0; i < numElements; ++i) { 167 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 168 | fprintf(stderr, "Result verification failed at element %d!\n", i); 169 | exit(EXIT_FAILURE); 170 | } 171 | } 172 | 173 | printf("Test PASSED\n"); 174 | 175 | // Free device global memory 176 | err = cudaFree(d_A); 177 | 178 | if (err != cudaSuccess) { 179 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", 180 | cudaGetErrorString(err)); 181 | exit(EXIT_FAILURE); 182 | } 183 | 184 | err = cudaFree(d_B); 185 | 186 | if (err != cudaSuccess) { 187 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", 188 | cudaGetErrorString(err)); 189 | exit(EXIT_FAILURE); 190 | } 191 | 192 | err = cudaFree(d_C); 193 | 194 | if (err != cudaSuccess) { 195 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", 196 | cudaGetErrorString(err)); 197 | exit(EXIT_FAILURE); 198 | } 199 | 200 | // Free host memory 201 | free(h_A); 202 | free(h_B); 203 | free(h_C); 204 | 205 | printf("Done\n"); 206 | return 0; 207 | } 208 | -------------------------------------------------------------------------------- /projects/README.md: -------------------------------------------------------------------------------- 1 | The projects proposed in this folder have been taken, respectively, from: 2 | 3 | - https://trasgo.infor.uva.es/sdm_downloads/k-means/ 4 | - https://trasgo.infor.uva.es/sdm_downloads/dna-sequence-alignment/ 5 | - https://trasgo.infor.uva.es/sdm_downloads/wind-tunnel-peachy-assignment/ 6 | 7 | 8 | You can find instructions/rules for the exam in the PMC20 slides deck. 9 | If you have any doubts, send me an email. -------------------------------------------------------------------------------- /projects/kmeans/KMEANS.c: -------------------------------------------------------------------------------- 1 | /* 2 | * k-Means clustering algorithm 3 | * 4 | * Reference sequential version (Do not modify this code) 5 | * 6 | * Parallel computing (Degree in Computer Engineering) 7 | * 2022/2023 8 | * 9 | * Version: 1.0 10 | * 11 | * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano 12 | * Grupo Trasgo, Universidad de Valladolid (Spain) 13 | * 14 | * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 15 | * https://creativecommons.org/licenses/by-sa/4.0/ 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define MAXLINE 2000 26 | #define MAXCAD 200 27 | 28 | //Macros 29 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 30 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 31 | 32 | /* 33 | Function showFileError: It displays the corresponding error during file reading. 34 | */ 35 | void showFileError(int error, char* filename) 36 | { 37 | printf("Error\n"); 38 | switch (error) 39 | { 40 | case -1: 41 | fprintf(stderr,"\tFile %s has too many columns.\n", filename); 42 | fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE); 43 | break; 44 | case -2: 45 | fprintf(stderr,"Error reading file: %s.\n", filename); 46 | break; 47 | case -3: 48 | fprintf(stderr,"Error writing file: %s.\n", filename); 49 | break; 50 | } 51 | fflush(stderr); 52 | } 53 | 54 | /* 55 | Function readInput: It reads the file to determine the number of rows and columns. 56 | */ 57 | int readInput(char* filename, int *lines, int *samples) 58 | { 59 | FILE *fp; 60 | char line[MAXLINE] = ""; 61 | char *ptr; 62 | const char *delim = "\t"; 63 | int contlines, contsamples = 0; 64 | 65 | contlines = 0; 66 | 67 | if ((fp=fopen(filename,"r"))!=NULL) 68 | { 69 | while(fgets(line, MAXLINE, fp)!= NULL) 70 | { 71 | if (strchr(line, '\n') == NULL) 72 | { 73 | return -1; 74 | } 75 | contlines++; 76 | ptr = strtok(line, delim); 77 | contsamples = 0; 78 | while(ptr != NULL) 79 | { 80 | contsamples++; 81 | ptr = strtok(NULL, delim); 82 | } 83 | } 84 | fclose(fp); 85 | *lines = contlines; 86 | *samples = contsamples; 87 | return 0; 88 | } 89 | else 90 | { 91 | return -2; 92 | } 93 | } 94 | 95 | /* 96 | Function readInput2: It loads data from file. 97 | */ 98 | int readInput2(char* filename, float* data) 99 | { 100 | FILE *fp; 101 | char line[MAXLINE] = ""; 102 | char *ptr; 103 | const char *delim = "\t"; 104 | int i = 0; 105 | 106 | if ((fp=fopen(filename,"rt"))!=NULL) 107 | { 108 | while(fgets(line, MAXLINE, fp)!= NULL) 109 | { 110 | ptr = strtok(line, delim); 111 | while(ptr != NULL) 112 | { 113 | data[i] = atof(ptr); 114 | i++; 115 | ptr = strtok(NULL, delim); 116 | } 117 | } 118 | fclose(fp); 119 | return 0; 120 | } 121 | else 122 | { 123 | return -2; //No file found 124 | } 125 | } 126 | 127 | /* 128 | Function writeResult: It writes in the output file the cluster of each sample (point). 129 | */ 130 | int writeResult(int *classMap, int lines, const char* filename) 131 | { 132 | FILE *fp; 133 | 134 | if ((fp=fopen(filename,"wt"))!=NULL) 135 | { 136 | for(int i=0; imaxDist) { 379 | maxDist=distCentroids[i]; 380 | } 381 | } 382 | memcpy(centroids, auxCentroids, (K*samples*sizeof(float))); 383 | 384 | sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist); 385 | outputMsg = strcat(outputMsg,line); 386 | 387 | } while((changes>minChanges) && (itmaxThreshold)); 388 | 389 | /* 390 | * 391 | * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT 392 | * 393 | */ 394 | // Output and termination conditions 395 | printf("%s",outputMsg); 396 | 397 | //END CLOCK***************************************** 398 | end = clock(); 399 | printf("\nComputation: %f seconds", (double)(end - start) / CLOCKS_PER_SEC); 400 | fflush(stdout); 401 | //************************************************** 402 | //START CLOCK*************************************** 403 | start = clock(); 404 | //************************************************** 405 | 406 | 407 | 408 | if (changes <= minChanges) { 409 | printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges); 410 | } 411 | else if (it >= maxIterations) { 412 | printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations); 413 | } 414 | else { 415 | printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold); 416 | } 417 | 418 | // Writing the classification of each point to the output file. 419 | error = writeResult(classMap, lines, argv[6]); 420 | if(error != 0) 421 | { 422 | showFileError(error, argv[6]); 423 | exit(error); 424 | } 425 | 426 | //Free memory 427 | free(data); 428 | free(classMap); 429 | free(centroidPos); 430 | free(centroids); 431 | free(distCentroids); 432 | free(pointsPerClass); 433 | free(auxCentroids); 434 | 435 | //END CLOCK***************************************** 436 | end = clock(); 437 | printf("\n\nMemory deallocation: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC); 438 | fflush(stdout); 439 | //***************************************************/ 440 | return 0; 441 | } 442 | -------------------------------------------------------------------------------- /projects/kmeans/KMEANS_cuda.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * k-Means clustering algorithm 3 | * 4 | * CUDA version 5 | * 6 | * Parallel computing (Degree in Computer Engineering) 7 | * 2022/2023 8 | * 9 | * Version: 1.0 10 | * 11 | * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano 12 | * Grupo Trasgo, Universidad de Valladolid (Spain) 13 | * 14 | * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 15 | * https://creativecommons.org/licenses/by-sa/4.0/ 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | #define MAXLINE 2000 28 | #define MAXCAD 200 29 | 30 | //Macros 31 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 32 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 33 | 34 | /* 35 | * Macros to show errors when calling a CUDA library function, 36 | * or after launching a kernel 37 | */ 38 | #define CHECK_CUDA_CALL( a ) { \ 39 | cudaError_t ok = a; \ 40 | if ( ok != cudaSuccess ) \ 41 | fprintf(stderr, "-- Error CUDA call in line %d: %s\n", __LINE__, cudaGetErrorString( ok ) ); \ 42 | } 43 | #define CHECK_CUDA_LAST() { \ 44 | cudaError_t ok = cudaGetLastError(); \ 45 | if ( ok != cudaSuccess ) \ 46 | fprintf(stderr, "-- Error CUDA last in line %d: %s\n", __LINE__, cudaGetErrorString( ok ) ); \ 47 | } 48 | 49 | /* 50 | Function showFileError: It displays the corresponding error during file reading. 51 | */ 52 | void showFileError(int error, char* filename) 53 | { 54 | printf("Error\n"); 55 | switch (error) 56 | { 57 | case -1: 58 | fprintf(stderr,"\tFile %s has too many columns.\n", filename); 59 | fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE); 60 | break; 61 | case -2: 62 | fprintf(stderr,"Error reading file: %s.\n", filename); 63 | break; 64 | case -3: 65 | fprintf(stderr,"Error writing file: %s.\n", filename); 66 | break; 67 | } 68 | fflush(stderr); 69 | } 70 | 71 | /* 72 | Function readInput: It reads the file to determine the number of rows and columns. 73 | */ 74 | int readInput(char* filename, int *lines, int *samples) 75 | { 76 | FILE *fp; 77 | char line[MAXLINE] = ""; 78 | char *ptr; 79 | const char *delim = "\t"; 80 | int contlines, contsamples = 0; 81 | 82 | contlines = 0; 83 | 84 | if ((fp=fopen(filename,"r"))!=NULL) 85 | { 86 | while(fgets(line, MAXLINE, fp)!= NULL) 87 | { 88 | if (strchr(line, '\n') == NULL) 89 | { 90 | return -1; 91 | } 92 | contlines++; 93 | ptr = strtok(line, delim); 94 | contsamples = 0; 95 | while(ptr != NULL) 96 | { 97 | contsamples++; 98 | ptr = strtok(NULL, delim); 99 | } 100 | } 101 | fclose(fp); 102 | *lines = contlines; 103 | *samples = contsamples; 104 | return 0; 105 | } 106 | else 107 | { 108 | return -2; 109 | } 110 | } 111 | 112 | /* 113 | Function readInput2: It loads data from file. 114 | */ 115 | int readInput2(char* filename, float* data) 116 | { 117 | FILE *fp; 118 | char line[MAXLINE] = ""; 119 | char *ptr; 120 | const char *delim = "\t"; 121 | int i = 0; 122 | 123 | if ((fp=fopen(filename,"rt"))!=NULL) 124 | { 125 | while(fgets(line, MAXLINE, fp)!= NULL) 126 | { 127 | ptr = strtok(line, delim); 128 | while(ptr != NULL) 129 | { 130 | data[i] = atof(ptr); 131 | i++; 132 | ptr = strtok(NULL, delim); 133 | } 134 | } 135 | fclose(fp); 136 | return 0; 137 | } 138 | else 139 | { 140 | return -2; //No file found 141 | } 142 | } 143 | 144 | /* 145 | Function writeResult: It writes in the output file the cluster of each sample (point). 146 | */ 147 | int writeResult(int *classMap, int lines, const char* filename) 148 | { 149 | FILE *fp; 150 | 151 | if ((fp=fopen(filename,"wt"))!=NULL) 152 | { 153 | for(int i=0; imaxDist) { 399 | maxDist=distCentroids[i]; 400 | } 401 | } 402 | memcpy(centroids, auxCentroids, (K*samples*sizeof(float))); 403 | 404 | sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist); 405 | outputMsg = strcat(outputMsg,line); 406 | 407 | } while((changes>minChanges) && (itmaxThreshold)); 408 | 409 | /* 410 | * 411 | * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT 412 | * 413 | */ 414 | // Output and termination conditions 415 | printf("%s",outputMsg); 416 | 417 | CHECK_CUDA_CALL( cudaDeviceSynchronize() ); 418 | 419 | //END CLOCK***************************************** 420 | end = omp_get_wtime(); 421 | printf("\nComputation: %f seconds", end - start); 422 | fflush(stdout); 423 | //************************************************** 424 | //START CLOCK*************************************** 425 | start = omp_get_wtime(); 426 | //************************************************** 427 | 428 | 429 | 430 | if (changes <= minChanges) { 431 | printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges); 432 | } 433 | else if (it >= maxIterations) { 434 | printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations); 435 | } 436 | else { 437 | printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold); 438 | } 439 | 440 | // Writing the classification of each point to the output file. 441 | error = writeResult(classMap, lines, argv[6]); 442 | if(error != 0) 443 | { 444 | showFileError(error, argv[6]); 445 | exit(error); 446 | } 447 | 448 | //Free memory 449 | free(data); 450 | free(classMap); 451 | free(centroidPos); 452 | free(centroids); 453 | free(distCentroids); 454 | free(pointsPerClass); 455 | free(auxCentroids); 456 | 457 | //END CLOCK***************************************** 458 | end = omp_get_wtime(); 459 | printf("\n\nMemory deallocation: %f seconds\n", end - start); 460 | fflush(stdout); 461 | //***************************************************/ 462 | return 0; 463 | } 464 | -------------------------------------------------------------------------------- /projects/kmeans/KMEANS_mpi.c: -------------------------------------------------------------------------------- 1 | /* 2 | * k-Means clustering algorithm 3 | * 4 | * MPI version 5 | * 6 | * Parallel computing (Degree in Computer Engineering) 7 | * 2022/2023 8 | * 9 | * Version: 1.0 10 | * 11 | * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano 12 | * Grupo Trasgo, Universidad de Valladolid (Spain) 13 | * 14 | * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 15 | * https://creativecommons.org/licenses/by-sa/4.0/ 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #define MAXLINE 2000 27 | #define MAXCAD 200 28 | 29 | //Macros 30 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 31 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 32 | 33 | /* 34 | Function showFileError: It displays the corresponding error during file reading. 35 | */ 36 | void showFileError(int error, char* filename) 37 | { 38 | printf("Error\n"); 39 | switch (error) 40 | { 41 | case -1: 42 | fprintf(stderr,"\tFile %s has too many columns.\n", filename); 43 | fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE); 44 | break; 45 | case -2: 46 | fprintf(stderr,"Error reading file: %s.\n", filename); 47 | break; 48 | case -3: 49 | fprintf(stderr,"Error writing file: %s.\n", filename); 50 | break; 51 | } 52 | fflush(stderr); 53 | } 54 | 55 | /* 56 | Function readInput: It reads the file to determine the number of rows and columns. 57 | */ 58 | int readInput(char* filename, int *lines, int *samples) 59 | { 60 | FILE *fp; 61 | char line[MAXLINE] = ""; 62 | char *ptr; 63 | const char *delim = "\t"; 64 | int contlines, contsamples = 0; 65 | 66 | contlines = 0; 67 | 68 | if ((fp=fopen(filename,"r"))!=NULL) 69 | { 70 | while(fgets(line, MAXLINE, fp)!= NULL) 71 | { 72 | if (strchr(line, '\n') == NULL) 73 | { 74 | return -1; 75 | } 76 | contlines++; 77 | ptr = strtok(line, delim); 78 | contsamples = 0; 79 | while(ptr != NULL) 80 | { 81 | contsamples++; 82 | ptr = strtok(NULL, delim); 83 | } 84 | } 85 | fclose(fp); 86 | *lines = contlines; 87 | *samples = contsamples; 88 | return 0; 89 | } 90 | else 91 | { 92 | return -2; 93 | } 94 | } 95 | 96 | /* 97 | Function readInput2: It loads data from file. 98 | */ 99 | int readInput2(char* filename, float* data) 100 | { 101 | FILE *fp; 102 | char line[MAXLINE] = ""; 103 | char *ptr; 104 | const char *delim = "\t"; 105 | int i = 0; 106 | 107 | if ((fp=fopen(filename,"rt"))!=NULL) 108 | { 109 | while(fgets(line, MAXLINE, fp)!= NULL) 110 | { 111 | ptr = strtok(line, delim); 112 | while(ptr != NULL) 113 | { 114 | data[i] = atof(ptr); 115 | i++; 116 | ptr = strtok(NULL, delim); 117 | } 118 | } 119 | fclose(fp); 120 | return 0; 121 | } 122 | else 123 | { 124 | return -2; //No file found 125 | } 126 | } 127 | 128 | /* 129 | Function writeResult: It writes in the output file the cluster of each sample (point). 130 | */ 131 | int writeResult(int *classMap, int lines, const char* filename) 132 | { 133 | FILE *fp; 134 | 135 | if ((fp=fopen(filename,"wt"))!=NULL) 136 | { 137 | for(int i=0; imaxDist) { 383 | maxDist=distCentroids[i]; 384 | } 385 | } 386 | memcpy(centroids, auxCentroids, (K*samples*sizeof(float))); 387 | 388 | sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist); 389 | outputMsg = strcat(outputMsg,line); 390 | 391 | } while((changes>minChanges) && (itmaxThreshold)); 392 | 393 | /* 394 | * 395 | * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT 396 | * 397 | */ 398 | // Output and termination conditions 399 | printf("%s",outputMsg); 400 | 401 | //END CLOCK***************************************** 402 | end = MPI_Wtime(); 403 | printf("\nComputation: %f seconds", end - start); 404 | fflush(stdout); 405 | //************************************************** 406 | //START CLOCK*************************************** 407 | start = MPI_Wtime(); 408 | //************************************************** 409 | 410 | 411 | 412 | if (changes <= minChanges) { 413 | printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges); 414 | } 415 | else if (it >= maxIterations) { 416 | printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations); 417 | } 418 | else { 419 | printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold); 420 | } 421 | 422 | // Writing the classification of each point to the output file. 423 | error = writeResult(classMap, lines, argv[6]); 424 | if(error != 0) 425 | { 426 | showFileError(error, argv[6]); 427 | MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE ); 428 | } 429 | 430 | //Free memory 431 | free(data); 432 | free(classMap); 433 | free(centroidPos); 434 | free(centroids); 435 | free(distCentroids); 436 | free(pointsPerClass); 437 | free(auxCentroids); 438 | 439 | //END CLOCK***************************************** 440 | end = MPI_Wtime(); 441 | printf("\n\nMemory deallocation: %f seconds\n", end - start); 442 | fflush(stdout); 443 | //***************************************************/ 444 | MPI_Finalize(); 445 | return 0; 446 | } 447 | -------------------------------------------------------------------------------- /projects/kmeans/KMEANS_omp.c: -------------------------------------------------------------------------------- 1 | /* 2 | * k-Means clustering algorithm 3 | * 4 | * OpenMP version 5 | * 6 | * Parallel computing (Degree in Computer Engineering) 7 | * 2022/2023 8 | * 9 | * Version: 1.0 10 | * 11 | * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano 12 | * Grupo Trasgo, Universidad de Valladolid (Spain) 13 | * 14 | * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 15 | * https://creativecommons.org/licenses/by-sa/4.0/ 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #define MAXLINE 2000 27 | #define MAXCAD 200 28 | 29 | //Macros 30 | #define MIN(a,b) ((a) < (b) ? (a) : (b)) 31 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 32 | 33 | /* 34 | Function showFileError: It displays the corresponding error during file reading. 35 | */ 36 | void showFileError(int error, char* filename) 37 | { 38 | printf("Error\n"); 39 | switch (error) 40 | { 41 | case -1: 42 | fprintf(stderr,"\tFile %s has too many columns.\n", filename); 43 | fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE); 44 | break; 45 | case -2: 46 | fprintf(stderr,"Error reading file: %s.\n", filename); 47 | break; 48 | case -3: 49 | fprintf(stderr,"Error writing file: %s.\n", filename); 50 | break; 51 | } 52 | fflush(stderr); 53 | } 54 | 55 | /* 56 | Function readInput: It reads the file to determine the number of rows and columns. 57 | */ 58 | int readInput(char* filename, int *lines, int *samples) 59 | { 60 | FILE *fp; 61 | char line[MAXLINE] = ""; 62 | char *ptr; 63 | const char *delim = "\t"; 64 | int contlines, contsamples = 0; 65 | 66 | contlines = 0; 67 | 68 | if ((fp=fopen(filename,"r"))!=NULL) 69 | { 70 | while(fgets(line, MAXLINE, fp)!= NULL) 71 | { 72 | if (strchr(line, '\n') == NULL) 73 | { 74 | return -1; 75 | } 76 | contlines++; 77 | ptr = strtok(line, delim); 78 | contsamples = 0; 79 | while(ptr != NULL) 80 | { 81 | contsamples++; 82 | ptr = strtok(NULL, delim); 83 | } 84 | } 85 | fclose(fp); 86 | *lines = contlines; 87 | *samples = contsamples; 88 | return 0; 89 | } 90 | else 91 | { 92 | return -2; 93 | } 94 | } 95 | 96 | /* 97 | Function readInput2: It loads data from file. 98 | */ 99 | int readInput2(char* filename, float* data) 100 | { 101 | FILE *fp; 102 | char line[MAXLINE] = ""; 103 | char *ptr; 104 | const char *delim = "\t"; 105 | int i = 0; 106 | 107 | if ((fp=fopen(filename,"rt"))!=NULL) 108 | { 109 | while(fgets(line, MAXLINE, fp)!= NULL) 110 | { 111 | ptr = strtok(line, delim); 112 | while(ptr != NULL) 113 | { 114 | data[i] = atof(ptr); 115 | i++; 116 | ptr = strtok(NULL, delim); 117 | } 118 | } 119 | fclose(fp); 120 | return 0; 121 | } 122 | else 123 | { 124 | return -2; //No file found 125 | } 126 | } 127 | 128 | /* 129 | Function writeResult: It writes in the output file the cluster of each sample (point). 130 | */ 131 | int writeResult(int *classMap, int lines, const char* filename) 132 | { 133 | FILE *fp; 134 | 135 | if ((fp=fopen(filename,"wt"))!=NULL) 136 | { 137 | for(int i=0; imaxDist) { 380 | maxDist=distCentroids[i]; 381 | } 382 | } 383 | memcpy(centroids, auxCentroids, (K*samples*sizeof(float))); 384 | 385 | sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist); 386 | outputMsg = strcat(outputMsg,line); 387 | 388 | } while((changes>minChanges) && (itmaxThreshold)); 389 | 390 | /* 391 | * 392 | * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT 393 | * 394 | */ 395 | // Output and termination conditions 396 | printf("%s",outputMsg); 397 | 398 | //END CLOCK***************************************** 399 | end = omp_get_wtime(); 400 | printf("\nComputation: %f seconds", end - start); 401 | fflush(stdout); 402 | //************************************************** 403 | //START CLOCK*************************************** 404 | start = omp_get_wtime(); 405 | //************************************************** 406 | 407 | 408 | 409 | if (changes <= minChanges) { 410 | printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges); 411 | } 412 | else if (it >= maxIterations) { 413 | printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations); 414 | } 415 | else { 416 | printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold); 417 | } 418 | 419 | // Writing the classification of each point to the output file. 420 | error = writeResult(classMap, lines, argv[6]); 421 | if(error != 0) 422 | { 423 | showFileError(error, argv[6]); 424 | exit(error); 425 | } 426 | 427 | //Free memory 428 | free(data); 429 | free(classMap); 430 | free(centroidPos); 431 | free(centroids); 432 | free(distCentroids); 433 | free(pointsPerClass); 434 | free(auxCentroids); 435 | 436 | //END CLOCK***************************************** 437 | end = omp_get_wtime(); 438 | printf("\n\nMemory deallocation: %f seconds\n", end - start); 439 | fflush(stdout); 440 | //***************************************************/ 441 | return 0; 442 | } 443 | -------------------------------------------------------------------------------- /projects/kmeans/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | k-Means clustering algorithm 3 | 4 | Parallel computing (Degree in Computer Engineering) 5 | 2022/2023 6 | 7 | EduHPC 2023: Peachy assignment 8 | 9 | (c) 2022-2023 Diego García-Álvarez, Arturo Gonzalez-Escribano 10 | Group Trasgo, Universidad de Valladolid (Spain) 11 | 12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 13 | https://creativecommons.org/licenses/by-sa/4.0/ 14 | 15 | -------------------------------------------------------------------------------- /projects/kmeans/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # K-means 3 | # 4 | # Parallel computing (Degree in Computer Engineering) 5 | # 2022/2023 6 | # 7 | # (c) 2023 Diego Garcia-Alvarez and Arturo Gonzalez-Escribano 8 | # Grupo Trasgo, Universidad de Valladolid (Spain) 9 | # 10 | 11 | # Compilers 12 | CC=gcc 13 | OMPFLAG=-fopenmp 14 | MPICC=mpicc 15 | CUDACC=nvcc 16 | 17 | # Flags for optimization and libs 18 | FLAGS=-O3 -Wall 19 | LIBS=-lm 20 | 21 | # Targets to build 22 | OBJS=KMEANS_seq KMEANS_omp KMEANS_mpi KMEANS_cuda 23 | 24 | # Rules. By default show help 25 | help: 26 | @echo 27 | @echo "K-means clustering method" 28 | @echo 29 | @echo "Group Trasgo, Universidad de Valladolid (Spain)" 30 | @echo 31 | @echo "make KMEANS_seq Build only the sequential version" 32 | @echo "make cKMEANS_omp Build only the OpenMP version" 33 | @echo "make KMEANS_mpi Build only the MPI version" 34 | @echo "make KMEANS_cuda Build only the CUDA version" 35 | @echo 36 | @echo "make all Build all versions (Sequential, OpenMP)" 37 | @echo "make debug Build all version with demo output for small surfaces" 38 | @echo "make clean Remove targets" 39 | @echo 40 | 41 | all: $(OBJS) 42 | 43 | KMEANS_seq: KMEANS.c 44 | $(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 45 | 46 | KMEANS_omp: KMEANS_omp.c 47 | $(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@ 48 | 49 | KMEANS_mpi: KMEANS_mpi.c 50 | $(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 51 | 52 | KMEANS_cuda: KMEANS_cuda.cu 53 | $(CUDACC) $(DEBUG) $< $(LIBS) -o $@ 54 | 55 | 56 | # Remove the target files 57 | clean: 58 | rm -rf $(OBJS) 59 | 60 | # Compile in debug mode 61 | debug: 62 | make DEBUG="-DDEBUG -g" FLAGS= all 63 | 64 | -------------------------------------------------------------------------------- /projects/kmeans/README: -------------------------------------------------------------------------------- 1 | 2 | k-Means clustering algorithm 3 | 4 | EduHPC 2023: Peachy assignment 5 | 6 | (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano 7 | Group Trasgo, Universidad de Valladolid (Spain) 8 | 9 | -------------------------------------------------------------- 10 | 11 | Read the handout and use the sequential code as reference to study. 12 | Use the other source files to parallelize with the proper programming model. 13 | 14 | Edit the first lines in the Makefile to set your preferred compilers and flags 15 | for both the sequential code and for each parallel programming model: 16 | OpenMP, MPI, and CUDA. 17 | 18 | To see a description of the Makefile options execute: 19 | $ make help 20 | 21 | Use the input files in the test_files directory for your first tests. 22 | Students are encouraged to manually write or automatically generate 23 | their own input files for more complete tests. See a description of 24 | the input files format in the handout. 25 | 26 | -------------------------------------------------------------------------------- /projects/kmeans/handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/projects/kmeans/handout.pdf -------------------------------------------------------------------------------- /projects/kmeans/test_files/input2D2.inp: -------------------------------------------------------------------------------- 1 | -81 0 2 | 47 84 3 | 55 -3 4 | -33 29 5 | 5 -93 6 | -47 72 7 | 34 -15 8 | 43 0 9 | 98 -73 10 | -9 -18 11 | -44 67 12 | 86 -94 13 | -77 -59 14 | 82 -90 15 | 60 -21 16 | 61 29 17 | 80 -43 18 | -38 -16 19 | 54 30 20 | 63 -42 21 | -------------------------------------------------------------------------------- /projects/sequence/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Exact DNA sequence alignment for multiple patterns 3 | 4 | Parallel computing (Degree in Computer Engineering) 5 | 2023/2024 6 | 7 | EduHPC 2024: Peachy assignment 8 | 9 | (c) 2023-2024 Arturo Gonzalez-Escribano, Diego García-Álvarez, Jesús Cámara 10 | Group Trasgo, Grupo GAMUVa, Universidad de Valladolid (Spain) 11 | 12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 13 | https://creativecommons.org/licenses/by-sa/4.0/ 14 | 15 | -------------------------------------------------------------------------------- /projects/sequence/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Exact genetic sequence alignment 3 | # 4 | # Parallel computing (Degree in Computer Engineering) 5 | # 2023/2024 6 | # 7 | # (c) 2024 Arturo Gonzalez-Escribano 8 | # Grupo Trasgo, Universidad de Valladolid (Spain) 9 | # 10 | 11 | # Compilers 12 | CC=gcc 13 | OMPFLAG=-fopenmp 14 | MPICC=mpicc 15 | CUDACC=nvcc 16 | 17 | # Flags for optimization and external libs 18 | LIBS=-lm 19 | FLAGS=-O3 -Wall 20 | CUDAFLAGS=-O3 -Xcompiler -Wall 21 | 22 | # Targets to build 23 | OBJS=align_seq align_omp align_mpi align_cuda 24 | 25 | # Rules. By default show help 26 | help: 27 | @echo 28 | @echo "Exact genetic sequence alignment" 29 | @echo 30 | @echo "Group Trasgo, Universidad de Valladolid (Spain)" 31 | @echo 32 | @echo "make align_seq Build only the sequential version" 33 | @echo "make align_omp Build only the OpenMP version" 34 | @echo "make align_mpi Build only the MPI version" 35 | @echo "make align_cuda Build only the CUDA version" 36 | @echo 37 | @echo "make all Build all versions (Sequential, OpenMP, MPI, CUDA)" 38 | @echo "make debug Build all version with demo output for small sequences" 39 | @echo "make clean Remove targets" 40 | @echo 41 | 42 | all: $(OBJS) 43 | 44 | align_seq: align.c rng.c 45 | $(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 46 | 47 | align_omp: align_omp.c rng.c 48 | $(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@ 49 | 50 | align_mpi: align_mpi.c rng.c 51 | $(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 52 | 53 | align_cuda: align_cuda.cu rng.c 54 | $(CUDACC) $(CUDAFLAGS) $(DEBUG) $< $(LIBS) -o $@ 55 | 56 | 57 | # Remove the target files 58 | clean: 59 | rm -rf $(OBJS) 60 | 61 | # Compile in debug mode 62 | debug: 63 | make DEBUG="-DDEBUG -g" all 64 | -------------------------------------------------------------------------------- /projects/sequence/README: -------------------------------------------------------------------------------- 1 | 2 | EduHPC 2024: Peachy assignment 3 | 4 | (c) 2023-2024 Arturo Gonzalez-Escribano, Diego García-Álvarez, Jesús Cámara 5 | Group Trasgo, Grupo GAMUVa, Universidad de Valladolid (Spain) 6 | 7 | -------------------------------------------------------------- 8 | 9 | Read the handout and use the sequential code as reference to study. 10 | Use the other source files to parallelize with the proper programming model. 11 | 12 | Edit the first lines in the Makefile to set your preferred compilers and flags 13 | for both the sequential code and for each parallel programming model: 14 | OpenMP, MPI, and CUDA. 15 | 16 | To see a description of the Makefile options execute: 17 | $ make help 18 | 19 | Use the following program arguments for your first tests. 20 | Students are encouraged to generate their own program arguments for more 21 | complete tests. See a description of the program arguments in the handout. 22 | 23 | 24 | Example tests 25 | ============== 26 | 27 | 1) Basic test: 28 | -------------- 29 | 300 0.1 0.3 0.35 100 5 5 300 150 50 150 80 M 609823 30 | 31 | 32 | 2) Simple tests for race conditions: 33 | ------------------------------------ 34 | 1000 0.35 0.2 0.25 0 0 0 20000 10 0 500 0 M 4353435 35 | 36 | 10000 0.35 0.2 0.25 0 0 0 10000 9000 9000 50 100 M 4353435 37 | 38 | 39 | 3) Check that the program works for sequences longest than INT_MAX: 40 | ------------------------------------------------------------------- 41 | 4294967300 0.35 0.2 0.25 0 0 0 1 1 0 4294967298 0 M 683224 42 | 43 | 44 | -------------------------------------------------------------------------------- /projects/sequence/align.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Exact genetic sequence alignment 3 | * (Using brute force) 4 | * 5 | * Reference sequential version (Do not modify this code) 6 | * 7 | * Computacion Paralela, Grado en Informatica (Universidad de Valladolid) 8 | * 2023/2024 9 | * 10 | * v1.2 11 | * 12 | * (c) 2024, Arturo Gonzalez-Escribano 13 | */ 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | 21 | /* Arbitrary value to indicate that no matches are found */ 22 | #define NOT_FOUND -1 23 | 24 | /* Arbitrary value to restrict the checksums period */ 25 | #define CHECKSUM_MAX 65535 26 | 27 | 28 | /* 29 | * Utils: Function to get wall time 30 | */ 31 | double cp_Wtime(){ 32 | struct timeval tv; 33 | gettimeofday(&tv, NULL); 34 | return tv.tv_sec + 1.0e-6 * tv.tv_usec; 35 | } 36 | 37 | /* 38 | * Utils: Random generator 39 | */ 40 | #include "rng.c" 41 | 42 | 43 | /* 44 | * 45 | * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT 46 | * 47 | */ 48 | 49 | /* 50 | * Function: Increment the number of pattern matches on the sequence positions 51 | * This function can be changed and/or optimized by the students 52 | */ 53 | void increment_matches( int pat, unsigned long *pat_found, unsigned long *pat_length, int *seq_matches ) { 54 | unsigned long ind; 55 | for( ind=0; ind seq_length - length ) location = seq_length - length; 84 | if ( location <= 0 ) location = 0; 85 | 86 | /* Copy sample */ 87 | unsigned long ind; 88 | for( ind=0; ind seq_length ) length = seq_length; 106 | if ( length <= 0 ) length = 1; 107 | 108 | /* Allocate pattern */ 109 | char *pattern = (char *)malloc( sizeof(char) * length ); 110 | if ( pattern == NULL ) { 111 | fprintf(stderr,"\n-- Error allocating a pattern of size: %lu\n", length ); 112 | exit( EXIT_FAILURE ); 113 | } 114 | 115 | /* Return results */ 116 | *new_length = length; 117 | return pattern; 118 | } 119 | 120 | 121 | /* 122 | * Function: Regenerate a sample of the sequence 123 | */ 124 | void generate_sample_sequence( rng_t *random, rng_t random_seq, float prob_G, float prob_C, float prob_A, unsigned long seq_length, unsigned long pat_samp_loc_mean, unsigned long pat_samp_loc_dev, char *pattern, unsigned long length ) { 125 | /* Choose location */ 126 | unsigned long location = (unsigned long)rng_next_normal( random, (double)pat_samp_loc_mean, (double)pat_samp_loc_dev ); 127 | if ( location > seq_length - length ) location = seq_length - length; 128 | if ( location <= 0 ) location = 0; 129 | 130 | /* Regenerate sample */ 131 | rng_t local_random = random_seq; 132 | rng_skip( &local_random, location ); 133 | generate_rng_sequence( &local_random, prob_G, prob_C, prob_A, pattern, length); 134 | } 135 | 136 | 137 | /* 138 | * Function: Print usage line in stderr 139 | */ 140 | void show_usage( char *program_name ) { 141 | fprintf(stderr,"Usage: %s ", program_name ); 142 | fprintf(stderr," \n"); 143 | fprintf(stderr,"\n"); 144 | } 145 | 146 | 147 | 148 | /* 149 | * MAIN PROGRAM 150 | */ 151 | int main(int argc, char *argv[]) { 152 | /* 0. Default output and error without buffering, forces to write immediately */ 153 | setbuf(stdout, NULL); 154 | setbuf(stderr, NULL); 155 | 156 | /* 1. Read scenary arguments */ 157 | /* 1.1. Check minimum number of arguments */ 158 | if (argc < 15) { 159 | fprintf(stderr, "\n-- Error: Not enough arguments when reading configuration from the command line\n\n"); 160 | show_usage( argv[0] ); 161 | exit( EXIT_FAILURE ); 162 | } 163 | 164 | /* 1.2. Read argument values */ 165 | unsigned long seq_length = atol( argv[1] ); 166 | float prob_G = atof( argv[2] ); 167 | float prob_C = atof( argv[3] ); 168 | float prob_A = atof( argv[4] ); 169 | if ( prob_G + prob_C + prob_A > 1 ) { 170 | fprintf(stderr, "\n-- Error: The sum of G,C,A,T nucleotid probabilities cannot be higher than 1\n\n"); 171 | show_usage( argv[0] ); 172 | exit( EXIT_FAILURE ); 173 | } 174 | prob_C += prob_G; 175 | prob_A += prob_C; 176 | 177 | int pat_rng_num = atoi( argv[5] ); 178 | unsigned long pat_rng_length_mean = atol( argv[6] ); 179 | unsigned long pat_rng_length_dev = atol( argv[7] ); 180 | 181 | int pat_samp_num = atoi( argv[8] ); 182 | unsigned long pat_samp_length_mean = atol( argv[9] ); 183 | unsigned long pat_samp_length_dev = atol( argv[10] ); 184 | unsigned long pat_samp_loc_mean = atol( argv[11] ); 185 | unsigned long pat_samp_loc_dev = atol( argv[12] ); 186 | 187 | char pat_samp_mix = argv[13][0]; 188 | if ( pat_samp_mix != 'B' && pat_samp_mix != 'A' && pat_samp_mix != 'M' ) { 189 | fprintf(stderr, "\n-- Error: Incorrect first character of pat_samp_mix: %c\n\n", pat_samp_mix); 190 | show_usage( argv[0] ); 191 | exit( EXIT_FAILURE ); 192 | } 193 | 194 | unsigned long seed = atol( argv[14] ); 195 | 196 | #ifdef DEBUG 197 | /* DEBUG: Print arguments */ 198 | printf("\nArguments: seq_length=%lu\n", seq_length ); 199 | printf("Arguments: Accumulated probabilitiy G=%f, C=%f, A=%f, T=1\n", prob_G, prob_C, prob_A ); 200 | printf("Arguments: Random patterns number=%d, length_mean=%lu, length_dev=%lu\n", pat_rng_num, pat_rng_length_mean, pat_rng_length_dev ); 201 | printf("Arguments: Sample patterns number=%d, length_mean=%lu, length_dev=%lu, loc_mean=%lu, loc_dev=%lu\n", pat_samp_num, pat_samp_length_mean, pat_samp_length_dev, pat_samp_loc_mean, pat_samp_loc_dev ); 202 | printf("Arguments: Type of mix: %c, Random seed: %lu\n", pat_samp_mix, seed ); 203 | printf("\n"); 204 | #endif // DEBUG 205 | 206 | /* 2. Initialize data structures */ 207 | /* 2.1. Allocate and fill sequence */ 208 | char *sequence = (char *)malloc( sizeof(char) * seq_length ); 209 | if ( sequence == NULL ) { 210 | fprintf(stderr,"\n-- Error allocating the sequence for size: %lu\n", seq_length ); 211 | exit( EXIT_FAILURE ); 212 | } 213 | rng_t random = rng_new( seed ); 214 | generate_rng_sequence( &random, prob_G, prob_C, prob_A, sequence, seq_length); 215 | 216 | /* 2.2. Allocate and fill patterns */ 217 | /* 2.2.1 Allocate main structures */ 218 | int pat_number = pat_rng_num + pat_samp_num; 219 | unsigned long *pat_length = (unsigned long *)malloc( sizeof(unsigned long) * pat_number ); 220 | char **pattern = (char **)malloc( sizeof(char*) * pat_number ); 221 | if ( pattern == NULL || pat_length == NULL ) { 222 | fprintf(stderr,"\n-- Error allocating the basic patterns structures for size: %d\n", pat_number ); 223 | exit( EXIT_FAILURE ); 224 | } 225 | 226 | /* 2.2.2 Allocate and initialize ancillary structure for pattern types */ 227 | int ind; 228 | unsigned long lind; 229 | #define PAT_TYPE_NONE 0 230 | #define PAT_TYPE_RNG 1 231 | #define PAT_TYPE_SAMP 2 232 | char *pat_type = (char *)malloc( sizeof(char) * pat_number ); 233 | if ( pat_type == NULL ) { 234 | fprintf(stderr,"\n-- Error allocating ancillary structure for pattern of size: %d\n", pat_number ); 235 | exit( EXIT_FAILURE ); 236 | } 237 | for( ind=0; ind 13 | #include 14 | 15 | /* 16 | * Constants 17 | */ 18 | #define RNG_MULTIPLIER 6364136223846793005ULL 19 | #define RNG_INCREMENT 1442695040888963407ULL 20 | 21 | /* 22 | * Type for random sequences state 23 | */ 24 | typedef uint64_t rng_t; 25 | 26 | /* 27 | * Constructor: Create a new state from a seed 28 | */ 29 | #ifdef __CUDACC__ 30 | __host__ __device__ 31 | #endif 32 | rng_t rng_new(uint64_t seed) { 33 | uint64_t hash = seed; 34 | hash = (hash ^ (hash >> 30)) * 0xbf58476d1ce4e5b9ULL; 35 | hash = (hash ^ (hash >> 27)) * 0x94d049bb133111ebULL; 36 | hash = hash ^ (hash >> 31); 37 | return hash; // initial state 38 | } 39 | 40 | /* 41 | * Next: Advance state and return a double number uniformely distributed 42 | * Adapted from the implementation on PCG (https://www.pcg-random.org/) 43 | */ 44 | #ifdef __CUDACC__ 45 | __host__ __device__ 46 | #endif 47 | double rng_next(rng_t *seq) { 48 | *seq = ( *seq * RNG_MULTIPLIER + RNG_INCREMENT); 49 | return (double) ldexpf( *seq, -64 ); 50 | } 51 | 52 | /* 53 | * Next Normal: Advance state and return a double number distributed with a normal(mu,sigma) 54 | */ 55 | #ifdef __CUDACC__ 56 | __host__ __device__ 57 | #endif 58 | double rng_next_normal( rng_t *seq, double mu, double sigma) { 59 | double u1 = rng_next(seq); 60 | double u2 = rng_next(seq); 61 | 62 | double z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2); 63 | // double z1 = sqrt(-2.0 * log(u1)) * sin(2.0 * M_PI * u2); 64 | 65 | return mu + sigma * z0; 66 | } 67 | 68 | /* 69 | * Skip ahead: Advance state with an arbitrary jump in log time 70 | * Adapted from the implementation on PCG (https://www.pcg-random.org/) 71 | */ 72 | #ifdef __CUDACC__ 73 | __host__ __device__ 74 | #endif 75 | void rng_skip( rng_t *seq, uint64_t steps ) { 76 | uint64_t cur_mult = RNG_MULTIPLIER; 77 | uint64_t cur_plus = RNG_INCREMENT; 78 | 79 | uint64_t acc_mult = 1u; 80 | uint64_t acc_plus = 0u; 81 | while (steps > 0) { 82 | if (steps & 1) { 83 | acc_mult *= cur_mult; 84 | acc_plus = acc_plus * cur_mult + cur_plus; 85 | } 86 | cur_plus = (cur_mult + 1) * cur_plus; 87 | cur_mult *= cur_mult; 88 | steps /= 2; 89 | } 90 | *seq = acc_mult * (*seq) + acc_plus; 91 | } 92 | 93 | -------------------------------------------------------------------------------- /projects/wind/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Simulation of a Wind Tunnel 3 | 4 | Parallel computing (Degree in Computer Engineering) 5 | 2020/2021 6 | 7 | EduHPC 2021: Peachy assignment 8 | 9 | (c) 2021 Arturo Gonzalez-Escribano, Yuri Torres 10 | Group Trasgo, Universidad de Valladolid (Spain) 11 | 12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License. 13 | https://creativecommons.org/licenses/by-sa/4.0/ 14 | 15 | -------------------------------------------------------------------------------- /projects/wind/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Wind-tunnel 3 | # 4 | # Parallel computing (Degree in Computer Engineering) 5 | # 2020/2021 6 | # 7 | # (c) 2021 Arturo Gonzalez-Escribano 8 | # Grupo Trasgo, Universidad de Valladolid (Spain) 9 | # 10 | 11 | # Compilers 12 | CC=gcc 13 | OMPFLAG=-fopenmp 14 | MPICC=mpicc 15 | CUDACC=nvcc 16 | 17 | # Flags for optimization and libs 18 | FLAGS=-O3 -Wall 19 | LIBS=-lm 20 | 21 | # Targets to build 22 | OBJS=wind_seq wind_omp wind_mpi wind_cuda 23 | 24 | # Rules. By default show help 25 | help: 26 | @echo 27 | @echo "Wind tunnel" 28 | @echo 29 | @echo "Group Trasgo, Universidad de Valladolid (Spain)" 30 | @echo 31 | @echo "make wind_seq Build only the reference sequential version" 32 | @echo "make wind_omp Build only the OpenMP version" 33 | @echo "make wind_mpi Build only the MPI version" 34 | @echo "make wind_cuda Build only the CUDA version" 35 | @echo 36 | @echo "make all Build all versions (Sequential, OpenMP)" 37 | @echo "make debug Build all version with demo output for small surfaces" 38 | @echo "make clean Remove targets" 39 | @echo 40 | 41 | all: $(OBJS) 42 | 43 | wind_seq: wind.c 44 | $(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 45 | 46 | wind_omp: wind_omp.c 47 | $(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@ 48 | 49 | wind_mpi: wind_mpi.c 50 | $(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@ 51 | 52 | wind_cuda: wind_cuda.cu 53 | $(CUDACC) $(DEBUG) $< $(LIBS) -o $@ 54 | 55 | # Remove the target files 56 | clean: 57 | rm -rf $(OBJS) 58 | 59 | # Compile in debug mode 60 | debug: 61 | make DEBUG="-DDEBUG -g" FLAGS= all 62 | 63 | -------------------------------------------------------------------------------- /projects/wind/README: -------------------------------------------------------------------------------- 1 | 2 | Simulation of a Wind Tunnel 3 | 4 | EduHPC 2021: Peachy assignment 5 | 6 | (c) 2021 Arturo Gonzalez-Escribano, Yuri Torres 7 | Group Trasgo, Universidad de Valladolid (Spain) 8 | 9 | -------------------------------------------------------------- 10 | 11 | Read the handout and use the sequential code as reference to study. 12 | Use the other source files to parallelize with the proper programming model. 13 | 14 | Edit the first lines in the Makefile to set your preferred compilers and flags 15 | for both the sequential code and for each parallel programming model: 16 | OpenMP, MPI, and CUDA. 17 | 18 | To see a description of the Makefile options execute: 19 | $ make help 20 | 21 | Examples: 22 | 23 | Use the following combinatios of arguments for your first tests. 24 | You will discover that they represent different classes of scenarios 25 | and problems when the code is parallelized. 26 | Students are encouraged to design and use their own scenarios for more 27 | complete tests. See a description of the arguments in the handout. 28 | 29 | Examples: 30 | 31 | Only propagation: 32 | ./wind_seq 538 60 1397 0.5 30 29 0 0 0 0 0 0 3431 9012 6432 33 | 34 | ./wind_seq 456 812 1004 2.2 21 745 0 0 0 0 0 0 684 384 1292 35 | 36 | ./wind_seq 38000 32 31000 0.5 3 24 0 0 0 0 0 0 583 1943 2345 37 | 38 | ./wind_seq 32 2100000 118 0.1 0 2100000 0 0 0 0 0 0 673 3902 43 39 | 40 | Fixed particles with chosen positions: 41 | ./wind_seq 102 80 352 0.1 10 50 0 0 0 0 0 0 3431 9012 12432 20 12 0.712 20 13 0.713 20 14 0.714 20 15 0.715 20 16 0.716 20 17 0.717 20 18 0.718 20 19 0.719 20 20 0.720 30 16 0.516 30 18 0.518 30 20 0.520 30 22 0.522 40 20 0.420 40 30 0.430 40 40 0.440 40 50 0.450 40 60 0.460 40 70 0.470 42 | 43 | Fixed particles with random and chosen positions: 44 | ./wind_seq 102 80 352 0.1 10 50 15 16 0.1 0 0 0 3431 9012 12432 20 12 0.712 20 13 0.713 20 14 0.714 20 15 0.715 20 16 0.716 20 17 0.717 20 18 0.718 20 19 0.719 20 20 0.720 30 16 0.516 30 18 0.518 30 20 0.520 30 22 0.522 40 20 0.420 40 30 0.430 40 40 0.440 40 50 0.450 40 60 0.460 40 70 0.470 45 | 46 | Fixed and moving particles with initial random positions: 47 | ./wind_seq 2100 457 6300 0.4 1 452 20 2000 0.001 16 50 0.2 583 223 712 48 | 49 | -------------------------------------------------------------------------------- /projects/wind/handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/projects/wind/handout.pdf -------------------------------------------------------------------------------- /utils/create_users_pmc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | create_user_no_prompt () { 4 | echo ========$1 5 | echo ========$2 6 | 7 | local USERNAME="$1" 8 | local PASSWORD="$2" 9 | local GROUP="$3" 10 | local FULLNAME="$4" 11 | local EMAIL="$5" 12 | 13 | local GECOS="" 14 | echo "Creating user ${USERNAME} ${FULLNAME} ${EMAIL}..." 15 | adduser --ingroup ${GROUP} --disabled-login --gecos "" ${USERNAME} 16 | echo "Setting password..." 17 | echo "${USERNAME}:${PASSWORD}" | chpasswd 18 | echo "Adding user to docker group..." 19 | usermod -aG docker ${USERNAME} 20 | echo "Running make..." 21 | ( cd /var/yp/; make) 22 | } 23 | 24 | [ $SUDO_USER ] && user=$SUDO_USER || user=$(whoami) 25 | 26 | GROUP="studenti_psmc" 27 | 28 | while IFS=$'\t' read -r EMAIL PASSWORD NAME SURNAME; do 29 | USERNAME=$(echo $EMAIL | cut -d '@' -f 1 | tr '.' '_') 30 | FULLNAME="${NAME} ${SURNAME}" 31 | #echo "----" 32 | #echo ${USERNAME} 33 | #echo ${EMAIL} 34 | #echo ${PASSWORD} 35 | #echo ${GROUP} 36 | #echo ${FULLNAME} 37 | #echo ${EMAIL} 38 | create_user_no_prompt "${USERNAME}" "${PASSWORD}" "${GROUP}" "${FULLNAME}" "${EMAIL}" 39 | done < users_list.csv 40 | -------------------------------------------------------------------------------- /utils/openmpiscript.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ##************************************************************** 4 | ## 5 | ## Copyright (C) 1990-2018, Condor Team, Computer Sciences Department, 6 | ## University of Wisconsin-Madison, WI. 7 | ## 8 | ## Licensed under the Apache License, Version 2.0 (the "License"); you 9 | ## may not use this file except in compliance with the License. You may 10 | ## obtain a copy of the License at 11 | ## 12 | ## http://www.apache.org/licenses/LICENSE-2.0 13 | ## 14 | ## Unless required by applicable law or agreed to in writing, software 15 | ## distributed under the License is distributed on an "AS IS" BASIS, 16 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | ## See the License for the specific language governing permissions and 18 | ## limitations under the License. 19 | ## 20 | ##************************************************************** 21 | 22 | # This is a script to run OpenMPI jobs under the HTCondor parallel universe. 23 | # OpenMPI assumes that a full install is available on all execute nodes. 24 | 25 | ## sample submit script 26 | #universe = parallel 27 | #executable = openmpiscript 28 | #arguments = actual_mpi_job arg1 arg2 arg3 29 | #getenv = true 30 | # 31 | #should_transfer_files = yes 32 | #transfer_input_files = actual_mpi_job 33 | #when_to_transfer_output = on_exit_or_evict 34 | # 35 | #output = out.$(NODE) 36 | #error = err.$(NODE) 37 | #log = log 38 | # 39 | #machine_count = 8 40 | #queue 41 | ## 42 | 43 | ## configuration options 44 | # $USE_OPENMP should be set to true if using OpenMP with your OpenMPI executable (not typical). 45 | USE_OPENMP=false 46 | 47 | # Set the paths to the helper scripts 48 | # Get them from the HTCondor libexec directory 49 | ORTED_LAUNCHER=$(condor_config_val libexec)/orted_launcher.sh 50 | GET_ORTED_CMD=$(condor_config_val libexec)/get_orted_cmd.sh 51 | # Or set a custom path (e.g. the local directory if transferring the scripts) 52 | #ORTED_LAUNCHER=./orted_launcher.sh 53 | #GET_ORTED_CMD=./get_orted_cmd.sh 54 | 55 | # $MPDIR points to the location of the OpenMPI install 56 | # The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended) 57 | 58 | 59 | MPDIR=/usr 60 | #MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH) 61 | 62 | 63 | 64 | 65 | 66 | # Or set it manually 67 | #MPDIR=/usr/lib64/openmpi 68 | 69 | # $EXINT is a comma-delimited list of excluded network interfaces. 70 | # If your mpi jobs are hanging, OpenMPI may be trying to use too many 71 | # network interfaces to communicate between nodes. 72 | # The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended) 73 | EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES) 74 | # Or set it manually 75 | #EXINT="docker0,virbr0" 76 | ## 77 | 78 | ## configuration check 79 | # We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp 80 | # so that OpenMPI caches all data under the user's scratch directory. 81 | # Not having /tmp mounted under scratch may hang mpi jobs. 82 | _USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH) 83 | if [ -z $_USE_SCRATCH ]; then 84 | >&2 echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config" 85 | elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then 86 | >&2 echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH" 87 | fi 88 | 89 | # If MPDIR is not set, then use a default value 90 | if [ -z $MPDIR ]; then 91 | >&2 echo "WARNING: Using default value for \$MPDIR in openmpiscript" 92 | MPDIR=/usr/lib64/openmpi 93 | fi 94 | PATH=$MPDIR/bin:.:$PATH 95 | export PATH 96 | 97 | # If EXINT is not set, then use some default values 98 | if [ -z $EXINT ]; then 99 | >&2 echo "WARNING: Using default values for \$EXINT in openmpiscript" 100 | EXINT="docker0,virbr0" 101 | fi 102 | ## 103 | 104 | ## cleanup function 105 | _orted_launcher_pid=0 106 | _mpirun_pid=0 107 | CONDOR_CHIRP=$(condor_config_val libexec)/condor_chirp 108 | force_cleanup() { 109 | # Forward SIGTERM to the orted launcher 110 | if [ $_orted_launcher_pid -ne 0 ]; then 111 | kill -s SIGTERM $_orted_launcher_pid 112 | fi 113 | 114 | # Cleanup mpirun 115 | if [ $_CONDOR_PROCNO -eq 0 ] && [ $_mpirun_pid -ne 0 ]; then 116 | $CONDOR_CHIRP ulog "Node $_CONDOR_PROCNO caught SIGTERM, cleaning up mpirun" 117 | rm $HOSTFILE 118 | 119 | # Send SIGTERM to mpirun and the orted launcher 120 | kill -s SIGTERM $_mpirun_pid 121 | 122 | # Give mpirun 30 seconds to terminate nicely 123 | for i in {1..30}; do 124 | kill -0 $_mpirun_pid 2> /dev/null # returns 0 if running 125 | _mpirun_killed=$? 126 | if [ $_mpirun_killed -ne 0 ]; then 127 | break 128 | fi 129 | sleep 1 130 | done 131 | 132 | # If mpirun is still running, send SIGKILL 133 | if [ $_mpirun_killed -eq 0 ]; then 134 | $CONDOR_CHIRP ulog "mpirun hung on Node ${_CONDOR_PROCNO}, sending SIGKILL!" 135 | kill -s SIGKILL $_mpirun_pid 136 | fi 137 | 138 | fi 139 | exit 1 140 | } 141 | trap force_cleanup SIGTERM 142 | ## 143 | 144 | ## execute node setup 145 | export PATH=$MPDIR/bin:$PATH 146 | 147 | # Run the orted launcher (gets orted command from condor_chirp) 148 | $ORTED_LAUNCHER & 149 | _orted_launcher_pid=$! 150 | if [ $_CONDOR_PROCNO -ne 0 ]; then 151 | # If not on node 0, wait for orted 152 | wait $_orted_launcher_pid 153 | exit $? 154 | fi 155 | ## 156 | 157 | ## head node (node 0) setup 158 | # Build the hostfile 159 | HOSTFILE=hosts 160 | while [ -f $_CONDOR_SCRATCH_DIR/$HOSTFILE ]; do 161 | HOSTFILE=x$HOSTFILE 162 | done 163 | HOSTFILE=$_CONDOR_SCRATCH_DIR/$HOSTFILE 164 | REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus) 165 | 166 | for node in $(seq 0 $(( $_CONDOR_NPROCS - 1 ))); do 167 | if $USE_OPENMP; then 168 | # OpenMP will do the threading on the execute node 169 | echo "$node slots=1" >> $HOSTFILE 170 | else 171 | # OpenMPI will do the threading on the execute node 172 | echo "$node slots=$REQUEST_CPUS" >> $HOSTFILE 173 | fi 174 | done 175 | 176 | # Make sure the executable is executable 177 | EXECUTABLE=$1 178 | shift 179 | chmod +x $EXECUTABLE 180 | ## 181 | 182 | ## run mpirun 183 | # Set MCA values for running on HTCondor 184 | export OMPI_MCA_plm_rsh_agent=$GET_ORTED_CMD # use the helper script instead of ssh 185 | export OMPI_MCA_plm_rsh_no_tree_spawn=1 # disable ssh tree spawn 186 | export OMPI_MCA_orte_hetero_nodes=1 # do not assume same hardware on each node 187 | export OMPI_MCA_orte_startup_timeout=120 # allow two minutes before failing 188 | export OMPI_MCA_hwloc_base_binding_policy="none" # do not bind to cpu cores 189 | export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude unused tcp network interfaces 190 | 191 | # Optional MCA values to set for firewalled setups 192 | #export OMPI_MCA_btl_tcp_port_min_v4=1024 # lowest port number that can be used 193 | #export OMPI_MCA_btl_tcp_port_range_v4=64511 # range of ports above lowest that can be used 194 | 195 | # Optionally set MCA values for increasing mpirun verbosity per component 196 | # (see ompi_info for more components) 197 | #export OMPI_MCA_plm_base_verbose=30 198 | #export OMPI_MCA_orte_base_verbose=30 199 | #export OMPI_MCA_hwloc_base_verbose=30 200 | #export OMPI_MCA_btl_base_verbose=30 201 | 202 | # Run mpirun in the background and wait for it to exit 203 | mpirun -v --prefix $MPDIR -hostfile $HOSTFILE $EXECUTABLE $@ & 204 | _mpirun_pid=$! 205 | wait $_mpirun_pid 206 | _mpirun_exit=$? 207 | 208 | ## clean up 209 | # Wait for orted to finish 210 | wait $_orted_launcher_pid 211 | rm $HOSTFILE 212 | exit $_mpirun_exit 213 | --------------------------------------------------------------------------------