├── README.md
├── lec04
    └── hello_world_0.c
├── lec05
    └── ring.c
├── lec06
    ├── README.md
    ├── argv.c
    ├── pi_parallel.c
    ├── pi_seq.c
    └── rand.c
├── lec08
    └── sum_vectors.c
├── lec12
    ├── README.md
    ├── branch_prediction_fast.c
    ├── branch_prediction_slow.c
    ├── cache_01_fast.c
    ├── cache_01_fast_broken.c
    ├── cache_01_fast_broken_fixed.c
    ├── cache_01_slow.c
    ├── cache_fs_fast.c
    ├── cache_fs_slow.c
    ├── compile.sh
    └── my_timer.h
├── lec13
    ├── README.md
    ├── blackscholes.c
    ├── compile.sh
    ├── inputgen.c
    ├── inputs
    │   ├── input_1000.txt
    │   ├── input_10000.txt
    │   └── input_100000.txt
    ├── my_timer.h
    ├── optionData.txt
    └── solution
    │   ├── blackscholes_omp
    │   ├── blackscholes_omp.c
    │   ├── blackscholes_pthreads
    │   └── blackscholes_pthreads.c
├── lec15
    ├── Makefile
    ├── README.md
    ├── histogram.c
    ├── histogram_solution_ato_local.c
    ├── histogram_solution_ato_local_better.c
    ├── histogram_solution_red.c
    ├── histogram_solution_trivial_ato.c
    ├── histogram_solution_trivial_crit.c
    ├── matmul.c
    ├── matmul_solution.c
    ├── pi.c
    ├── pi_solution_ato.c
    ├── pi_solution_crit.c
    ├── pi_solution_crit_rand.c
    ├── pi_solution_red.c
    └── rand_vs_rand_r.c
├── lec19
    ├── Makefile
    ├── README.md
    ├── cuda_job.sub
    ├── error_checks.h
    ├── image_blur.cu
    ├── image_blur_shared.cu
    ├── jacobi.cu
    ├── jacobi_solution.cu
    ├── test_cuda.cu
    └── vector_add.cu
├── projects
    ├── README.md
    ├── kmeans
    │   ├── KMEANS.c
    │   ├── KMEANS_cuda.cu
    │   ├── KMEANS_mpi.c
    │   ├── KMEANS_omp.c
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README
    │   ├── handout.pdf
    │   └── test_files
    │   │   ├── input100D.inp
    │   │   ├── input100D2.inp
    │   │   ├── input10D.inp
    │   │   ├── input20D.inp
    │   │   ├── input2D.inp
    │   │   └── input2D2.inp
    ├── sequence
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README
    │   ├── align.c
    │   ├── align_cuda.cu
    │   ├── align_mpi.c
    │   ├── align_omp.c
    │   ├── handout.pdf
    │   └── rng.c
    └── wind
    │   ├── LICENSE
    │   ├── Makefile
    │   ├── README
    │   ├── handout.pdf
    │   ├── wind.c
    │   ├── wind_cuda.cu
    │   ├── wind_mpi.c
    │   └── wind_omp.c
└── utils
    ├── create_users_pmc.sh
    └── openmpiscript.sh


/README.md:
--------------------------------------------------------------------------------
1 | This repository contains the code samples shown and discussed in class.


--------------------------------------------------------------------------------
/lec04/hello_world_0.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | 
 4 | int main(void){
 5 |     int r = MPI_Init(NULL, NULL);
 6 |     int size, rank;
 7 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
 8 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 9 |     if(r != MPI_SUCCESS){
10 |         printf("Error starting MPI program. Terminating.\n");
11 |         MPI_Abort(MPI_COMM_WORLD, r);
12 |     }
13 |     char str[256];
14 |     if(rank == 0){
15 |         printf("Hello, World! I am process %d of %d.\n", rank, size);
16 |         int i;
17 |         for(i = 1; i < size; i++){
18 |             
19 |             MPI_Recv(str, 256, MPI_CHAR, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
20 |             printf("%s", str);
21 |         }
22 |     }else{
23 |         sprintf(str, "Hello, World! I am process %d of %d.\n", rank, size);
24 |         MPI_Send(str, 256, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
25 |     }
26 |     
27 |     MPI_Finalize();
28 |     return 0;
29 | }


--------------------------------------------------------------------------------
/lec05/ring.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | 
 4 | int main(void){
 5 |     int rank, size;
 6 |     int send_right = 19;
 7 |     int send_left = 23;
 8 |     int recv_left, recv_right;
 9 |     MPI_Init(NULL, NULL);
10 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
11 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
12 |     MPI_Request requests[4];
13 |     // Send right   
14 |     MPI_Isend(&send_right, 1, MPI_INT, (rank + 1) % size, 0, MPI_COMM_WORLD, &requests[0]);
15 |     // Send left
16 |     MPI_Isend(&send_left, 1, MPI_INT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &requests[1]);
17 |     // Recv from right
18 |     MPI_Irecv(&recv_right, 1, MPI_INT, (rank + 1) % size, 0, MPI_COMM_WORLD, &requests[2]);
19 |     // Recv from left
20 |     MPI_Irecv(&recv_left, 1, MPI_INT, (rank - 1 + size) % size, 0, MPI_COMM_WORLD, &requests[3]);
21 |     // Compute anything
22 |     // ...
23 |     MPI_Waitall(4, requests, MPI_STATUSES_IGNORE);
24 | 
25 |     MPI_Finalize();
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/lec06/README.md:
--------------------------------------------------------------------------------
 1 | Exercises shown during lecture 6.
 2 | - rand.c: A simple program in which each program prints a random number. You can see that each rank print the same number, since each one use the same seed. Now, try to add the following code after the MPI_Init:
 3 | ```c
 4 |     int rank;
 5 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 6 |     srand(rank);
 7 | ```
 8 | If you run the program again, you will see that each rank print a different number. This is because each rank is using a different seed.
 9 | 
10 | - pi_seq.c: A program that calculates the value of pi (sequentially) using the process we have seen in the slides.
11 | - pi_parallel.c: A program that calculates the value of pi (in parallel) using the process we have seen in the slides.
12 | - argv.c: A program that shows how to use the argv parameter in the main function.


--------------------------------------------------------------------------------
/lec06/argv.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc, char** argv){
 5 |     MPI_Init(&argc, &argv);
 6 |     int rank;
 7 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 8 |     printf("I am rank %d and argv[0] is %s argv[1] is %s\n", 
 9 |            rank, argv[0], argv[1]);
10 |     MPI_Finalize();
11 |     return 0;
12 | }


--------------------------------------------------------------------------------
/lec06/pi_parallel.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <time.h>
 5 | 
 6 | double get_rand_minus_one_one(){
 7 |     return 2 * (rand() / (double)RAND_MAX) - 1;
 8 | }
 9 | 
10 | int main(int argc, char** argv){
11 |     int num_tosses = atoi(argv[1]);
12 |     int toss;
13 |     int num_hits = 0;
14 |     MPI_Init(NULL, NULL);
15 |     double start_time = MPI_Wtime();
16 |     int world_size, rank;
17 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
18 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
19 |     srand(time(NULL)*rank);
20 |     int local_tosses = num_tosses / world_size;
21 |     for(toss = 0; toss < local_tosses; toss++){
22 |         double x = get_rand_minus_one_one();
23 |         double y = get_rand_minus_one_one();
24 |         if(x*x + y*y <= 1){
25 |             num_hits++;
26 |         }
27 |     }
28 |     int total_hits;
29 |     MPI_Reduce(&num_hits, &total_hits, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
30 |     if(rank == 0){
31 |         double pi_estimate = 4 * total_hits / ((double)num_tosses);
32 |         printf("Estimate of pi = %f Computed in %f seconds\n", pi_estimate, MPI_Wtime() - start_time);
33 |     }
34 |     MPI_Finalize();
35 |     return 0;
36 | }


--------------------------------------------------------------------------------
/lec06/pi_seq.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | 
 5 | double get_rand_minus_one_one(){
 6 |     return 2 * (rand() / (double)RAND_MAX) - 1;
 7 | }
 8 | 
 9 | int main(int argc, char** argv){
10 |     int num_tosses = atoi(argv[1]);
11 |     srand(time(NULL));
12 |     int toss;
13 |     int num_hits = 0;
14 |     for(toss = 0; toss < num_tosses; toss++){
15 |         double x = get_rand_minus_one_one();
16 |         double y = get_rand_minus_one_one();
17 |         if(x*x + y*y <= 1){
18 |             num_hits++;
19 |         }
20 |     }
21 |     double pi_estimate = 4 * num_hits / ((double)num_tosses);
22 |     printf("Estimate of pi = %f\n", pi_estimate);
23 |     return 0;
24 | }


--------------------------------------------------------------------------------
/lec06/rand.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <math.h>
 3 | #include <stdio.h>
 4 | #include <mpi.h>
 5 | 
 6 | int main(int argc, char** argv){
 7 |     MPI_Init(NULL, NULL);
 8 |     printf("Rand %d\n", rand());
 9 |     MPI_Finalize();
10 |     return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/lec08/sum_vectors.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | int* create_random_vector(int n){
 6 |     int* vec = (int*) malloc(n * sizeof(int));
 7 |     for(int i = 0; i < n; i++){
 8 |         vec[i] = rand() % 10;
 9 |     }
10 |     return vec;
11 | }
12 | 
13 | void print_vector(int* vec, int n){
14 |     for(int i = 0; i < n; i++){
15 |         printf("%d ", vec[i]);
16 |     }
17 |     printf("\n");
18 | }
19 | 
20 | int main(int argc, char** argv){
21 |     MPI_Init(&argc, &argv);
22 |     int rank, size;
23 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
24 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
25 |     int *a, *b;
26 |     int n = atoi(argv[1]);
27 |     if(n % size != 0){
28 |         printf("n must be divisible by the number of processes\n");
29 |         MPI_Abort(MPI_COMM_WORLD, 1);
30 |     }
31 |     if(rank == 0){
32 |         a = create_random_vector(n);
33 |         b = create_random_vector(n);
34 |         printf("Rank 0: a = ");
35 |         print_vector(a, n);
36 |         printf("Rank 0: b = ");
37 |         print_vector(b, n);
38 |         MPI_Scatter(a, n/size, MPI_INT, MPI_IN_PLACE, n/size, MPI_INT, 0, MPI_COMM_WORLD);
39 |         MPI_Scatter(b, n/size, MPI_INT, MPI_IN_PLACE, n/size, MPI_INT, 0, MPI_COMM_WORLD);
40 |     }else{
41 |         a = (int*) malloc(n/size * sizeof(int));
42 |         b = (int*) malloc(n/size * sizeof(int));
43 |         MPI_Scatter(NULL, n/size, MPI_INT, a, n/size, MPI_INT, 0, MPI_COMM_WORLD);
44 |         MPI_Scatter(NULL, n/size, MPI_INT, b, n/size, MPI_INT, 0, MPI_COMM_WORLD);
45 |     }
46 |     int* c = (int*) malloc(n/size * sizeof(int));
47 |     for(int i = 0; i < n/size; i++){
48 |         c[i] = a[i] + b[i];
49 |     }
50 |     int* c_finale = NULL;
51 |     if(rank == 0){
52 |         c_finale = (int*) malloc(n * sizeof(int));
53 |     }
54 |     MPI_Gather(c, n/size, MPI_INT, c_finale, n/size, MPI_INT, 0, MPI_COMM_WORLD);
55 |     if(rank == 0){
56 |         printf("Rank 0: c = ");
57 |         print_vector(c_finale, n);
58 |     }
59 |     MPI_Finalize();
60 |     return 0;
61 | }


--------------------------------------------------------------------------------
/lec12/README.md:
--------------------------------------------------------------------------------
 1 | - cache_01_slow.c: The program does not perform well because the matrix is read by column rather than by row
 2 | - cache_01_fast.c: Same as cache_01_slow.c, but the matrix is read by row rather than by column, thus outperforming the previous version
 3 | - cache_01_fast_broken.c: Same as cache_01_fast.c, but the vector declaration is within the main body. This enables GCC to apply Dead Code Elimination (DCE), and remove basically all the code. The application would then just not compute anything. Beware, some compilers can apply dead code elimination also to global variable, and in that case the cache_01_slow.c and cache_01_fast.c would also not compute anything.
 4 | - cache_01_fast_broken_fixed.c Same as cache_01_fast_broken_fixed.c, but now we do something with the result of the calculation (e.g., print the sum of the elements of y), so that the compiler does not eliminate the code. Alternatively, you can remove DCE by adding the following flags when compiling: -fno-dce -fno-dse -fno-tree-dce -fno-tree-dse
 5 | - cache_fs_slow.c: It shows the false sharing problem
 6 | - cache_fs_fast.c: It solves the false sharing problem by padding the structure
 7 | - branch_prediction_slow.c: Fills an array with random elements between 0 and 9, and then counts the number of elements that are greater than 5. The program is slow because the branch predictor is not able to predict the outcome of the if statement.
 8 | - branch_prediction_fast.c: Same as branch_prediction_slow.c, but it sorts the array before doing the check, so that (approximatively), the first n/2 elements are smaller an the remaining n/2 larger. In this way, the branch predictor is able to predict the outcome of the if statement more effectively.
 9 | 
10 | 
11 | For more examples, check https://github.com/Kobzol/hardware-effects
12 | 
13 | To install perf on WSL2:
14 | ```bash
15 | sudo apt install linux-tools-generic
16 | ```


--------------------------------------------------------------------------------
/lec12/branch_prediction_fast.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <math.h>
 5 | #include "my_timer.h"
 6 | 
 7 | double x[MAX];   
 8 | 
 9 | int compare(const void* a, const void* b) {
10 |    return (*(double*)a - *(double*)b);
11 | }
12 | 
13 | // Computes matrix-vector multiplication sequentially
14 | int main(int argc, char** argv) {
15 |     int i,iter;
16 |     srand(time(NULL));
17 |     for (i = 0; i < MAX; i++) {
18 |         x[i] = rand() % 10;
19 |     }
20 |     qsort(x, MAX, sizeof(double), compare);
21 | 
22 |     double total_time = 0.0;
23 |     int total_smaller = 0;
24 |     for(iter = 0; iter < ITER; iter++){
25 |         double start, stop;
26 |         int smaller = 0;
27 |         GET_TIME(start);
28 |         for(i = 0; i < MAX; i++){
29 |             if(x[i] < 5){
30 |                 smaller++;
31 |             }
32 |         }
33 |         GET_TIME(stop);
34 |         total_smaller += smaller;
35 |         total_time += stop-start;
36 |     }
37 |     printf("Total smaller %d\n", total_smaller);
38 |     printf("Average runtime %f sec\n", total_time/ITER);
39 | }
40 | 


--------------------------------------------------------------------------------
/lec12/branch_prediction_slow.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include <math.h>
 5 | #include "my_timer.h"
 6 | 
 7 | double x[MAX];   
 8 | 
 9 | // Computes matrix-vector multiplication sequentially
10 | int main(int argc, char** argv) {
11 |     int i,iter;
12 |     srand(time(NULL));
13 |     for (i = 0; i < MAX; i++) {
14 |         x[i] = rand() % 10;
15 |     }
16 | 
17 |     double total_time = 0.0;
18 |     int total_smaller = 0;
19 |     for(iter = 0; iter < ITER; iter++){
20 |         double start, stop;
21 |         int smaller = 0;
22 |         GET_TIME(start);
23 |         for(i = 0; i < MAX; i++){
24 |             if(x[i] < 5){
25 |                 smaller++;
26 |             }
27 |         }
28 |         GET_TIME(stop);
29 |         total_smaller += smaller;
30 |         total_time += stop-start;
31 |     }
32 |     printf("Total smaller %d\n", total_smaller);
33 |     printf("Average runtime %f sec\n", total_time/ITER);
34 | }
35 | 


--------------------------------------------------------------------------------
/lec12/cache_01_fast.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include "my_timer.h"
 5 | 
 6 | double A[MAX][MAX];
 7 | double x[MAX];
 8 | double y[MAX];   
 9 | 
10 | // Computes matrix-vector multiplication sequentially
11 | int main(int argc, char** argv) {
12 |     int i,j,iter;
13 |     srand(time(NULL));
14 |     /* Initialize A and x with random values, and y to 0s*/
15 |     for (i = 0; i < MAX; i++) {
16 |         x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1
17 |         y[i] = 0.0;
18 |         for (j = 0; j < MAX; j++)
19 |             A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1
20 |     }
21 | 
22 |     double total_time = 0.0;
23 |     for(iter = 0; iter < ITER; iter++){
24 |         double start, stop;
25 |         GET_TIME(start);
26 |         for (i = 0; i < MAX; i++)        
27 |             for (j = 0; j < MAX; j++)            
28 |                 y[i] += A[i][j]*x[j];
29 |         GET_TIME(stop);
30 |         total_time += stop-start;
31 |     }
32 | 
33 |     /**
34 |     for (i = 0; i < MAX; i++)
35 |         printf("%f\n", y[i]);
36 |     **/
37 | 
38 |     printf("Average runtime %f sec\n", total_time/ITER);
39 | }
40 | 


--------------------------------------------------------------------------------
/lec12/cache_01_fast_broken.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include "my_timer.h"
 5 | 
 6 | double A[MAX][MAX];
 7 | double x[MAX];
 8 | 
 9 | // Computes matrix-vector multiplication sequentially
10 | int main(int argc, char** argv) {
11 |     double y[MAX]; 
12 |     int i,j,iter;
13 |     srand(time(NULL));
14 |     /* Initialize A and x with random values, and y to 0s*/
15 |     for (i = 0; i < MAX; i++) {
16 |         x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1
17 |         y[i] = 0.0;
18 |         for (j = 0; j < MAX; j++)
19 |             A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1
20 |     }
21 | 
22 |     double total_time = 0.0;
23 |     for(iter = 0; iter < ITER; iter++){
24 |         double start, stop;
25 |         GET_TIME(start);
26 |         for (i = 0; i < MAX; i++)        
27 |             for (j = 0; j < MAX; j++)            
28 |                 y[i] += A[i][j]*x[j];
29 |         GET_TIME(stop);
30 |         total_time += stop-start;
31 |     }
32 | 
33 |     /**
34 |     for (i = 0; i < MAX; i++)
35 |         printf("%f\n", y[i]);
36 |     **/
37 | 
38 |     printf("Average runtime %f sec\n", total_time/ITER);
39 | }
40 | 


--------------------------------------------------------------------------------
/lec12/cache_01_fast_broken_fixed.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include "my_timer.h"
 5 | 
 6 | double A[MAX][MAX];
 7 | double x[MAX];
 8 | 
 9 | // Computes matrix-vector multiplication sequentially
10 | int main(int argc, char** argv) {
11 |     double y[MAX]; 
12 |     int i,j,iter;
13 |     double dummy = 0.0;
14 |     srand(time(NULL));
15 |     /* Initialize A and x with random values, and y to 0s*/
16 |     for (i = 0; i < MAX; i++) {
17 |         x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1
18 |         y[i] = 0.0;
19 |         for (j = 0; j < MAX; j++)
20 |             A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1
21 |     }
22 | 
23 |     double total_time = 0.0;
24 |     for(iter = 0; iter < ITER; iter++){
25 |         double start, stop;
26 |         GET_TIME(start);
27 |         for (i = 0; i < MAX; i++)        
28 |             for (j = 0; j < MAX; j++)            
29 |                 y[i] += A[i][j]*x[j];
30 |         GET_TIME(stop);
31 |         total_time += stop-start;
32 | 
33 |         // Do something with the values of y to avoid dead code elimination
34 |         for (i = 0; i < MAX; i++)
35 |             dummy += y[i];        
36 |     }
37 |     printf("Dummy value to avoid dead code elimination: %f\n", dummy);
38 |     printf("Average runtime %f sec\n", total_time/ITER);
39 | }
40 | 


--------------------------------------------------------------------------------
/lec12/cache_01_slow.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <time.h>
 4 | #include "my_timer.h"
 5 | 
 6 | double A[MAX][MAX];
 7 | double x[MAX];
 8 | double y[MAX];   
 9 | 
10 | // Computes matrix-vector multiplication sequentially
11 | int main(int argc, char** argv) {
12 |     int i,j,iter;
13 |     srand(time(NULL));
14 |     /* Initialize A and x with random values, and y to 0s*/
15 |     for (i = 0; i < MAX; i++) {
16 |         x[i] = (double) rand() / RAND_MAX; // Random number between 0 and 1
17 |         y[i] = 0.0;
18 |         for (j = 0; j < MAX; j++)
19 |             A[i][j] = (double) rand() / RAND_MAX; // Random number between 0 and 1
20 |     }
21 | 
22 |     double total_time = 0.0;
23 |     for(iter = 0; iter < ITER; iter++){
24 |         double start, stop;
25 |         GET_TIME(start);
26 |         for (j = 0; j < MAX; j++)
27 |             for (i = 0; i < MAX; i++)        
28 |                 y[i] += A[i][j]*x[j];
29 |         GET_TIME(stop);
30 |         total_time += stop-start;
31 |     }
32 | 
33 |     /**
34 |     for (i = 0; i < MAX; i++)
35 |         printf("%f\n", y[i]);
36 |     **/
37 | 
38 |     printf("Average runtime %f sec\n", total_time/ITER);
39 | }
40 | 


--------------------------------------------------------------------------------
/lec12/cache_fs_fast.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE
 2 | #include <pthread.h>
 3 | #include <sched.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | #include <math.h>
 8 | #include "my_timer.h"
 9 | 
10 | 
11 | #define CLS 16
12 | 
13 | float data[NUM_THREADS*CLS];
14 | 
15 | void* thread_fun(void* arg){
16 |     int thread_id = *((int*) arg);
17 |     // Pin 
18 |     cpu_set_t cpuset;
19 |     pthread_t thread = pthread_self();
20 |     CPU_ZERO(&cpuset);
21 |     CPU_SET(thread_id, &cpuset);
22 |     pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
23 | 
24 |     for(int i = 0; i < 100000; i++){
25 |         data[thread_id*CLS] += i;
26 |     }
27 |     return NULL;
28 | }
29 | 
30 | // Computes matrix-vector multiplication sequentially
31 | int main(int argc, char** argv) {
32 |     int iter;
33 |     srand(time(NULL));
34 |     for(int i = 0; i < NUM_THREADS; i++){
35 |         data[i] = rand();
36 |     }
37 | 
38 |     int ids[NUM_THREADS];
39 |     for(int i = 0; i < NUM_THREADS; i++){
40 |         ids[i] = i;
41 |     }
42 |     pthread_t threads[NUM_THREADS];
43 | 
44 |     double total_time = 0.0;
45 |     for(iter = 0; iter < ITER; iter++){
46 |         double start, stop;
47 |         GET_TIME(start);
48 |         // Create threads
49 |         for(int i = 0; i < NUM_THREADS; i++){
50 |             pthread_create(&threads[i], NULL, thread_fun, (void*) &ids[i]);
51 |         }
52 |         
53 |         // Join threads
54 |         for(int i = 0; i < NUM_THREADS; i++){
55 |             pthread_join(threads[i], NULL);
56 |         }
57 |         GET_TIME(stop);
58 |         total_time += stop-start;
59 |     }
60 | 
61 |     /**
62 |     for (i = 0; i < MAX; i++)
63 |         printf("%f\n", y[i]);
64 |     **/
65 | 
66 |     printf("Average runtime %f sec\n", total_time/ITER);
67 | }
68 | 


--------------------------------------------------------------------------------
/lec12/cache_fs_slow.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE
 2 | #include <pthread.h>
 3 | #include <sched.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | #include <math.h>
 8 | #include "my_timer.h"
 9 | 
10 | 
11 | 
12 | float data[NUM_THREADS];
13 | 
14 | void* thread_fun(void* arg){
15 |     int thread_id = *((int*) arg);
16 |     // Pin 
17 |     cpu_set_t cpuset;
18 |     pthread_t thread = pthread_self();
19 |     CPU_ZERO(&cpuset);
20 |     CPU_SET(thread_id, &cpuset);
21 |     pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
22 |     int t = data[thread_id];
23 |     for(int i = 0; i < 100000; i++){
24 |         t += i;
25 |     }
26 |     data[thread_id] = t;
27 |     return NULL;
28 | }
29 | 
30 | // Computes matrix-vector multiplication sequentially
31 | int main(int argc, char** argv) {
32 |     int iter;
33 |     srand(time(NULL));
34 |     for(int i = 0; i < NUM_THREADS; i++){
35 |         data[i] = rand();
36 |     }
37 | 
38 |     int ids[NUM_THREADS];
39 |     for(int i = 0; i < NUM_THREADS; i++){
40 |         ids[i] = i;
41 |     }
42 |     pthread_t threads[NUM_THREADS];
43 | 
44 |     double total_time = 0.0;
45 |     for(iter = 0; iter < ITER; iter++){
46 |         double start, stop;
47 |         GET_TIME(start);
48 |         // Create threads
49 |         for(int i = 0; i < NUM_THREADS; i++){
50 |             pthread_create(&threads[i], NULL, thread_fun, (void*) &ids[i]);
51 |         }
52 |         
53 |         // Join threads
54 |         for(int i = 0; i < NUM_THREADS; i++){
55 |             pthread_join(threads[i], NULL);
56 |         }
57 |         GET_TIME(stop);
58 |         total_time += stop-start;
59 |     }
60 | 
61 |     printf("Average runtime %f sec\n", total_time/ITER);
62 | }
63 | 


--------------------------------------------------------------------------------
/lec12/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # You should use Makefiles instead ;)
 3 | 
 4 | # Examples on profiling
 5 | gcc -Wall -pg -O3 -o mat_vec_p mat_vec.c -pthread
 6 | 
 7 | # Examples on row-major access
 8 | MATRIX_SIZE=10000
 9 | NUM_ITERATIONS=10
10 | OPT=-O3
11 | 
12 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_slow cache_01_slow.c
13 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast cache_01_fast.c
14 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast_broken cache_01_fast_broken.c
15 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_01_fast_broken_fixed cache_01_fast_broken_fixed.c
16 | 
17 | # Examples on false sharing
18 | NUM_ITERATIONS=1000
19 | OPT=-O0
20 | NUM_THREADS=4
21 | gcc -Wall -g ${OPT} -D NUM_THREADS=${NUM_THREADS} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_fs_slow cache_fs_slow.c -pthread
22 | gcc -Wall -g ${OPT} -D NUM_THREADS=${NUM_THREADS} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o cache_fs_fast cache_fs_fast.c -pthread
23 | 
24 | # Examples on branch prediction
25 | MATRIX_SIZE=1000000
26 | NUM_ITERATIONS=10
27 | OPT=-O0
28 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o branch_prediction_slow branch_prediction_slow.c
29 | gcc -Wall -g ${OPT} -D MAX=${MATRIX_SIZE} -D ITER=${NUM_ITERATIONS} -o branch_prediction_fast branch_prediction_fast.c
30 | 
31 | # Matrix-vector mul
32 | gcc -Wall -g -O3 -o mat_vec mat_vec.c -pthread
33 | 


--------------------------------------------------------------------------------
/lec12/my_timer.h:
--------------------------------------------------------------------------------
1 | #include <sys/time.h>
2 | 
3 | #define GET_TIME(now) {                  \
4 |    struct timeval t;                     \
5 |    gettimeofday(&t, NULL);               \
6 |    now = t.tv_sec + t.tv_usec/1000000.0; \
7 | }
8 | 


--------------------------------------------------------------------------------
/lec13/README.md:
--------------------------------------------------------------------------------
 1 | In this exercise, you will parallelize a sequential program that applies the Black-Scholes option pricing formula to a large dataset. The Black-Scholes formula is a mathematical model for the dynamics of a financial market containing derivative investment instruments. The formula calculates the price of a financial option comprising a stock and an option to buy or sell the stock at a specified price at a future date. 
 2 | Regardless of the specific details of the formula, in this exercise you are supposed to parallelize the sequential code available in the file `blackscholes.c`. 
 3 | The code reads a dataset from a file and applies the Black-Scholes formula to each record in the dataset. Sample input datasets can be found in the `inputs` directory. The code writes the results to an output file.
 4 | You can generate new datasets (e.g., if you want to generate bigger datasets), by running the `inputgen.c` program. The prorgram takes two arguments: the number of records in the dataset and the output file.
 5 | You can compile all the code available in this directory by running the `compile.sh` script. 
 6 | 
 7 | The `blackscholes.c` application takes two arguments from command line: the input file and the output file. The program runs some correctness checks. You should implement two parallel versions of this application, one using OpenMP and another using Pthreads. The parallel versions should read the input file and write the output file in the same format as the sequential version. The parallel versions should also produce the same results as the sequential version (if not, the program will print error messages).
 8 | 
 9 | In the `solution` folder, you will find the proposed solutions.
10 | 
11 | The code for both the sequential version and the parallel solution has been adapted from the PARSEC benchmark.


--------------------------------------------------------------------------------
/lec13/blackscholes.c:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2007 Intel Corp.
  2 | 
  3 | // Black-Scholes
  4 | // Analytical method for calculating European Options
  5 | //
  6 | // 
  7 | // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 
  8 | // Hall, John C. Hull,
  9 | 
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <math.h>
 13 | #include <string.h>
 14 | #include "my_timer.h"
 15 | 
 16 | //Precision to use for calculations
 17 | #define fptype float
 18 | 
 19 | #define NUM_RUNS 1000
 20 | 
 21 | typedef struct OptionData_ {
 22 |         fptype s;          // spot price
 23 |         fptype strike;     // strike price
 24 |         fptype r;          // risk-free interest rate
 25 |         fptype divq;       // dividend rate
 26 |         fptype v;          // volatility
 27 |         fptype t;          // time to maturity or option expiration in years 
 28 |                            //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
 29 |         char OptionType;   // Option type.  "P"=PUT, "C"=CALL
 30 |         fptype divs;       // dividend vals (not used in this test)
 31 |         fptype DGrefval;   // DerivaGem Reference Value
 32 | } OptionData;
 33 | 
 34 | OptionData *data;
 35 | fptype *prices;
 36 | int numOptions;
 37 | 
 38 | int    * otype;
 39 | fptype * sptprice;
 40 | fptype * strike;
 41 | fptype * rate;
 42 | fptype * volatility;
 43 | fptype * otime;
 44 | int numError = 0;
 45 | 
 46 | ////////////////////////////////////////////////////////////////////////////////
 47 | ////////////////////////////////////////////////////////////////////////////////
 48 | ///////////////////////////////////////////////////////////////////////////////
 49 | ////////////////////////////////////////////////////////////////////////////////
 50 | // Cumulative Normal Distribution Function
 51 | // See Hull, Section 11.8, P.243-244
 52 | #define inv_sqrt_2xPI 0.39894228040143270286
 53 | 
 54 | fptype CNDF ( fptype InputX ) 
 55 | {
 56 |     int sign;
 57 | 
 58 |     fptype OutputX;
 59 |     fptype xInput;
 60 |     fptype xNPrimeofX;
 61 |     fptype expValues;
 62 |     fptype xK2;
 63 |     fptype xK2_2, xK2_3;
 64 |     fptype xK2_4, xK2_5;
 65 |     fptype xLocal, xLocal_1;
 66 |     fptype xLocal_2, xLocal_3;
 67 | 
 68 |     // Check for negative value of InputX
 69 |     if (InputX < 0.0) {
 70 |         InputX = -InputX;
 71 |         sign = 1;
 72 |     } else 
 73 |         sign = 0;
 74 | 
 75 |     xInput = InputX;
 76 |  
 77 |     // Compute NPrimeX term common to both four & six decimal accuracy calcs
 78 |     expValues = exp(-0.5f * InputX * InputX);
 79 |     xNPrimeofX = expValues;
 80 |     xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;
 81 | 
 82 |     xK2 = 0.2316419 * xInput;
 83 |     xK2 = 1.0 + xK2;
 84 |     xK2 = 1.0 / xK2;
 85 |     xK2_2 = xK2 * xK2;
 86 |     xK2_3 = xK2_2 * xK2;
 87 |     xK2_4 = xK2_3 * xK2;
 88 |     xK2_5 = xK2_4 * xK2;
 89 |     
 90 |     xLocal_1 = xK2 * 0.319381530;
 91 |     xLocal_2 = xK2_2 * (-0.356563782);
 92 |     xLocal_3 = xK2_3 * 1.781477937;
 93 |     xLocal_2 = xLocal_2 + xLocal_3;
 94 |     xLocal_3 = xK2_4 * (-1.821255978);
 95 |     xLocal_2 = xLocal_2 + xLocal_3;
 96 |     xLocal_3 = xK2_5 * 1.330274429;
 97 |     xLocal_2 = xLocal_2 + xLocal_3;
 98 | 
 99 |     xLocal_1 = xLocal_2 + xLocal_1;
100 |     xLocal   = xLocal_1 * xNPrimeofX;
101 |     xLocal   = 1.0 - xLocal;
102 | 
103 |     OutputX  = xLocal;
104 |     
105 |     if (sign) {
106 |         OutputX = 1.0 - OutputX;
107 |     }
108 |     
109 |     return OutputX;
110 | } 
111 | 
112 | //////////////////////////////////////////////////////////////////////////////////////
113 | //////////////////////////////////////////////////////////////////////////////////////
114 | //////////////////////////////////////////////////////////////////////////////////////
115 | //////////////////////////////////////////////////////////////////////////////////////
116 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice,
117 |                             fptype strike, fptype rate, fptype volatility,
118 |                             fptype time, int otype, float timet )
119 | {
120 |     fptype OptionPrice;
121 | 
122 |     // local private working variables for the calculation
123 |     fptype xRiskFreeRate;
124 |     fptype xVolatility;
125 |     fptype xTime;
126 |     fptype xSqrtTime;
127 | 
128 |     fptype logValues;
129 |     fptype xLogTerm;
130 |     fptype xD1; 
131 |     fptype xD2;
132 |     fptype xPowerTerm;
133 |     fptype xDen;
134 |     fptype d1;
135 |     fptype d2;
136 |     fptype FutureValueX;
137 |     fptype NofXd1;
138 |     fptype NofXd2;
139 |     fptype NegNofXd1;
140 |     fptype NegNofXd2;    
141 |     
142 |     xRiskFreeRate = rate;
143 |     xVolatility = volatility;
144 | 
145 |     xTime = time;
146 |     xSqrtTime = sqrt(xTime);
147 | 
148 |     logValues = log( sptprice / strike );
149 |         
150 |     xLogTerm = logValues;
151 |         
152 |     
153 |     xPowerTerm = xVolatility * xVolatility;
154 |     xPowerTerm = xPowerTerm * 0.5;
155 |         
156 |     xD1 = xRiskFreeRate + xPowerTerm;
157 |     xD1 = xD1 * xTime;
158 |     xD1 = xD1 + xLogTerm;
159 | 
160 |     xDen = xVolatility * xSqrtTime;
161 |     xD1 = xD1 / xDen;
162 |     xD2 = xD1 -  xDen;
163 | 
164 |     d1 = xD1;
165 |     d2 = xD2;
166 |     
167 |     NofXd1 = CNDF( d1 );
168 |     NofXd2 = CNDF( d2 );
169 | 
170 |     FutureValueX = strike * ( exp( -(rate)*(time) ) );        
171 |     if (otype == 0) {            
172 |         OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);
173 |     } else { 
174 |         NegNofXd1 = (1.0 - NofXd1);
175 |         NegNofXd2 = (1.0 - NofXd2);
176 |         OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);
177 |     }
178 |     
179 |     return OptionPrice;
180 | }
181 | 
182 | int main (int argc, char **argv)
183 | {
184 |     FILE *file;
185 |     int i;
186 |     int loopnum;
187 |     int rv;
188 | 
189 |     if (argc != 3) {
190 |         printf("Usage:\n\t%s <inputFile> <outputFile>\n", argv[0]);
191 |         exit(1);
192 |     }
193 |     char *inputFile = argv[1];
194 |     char *outputFile = argv[2];
195 | 
196 |     //Read input data from file
197 |     file = fopen(inputFile, "r");
198 |     if(file == NULL) {
199 |       printf("ERROR: Unable to open file `%s'.\n", inputFile);
200 |       exit(1);
201 |     }
202 |     rv = fscanf(file, "%i", &numOptions);
203 |     if(rv != 1) {
204 |       printf("ERROR: Unable to read from file `%s'.\n", inputFile);
205 |       fclose(file);
206 |       exit(1);
207 |     }
208 | 
209 |     // alloc spaces for the option data
210 |     data = (OptionData*)malloc(numOptions*sizeof(OptionData));
211 |     prices = (fptype*)malloc(numOptions*sizeof(fptype));
212 |     for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
213 |     {
214 |         rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
215 |         if(rv != 9) {
216 |           printf("ERROR: Unable to read from file `%s'.\n", inputFile);
217 |           fclose(file);
218 |           exit(1);
219 |         }
220 |     }
221 |     rv = fclose(file);
222 |     if(rv != 0) {
223 |       printf("ERROR: Unable to close file `%s'.\n", inputFile);
224 |       exit(1);
225 |     }
226 | 
227 |     printf("Num of Options: %d\n", numOptions);
228 |     printf("Num of Runs: %d\n", NUM_RUNS);
229 | 
230 |     sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype));
231 |     strike = sptprice + numOptions;
232 |     rate = strike + numOptions;
233 |     volatility = rate + numOptions;
234 |     otime = volatility + numOptions;
235 | 
236 |     otype = (int *) malloc(numOptions * sizeof(fptype));
237 | 
238 |     for (i=0; i<numOptions; i++) {
239 |         otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;
240 |         sptprice[i]   = data[i].s;
241 |         strike[i]     = data[i].strike;
242 |         rate[i]       = data[i].r;
243 |         volatility[i] = data[i].v;    
244 |         otime[i]      = data[i].t;
245 |     }
246 | 
247 |     printf("Size of data: %ld\n", numOptions * (sizeof(OptionData) + sizeof(int)));
248 | 
249 |     //serial version
250 |     int j;
251 |     fptype price;
252 | #ifdef ERR_CHK
253 |     fptype priceDelta;
254 | #endif
255 | 
256 |     double start, stop, end;
257 |     GET_TIME(start);
258 |     for (j=0; j<NUM_RUNS; j++) {
259 |         for (i=0; i<numOptions; i++) {
260 |             /* Calling main function to calculate option value based on 
261 |              * Black & Scholes's equation.
262 |              */
263 |             price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
264 |                                          rate[i], volatility[i], otime[i], 
265 |                                          otype[i], 0);
266 |             prices[i] = price;
267 | 
268 | #ifdef ERR_CHK
269 |             priceDelta = data[i].DGrefval - price;
270 |             if( fabs(priceDelta) >= 1e-4 ){
271 |                 printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
272 |                        i, price, data[i].DGrefval, priceDelta);
273 |                 numError ++;
274 |             }
275 | #endif
276 |         }
277 |     }
278 |     GET_TIME(stop);
279 |     end = stop - start;
280 |     printf("Time: %f seconds\n", end);
281 | 
282 | 
283 |     //Write prices to output file
284 |     file = fopen(outputFile, "w");
285 |     if(file == NULL) {
286 |       printf("ERROR: Unable to open file `%s'.\n", outputFile);
287 |       exit(1);
288 |     }
289 |     rv = fprintf(file, "%i\n", numOptions);
290 |     if(rv < 0) {
291 |       printf("ERROR: Unable to write to file `%s'.\n", outputFile);
292 |       fclose(file);
293 |       exit(1);
294 |     }
295 |     for(i=0; i<numOptions; i++) {
296 |       rv = fprintf(file, "%.18f\n", prices[i]);
297 |       if(rv < 0) {
298 |         printf("ERROR: Unable to write to file `%s'.\n", outputFile);
299 |         fclose(file);
300 |         exit(1);
301 |       }
302 |     }
303 |     rv = fclose(file);
304 |     if(rv != 0) {
305 |       printf("ERROR: Unable to close file `%s'.\n", outputFile);
306 |       exit(1);
307 |     }
308 | 
309 | #ifdef ERR_CHK
310 |     printf("Num Errors: %d\n", numError);
311 | #endif
312 |     free(data);
313 |     free(prices);
314 | 
315 |     return 0;
316 | }
317 | 


--------------------------------------------------------------------------------
/lec13/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # You should use Makefiles instead ;)
 3 | 
 4 | OPT=-O3
 5 | FLAGS=-DERR_CHK
 6 | 
 7 | gcc -Wall -g ${OPT} ${FLAGS} -o inputgen inputgen.c -lm
 8 | gcc -Wall -g ${OPT} ${FLAGS} -o blackscholes blackscholes.c -lm
 9 | gcc -Wall -g ${OPT} ${FLAGS} -o solution/blackscholes_omp solution/blackscholes_omp.c -lm -fopenmp
10 | gcc -Wall -g ${OPT} ${FLAGS} -o solution/blackscholes_pthreads solution/blackscholes_pthreads.c -lm -pthread
11 | 
12 | 


--------------------------------------------------------------------------------
/lec13/inputgen.c:
--------------------------------------------------------------------------------
 1 | //Copyright (c) 2009 Princeton University
 2 | //Written by Christian Bienia
 3 | //Generate input files for blackscholes benchmark
 4 | 
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | 
 8 | 
 9 | 
10 | //Precision to use
11 | #define fptype double
12 | 
13 | typedef struct OptionData_ {
14 |         fptype s;          // spot price
15 |         fptype strike;     // strike price
16 |         fptype r;          // risk-free interest rate
17 |         fptype divq;       // dividend rate
18 |         fptype v;          // volatility
19 |         fptype t;          // time to maturity or option expiration in years 
20 |                            //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
21 |         const char *OptionType;  // Option type.  "P"=PUT, "C"=CALL
22 |         fptype divs;       // dividend vals (not used in this test)
23 |         fptype DGrefval;   // DerivaGem Reference Value
24 | } OptionData;
25 | 
26 | //Total number of options in optionData.txt
27 | #define MAX_OPTIONS 1000
28 | 
29 | OptionData data_init[] = {
30 |     #include "optionData.txt"
31 | };
32 | 
33 | 
34 | 
35 | int main (int argc, char **argv) {
36 |   int numOptions;
37 |   char *fileName;
38 |   int rv;
39 |   int i;
40 | 
41 |   if (argc != 3) {
42 |     printf("Usage:\n\t%s <numOptions> <fileName>\n", argv[0]);
43 |     exit(1);
44 |   }
45 |   numOptions = atoi(argv[1]);
46 |   fileName = argv[2];
47 |   if(numOptions < 1) {
48 |     printf("ERROR: Number of options must at least be 1.\n");
49 |     exit(1);
50 |   }
51 | 
52 |   FILE *file;
53 |   file = fopen(fileName, "w");
54 |   if(file == NULL) {
55 |     printf("ERROR: Unable to open file `%s'.\n", fileName);
56 |     exit(1);
57 |   }
58 | 
59 |   //write number of options
60 |   rv = fprintf(file, "%i\n", numOptions);
61 |   if(rv < 0) {
62 |     printf("ERROR: Unable to write to file `%s'.\n", fileName);
63 |     fclose(file);
64 |     exit(1);
65 |   }
66 | 
67 |   //write values for options
68 |   for(i=0; i<numOptions; i++) {
69 |     //NOTE: DG RefValues specified exceed double precision, output will deviate
70 |     rv = fprintf(file, "%.2f %.2f %.4f %.2f %.2f %.2f %c %.2f %.18f\n", data_init[i % MAX_OPTIONS].s, data_init[i % MAX_OPTIONS].strike, data_init[i % MAX_OPTIONS].r, data_init[i % MAX_OPTIONS].divq, data_init[i % MAX_OPTIONS].v, data_init[i % MAX_OPTIONS].t, data_init[i % MAX_OPTIONS].OptionType[0], data_init[i % MAX_OPTIONS].divs, data_init[i % MAX_OPTIONS].DGrefval);
71 |     if(rv < 0) {
72 |       printf("ERROR: Unable to write to file `%s'.\n", fileName);
73 |       fclose(file);
74 |       exit(1);
75 |     }
76 |   }
77 | 
78 |   rv = fclose(file);
79 |   if(rv != 0) {
80 |     printf("ERROR: Unable to close file `%s'.\n", fileName);
81 |     exit(1);
82 |   }
83 | 
84 |   return 0;
85 | }


--------------------------------------------------------------------------------
/lec13/my_timer.h:
--------------------------------------------------------------------------------
1 | #include <sys/time.h>
2 | 
3 | #define GET_TIME(now) {                  \
4 |    struct timeval t;                     \
5 |    gettimeofday(&t, NULL);               \
6 |    now = t.tv_sec + t.tv_usec/1000000.0; \
7 | }
8 | 


--------------------------------------------------------------------------------
/lec13/solution/blackscholes_omp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/lec13/solution/blackscholes_omp


--------------------------------------------------------------------------------
/lec13/solution/blackscholes_omp.c:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2007 Intel Corp.
  2 | 
  3 | // Black-Scholes
  4 | // Analytical method for calculating European Options
  5 | //
  6 | // 
  7 | // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 
  8 | // Hall, John C. Hull,
  9 | 
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <math.h>
 13 | #include <string.h>
 14 | #include "../my_timer.h"
 15 | 
 16 | // Multi-threaded OpenMP header
 17 | #include <omp.h>
 18 | 
 19 | //Precision to use for calculations
 20 | #define fptype float
 21 | 
 22 | #define NUM_RUNS 1000
 23 | 
 24 | typedef struct OptionData_ {
 25 |         fptype s;          // spot price
 26 |         fptype strike;     // strike price
 27 |         fptype r;          // risk-free interest rate
 28 |         fptype divq;       // dividend rate
 29 |         fptype v;          // volatility
 30 |         fptype t;          // time to maturity or option expiration in years 
 31 |                            //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
 32 |         char OptionType;   // Option type.  "P"=PUT, "C"=CALL
 33 |         fptype divs;       // dividend vals (not used in this test)
 34 |         fptype DGrefval;   // DerivaGem Reference Value
 35 | } OptionData;
 36 | 
 37 | OptionData *data;
 38 | fptype *prices;
 39 | int numOptions;
 40 | 
 41 | int    * otype;
 42 | fptype * sptprice;
 43 | fptype * strike;
 44 | fptype * rate;
 45 | fptype * volatility;
 46 | fptype * otime;
 47 | int numError = 0;
 48 | int nThreads;
 49 | 
 50 | ////////////////////////////////////////////////////////////////////////////////
 51 | ////////////////////////////////////////////////////////////////////////////////
 52 | ///////////////////////////////////////////////////////////////////////////////
 53 | ////////////////////////////////////////////////////////////////////////////////
 54 | // Cumulative Normal Distribution Function
 55 | // See Hull, Section 11.8, P.243-244
 56 | #define inv_sqrt_2xPI 0.39894228040143270286
 57 | 
 58 | fptype CNDF ( fptype InputX ) 
 59 | {
 60 |     int sign;
 61 | 
 62 |     fptype OutputX;
 63 |     fptype xInput;
 64 |     fptype xNPrimeofX;
 65 |     fptype expValues;
 66 |     fptype xK2;
 67 |     fptype xK2_2, xK2_3;
 68 |     fptype xK2_4, xK2_5;
 69 |     fptype xLocal, xLocal_1;
 70 |     fptype xLocal_2, xLocal_3;
 71 | 
 72 |     // Check for negative value of InputX
 73 |     if (InputX < 0.0) {
 74 |         InputX = -InputX;
 75 |         sign = 1;
 76 |     } else 
 77 |         sign = 0;
 78 | 
 79 |     xInput = InputX;
 80 |  
 81 |     // Compute NPrimeX term common to both four & six decimal accuracy calcs
 82 |     expValues = exp(-0.5f * InputX * InputX);
 83 |     xNPrimeofX = expValues;
 84 |     xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;
 85 | 
 86 |     xK2 = 0.2316419 * xInput;
 87 |     xK2 = 1.0 + xK2;
 88 |     xK2 = 1.0 / xK2;
 89 |     xK2_2 = xK2 * xK2;
 90 |     xK2_3 = xK2_2 * xK2;
 91 |     xK2_4 = xK2_3 * xK2;
 92 |     xK2_5 = xK2_4 * xK2;
 93 |     
 94 |     xLocal_1 = xK2 * 0.319381530;
 95 |     xLocal_2 = xK2_2 * (-0.356563782);
 96 |     xLocal_3 = xK2_3 * 1.781477937;
 97 |     xLocal_2 = xLocal_2 + xLocal_3;
 98 |     xLocal_3 = xK2_4 * (-1.821255978);
 99 |     xLocal_2 = xLocal_2 + xLocal_3;
100 |     xLocal_3 = xK2_5 * 1.330274429;
101 |     xLocal_2 = xLocal_2 + xLocal_3;
102 | 
103 |     xLocal_1 = xLocal_2 + xLocal_1;
104 |     xLocal   = xLocal_1 * xNPrimeofX;
105 |     xLocal   = 1.0 - xLocal;
106 | 
107 |     OutputX  = xLocal;
108 |     
109 |     if (sign) {
110 |         OutputX = 1.0 - OutputX;
111 |     }
112 |     
113 |     return OutputX;
114 | } 
115 | 
116 | //////////////////////////////////////////////////////////////////////////////////////
117 | //////////////////////////////////////////////////////////////////////////////////////
118 | //////////////////////////////////////////////////////////////////////////////////////
119 | //////////////////////////////////////////////////////////////////////////////////////
120 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice,
121 |                             fptype strike, fptype rate, fptype volatility,
122 |                             fptype time, int otype, float timet )
123 | {
124 |     fptype OptionPrice;
125 | 
126 |     // local private working variables for the calculation
127 |     fptype xRiskFreeRate;
128 |     fptype xVolatility;
129 |     fptype xTime;
130 |     fptype xSqrtTime;
131 | 
132 |     fptype logValues;
133 |     fptype xLogTerm;
134 |     fptype xD1; 
135 |     fptype xD2;
136 |     fptype xPowerTerm;
137 |     fptype xDen;
138 |     fptype d1;
139 |     fptype d2;
140 |     fptype FutureValueX;
141 |     fptype NofXd1;
142 |     fptype NofXd2;
143 |     fptype NegNofXd1;
144 |     fptype NegNofXd2;    
145 |     
146 |     xRiskFreeRate = rate;
147 |     xVolatility = volatility;
148 | 
149 |     xTime = time;
150 |     xSqrtTime = sqrt(xTime);
151 | 
152 |     logValues = log( sptprice / strike );
153 |         
154 |     xLogTerm = logValues;
155 |         
156 |     
157 |     xPowerTerm = xVolatility * xVolatility;
158 |     xPowerTerm = xPowerTerm * 0.5;
159 |         
160 |     xD1 = xRiskFreeRate + xPowerTerm;
161 |     xD1 = xD1 * xTime;
162 |     xD1 = xD1 + xLogTerm;
163 | 
164 |     xDen = xVolatility * xSqrtTime;
165 |     xD1 = xD1 / xDen;
166 |     xD2 = xD1 -  xDen;
167 | 
168 |     d1 = xD1;
169 |     d2 = xD2;
170 |     
171 |     NofXd1 = CNDF( d1 );
172 |     NofXd2 = CNDF( d2 );
173 | 
174 |     FutureValueX = strike * ( exp( -(rate)*(time) ) );        
175 |     if (otype == 0) {            
176 |         OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);
177 |     } else { 
178 |         NegNofXd1 = (1.0 - NofXd1);
179 |         NegNofXd2 = (1.0 - NofXd2);
180 |         OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);
181 |     }
182 |     
183 |     return OptionPrice;
184 | }
185 | 
186 | int main (int argc, char **argv)
187 | {
188 |     FILE *file;
189 |     int i;
190 |     int loopnum;
191 |     int rv;
192 | 
193 |    if (argc != 4)
194 |         {
195 |                 printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
196 |                 exit(1);
197 |         }
198 |     nThreads = atoi(argv[1]);
199 |     char *inputFile = argv[2];
200 |     char *outputFile = argv[3];
201 | 
202 |     //Read input data from file
203 |     file = fopen(inputFile, "r");
204 |     if(file == NULL) {
205 |       printf("ERROR: Unable to open file `%s'.\n", inputFile);
206 |       exit(1);
207 |     }
208 |     rv = fscanf(file, "%i", &numOptions);
209 |     if(rv != 1) {
210 |       printf("ERROR: Unable to read from file `%s'.\n", inputFile);
211 |       fclose(file);
212 |       exit(1);
213 |     }
214 |     if(nThreads > numOptions) {
215 |       printf("WARNING: Not enough work, reducing number of threads to match number of options.\n");
216 |       nThreads = numOptions;
217 |     }
218 | 
219 |     // alloc spaces for the option data
220 |     data = (OptionData*)malloc(numOptions*sizeof(OptionData));
221 |     prices = (fptype*)malloc(numOptions*sizeof(fptype));
222 |     for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
223 |     {
224 |         rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
225 |         if(rv != 9) {
226 |           printf("ERROR: Unable to read from file `%s'.\n", inputFile);
227 |           fclose(file);
228 |           exit(1);
229 |         }
230 |     }
231 |     rv = fclose(file);
232 |     if(rv != 0) {
233 |       printf("ERROR: Unable to close file `%s'.\n", inputFile);
234 |       exit(1);
235 |     }
236 | 
237 |     printf("Num of Options: %d\n", numOptions);
238 |     printf("Num of Runs: %d\n", NUM_RUNS);
239 | 
240 | 
241 |     sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype));
242 |     strike = sptprice + numOptions;
243 |     rate = strike + numOptions;
244 |     volatility = rate + numOptions;
245 |     otime = volatility + numOptions;
246 | 
247 | 
248 |     otype = (int *) malloc(numOptions * sizeof(fptype));
249 | 
250 |     for (i=0; i<numOptions; i++) {
251 |         otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;
252 |         sptprice[i]   = data[i].s;
253 |         strike[i]     = data[i].strike;
254 |         rate[i]       = data[i].r;
255 |         volatility[i] = data[i].v;    
256 |         otime[i]      = data[i].t;
257 |     }
258 | 
259 |     printf("Size of data: %ld\n", numOptions * (sizeof(OptionData) + sizeof(int)));    
260 |     int j;
261 |     fptype price;
262 | #ifdef ERR_CHK
263 |     fptype priceDelta;
264 | #endif
265 | 
266 |     double start, stop, end;
267 |     GET_TIME(start);
268 |     
269 |     for (j=0; j<NUM_RUNS; j++) {
270 |         #pragma omp parallel num_threads(nThreads) private(i, price, priceDelta)
271 |         {
272 |             int tid = omp_get_thread_num();
273 |             int start_opt = tid * (numOptions / nThreads);
274 |             int end_opt = start_opt + (numOptions / nThreads);
275 |             for (i=start_opt; i<end_opt; i++) {
276 |                 /* Calling main function to calculate option value based on 
277 |                 * Black & Scholes's equation.
278 |                 */
279 |                 price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
280 |                                             rate[i], volatility[i], otime[i], 
281 |                                             otype[i], 0);
282 |                 prices[i] = price;
283 | 
284 |     #ifdef ERR_CHK
285 |                 priceDelta = data[i].DGrefval - price;
286 |                 if( fabs(priceDelta) >= 1e-4 ){
287 |                     printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
288 |                         i, price, data[i].DGrefval, priceDelta);
289 |                     numError ++;
290 |                 }
291 |     #endif
292 |             }
293 |         }
294 |     }
295 |     GET_TIME(stop);
296 |     end = stop - start;
297 |     printf("Time: %f seconds\n", end);
298 | 
299 |     //Write prices to output file
300 |     file = fopen(outputFile, "w");
301 |     if(file == NULL) {
302 |       printf("ERROR: Unable to open file `%s'.\n", outputFile);
303 |       exit(1);
304 |     }
305 |     rv = fprintf(file, "%i\n", numOptions);
306 |     if(rv < 0) {
307 |       printf("ERROR: Unable to write to file `%s'.\n", outputFile);
308 |       fclose(file);
309 |       exit(1);
310 |     }
311 |     for(i=0; i<numOptions; i++) {
312 |       rv = fprintf(file, "%.18f\n", prices[i]);
313 |       if(rv < 0) {
314 |         printf("ERROR: Unable to write to file `%s'.\n", outputFile);
315 |         fclose(file);
316 |         exit(1);
317 |       }
318 |     }
319 |     rv = fclose(file);
320 |     if(rv != 0) {
321 |       printf("ERROR: Unable to close file `%s'.\n", outputFile);
322 |       exit(1);
323 |     }
324 | 
325 | #ifdef ERR_CHK
326 |     printf("Num Errors: %d\n", numError);
327 | #endif
328 |     free(data);
329 |     free(prices);
330 | 
331 |     return 0;
332 | }
333 | 


--------------------------------------------------------------------------------
/lec13/solution/blackscholes_pthreads:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/lec13/solution/blackscholes_pthreads


--------------------------------------------------------------------------------
/lec13/solution/blackscholes_pthreads.c:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2007 Intel Corp.
  2 | 
  3 | // Black-Scholes
  4 | // Analytical method for calculating European Options
  5 | //
  6 | // 
  7 | // Reference Source: Options, Futures, and Other Derivatives, 3rd Edition, Prentice 
  8 | // Hall, John C. Hull,
  9 | 
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <math.h>
 13 | #include <string.h>
 14 | #include <pthread.h>
 15 | #include "../my_timer.h"
 16 | 
 17 | //Precision to use for calculations
 18 | #define fptype float
 19 | 
 20 | #define NUM_RUNS 1000
 21 | 
 22 | typedef struct OptionData_ {
 23 |         fptype s;          // spot price
 24 |         fptype strike;     // strike price
 25 |         fptype r;          // risk-free interest rate
 26 |         fptype divq;       // dividend rate
 27 |         fptype v;          // volatility
 28 |         fptype t;          // time to maturity or option expiration in years 
 29 |                            //     (1yr = 1.0, 6mos = 0.5, 3mos = 0.25, ..., etc)  
 30 |         char OptionType;   // Option type.  "P"=PUT, "C"=CALL
 31 |         fptype divs;       // dividend vals (not used in this test)
 32 |         fptype DGrefval;   // DerivaGem Reference Value
 33 | } OptionData;
 34 | 
 35 | OptionData *data;
 36 | fptype *prices;
 37 | int numOptions;
 38 | 
 39 | int    * otype;
 40 | fptype * sptprice;
 41 | fptype * strike;
 42 | fptype * rate;
 43 | fptype * volatility;
 44 | fptype * otime;
 45 | int numError = 0;
 46 | int nThreads;
 47 | 
 48 | ////////////////////////////////////////////////////////////////////////////////
 49 | ////////////////////////////////////////////////////////////////////////////////
 50 | ///////////////////////////////////////////////////////////////////////////////
 51 | ////////////////////////////////////////////////////////////////////////////////
 52 | // Cumulative Normal Distribution Function
 53 | // See Hull, Section 11.8, P.243-244
 54 | #define inv_sqrt_2xPI 0.39894228040143270286
 55 | 
 56 | fptype CNDF ( fptype InputX ) 
 57 | {
 58 |     int sign;
 59 | 
 60 |     fptype OutputX;
 61 |     fptype xInput;
 62 |     fptype xNPrimeofX;
 63 |     fptype expValues;
 64 |     fptype xK2;
 65 |     fptype xK2_2, xK2_3;
 66 |     fptype xK2_4, xK2_5;
 67 |     fptype xLocal, xLocal_1;
 68 |     fptype xLocal_2, xLocal_3;
 69 | 
 70 |     // Check for negative value of InputX
 71 |     if (InputX < 0.0) {
 72 |         InputX = -InputX;
 73 |         sign = 1;
 74 |     } else 
 75 |         sign = 0;
 76 | 
 77 |     xInput = InputX;
 78 |  
 79 |     // Compute NPrimeX term common to both four & six decimal accuracy calcs
 80 |     expValues = exp(-0.5f * InputX * InputX);
 81 |     xNPrimeofX = expValues;
 82 |     xNPrimeofX = xNPrimeofX * inv_sqrt_2xPI;
 83 | 
 84 |     xK2 = 0.2316419 * xInput;
 85 |     xK2 = 1.0 + xK2;
 86 |     xK2 = 1.0 / xK2;
 87 |     xK2_2 = xK2 * xK2;
 88 |     xK2_3 = xK2_2 * xK2;
 89 |     xK2_4 = xK2_3 * xK2;
 90 |     xK2_5 = xK2_4 * xK2;
 91 |     
 92 |     xLocal_1 = xK2 * 0.319381530;
 93 |     xLocal_2 = xK2_2 * (-0.356563782);
 94 |     xLocal_3 = xK2_3 * 1.781477937;
 95 |     xLocal_2 = xLocal_2 + xLocal_3;
 96 |     xLocal_3 = xK2_4 * (-1.821255978);
 97 |     xLocal_2 = xLocal_2 + xLocal_3;
 98 |     xLocal_3 = xK2_5 * 1.330274429;
 99 |     xLocal_2 = xLocal_2 + xLocal_3;
100 | 
101 |     xLocal_1 = xLocal_2 + xLocal_1;
102 |     xLocal   = xLocal_1 * xNPrimeofX;
103 |     xLocal   = 1.0 - xLocal;
104 | 
105 |     OutputX  = xLocal;
106 |     
107 |     if (sign) {
108 |         OutputX = 1.0 - OutputX;
109 |     }
110 |     
111 |     return OutputX;
112 | } 
113 | 
114 | //////////////////////////////////////////////////////////////////////////////////////
115 | //////////////////////////////////////////////////////////////////////////////////////
116 | //////////////////////////////////////////////////////////////////////////////////////
117 | //////////////////////////////////////////////////////////////////////////////////////
118 | fptype BlkSchlsEqEuroNoDiv( fptype sptprice,
119 |                             fptype strike, fptype rate, fptype volatility,
120 |                             fptype time, int otype, float timet )
121 | {
122 |     fptype OptionPrice;
123 | 
124 |     // local private working variables for the calculation
125 |     fptype xRiskFreeRate;
126 |     fptype xVolatility;
127 |     fptype xTime;
128 |     fptype xSqrtTime;
129 | 
130 |     fptype logValues;
131 |     fptype xLogTerm;
132 |     fptype xD1; 
133 |     fptype xD2;
134 |     fptype xPowerTerm;
135 |     fptype xDen;
136 |     fptype d1;
137 |     fptype d2;
138 |     fptype FutureValueX;
139 |     fptype NofXd1;
140 |     fptype NofXd2;
141 |     fptype NegNofXd1;
142 |     fptype NegNofXd2;    
143 |     
144 |     xRiskFreeRate = rate;
145 |     xVolatility = volatility;
146 | 
147 |     xTime = time;
148 |     xSqrtTime = sqrt(xTime);
149 | 
150 |     logValues = log( sptprice / strike );
151 |         
152 |     xLogTerm = logValues;
153 |         
154 |     
155 |     xPowerTerm = xVolatility * xVolatility;
156 |     xPowerTerm = xPowerTerm * 0.5;
157 |         
158 |     xD1 = xRiskFreeRate + xPowerTerm;
159 |     xD1 = xD1 * xTime;
160 |     xD1 = xD1 + xLogTerm;
161 | 
162 |     xDen = xVolatility * xSqrtTime;
163 |     xD1 = xD1 / xDen;
164 |     xD2 = xD1 -  xDen;
165 | 
166 |     d1 = xD1;
167 |     d2 = xD2;
168 |     
169 |     NofXd1 = CNDF( d1 );
170 |     NofXd2 = CNDF( d2 );
171 | 
172 |     FutureValueX = strike * ( exp( -(rate)*(time) ) );        
173 |     if (otype == 0) {            
174 |         OptionPrice = (sptprice * NofXd1) - (FutureValueX * NofXd2);
175 |     } else { 
176 |         NegNofXd1 = (1.0 - NofXd1);
177 |         NegNofXd2 = (1.0 - NofXd2);
178 |         OptionPrice = (FutureValueX * NegNofXd2) - (sptprice * NegNofXd1);
179 |     }
180 |     
181 |     return OptionPrice;
182 | }
183 | 
184 | void* bs_thread(void *tid_ptr) {
185 |     int i, j;
186 |     fptype price;
187 |     fptype priceDelta;
188 |     int tid = *(int *)tid_ptr;
189 |     int start = tid * (numOptions / nThreads);
190 |     int end = start + (numOptions / nThreads);
191 | 
192 |     for (j=0; j<NUM_RUNS; j++) {
193 |         for (i=start; i<end; i++) {
194 |             /* Calling main function to calculate option value based on 
195 |              * Black & Scholes's equation.
196 |              */
197 |             price = BlkSchlsEqEuroNoDiv( sptprice[i], strike[i],
198 |                                          rate[i], volatility[i], otime[i], 
199 |                                          otype[i], 0);
200 |             prices[i] = price;
201 | 
202 | #ifdef ERR_CHK
203 |             priceDelta = data[i].DGrefval - price;
204 |             if( fabs(priceDelta) >= 1e-4 ){
205 |                 printf("Error on %d. Computed=%.5f, Ref=%.5f, Delta=%.5f\n",
206 |                        i, price, data[i].DGrefval, priceDelta);
207 |                 numError ++;
208 |             }
209 | #endif
210 |         }
211 |     }
212 | 
213 |     return NULL;
214 | }
215 | 
216 | int main (int argc, char **argv)
217 | {
218 |     FILE *file;
219 |     int i;
220 |     int loopnum;
221 |     int rv;
222 | 
223 |    if (argc != 4)
224 |         {
225 |                 printf("Usage:\n\t%s <nthreads> <inputFile> <outputFile>\n", argv[0]);
226 |                 exit(1);
227 |         }
228 |     nThreads = atoi(argv[1]);
229 |     char *inputFile = argv[2];
230 |     char *outputFile = argv[3];
231 | 
232 |     //Read input data from file
233 |     file = fopen(inputFile, "r");
234 |     if(file == NULL) {
235 |       printf("ERROR: Unable to open file `%s'.\n", inputFile);
236 |       exit(1);
237 |     }
238 |     rv = fscanf(file, "%i", &numOptions);
239 |     if(rv != 1) {
240 |       printf("ERROR: Unable to read from file `%s'.\n", inputFile);
241 |       fclose(file);
242 |       exit(1);
243 |     }
244 |     if(nThreads > numOptions) {
245 |       printf("WARNING: Not enough work, reducing number of threads to match number of options.\n");
246 |       nThreads = numOptions;
247 |     }
248 | 
249 |     // alloc spaces for the option data
250 |     data = (OptionData*)malloc(numOptions*sizeof(OptionData));
251 |     prices = (fptype*)malloc(numOptions*sizeof(fptype));
252 |     for ( loopnum = 0; loopnum < numOptions; ++ loopnum )
253 |     {
254 |         rv = fscanf(file, "%f %f %f %f %f %f %c %f %f", &data[loopnum].s, &data[loopnum].strike, &data[loopnum].r, &data[loopnum].divq, &data[loopnum].v, &data[loopnum].t, &data[loopnum].OptionType, &data[loopnum].divs, &data[loopnum].DGrefval);
255 |         if(rv != 9) {
256 |           printf("ERROR: Unable to read from file `%s'.\n", inputFile);
257 |           fclose(file);
258 |           exit(1);
259 |         }
260 |     }
261 |     rv = fclose(file);
262 |     if(rv != 0) {
263 |       printf("ERROR: Unable to close file `%s'.\n", inputFile);
264 |       exit(1);
265 |     }
266 | 
267 |     printf("Num of Options: %d\n", numOptions);
268 |     printf("Num of Runs: %d\n", NUM_RUNS);
269 | 
270 |     sptprice = (fptype *) malloc(5 * numOptions * sizeof(fptype));
271 |     strike = sptprice + numOptions;
272 |     rate = strike + numOptions;
273 |     volatility = rate + numOptions;
274 |     otime = volatility + numOptions;
275 | 
276 |     otype = (int *) malloc(numOptions * sizeof(fptype));
277 | 
278 |     for (i=0; i<numOptions; i++) {
279 |         otype[i]      = (data[i].OptionType == 'P') ? 1 : 0;
280 |         sptprice[i]   = data[i].s;
281 |         strike[i]     = data[i].strike;
282 |         rate[i]       = data[i].r;
283 |         volatility[i] = data[i].v;    
284 |         otime[i]      = data[i].t;
285 |     }
286 | 
287 |     printf("Size of data: %ld\n", numOptions * (sizeof(OptionData) + sizeof(int)));
288 | 
289 |     int *tids;
290 |     tids = (int *) malloc (nThreads * sizeof(int));
291 |     pthread_t *threads;
292 |     threads = (pthread_t *) malloc (nThreads * sizeof(pthread_t));
293 |     
294 |     double start, stop, end;
295 |     GET_TIME(start);
296 | 
297 |     for(i=0; i<nThreads; i++) {
298 |         tids[i]=i;
299 |         pthread_create(&(threads[i]), NULL, bs_thread, &(tids[i]));
300 |     }
301 |     for(i=0; i<nThreads; i++) {
302 |         pthread_join(threads[i], NULL);
303 |     }
304 |     free(tids);
305 |     free(threads);
306 |     GET_TIME(stop);
307 |     end = stop - start;
308 |     printf("Time: %f seconds\n", end);    
309 | 
310 |     //Write prices to output file
311 |     file = fopen(outputFile, "w");
312 |     if(file == NULL) {
313 |       printf("ERROR: Unable to open file `%s'.\n", outputFile);
314 |       exit(1);
315 |     }
316 |     rv = fprintf(file, "%i\n", numOptions);
317 |     if(rv < 0) {
318 |       printf("ERROR: Unable to write to file `%s'.\n", outputFile);
319 |       fclose(file);
320 |       exit(1);
321 |     }
322 |     for(i=0; i<numOptions; i++) {
323 |       rv = fprintf(file, "%.18f\n", prices[i]);
324 |       if(rv < 0) {
325 |         printf("ERROR: Unable to write to file `%s'.\n", outputFile);
326 |         fclose(file);
327 |         exit(1);
328 |       }
329 |     }
330 |     rv = fclose(file);
331 |     if(rv != 0) {
332 |       printf("ERROR: Unable to close file `%s'.\n", outputFile);
333 |       exit(1);
334 |     }
335 | 
336 | #ifdef ERR_CHK
337 |     printf("Num Errors: %d\n", numError);
338 | #endif
339 |     free(data);
340 |     free(prices);
341 | 
342 |     return 0;
343 | }
344 | 


--------------------------------------------------------------------------------
/lec15/Makefile:
--------------------------------------------------------------------------------
 1 | CC          = gcc
 2 | LIBS        =
 3 | CFLAGS      = -Wall -pedantic -g
 4 | OMPFLAGS    = -fopenmp
 5 | 
 6 | RM=rm -f
 7 | 
 8 | EXES=pi pi_solution_crit_rand pi_solution_crit pi_solution_ato pi_solution_red matmul matmul_solution histogram histogram_solution_trivial_crit histogram_solution_trivial_ato histogram_solution_ato_local histogram_solution_red histogram_solution_ato_local_better rand_vs_rand_r
 9 | 
10 | all: $(EXES)
11 | 
12 | rand_vs_rand_r: rand_vs_rand_r.c 
13 | 	$(CC) $(CFLAGS) -o $@ $@.c $(LIBS)
14 | 
15 | %: %.c 
16 | 	$(CC) $(CFLAGS) $(OMPFLAGS) -o $@ $@.c $(LIBS)
17 | 
18 | clean:
19 | 	$(RM) $(EXES) 


--------------------------------------------------------------------------------
/lec15/README.md:
--------------------------------------------------------------------------------
 1 | Today we are going to see the following exercises using OpenMP.
 2 | 
 3 | # Matrix multiplication
 4 | This exercise is adapted from the SC08 OpenMP tutorial by Mattson and Meadows (https://www.openmp.org/wp-content/uploads/omp-hands-on-SC08.pdf)
 5 | Starting from the serial version of the code (in the *matmul.c* file), parallelize it using OpenMP. The serial code computes the product of two matrices.
 6 | We provide several different parallel solutions:
 7 | - *matmul_solution.c*: uses a **parallel for** directive to parallelize the outer loop of the matrix multiplication.
 8 | 
 9 | # Pi calculation
10 | This exercise is adapted from the SC08 OpenMP tutorial by Mattson and Meadows (https://www.openmp.org/wp-content/uploads/omp-hands-on-SC08.pdf)
11 | Starting from the serial version of the code (in the *pi.c* file), parallelize it using OpenMP. The serial code computes pi using the Monte Carlo method we
12 | have also seen in the MPI and Pthreads examples.
13 | We provide several different parallel solutions:
14 | - *pi_solution_crit_rand.c*: uses a **critical section** to update the global variable that stores the number of points inside the circle.
15 | - *pi_solution_crit.c*: uses a **critical section** to update the global variable that stores the number of points inside the circle, but uses **rand_r** instead of **rand**
16 |   to generate random number. This provides a significant speedup, since rand uses a mutex to protect its internal state. You can check that rand_r is usually
17 |   faster than rand by running the rand_vs_rand_r.c example.
18 | - *pi_solution_ato.c*: uses **atomic operations** to update the global variable that stores the number of points inside the circle.
19 | - *pi_solution_red.c*: uses a **reduction** to update the global variable that stores the number of points inside the circle.
20 | 
21 | # Histogram calculation
22 | Starting from the serial version of the code (in the *histogram.c* file), parallelize it using OpenMP. The serial code computes the histogram of the values in an array.
23 | The code takes one command line argument, representing the maximum value that the numbers in the array can have.
24 | We provide several different parallel solutions:
25 | - *histogram_solution_trivial_crit.c*: uses a **critical section** to update the histogram.
26 | - *histogram_solution_trivial_ato.c*: uses **atomic operations** to update the histogram.
27 | - *histogram_solution_ato_local.c*: uses a **reduction** to update the histogram, but avoids (part of) the false sharing, by accumulating into a local array, and then
28 |     updating the global histogram array using atomics.
29 | - *histogram_solution_ato_local_better.c*: uses a **reduction** to update the histogram, and avoid all the false sharing, by accumulating into a local array, and then
30 |     updating the global histogram array using atomics. Those local arrays are made bigger so to avoid false sharing.
31 | - *histogram_solution_red.c*: uses a **reduction** to update the histogram.


--------------------------------------------------------------------------------
/lec15/histogram.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | // You can assume that the size of the array is much larger than the maximum value.
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <omp.h>
 8 | 
 9 | #define ARRAY_SIZE 100000000
10 | 
11 | int main(int argc, char** argv){
12 |     int max = atoi(argv[1]);
13 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
14 |     int* counts = (int*)malloc(max * sizeof(int));
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     double start = omp_get_wtime();
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts[i] = 0;
23 |     }
24 | 
25 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
26 |         counts[array[i]]++;
27 |     }
28 |     double stop = omp_get_wtime();
29 | 
30 |     for(unsigned long i = 0; i < max; i++){
31 |         printf("%d elements with value %ld\n", counts[i], i);
32 |     }
33 |     printf("Total runtime: %f secs\n", stop - start);
34 |     
35 |     free(counts);
36 |     free(array);
37 |     return 0;
38 | }


--------------------------------------------------------------------------------
/lec15/histogram_solution_ato_local.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <omp.h>
 7 | 
 8 | #define ARRAY_SIZE 100000000
 9 | 
10 | int main(int argc, char** argv){
11 |     int max = atoi(argv[1]);
12 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
13 |     int* counts = (int*)malloc(max * sizeof(int));
14 |     int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     // Compute reference counts for error check
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts_reference[i] = 0;
23 |     }
24 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
25 |         counts_reference[array[i]]++;
26 |     }
27 | 
28 |     // Create local counts -- ATTENTION: We need to do it for omp_get_max_threads (we do not know how many threads will be used)
29 |     // Alternatively, we could check the value of the OMP_NUM_THREADS env variable
30 |     int** counts_local = (int**)malloc(omp_get_max_threads() * sizeof(int*));
31 |     for(int i = 0; i < omp_get_max_threads(); i++){
32 |         counts_local[i] = (int*)malloc(max * sizeof(int));
33 |         for(unsigned long j = 0; j < max; j++){
34 |             counts_local[i][j] = 0;
35 |         }
36 |     }
37 | 
38 |     double start = omp_get_wtime();
39 |     for(unsigned long i = 0; i < max; i++){
40 |         counts[i] = 0;
41 |     }
42 | 
43 |     #pragma omp parallel
44 |     {
45 |         int tid = omp_get_thread_num();
46 |         #pragma omp for
47 |         for(unsigned long i = 0; i < ARRAY_SIZE; i++){
48 |             counts_local[tid][array[i]]++; // ATTENTION: Still some false sharing might happen here
49 |         }
50 | 
51 |         #pragma omp for
52 |         for(int t = 0; t < omp_get_num_threads(); t++){
53 |             for(unsigned long i = 0; i < max; i++){    
54 |                 #pragma omp atomic        
55 |                 counts[i] += counts_local[t][i];
56 |             }
57 |         }
58 |     }
59 | 
60 |     double stop = omp_get_wtime();
61 | 
62 |     for(unsigned long i = 0; i < max; i++){
63 |         if(counts[i] != counts_reference[i]){
64 |             fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]);
65 |             return 1;
66 |         }
67 |         printf("%d elements with value %ld\n", counts[i], i);
68 |     }
69 |     printf("Total runtime: %f secs\n", stop - start);
70 | 
71 |     for(int i = 0; i < omp_get_max_threads(); i++){
72 |         free(counts_local[i]);
73 |     }
74 |     free(counts_local);
75 |     free(counts_reference);    
76 |     free(counts);
77 |     free(array);
78 |     return 0;
79 | }


--------------------------------------------------------------------------------
/lec15/histogram_solution_ato_local_better.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <omp.h>
 7 | 
 8 | #define ARRAY_SIZE 100000000
 9 | 
10 | int main(int argc, char** argv){
11 |     int max = atoi(argv[1]);
12 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
13 |     int* counts = (int*)malloc(max * sizeof(int));
14 |     int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     // Compute reference counts for error check
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts_reference[i] = 0;
23 |     }
24 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
25 |         counts_reference[array[i]]++;
26 |     }
27 | 
28 |     // Create local counts -- ATTENTION: We need to do it for omp_get_max_threads (we do not know how many threads will be used)
29 |     // Alternatively, we could check the value of the OMP_NUM_THREADS env variable
30 |     int** counts_local = (int**)malloc(omp_get_max_threads() * sizeof(int*));
31 |     for(int i = 0; i < omp_get_max_threads(); i++){
32 |         int adjusted_size = (max*sizeof(int) + 64); // Add padding (assuming 64 byte cache line size)
33 |         counts_local[i] = (int*)malloc(adjusted_size);
34 |         for(unsigned long j = 0; j < max; j++){
35 |             counts_local[i][j] = 0;
36 |         }
37 |     }
38 | 
39 |     double start = omp_get_wtime();
40 |     for(unsigned long i = 0; i < max; i++){
41 |         counts[i] = 0;
42 |     }
43 | 
44 |     #pragma omp parallel
45 |     {
46 |         int tid = omp_get_thread_num();
47 |         #pragma omp for
48 |         for(unsigned long i = 0; i < ARRAY_SIZE; i++){
49 |             counts_local[tid][array[i]]++;
50 |         }
51 | 
52 |         #pragma omp for
53 |         for(int t = 0; t < omp_get_num_threads(); t++){
54 |             for(unsigned long i = 0; i < max; i++){    
55 |                 #pragma omp atomic        
56 |                 counts[i] += counts_local[t][i];
57 |             }
58 |         }
59 |     }
60 | 
61 |     double stop = omp_get_wtime();
62 | 
63 |     for(unsigned long i = 0; i < max; i++){
64 |         if(counts[i] != counts_reference[i]){
65 |             fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]);
66 |             return 1;
67 |         }
68 |         printf("%d elements with value %ld\n", counts[i], i);
69 |     }
70 |     printf("Total runtime: %f secs\n", stop - start);
71 | 
72 |     for(int i = 0; i < omp_get_max_threads(); i++){
73 |         free(counts_local[i]);
74 |     }
75 |     free(counts_local);
76 |     free(counts_reference);    
77 |     free(counts);
78 |     free(array);
79 |     return 0;
80 | }


--------------------------------------------------------------------------------
/lec15/histogram_solution_red.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <omp.h>
 7 | 
 8 | #define ARRAY_SIZE 100000000
 9 | 
10 | int main(int argc, char** argv){
11 |     int max = atoi(argv[1]);
12 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
13 |     int* counts = (int*)malloc(max * sizeof(int));
14 |     int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     // Compute reference counts for error check
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts_reference[i] = 0;
23 |     }
24 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
25 |         counts_reference[array[i]]++;
26 |     }
27 | 
28 |     double start = omp_get_wtime();
29 |     for(unsigned long i = 0; i < max; i++){
30 |         counts[i] = 0;
31 |     }
32 | 
33 |     #pragma omp parallel for reduction(+:counts[:max])
34 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
35 |         counts[array[i]]++;
36 |     }
37 |     double stop = omp_get_wtime();
38 | 
39 |     for(unsigned long i = 0; i < max; i++){
40 |         if(counts[i] != counts_reference[i]){
41 |             fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]);
42 |             return 1;
43 |         }
44 |         printf("%d elements with value %ld\n", counts[i], i);
45 |     }
46 |     printf("Total runtime: %f secs\n", stop - start);
47 | 
48 |     free(counts_reference);    
49 |     free(counts);
50 |     free(array);
51 |     return 0;
52 | }


--------------------------------------------------------------------------------
/lec15/histogram_solution_trivial_ato.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <omp.h>
 7 | 
 8 | #define ARRAY_SIZE 100000000
 9 | 
10 | int main(int argc, char** argv){
11 |     int max = atoi(argv[1]);
12 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
13 |     int* counts = (int*)malloc(max * sizeof(int));
14 |     int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     // Compute reference counts for error check
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts_reference[i] = 0;
23 |     }
24 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
25 |         counts_reference[array[i]]++;
26 |     }
27 | 
28 |     double start = omp_get_wtime();
29 |     for(unsigned long i = 0; i < max; i++){
30 |         counts[i] = 0;
31 |     }
32 | 
33 |     #pragma omp parallel for
34 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
35 |         #pragma omp atomic
36 |         counts[array[i]]++;
37 |     }
38 |     double stop = omp_get_wtime();
39 | 
40 |     for(unsigned long i = 0; i < max; i++){
41 |         if(counts[i] != counts_reference[i]){
42 |             fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]);
43 |             return 1;
44 |         }
45 |         printf("%d elements with value %ld\n", counts[i], i);
46 |     }
47 |     printf("Total runtime: %f secs\n", stop - start);
48 | 
49 |     free(counts_reference);    
50 |     free(counts);
51 |     free(array);
52 |     return 0;
53 | }


--------------------------------------------------------------------------------
/lec15/histogram_solution_trivial_crit.c:
--------------------------------------------------------------------------------
 1 | // Implements counting sort
 2 | // First argument from command line is the maximum value that each element in the array can have.
 3 | // e.g., if the argument is 100, then each element of the array can contain a value between 0 and 100.
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <omp.h>
 7 | 
 8 | #define ARRAY_SIZE 100000000
 9 | 
10 | int main(int argc, char** argv){
11 |     int max = atoi(argv[1]);
12 |     int* array = (int*)malloc(ARRAY_SIZE * sizeof(int));
13 |     int* counts = (int*)malloc(max * sizeof(int));
14 |     int* counts_reference = (int*)malloc(max * sizeof(int)); // Compute sequentially and use for error checks
15 |     // Generate random array
16 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
17 |         array[i] = rand() % max;        
18 |     }
19 | 
20 |     // Compute reference counts for error check
21 |     for(unsigned long i = 0; i < max; i++){
22 |         counts_reference[i] = 0;
23 |     }
24 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
25 |         counts_reference[array[i]]++;
26 |     }
27 | 
28 |     double start = omp_get_wtime();
29 |     for(unsigned long i = 0; i < max; i++){
30 |         counts[i] = 0;
31 |     }
32 | 
33 |     #pragma omp parallel for
34 |     for(unsigned long i = 0; i < ARRAY_SIZE; i++){
35 |         #pragma omp critical
36 |         counts[array[i]]++;
37 |     }
38 |     double stop = omp_get_wtime();
39 | 
40 |     for(unsigned long i = 0; i < max; i++){
41 |         if(counts[i] != counts_reference[i]){
42 |             fprintf(stderr, "Error: counts[%lu] = %d, counts_reference[%lu] = %d\n", i, counts[i], i, counts_reference[i]);
43 |             return 1;
44 |         }
45 |         printf("%d elements with value %ld\n", counts[i], i);
46 |     }
47 |     printf("Total runtime: %f secs\n", stop - start);
48 | 
49 |     free(counts_reference);    
50 |     free(counts);
51 |     free(array);
52 |     return 0;
53 | }


--------------------------------------------------------------------------------
/lec15/matmul.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | **  PROGRAM: Matrix Multiply
 3 | **
 4 | **  PURPOSE: This is a simple matrix multiply program. 
 5 | **           It will compute the product
 6 | **
 7 | **                C  = A * B
 8 | **
 9 | **           A and B are set to constant matrices so we
10 | **           can make a quick test of the multiplication.
11 | **
12 | **  USAGE:   Right now, I hardwire the martix dimensions. 
13 | **           later, I'll take them from the command line.
14 | **
15 | **  HISTORY: Written by Tim Mattson, Nov 1999.
16 | */
17 | #include <malloc.h>
18 | #include <stdio.h>
19 | #include <omp.h>
20 | 
21 | #define ORDER 1000
22 | #define AVAL 3.0
23 | #define BVAL 5.0
24 | #define TOL  0.001
25 | 
26 | int main(int argc, char **argv)
27 | {
28 | 	int Ndim, Pdim, Mdim;   /* A[N][P], B[P][M], C[N][M] */
29 | 	int i,j,k;
30 | 	double *A, *B, *C, cval, err, errsq;
31 |       double dN, mflops;
32 | 	double start_time, run_time;
33 | 
34 | 
35 | 	Ndim = ORDER;
36 | 	Pdim = ORDER;
37 | 	Mdim = ORDER;
38 | 
39 |    	A = (double *)malloc(Ndim*Pdim*sizeof(double));
40 |       B = (double *)malloc(Pdim*Mdim*sizeof(double));
41 |       C = (double *)malloc(Ndim*Mdim*sizeof(double));
42 | 
43 | 	/* Initialize matrices */
44 | 
45 | 	for (i=0; i<Ndim; i++)
46 | 		for (j=0; j<Pdim; j++)
47 | 			*(A+(i*Ndim+j)) = AVAL;
48 | 
49 | 	for (i=0; i<Pdim; i++)
50 | 		for (j=0; j<Mdim; j++)
51 | 			*(B+(i*Pdim+j)) = BVAL;
52 | 
53 | 	for (i=0; i<Ndim; i++)
54 | 		for (j=0; j<Mdim; j++)
55 | 			*(C+(i*Ndim+j)) = 0.0;
56 | 
57 | 	/* Do the matrix product */
58 | 
59 | 	start_time = omp_get_wtime(); 
60 | 	for (i=0; i<Ndim; i++){
61 | 		for (j=0; j<Mdim; j++){
62 | 			for(k=0;k<Pdim;k++){
63 | 				/* C(i,j) = sum(over k) A(i,k) * B(k,j) */
64 | 				*(C+(i*Ndim+j)) += *(A+(i*Ndim+k)) *  *(B+(k*Pdim+j));
65 | 			}
66 | 		}
67 | 	}
68 | 	/* Check the answer */
69 | 
70 | 	run_time = omp_get_wtime() - start_time;
71 |  
72 | 	printf(" Order %d multiplication in %f seconds \n", ORDER, run_time);
73 | 
74 |       dN = (double)ORDER;
75 |       mflops = 2.0 * dN * dN * dN/(1000000.0* run_time);
76 |  
77 | 	printf(" Order %d multiplication at %f mflops\n", ORDER, mflops);
78 | 
79 | 	cval = Pdim * AVAL * BVAL;
80 | 	errsq = 0.0;
81 | 	for (i=0; i<Ndim; i++){
82 | 		for (j=0; j<Mdim; j++){
83 | 			err = *(C+i*Ndim+j) - cval;
84 | 		    errsq += err * err;
85 | 		}
86 | 	}
87 | 
88 | 	if (errsq > TOL) 
89 | 		printf("\n Errors in multiplication: %f",errsq);
90 | 	else
91 | 		printf("\n Hey, it worked");
92 | 
93 | 	printf("\n all done \n");
94 | }
95 | 


--------------------------------------------------------------------------------
/lec15/matmul_solution.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | **  PROGRAM: Parallel Matrix Multiply (using OpenMP)
 3 | **
 4 | **  PURPOSE: This is a simple matrix multiply program. 
 5 | **           It will compute the product
 6 | **
 7 | **                C  = A * B
 8 | **
 9 | **           A and B are set to constant matrices so we
10 | **           can make a quick test of the multiplication.
11 | **
12 | **  USAGE:   Right now, I hardwire the martix dimensions. 
13 | **           later, I'll take them from the command line.
14 | **  
15 | **  HISTORY: Written by Tim Mattson, Nov 1999.
16 | */
17 | #include <malloc.h>
18 | #include <stdio.h>
19 | #include <omp.h>
20 | 
21 | #define ORDER 1000
22 | #define AVAL 3.0
23 | #define BVAL 5.0
24 | #define TOL  0.001
25 | 
26 | int main(int argc, char *argv[])
27 | {
28 | 	int Ndim, Pdim, Mdim;   /* A[N][P], B[P][M], C[N][M] */
29 | 	int i,j,k;
30 | 	
31 | 	double *A, *B, *C, cval, err, errsq;
32 |     double dN, mflops;
33 | 	double start_time, run_time;
34 | 
35 | 
36 | 	Ndim = ORDER;
37 | 	Pdim = ORDER;
38 | 	Mdim = ORDER;
39 | 
40 | 	A = (double *)malloc(Ndim*Pdim*sizeof(double));
41 |       B = (double *)malloc(Pdim*Mdim*sizeof(double));
42 |       C = (double *)malloc(Ndim*Mdim*sizeof(double));
43 | 
44 | 	/* Initialize matrices */
45 | 
46 | 	for (i=0; i<Ndim; i++)
47 | 		for (j=0; j<Pdim; j++)
48 | 			*(A+(i*Ndim+j)) = AVAL;
49 | 
50 | 	for (i=0; i<Pdim; i++)
51 | 		for (j=0; j<Mdim; j++)
52 | 			*(B+(i*Pdim+j)) = BVAL;
53 | 
54 | 	for (i=0; i<Ndim; i++)
55 | 		for (j=0; j<Mdim; j++)
56 | 			*(C+(i*Ndim+j)) = 0.0;
57 | 	
58 | 	start_time = omp_get_wtime();
59 | 
60 | 	/* Do the matrix product */
61 | 
62 | #pragma omp parallel for private(i, j, k)
63 | 	for (i=0; i<Ndim; i++){
64 | 		for (j=0; j<Mdim; j++){
65 | 			for(k=0;k<Pdim;k++){
66 | 				/* C(i,j) = sum(over k) A(i,k) * B(k,j) */
67 | 				*(C+(i*Ndim+j)) += *(A+(i*Ndim+k)) *  *(B+(k*Pdim+j));
68 | 			}
69 | 		}
70 | 	}
71 | 	/* Check the answer */
72 | 
73 | 	run_time = omp_get_wtime() - start_time;
74 | 
75 | 	printf(" Order %d multiplication in %f seconds \n", ORDER, run_time);
76 |       printf(" %d threads\n",omp_get_max_threads());
77 |       dN = (double)ORDER;
78 |       mflops = 2.0 * dN * dN * dN/(1000000.0* run_time);
79 | 
80 |       printf(" Order %d multiplication at %f mflops\n", ORDER, mflops);
81 | 
82 | 	cval = Pdim * AVAL * BVAL;
83 | 	errsq = 0.0;
84 | 	for (i=0; i<Ndim; i++){
85 | 		for (j=0; j<Mdim; j++){
86 | 			err = *(C+i*Ndim+j) - cval;
87 | 		    errsq += err * err;
88 | 		}
89 | 	}
90 | 
91 | 	if (errsq > TOL) 
92 | 		printf("\n Errors in multiplication: %f",errsq);
93 | 	else
94 | 		printf("\n Hey, it worked");
95 | 
96 | 	printf("\n all done \n");
97 | }
98 | 


--------------------------------------------------------------------------------
/lec15/pi.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | NAME:
  4 |    Pi_mc:  PI Monte Carlo
  5 | 
  6 | Purpose:
  7 |    This program uses a Monte Carlo algorithm to compute PI as an
  8 |    example of how random number generators are used to solve problems.
  9 |    Note that if your goal is to find digits of pi, there are much 
 10 |    better algorithms you could use.
 11 | 
 12 | Usage:
 13 |    To keep the program as simple as possible, you must edit the file
 14 |    and change the value of num_trials to change the number of samples
 15 |    used.  Then compile and run the program.
 16 | 
 17 | Algorithm:
 18 |    The basic idea behind the algorithm is easy to visualize.  Draw a 
 19 |    square on a wall.  Inside the square, draw a circle.  Now randomly throw 
 20 |    darts at the wall.  some darts will land inside the square.  Of those, 
 21 |    some will fall inside the circle.   The probability of landing inside
 22 |    the circle or the square is proportional to their areas.
 23 | 
 24 |    We can use a random number generator to "throw the darts" and count
 25 |    how many "darts" fall inside the square and how many inside the 
 26 |    cicle.  Dividing these two numbers gives us the ratio of their areas
 27 |    and from that we can compute pi.
 28 | 
 29 | Algorithm details:
 30 |    To turn this into code, I need a bit more detail.  Assume the circle
 31 |    is centered inside the square.  the circle will have a radius of r and 
 32 |    each side of the square will be of area 2*r (i.e. the diameter of the
 33 |    circle).  
 34 | 
 35 |        A(circle) = pi * r^2
 36 |        A(square) = (2*r)*(2*r) = 4*r^2
 37 | 
 38 |        ratio = A(circle)/A(square) = pi/4
 39 | 
 40 |    Since the probability (P) of a dart falling inside a figure (i.e. the square 
 41 |    or the circle) is proportional to the area, we have
 42 | 
 43 |        ratio = P(circle)/P(square) = pi/4
 44 | 
 45 |    If I throw N darts as computed by random numbers evenly distributed 
 46 |    over the area of the square
 47 | 
 48 |       P(sqaure) = N/N    .... i.e. every dart lands in the square
 49 |       P(circle) = N(circle)/N
 50 | 
 51 |       ratio = (N(circle)/N)/(N/N)  = N(circle)/N
 52 | 
 53 |    Hence, to find the area, I compute N random "darts" and count how many fall
 54 |    inside the circle.  The equation for a circle is
 55 | 
 56 |       x^2 + y^2 = r^2 
 57 | 
 58 |    So I randomly compute "x" and "y" evenly distributed from -r to r and 
 59 |    count the "dart" as falling inside the cicle if
 60 | 
 61 |       x^2 + y^2 < or = r
 62 | 
 63 | Results:  
 64 |    Remember, our goal is to demonstrate a simple monte carlo algorithm, 
 65 |    not compute pi.  But just for the record, here are some results (Intel compiler
 66 |    version 10.0, Windows XP, core duo laptop)
 67 | 
 68 |        100        3.160000
 69 |        1000       3.148000
 70 |        10000      3.154000
 71 |        100000     3.139920
 72 |        1000000    3.141456
 73 |        10000000   3.141590
 74 |        100000000  3.141581
 75 | 
 76 |    As a point of reference, the first 7 digits of the true value of pi 
 77 |    is 3.141592 
 78 | 
 79 | 
 80 | History: 
 81 |    Written by Tim Mattson, 9/2007.
 82 | 
 83 | */
 84 | #include <stdio.h>
 85 | #include <stdlib.h>
 86 | #include <omp.h>
 87 | #include <time.h>
 88 | 
 89 | // 
 90 | // The monte carlo pi program
 91 | //
 92 | 
 93 | static long num_trials = 100000000;
 94 | 
 95 | double get_rand_minus_one_one(){
 96 |     return 2 * (rand() / (double)RAND_MAX) - 1;
 97 | }
 98 | 
 99 | int main ()
100 | {
101 |    long i;  long Ncirc = 0;
102 |    double pi, x, y, test, total_time;
103 |    double r = 1.0;   // radius of circle. Side of squrare is 2*r 
104 |    srand(time(NULL));
105 | 
106 |    total_time = omp_get_wtime();
107 |    for(i=0;i<num_trials; i++)
108 |    {
109 |       x = get_rand_minus_one_one(); 
110 |       y = get_rand_minus_one_one();
111 | 
112 |       test = x*x + y*y;
113 | 
114 |       if (test <= r*r) Ncirc++;
115 |     }
116 | 
117 |     pi = 4.0 * ((double)Ncirc/(double)num_trials);
118 | 
119 |     printf("\n %ld trials, pi is %f ",num_trials, pi);
120 |     printf(" in %f seconds\n",omp_get_wtime()-total_time);
121 | 
122 |     return 0;
123 | }
124 | 	  
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/lec15/pi_solution_ato.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <omp.h>
 5 | #include <time.h>
 6 | 
 7 | static long num_trials = 100000000;
 8 | 
 9 | double get_rand_minus_one_one(unsigned int* seed){
10 |     return 2 * (rand_r(seed) / (double)RAND_MAX) - 1;
11 | }
12 | 
13 | int main ()
14 | {
15 |    long i;  long Ncirc = 0;
16 |    double pi, x, y, test, total_time;
17 |    double r = 1.0;   // radius of circle. Side of squrare is 2*r 
18 |    srand(time(NULL));
19 | 
20 |    total_time = omp_get_wtime();
21 |    #pragma omp parallel
22 |    {
23 |       unsigned int seed = omp_get_thread_num();
24 |       #pragma omp single
25 |           printf(" %d threads ",omp_get_num_threads());
26 | 
27 |       #pragma omp for private(x,y,test)
28 |       for(i=0;i<num_trials; i++)
29 |       {
30 |          x = get_rand_minus_one_one(&seed); 
31 |          y = get_rand_minus_one_one(&seed);
32 | 
33 |          test = x*x + y*y;
34 | 
35 |          if (test <= r*r){
36 |             #pragma omp atomic
37 |             Ncirc++;
38 |          }
39 |        }
40 |     }
41 | 
42 |     pi = 4.0 * ((double)Ncirc/(double)num_trials);
43 | 
44 |     printf("\n %ld trials, pi is %f ",num_trials, pi);
45 |     printf(" in %f seconds\n",omp_get_wtime()-total_time);
46 | 
47 |     return 0;
48 | }


--------------------------------------------------------------------------------
/lec15/pi_solution_crit.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <omp.h>
 5 | #include <time.h>
 6 | 
 7 | static long num_trials = 100000000;
 8 | 
 9 | double get_rand_minus_one_one(unsigned int* seed){
10 |     return 2 * (rand_r(seed) / (double)RAND_MAX) - 1;
11 | }
12 | 
13 | int main ()
14 | {
15 |    long i;  long Ncirc = 0;
16 |    double pi, x, y, test, total_time;
17 |    double r = 1.0;   // radius of circle. Side of squrare is 2*r 
18 |    srand(time(NULL));
19 | 
20 |    total_time = omp_get_wtime();
21 |    #pragma omp parallel
22 |    {
23 |       unsigned int seed = omp_get_thread_num();
24 |       #pragma omp single
25 |           printf(" %d threads ",omp_get_num_threads());
26 | 
27 |       #pragma omp for private(x,y,test)
28 |       for(i=0;i<num_trials; i++)
29 |       {
30 |          x = get_rand_minus_one_one(&seed); 
31 |          y = get_rand_minus_one_one(&seed);
32 | 
33 |          test = x*x + y*y;
34 | 
35 |          if (test <= r*r){
36 |             #pragma omp critical
37 |             Ncirc++;
38 |          }
39 |        }
40 |     }
41 | 
42 |     pi = 4.0 * ((double)Ncirc/(double)num_trials);
43 | 
44 |     printf("\n %ld trials, pi is %f ",num_trials, pi);
45 |     printf(" in %f seconds\n",omp_get_wtime()-total_time);
46 | 
47 |     return 0;
48 | }


--------------------------------------------------------------------------------
/lec15/pi_solution_crit_rand.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <omp.h>
 5 | #include <time.h>
 6 | 
 7 | static long num_trials = 100000000;
 8 | 
 9 | double get_rand_minus_one_one(){
10 |     return 2 * (rand() / (double)RAND_MAX) - 1;
11 | }
12 | 
13 | int main ()
14 | {
15 |    long i;  long Ncirc = 0;
16 |    double pi, x, y, test, total_time;
17 |    double r = 1.0;   // radius of circle. Side of squrare is 2*r 
18 |    srand(time(NULL));
19 | 
20 |    total_time = omp_get_wtime();
21 |    #pragma omp parallel
22 |    {
23 | 
24 |       #pragma omp single
25 |           printf(" %d threads ",omp_get_num_threads());
26 | 
27 |       #pragma omp for private(x,y,test)
28 |       for(i=0;i<num_trials; i++)
29 |       {
30 |          x = get_rand_minus_one_one(); 
31 |          y = get_rand_minus_one_one();
32 | 
33 |          test = x*x + y*y;
34 | 
35 |          if (test <= r*r){
36 |             #pragma omp critical
37 |             Ncirc++;
38 |          }
39 |        }
40 |     }
41 | 
42 |     pi = 4.0 * ((double)Ncirc/(double)num_trials);
43 | 
44 |     printf("\n %ld trials, pi is %f ",num_trials, pi);
45 |     printf(" in %f seconds\n",omp_get_wtime()-total_time);
46 | 
47 |     return 0;
48 | }


--------------------------------------------------------------------------------
/lec15/pi_solution_red.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <omp.h>
 5 | #include <time.h>
 6 | 
 7 | static long num_trials = 100000000;
 8 | 
 9 | double get_rand_minus_one_one(unsigned int* seed){
10 |     return 2 * (rand_r(seed) / (double)RAND_MAX) - 1;
11 | }
12 | 
13 | int main ()
14 | {
15 |    long i;  long Ncirc = 0;
16 |    double pi, x, y, test, total_time;
17 |    double r = 1.0;   // radius of circle. Side of squrare is 2*r 
18 |    srand(time(NULL));
19 | 
20 |    total_time = omp_get_wtime();
21 |    #pragma omp parallel
22 |    {
23 |       unsigned int seed = omp_get_thread_num();
24 |       #pragma omp single
25 |           printf(" %d threads ",omp_get_num_threads());
26 | 
27 |       #pragma omp for reduction(+:Ncirc) private(x,y,test)
28 |       for(i=0;i<num_trials; i++)
29 |       {
30 |          x = get_rand_minus_one_one(&seed); 
31 |          y = get_rand_minus_one_one(&seed);
32 | 
33 |          test = x*x + y*y;
34 | 
35 |          if (test <= r*r) Ncirc++;
36 |        }
37 |     }
38 | 
39 |     pi = 4.0 * ((double)Ncirc/(double)num_trials);
40 | 
41 |     printf("\n %ld trials, pi is %f ",num_trials, pi);
42 |     printf(" in %f seconds\n",omp_get_wtime()-total_time);
43 | 
44 |     return 0;
45 | }


--------------------------------------------------------------------------------
/lec15/rand_vs_rand_r.c:
--------------------------------------------------------------------------------
 1 | // Check the performance of rand() and rand_r() functions
 2 | #include "../lec12/my_timer.h"
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | 
 6 | #define ITER 100000000
 7 | 
 8 | int main(int argc, char** argv){
 9 |     double start, stop;
10 |     GET_TIME(start);
11 |     double dummy = 0;
12 |     for(int i = 0; i < ITER; i++){
13 |         dummy += rand();
14 |     }
15 |     GET_TIME(stop);
16 |     printf("rand() time: %lf sec\n", stop - start);
17 |     unsigned int s = 0;
18 |     GET_TIME(start);
19 |     for(int i = 0; i < ITER; i++){
20 |         dummy += rand_r(&s);
21 |     }
22 |     GET_TIME(stop);
23 |     printf("rand_r() time: %lf sec\n", stop - start);    
24 |     return 0;
25 | }


--------------------------------------------------------------------------------
/lec19/Makefile:
--------------------------------------------------------------------------------
 1 | CC          = nvcc
 2 | LIBS        =
 3 | CFLAGS      = -arch=compute_80 -DBLUR_SIZE=1 -DBLOCK_SIZE=16
 4 | 
 5 | RM=rm -f
 6 | 
 7 | EXES=vector_add test_cuda image_blur image_blur_shared image_blur_shared_all jacobi jacobi_solution
 8 | 
 9 | all: $(EXES)
10 | 
11 | %: %.cu 
12 | 	$(CC) $(CFLAGS) -o $@ $@.cu $(LIBS)
13 | 
14 | clean:
15 | 	$(RM) $(EXES) 
16 | 


--------------------------------------------------------------------------------
/lec19/README.md:
--------------------------------------------------------------------------------
1 | - vector_add.cu: Example on vectors addition
2 | - image_blur.cu: Example on image blurring with data in global memory
3 | - image_blur_shared.cu: Example on image blurring with (part of the) data in shared memory
4 | - jacobi.cu: Example on Jacobi solver (taken from https://github.com/csc-training/CUDA/tree/master/exercises/jacobi).
5 |              It computes it both on CPU and GPU, comparing the runtime and the result.
6 | 	     This file only contains the CPU implementation, you are supposed to implement the GPU part.
7 | - jacobi_solution.cu: Solution of jacobi.cu


--------------------------------------------------------------------------------
/lec19/cuda_job.sub:
--------------------------------------------------------------------------------
 1 | universe = vanilla
 2 | 
 3 | log = cuda_job.log
 4 | output = cuda_job.out
 5 | error = cuda_job.err
 6 | 
 7 | # Request GPU resources
 8 | request_gpus = 1
 9 | 
10 | # Specify any environment setup if needed
11 | getenv = True
12 | 
13 | queue
14 | 
15 | 


--------------------------------------------------------------------------------
/lec19/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/lec19/image_blur.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <assert.h>
  3 | #include <stdio.h>
  4 | #include "../lec13/my_timer.h"
  5 | 
  6 | #define HEIGHT 8192
  7 | #define WIDTH 8192
  8 | #define NUM_PIXELS (HEIGHT*WIDTH)
  9 | #define NUM_CHANNELS 1
 10 | 
 11 | __global__ void blurKernel(unsigned char* in, unsigned char* out, int w, int h){
 12 | 	int col = blockIdx.x*blockDim.x + threadIdx.x;
 13 | 	int row = blockIdx.y*blockDim.y + threadIdx.y;
 14 | 
 15 | 	if(col < w && row < h){
 16 | 		int pixVal = 0;
 17 | 		int pixels = 0;
 18 | 		for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){
 19 | 			for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){
 20 | 				int curRow = row + blurRow;
 21 | 				int curCol = col + blurCol;
 22 | 				if(curRow >= 0 && curRow < h && curCol >= 0 && curCol < w){
 23 | 					pixVal += in[curRow*w + curCol];
 24 | 					pixels++;
 25 | 				}
 26 | 			}
 27 | 
 28 | 		}
 29 | 	        out[row*w + col] = (unsigned char) (pixVal / pixels);
 30 | 	}
 31 | }
 32 | 
 33 | int main(int argc, char** argv){
 34 | 	// We do not actually load an image
 35 | 	// In principle, we should load an image from a file into a host buffer
 36 | 	// and then copy it to a device buffer.
 37 | 	// Instead, we create an 'image' made of random bytes.
 38 | 
 39 | 	size_t numBytes = NUM_PIXELS*NUM_CHANNELS*sizeof(unsigned char);
 40 | 
 41 | 	// Allocate host input/output vectors
 42 | 	unsigned char *h_input, *h_output, *h_output_ref;
 43 | 	h_input = (unsigned char*) malloc(numBytes);
 44 | 	h_output = (unsigned char*) malloc(numBytes);
 45 | 	h_output_ref = (unsigned char*) malloc(numBytes);
 46 | 	
 47 | 
 48 | 	printf("Initializing input.\n"); fflush(stdout);
 49 | 	// Initialize the input with random stuff
 50 | 	for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){
 51 | 		   h_input[i] = rand() % 256;
 52 | 	}
 53 | 	printf("Input initialized.\n"); fflush(stdout);
 54 | 
 55 | 	// Allocate device input/output vectors and copy the input data to the device
 56 | 	unsigned char *d_input, *d_output;
 57 | 	cudaMalloc((void**) &d_input, numBytes);
 58 | 	cudaMalloc((void**) &d_output, numBytes);
 59 | 	cudaMemcpy(d_input, h_input, numBytes, cudaMemcpyHostToDevice);
 60 | 
 61 | 	printf("Data copied to device.\n"); fflush(stdout);
 62 | 	
 63 | 	dim3 dimGrid(ceil(WIDTH/(float) BLOCK_SIZE), ceil(HEIGHT/ (float) BLOCK_SIZE));
 64 | 	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 65 | 	double start, stop;
 66 | 
 67 | 	// In principle we should also include in the timing the time to copy data to/from the device,
 68 | 	// we do not do it here mostly to show the difference in time between the version with/without shared memory
 69 | 	GET_TIME(start);
 70 | 	blurKernel<<<dimGrid, dimBlock>>>(d_input, d_output, WIDTH, HEIGHT);
 71 | 	cudaDeviceSynchronize();
 72 | 	GET_TIME(stop);
 73 | 	printf("Runtime: %lf seconds\n", stop - start); fflush(stdout);
 74 | 
 75 | 	// Copy the output data from the device to the host
 76 | 	cudaMemcpy(h_output, d_output, numBytes, cudaMemcpyDeviceToHost);
 77 | 	printf("Output retrieved.\n"); fflush(stdout);
 78 | 
 79 | 	// Now we check that the result computed by the device is correct
 80 | 	// Do the same blurring on the host
 81 | 	for(int row = 0; row < HEIGHT; row++){
 82 | 		for(int col = 0; col < WIDTH; col++){
 83 | 			int pixVal = 0;
 84 | 	                int pixels = 0;
 85 |         	    	for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){
 86 | 				for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){
 87 |                                     	    int curRow = row + blurRow;
 88 |                                 	    int curCol = col + blurCol;
 89 |                                 	    if(curRow >= 0 && curRow < HEIGHT && curCol >= 0 && curCol < WIDTH){
 90 |                                             	      pixVal += h_input[curRow*WIDTH + curCol];
 91 |                                         	      pixels++;
 92 |                                 	    }
 93 |                         	}
 94 | 			}
 95 |                 	h_output_ref[row*WIDTH + col] = (unsigned char) (pixVal / pixels);
 96 | 		}
 97 | 	}
 98 | 	printf("Reference result computed\n"); fflush(stdout);
 99 | 	// Now check that the content of h_output is equal to h_output_ref
100 |         for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){
101 | 		   if(h_output_ref[i] != h_output[i]){
102 | 		   		  fprintf(stderr, "Outputs differ at index %d (%d vs. %d)\n", i, h_output_ref[i], h_output[i]); fflush(stderr);
103 | 				  exit(-1);
104 | 		   }
105 | 
106 | 	}
107 | 
108 | 	printf("Everything is fine\n"); fflush(stdout);
109 | 
110 | 	cudaFree(d_input);
111 | 	cudaFree(d_output);
112 | 	return 0;
113 | }
114 | 


--------------------------------------------------------------------------------
/lec19/image_blur_shared.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda.h>
  2 | #include <assert.h>
  3 | #include <stdio.h>
  4 | #include "../lec13/my_timer.h"
  5 | 
  6 | #define HEIGHT 8192
  7 | #define WIDTH 8192
  8 | #define NUM_PIXELS (HEIGHT*WIDTH)
  9 | #define NUM_CHANNELS 1
 10 | 
 11 | __global__ void blurKernel(unsigned char* in, unsigned char* out, int w, int h){
 12 |         __device__ __shared__ unsigned char in_shared[BLOCK_SIZE][BLOCK_SIZE]; // ATTENTION: I expressed it as a static 2D array, so I can use 2D indexing
 13 | 	int col = blockIdx.x*blockDim.x + threadIdx.x;
 14 | 	int row = blockIdx.y*blockDim.y + threadIdx.y;
 15 | 
 16 | 	if(col < w && row < h){
 17 | 	        // Each thread copies its element from global to shared memory
 18 | 	        in_shared[threadIdx.y][threadIdx.x] = in[row*w + col];
 19 | 		__syncthreads();
 20 | 		
 21 | 		int pixVal = 0;
 22 | 		int pixels = 0;
 23 | 		for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){
 24 | 			for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){
 25 | 				int curRow = row + blurRow;
 26 | 				int curCol = col + blurCol;
 27 | 				if(curRow >= 0 && curRow < h && curCol >= 0 && curCol < w){
 28 | 					// We need to check if the element in[curRow][curCol]
 29 | 					// is one of those managed by this block (and then we
 30 | 					// find the data in shared memory), or managed by a different
 31 | 					// block (and then we need to get the data from global memory).
 32 | 					// To do so, we can just divide the curRow by the BLOCK_SIZE.
 33 | 					// If that's different from row/BLOCK_SIZE, then this thread
 34 | 					// and the element it wants to read now are in two different
 35 | 					// blocks, and it is read from global memory (the same for curCol).
 36 | 					if(curRow / BLOCK_SIZE != row / BLOCK_SIZE || curCol / BLOCK_SIZE != col / BLOCK_SIZE){
 37 |                                             pixVal += in[curRow*w + curCol];
 38 | 					}else{
 39 | 					    int curRowShared = threadIdx.y + blurRow;
 40 | 					    int curColShared = threadIdx.x + blurCol;
 41 | 					    pixVal += in_shared[curRowShared][curColShared];
 42 | 					}
 43 | 					pixels++;
 44 | 				}
 45 | 			}
 46 | 
 47 | 		}
 48 | 	        out[row*w + col] = (unsigned char) (pixVal / pixels);
 49 | 	}
 50 | }
 51 | 
 52 | int main(int argc, char** argv){
 53 | 	// We do not actually load an image
 54 | 	// In principle, we should load an image from a file into a host buffer
 55 | 	// and then copy it to a device buffer.
 56 | 	// Instead, we create an 'image' made of random bytes.
 57 | 
 58 | 	size_t numBytes = NUM_PIXELS*NUM_CHANNELS*sizeof(unsigned char);
 59 | 
 60 | 	// Allocate host input/output vectors
 61 | 	unsigned char *h_input, *h_output, *h_output_ref;
 62 | 	h_input = (unsigned char*) malloc(numBytes);
 63 | 	h_output = (unsigned char*) malloc(numBytes);
 64 | 	h_output_ref = (unsigned char*) malloc(numBytes);
 65 | 	
 66 | 
 67 | 	printf("Initializing input.\n"); fflush(stdout);
 68 | 	// Initialize the input with random stuff
 69 | 	for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){
 70 | 		   h_input[i] = rand() % 256;
 71 | 	}
 72 | 	printf("Input initialized.\n"); fflush(stdout);
 73 | 
 74 | 	// Allocate device input/output vectors and copy the input data to the device
 75 | 	unsigned char *d_input, *d_output;
 76 | 	cudaMalloc((void**) &d_input, numBytes);
 77 | 	cudaMalloc((void**) &d_output, numBytes);
 78 | 	cudaMemcpy(d_input, h_input, numBytes, cudaMemcpyHostToDevice);
 79 | 
 80 | 	printf("Data copied to device.\n"); fflush(stdout);
 81 | 	
 82 | 	dim3 dimGrid(ceil(WIDTH/(float) BLOCK_SIZE), ceil(HEIGHT/(float) BLOCK_SIZE));
 83 | 	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 84 | 	double start, stop;
 85 | 
 86 | 	// In principle we should also include in the timing the time to copy data to/from the device,
 87 | 	// we do not do it here mostly to show the difference in time between the version with/without shared memory
 88 | 	GET_TIME(start);
 89 | 	blurKernel<<<dimGrid, dimBlock>>>(d_input, d_output, WIDTH, HEIGHT);
 90 | 	cudaDeviceSynchronize();
 91 | 	GET_TIME(stop);
 92 | 	printf("Runtime: %lf seconds\n", stop - start); fflush(stdout);
 93 | 
 94 | 	// Copy the output data from the device to the host
 95 | 	cudaMemcpy(h_output, d_output, numBytes, cudaMemcpyDeviceToHost);
 96 | 	printf("Output retrieved.\n"); fflush(stdout);
 97 | 
 98 | 	// Now we check that the result computed by the device is correct
 99 | 	// Do the same blurring on the host
100 | 	for(int row = 0; row < HEIGHT; row++){
101 | 		for(int col = 0; col < WIDTH; col++){
102 | 			int pixVal = 0;
103 | 	                int pixels = 0;
104 |         	    	for(int blurRow = -BLUR_SIZE; blurRow <= BLUR_SIZE; ++blurRow){
105 | 				for(int blurCol = -BLUR_SIZE; blurCol <= BLUR_SIZE; ++blurCol){
106 |                                     	    int curRow = row + blurRow;
107 |                                 	    int curCol = col + blurCol;
108 |                                 	    if(curRow >= 0 && curRow < HEIGHT && curCol >= 0 && curCol < WIDTH){
109 |                                             	      pixVal += h_input[curRow*WIDTH + curCol];
110 |                                         	      pixels++;
111 |                                 	    }
112 |                         	}
113 | 			}
114 |                 	h_output_ref[row*WIDTH + col] = (unsigned char) (pixVal / pixels);
115 | 		}
116 | 	}
117 | 	printf("Reference result computed\n"); fflush(stdout);
118 | 	// Now check that the content of h_output is equal to h_output_ref
119 |         for(size_t i = 0; i < NUM_PIXELS*NUM_CHANNELS; i++){
120 | 		   if(h_output_ref[i] != h_output[i]){
121 | 		   		  fprintf(stderr, "Outputs differ at index %d (%d vs. %d)\n", i, h_output_ref[i], h_output[i]); fflush(stderr);
122 | 				  exit(-1);
123 | 		   }
124 | 
125 | 	}
126 | 
127 | 	printf("Everything is fine\n"); fflush(stdout);
128 | 
129 | 	cudaFree(d_input);
130 | 	cudaFree(d_output);
131 | 	return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/lec19/jacobi.cu:
--------------------------------------------------------------------------------
  1 | #include <sys/time.h>
  2 | #include <cstdio>
  3 | #include "error_checks.h"
  4 | 
  5 | // Change this to 0 if CPU reference result is not needed
  6 | #define COMPUTE_CPU_REFERENCE 1
  7 | #define MAX_ITERATIONS 3000
  8 | #define GPU_VERSION 0 // TODO: Change this to 1 to enable the GPU version
  9 | 
 10 | // CPU kernel
 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 
 12 |               double h2, int N)
 13 | { 
 14 |     int i, j;
 15 |     int index, i1, i2, i3, i4;
 16 | 
 17 |     for (j = 1; j < N-1; j++) {
 18 |         for (i = 1; i < N-1; i++) {
 19 |             index = i + j*N; 
 20 |             i1 = (i-1) +   j   * N;
 21 |             i2 = (i+1) +   j   * N;
 22 |             i3 =   i   + (j-1) * N;
 23 |             i4 =   i   + (j+1) * N;
 24 |             phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 25 |                                  phiPrev[i3] + phiPrev[i4] - 
 26 |                                  h2 * source[index]);
 27 |         } 
 28 |     }
 29 | } 
 30 | 
 31 | // GPU kernel
 32 | __global__ 
 33 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 
 34 |               double h2, int N)
 35 | {
 36 |     // TODO: Add here the GPU implementation
 37 | }
 38 | 
 39 | 
 40 | double compareArrays(const double *a, const double *b, int N)
 41 | {
 42 |     double error = 0.0;
 43 |     int i;
 44 |     for (i = 0; i < N*N; i++) {
 45 |         error += fabs(a[i] - b[i]);
 46 |     }
 47 |     return error/(N*N);
 48 | }
 49 | 
 50 | 
 51 | double diffCPU(const double *phi, const double *phiPrev, int N)
 52 | {
 53 |     int i;
 54 |     double sum = 0;
 55 |     double diffsum = 0;
 56 |     
 57 |     for (i = 0; i < N*N; i++) {
 58 |         diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
 59 |         sum += phi[i] * phi[i];
 60 |     }
 61 | 
 62 |     return sqrt(diffsum/sum);
 63 | }
 64 | 
 65 | 
 66 | int main() 
 67 | { 
 68 |     timeval t1, t2; // Structs for timing
 69 |     const int N = 512;
 70 |     double h = 1.0 / (N - 1);
 71 |     int iterations;
 72 |     const double tolerance = 5e-4; // Stopping condition
 73 |     int i, j, index;
 74 | 
 75 |     const int blocksize = 16;
 76 |   
 77 |     double *phi      = new double[N*N]; 
 78 |     double *phiPrev  = new double[N*N]; 
 79 |     double *source   = new double[N*N]; 
 80 |     double *phi_cuda = new double[N*N]; 
 81 | 
 82 |     double *phi_d, *phiPrev_d, *source_d; 
 83 |     // Size of the arrays in bytes
 84 |     const int size = N*N*sizeof(double); 
 85 |     double diff;
 86 |   
 87 |     // Source initialization
 88 |     for (i = 0; i < N; i++) {
 89 |         for (j = 0; j < N; j++) {      
 90 |             double x, y;
 91 |             x = (i - N / 2) * h;
 92 |             y = (j - N / 2) * h;
 93 |             index = j + i * N;
 94 |             if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
 95 |                 source[index] = 1e10*h*h;
 96 |             else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
 97 |                 source[index] = -1e10*h*h;
 98 |             else
 99 |                 source[index] = 0.0;
100 |         }            
101 |     }
102 | 
103 |     CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 
104 |     CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 
105 | 
106 |     // Reset values to zero
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {      
109 |             index = j + i * N;
110 |             phi[index] = 0.0; 
111 |             phiPrev[index] = 0.0; 
112 |         }            
113 |     }
114 | 
115 |     CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 
116 |     CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 
117 |     CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
118 |     CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );
119 | 
120 |     // CPU version 
121 |     if(COMPUTE_CPU_REFERENCE) { 
122 |         gettimeofday(&t1, NULL);
123 | 
124 |         // Do sweeps untill difference is under the tolerance
125 |         diff = tolerance * 2;
126 |         iterations = 0;
127 |         while (diff > tolerance && iterations < MAX_ITERATIONS) {
128 |             sweepCPU(phiPrev, phi, source, h * h, N);
129 |             sweepCPU(phi, phiPrev, source, h * h, N);
130 |             
131 |             iterations += 2;
132 |             if (iterations % 100 == 0) {
133 |                 diff = diffCPU(phi, phiPrev, N);
134 |                 printf("%d %g\n", iterations, diff);
135 |             }
136 |         }
137 |         gettimeofday(&t2, NULL);
138 |         printf("CPU Jacobi: %g seconds, %d iterations\n", 
139 |                t2.tv_sec - t1.tv_sec + 
140 |                (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
141 |     }
142 | 
143 | 
144 | #if GPU_VERSION
145 |     // GPU version
146 | 
147 |     dim3 dimBlock(blocksize, blocksize); 
148 |     dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 
149 |     
150 |     //do sweeps until diff under tolerance
151 |     diff = tolerance * 2;
152 |     iterations = 0;
153 | 
154 |     gettimeofday(&t1, NULL);
155 | 
156 |     while (diff > tolerance && iterations < MAX_ITERATIONS) {
157 |         // See above how the CPU update kernel is called
158 |         // and implement similar calling sequence for the GPU code
159 | 
160 |         //// Add routines here
161 |         // TODO: Add GPU kernel calls here (see CPU version above)
162 | 
163 |         iterations += 2;
164 |         
165 |         if (iterations % 100 == 0) {
166 |             // TODO: Add GPU kernel calls here (see CPU version above)
167 |             CHECK_ERROR_MSG("Difference computation");
168 |             printf("%d %g\n", iterations, diff);
169 |         }
170 |     }
171 |     
172 |     //// Add here the routine to copy back the results
173 |     //TODO: Copy back the results
174 | 
175 |     gettimeofday(&t2, NULL);
176 |     printf("GPU Jacobi: %g seconds, %d iterations\n", 
177 |            t2.tv_sec - t1.tv_sec + 
178 |            (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
179 | 
180 |     //// Add here the clean up code for all allocated CUDA resources
181 |     // TODO: Add here the clean up code
182 | #endif
183 | 
184 |     if (COMPUTE_CPU_REFERENCE) {
185 |         printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
186 |     }
187 |     
188 |     delete[] phi; 
189 |     delete[] phi_cuda;
190 |     delete[] phiPrev; 
191 |     delete[] source; 
192 |     
193 |     return EXIT_SUCCESS; 
194 | } 


--------------------------------------------------------------------------------
/lec19/jacobi_solution.cu:
--------------------------------------------------------------------------------
  1 | #include <sys/time.h>
  2 | #include <cstdio>
  3 | #include "error_checks.h"
  4 | 
  5 | // Change this to 0 if CPU reference result is not needed
  6 | #define COMPUTE_CPU_REFERENCE 1
  7 | #define MAX_ITERATIONS 3000
  8 | 
  9 | // CPU kernel
 10 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 
 11 |               double h2, int N)
 12 | { 
 13 |     int i, j;
 14 |     int index, i1, i2, i3, i4;
 15 | 
 16 |     for (j = 1; j < N-1; j++) {
 17 |         for (i = 1; i < N-1; i++) {
 18 |             index = i + j*N; 
 19 |             i1 = (i-1) +   j   * N;
 20 |             i2 = (i+1) +   j   * N;
 21 |             i3 =   i   + (j-1) * N;
 22 |             i4 =   i   + (j+1) * N;
 23 |             phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 24 |                                  phiPrev[i3] + phiPrev[i4] - 
 25 |                                  h2 * source[index]);
 26 |         } 
 27 |     }
 28 | } 
 29 | 
 30 | // GPU kernel
 31 | __global__ 
 32 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 
 33 |               double h2, int N)
 34 | {
 35 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 36 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 37 |     int index = i + j*N;
 38 |     int i1, i2, i3, i4;
 39 | 
 40 |     i1 = (i-1) +   j   * N;
 41 |     i2 = (i+1) +   j   * N;
 42 |     i3 =   i   + (j-1) * N;
 43 |     i4 =   i   + (j+1) * N;
 44 | 
 45 |     if (i > 0 && j > 0 && i < N-1 && j < N-1)
 46 |         phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 47 |                              phiPrev[i3] + phiPrev[i4] - 
 48 |                              h2 * source[index]);
 49 | }
 50 | 
 51 | 
 52 | double compareArrays(const double *a, const double *b, int N)
 53 | {
 54 |     double error = 0.0;
 55 |     int i;
 56 |     for (i = 0; i < N*N; i++) {
 57 |         error += fabs(a[i] - b[i]);
 58 |     }
 59 |     return error/(N*N);
 60 | }
 61 | 
 62 | 
 63 | double diffCPU(const double *phi, const double *phiPrev, int N)
 64 | {
 65 |     int i;
 66 |     double sum = 0;
 67 |     double diffsum = 0;
 68 |     
 69 |     for (i = 0; i < N*N; i++) {
 70 |         diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
 71 |         sum += phi[i] * phi[i];
 72 |     }
 73 | 
 74 |     return sqrt(diffsum/sum);
 75 | }
 76 | 
 77 | __global__
 78 | void diffGPU(const double *phi, const double *phiPrev, int N, double* sum, double* diffsum)
 79 | {
 80 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 81 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 82 |     int index = i + j*N;
 83 |     atomicAdd(diffsum, (phi[index] - phiPrev[index]) * (phi[index] - phiPrev[index]));
 84 |     atomicAdd(sum, phi[index] * phi[index]);
 85 | }
 86 | 
 87 | 
 88 | int main() 
 89 | { 
 90 |     timeval t1, t2; // Structs for timing
 91 |     const int N = 512;
 92 |     double h = 1.0 / (N - 1);
 93 |     int iterations;
 94 |     const double tolerance = 5e-4; // Stopping condition
 95 |     int i, j, index;
 96 | 
 97 |     const int blocksize = 16;
 98 |   
 99 |     double *phi      = new double[N*N]; 
100 |     double *phiPrev  = new double[N*N]; 
101 |     double *source   = new double[N*N]; 
102 |     double *phi_cuda = new double[N*N]; 
103 | 
104 |     double *phi_d, *phiPrev_d, *source_d; 
105 |     // Size of the arrays in bytes
106 |     const int size = N*N*sizeof(double); 
107 |     double diff;
108 |   
109 |     // Source initialization
110 |     for (i = 0; i < N; i++) {
111 |         for (j = 0; j < N; j++) {      
112 |             double x, y;
113 |             x = (i - N / 2) * h;
114 |             y = (j - N / 2) * h;
115 |             index = j + i * N;
116 |             if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
117 |                 source[index] = 1e10*h*h;
118 |             else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
119 |                 source[index] = -1e10*h*h;
120 |             else
121 |                 source[index] = 0.0;
122 |         }            
123 |     }
124 | 
125 |     CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 
126 |     CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) );
127 |     double sum_h, diffsum_h, *sum_d, *diffsum_d;
128 |     CUDA_CHECK( cudaMalloc( (void**)&sum_d, sizeof(double)) );
129 |     CUDA_CHECK( cudaMalloc( (void**)&diffsum_d, sizeof(double)) );
130 | 
131 |     // Reset values to zero
132 |     for (i = 0; i < N; i++) {
133 |         for (j = 0; j < N; j++) {      
134 |             index = j + i * N;
135 |             phi[index] = 0.0; 
136 |             phiPrev[index] = 0.0; 
137 |         }            
138 |     }
139 | 
140 |     CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 
141 |     CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 
142 |     CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
143 |     CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );
144 | 
145 |     // CPU version 
146 |     if(COMPUTE_CPU_REFERENCE) { 
147 |         gettimeofday(&t1, NULL);
148 | 
149 |         // Do sweeps untill difference is under the tolerance
150 |         diff = tolerance * 2;
151 |         iterations = 0;
152 |         while (diff > tolerance && iterations < MAX_ITERATIONS) {
153 |             sweepCPU(phiPrev, phi, source, h * h, N);
154 |             sweepCPU(phi, phiPrev, source, h * h, N);
155 |             
156 |             iterations += 2;
157 |             if (iterations % 100 == 0) {
158 |                 diff = diffCPU(phi, phiPrev, N);
159 |                 printf("%d %g\n", iterations, diff);
160 |             }
161 |         }
162 |         gettimeofday(&t2, NULL);
163 |         printf("CPU Jacobi: %g seconds, %d iterations\n", 
164 |                t2.tv_sec - t1.tv_sec + 
165 |                (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
166 |     }
167 | 
168 |     // GPU version
169 | 
170 |     dim3 dimBlock(blocksize, blocksize); 
171 |     dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 
172 |     
173 |     //do sweeps until diff under tolerance
174 |     diff = tolerance * 2;
175 |     iterations = 0;
176 | 
177 |     gettimeofday(&t1, NULL);
178 | 
179 |     while (diff > tolerance && iterations < MAX_ITERATIONS) {
180 |         // See above how the CPU update kernel is called
181 |         // and implement similar calling sequence for the GPU code
182 | 
183 |         //// Add routines here
184 |         sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h*h, N); 
185 |         sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h*h, N); 
186 |         CHECK_ERROR_MSG("Jacobi kernels");
187 |         iterations += 2;
188 |         
189 |         if (iterations % 100 == 0) {
190 | 	    // Reinitialize sum_d and diffsum_d to 0
191 | 	        sum_h = 0; diffsum_h = 0;
192 | 	        CUDA_CHECK( cudaMemcpy(sum_d, &sum_h, sizeof(double), cudaMemcpyHostToDevice) );
193 | 	        CUDA_CHECK( cudaMemcpy(diffsum_d, &diffsum_h, sizeof(double), cudaMemcpyHostToDevice) );
194 | 	    
195 |             diffGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, N, sum_d, diffsum_d);
196 |             CUDA_CHECK( cudaMemcpy(&sum_h, sum_d, sizeof(double), cudaMemcpyDeviceToHost) );
197 | 	        CUDA_CHECK( cudaMemcpy(&diffsum_h, diffsum_d, sizeof(double), cudaMemcpyDeviceToHost) );
198 | 	        diff = sqrt(diffsum_h/sum_h);
199 |             CHECK_ERROR_MSG("Difference computation");
200 |             printf("%d %g\n", iterations, diff);
201 |         }
202 |     }
203 |     
204 |     CUDA_CHECK( cudaMemcpy(phi_cuda, phi_d, size, cudaMemcpyDeviceToHost) ); 
205 | 
206 |     gettimeofday(&t2, NULL);
207 |     printf("GPU Jacobi: %g seconds, %d iterations\n", 
208 |            t2.tv_sec - t1.tv_sec + 
209 |            (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
210 | 
211 |     //// Add here the clean up code for all allocated CUDA resources
212 |     CUDA_CHECK( cudaFree(phi_d) ); 
213 |     CUDA_CHECK( cudaFree(phiPrev_d) );
214 |     CUDA_CHECK( cudaFree(source_d) );
215 |     CUDA_CHECK( cudaFree(sum_d) );
216 |     CUDA_CHECK( cudaFree(diffsum_d) );
217 | 
218 |     if (COMPUTE_CPU_REFERENCE) {
219 |         printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
220 |     }
221 |     
222 |     delete[] phi; 
223 |     delete[] phi_cuda;
224 |     delete[] phiPrev; 
225 |     delete[] source; 
226 |     
227 |     return EXIT_SUCCESS; 
228 | } 


--------------------------------------------------------------------------------
/lec19/test_cuda.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main() {
 5 |     int count;
 6 |     cudaError_t err = cudaGetDeviceCount(&count);
 7 | 
 8 |     if (err != cudaSuccess) {
 9 |         printf("CUDA Error: %s\n", cudaGetErrorString(err));
10 |         return -1;
11 |     }
12 | 
13 |     printf("Number of CUDA devices: %d\n", count);
14 |     return 0;
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/lec19/vector_add.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | /**
 29 |  * Vector addition: C = A + B.
 30 |  *
 31 |  * This sample is a very basic sample that implements element by element
 32 |  * vector addition. It is the same as the sample illustrating Chapter 2
 33 |  * of the programming guide with some additions like error checking.
 34 |  */
 35 | 
 36 | #include <stdio.h>
 37 | #include <cuda.h>
 38 | 
 39 | /**
 40 |  * CUDA Kernel Device code
 41 |  *
 42 |  * Computes the vector addition of A and B into C. The 3 vectors have the same
 43 |  * number of elements numElements.
 44 |  */
 45 | __global__ void vectorAdd(const float *A, const float *B, float *C,
 46 |                           int numElements) {
 47 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 48 | 
 49 |   if (i < numElements) {
 50 |     C[i] = A[i] + B[i] + 0.0f;
 51 |   }
 52 | }
 53 | 
 54 | /**
 55 |  * Host main routine
 56 |  */
 57 | int main(void) {
 58 |   // Error code to check return values for CUDA calls
 59 |   cudaError_t err = cudaSuccess;
 60 | 
 61 |   // Print the vector length to be used, and compute its size
 62 |   int numElements = 50000;
 63 |   size_t size = numElements * sizeof(float);
 64 |   printf("[Vector addition of %d elements]\n", numElements);
 65 | 
 66 |   // Allocate the host input vector A
 67 |   float *h_A = (float *)malloc(size);
 68 | 
 69 |   // Allocate the host input vector B
 70 |   float *h_B = (float *)malloc(size);
 71 | 
 72 |   // Allocate the host output vector C
 73 |   float *h_C = (float *)malloc(size);
 74 | 
 75 |   // Verify that allocations succeeded
 76 |   if (h_A == NULL || h_B == NULL || h_C == NULL) {
 77 |     fprintf(stderr, "Failed to allocate host vectors!\n");
 78 |     exit(EXIT_FAILURE);
 79 |   }
 80 | 
 81 |   // Initialize the host input vectors
 82 |   for (int i = 0; i < numElements; ++i) {
 83 |     h_A[i] = rand() / (float)RAND_MAX;
 84 |     h_B[i] = rand() / (float)RAND_MAX;
 85 |   }
 86 | 
 87 |   // Allocate the device input vector A
 88 |   float *d_A = NULL;
 89 |   err = cudaMalloc((void **)&d_A, size);
 90 | 
 91 |   if (err != cudaSuccess) {
 92 |     fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
 93 |             cudaGetErrorString(err));
 94 |     exit(EXIT_FAILURE);
 95 |   }
 96 | 
 97 |   // Allocate the device input vector B
 98 |   float *d_B = NULL;
 99 |   err = cudaMalloc((void **)&d_B, size);
100 | 
101 |   if (err != cudaSuccess) {
102 |     fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
103 |             cudaGetErrorString(err));
104 |     exit(EXIT_FAILURE);
105 |   }
106 | 
107 |   // Allocate the device output vector C
108 |   float *d_C = NULL;
109 |   err = cudaMalloc((void **)&d_C, size);
110 | 
111 |   if (err != cudaSuccess) {
112 |     fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
113 |             cudaGetErrorString(err));
114 |     exit(EXIT_FAILURE);
115 |   }
116 | 
117 |   // Copy the host input vectors A and B in host memory to the device input
118 |   // vectors in
119 |   // device memory
120 |   printf("Copy input data from the host memory to the CUDA device\n");
121 |   err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
122 | 
123 |   if (err != cudaSuccess) {
124 |     fprintf(stderr,
125 |             "Failed to copy vector A from host to device (error code %s)!\n",
126 |             cudaGetErrorString(err));
127 |     exit(EXIT_FAILURE);
128 |   }
129 | 
130 |   err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
131 | 
132 |   if (err != cudaSuccess) {
133 |     fprintf(stderr,
134 |             "Failed to copy vector B from host to device (error code %s)!\n",
135 |             cudaGetErrorString(err));
136 |     exit(EXIT_FAILURE);
137 |   }
138 | 
139 |   // Launch the Vector Add CUDA Kernel
140 |   int threadsPerBlock = 256;
141 |   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
142 |   printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
143 |          threadsPerBlock);
144 |   vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
145 |   err = cudaGetLastError();
146 | 
147 |   if (err != cudaSuccess) {
148 |     fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
149 |             cudaGetErrorString(err));
150 |     exit(EXIT_FAILURE);
151 |   }
152 | 
153 |   // Copy the device result vector in device memory to the host result vector
154 |   // in host memory.
155 |   printf("Copy output data from the CUDA device to the host memory\n");
156 |   err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
157 | 
158 |   if (err != cudaSuccess) {
159 |     fprintf(stderr,
160 |             "Failed to copy vector C from device to host (error code %s)!\n",
161 |             cudaGetErrorString(err));
162 |     exit(EXIT_FAILURE);
163 |   }
164 | 
165 |   // Verify that the result vector is correct
166 |   for (int i = 0; i < numElements; ++i) {
167 |     if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
168 |       fprintf(stderr, "Result verification failed at element %d!\n", i);
169 |       exit(EXIT_FAILURE);
170 |     }
171 |   }
172 | 
173 |   printf("Test PASSED\n");
174 | 
175 |   // Free device global memory
176 |   err = cudaFree(d_A);
177 | 
178 |   if (err != cudaSuccess) {
179 |     fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
180 |             cudaGetErrorString(err));
181 |     exit(EXIT_FAILURE);
182 |   }
183 | 
184 |   err = cudaFree(d_B);
185 | 
186 |   if (err != cudaSuccess) {
187 |     fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
188 |             cudaGetErrorString(err));
189 |     exit(EXIT_FAILURE);
190 |   }
191 | 
192 |   err = cudaFree(d_C);
193 | 
194 |   if (err != cudaSuccess) {
195 |     fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
196 |             cudaGetErrorString(err));
197 |     exit(EXIT_FAILURE);
198 |   }
199 | 
200 |   // Free host memory
201 |   free(h_A);
202 |   free(h_B);
203 |   free(h_C);
204 | 
205 |   printf("Done\n");
206 |   return 0;
207 | }
208 | 


--------------------------------------------------------------------------------
/projects/README.md:
--------------------------------------------------------------------------------
1 | The projects proposed in this folder have been taken, respectively, from:
2 | 
3 | - https://trasgo.infor.uva.es/sdm_downloads/k-means/
4 | - https://trasgo.infor.uva.es/sdm_downloads/dna-sequence-alignment/
5 | - https://trasgo.infor.uva.es/sdm_downloads/wind-tunnel-peachy-assignment/
6 | 
7 | 
8 | You can find instructions/rules for the exam in the PMC20 slides deck.
9 | If you have any doubts, send me an email.


--------------------------------------------------------------------------------
/projects/kmeans/KMEANS.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * k-Means clustering algorithm
  3 |  *
  4 |  * Reference sequential version (Do not modify this code)
  5 |  *
  6 |  * Parallel computing (Degree in Computer Engineering)
  7 |  * 2022/2023
  8 |  *
  9 |  * Version: 1.0
 10 |  *
 11 |  * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano
 12 |  * Grupo Trasgo, Universidad de Valladolid (Spain)
 13 |  *
 14 |  * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
 15 |  * https://creativecommons.org/licenses/by-sa/4.0/
 16 |  */
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <ctype.h>
 20 | #include <math.h>
 21 | #include <time.h>
 22 | #include <string.h>
 23 | #include <float.h>
 24 | 
 25 | #define MAXLINE 2000
 26 | #define MAXCAD 200
 27 | 
 28 | //Macros
 29 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
 30 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 31 | 
 32 | /* 
 33 | Function showFileError: It displays the corresponding error during file reading.
 34 | */
 35 | void showFileError(int error, char* filename)
 36 | {
 37 | 	printf("Error\n");
 38 | 	switch (error)
 39 | 	{
 40 | 		case -1:
 41 | 			fprintf(stderr,"\tFile %s has too many columns.\n", filename);
 42 | 			fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE);
 43 | 			break;
 44 | 		case -2:
 45 | 			fprintf(stderr,"Error reading file: %s.\n", filename);
 46 | 			break;
 47 | 		case -3:
 48 | 			fprintf(stderr,"Error writing file: %s.\n", filename);
 49 | 			break;
 50 | 	}
 51 | 	fflush(stderr);	
 52 | }
 53 | 
 54 | /* 
 55 | Function readInput: It reads the file to determine the number of rows and columns.
 56 | */
 57 | int readInput(char* filename, int *lines, int *samples)
 58 | {
 59 |     FILE *fp;
 60 |     char line[MAXLINE] = "";
 61 |     char *ptr;
 62 |     const char *delim = "\t";
 63 |     int contlines, contsamples = 0;
 64 |     
 65 |     contlines = 0;
 66 | 
 67 |     if ((fp=fopen(filename,"r"))!=NULL)
 68 |     {
 69 |         while(fgets(line, MAXLINE, fp)!= NULL) 
 70 | 		{
 71 | 			if (strchr(line, '\n') == NULL)
 72 | 			{
 73 | 				return -1;
 74 | 			}
 75 |             contlines++;       
 76 |             ptr = strtok(line, delim);
 77 |             contsamples = 0;
 78 |             while(ptr != NULL)
 79 |             {
 80 |             	contsamples++;
 81 | 				ptr = strtok(NULL, delim);
 82 | 	    	}	    
 83 |         }
 84 |         fclose(fp);
 85 |         *lines = contlines;
 86 |         *samples = contsamples;  
 87 |         return 0;
 88 |     }
 89 |     else
 90 | 	{
 91 |     	return -2;
 92 | 	}
 93 | }
 94 | 
 95 | /* 
 96 | Function readInput2: It loads data from file.
 97 | */
 98 | int readInput2(char* filename, float* data)
 99 | {
100 |     FILE *fp;
101 |     char line[MAXLINE] = "";
102 |     char *ptr;
103 |     const char *delim = "\t";
104 |     int i = 0;
105 |     
106 |     if ((fp=fopen(filename,"rt"))!=NULL)
107 |     {
108 |         while(fgets(line, MAXLINE, fp)!= NULL)
109 |         {         
110 |             ptr = strtok(line, delim);
111 |             while(ptr != NULL)
112 |             {
113 |             	data[i] = atof(ptr);
114 |             	i++;
115 | 				ptr = strtok(NULL, delim);
116 | 	   		}
117 | 	    }
118 |         fclose(fp);
119 |         return 0;
120 |     }
121 |     else
122 | 	{
123 |     	return -2; //No file found
124 | 	}
125 | }
126 | 
127 | /* 
128 | Function writeResult: It writes in the output file the cluster of each sample (point).
129 | */
130 | int writeResult(int *classMap, int lines, const char* filename)
131 | {	
132 |     FILE *fp;
133 |     
134 |     if ((fp=fopen(filename,"wt"))!=NULL)
135 |     {
136 |         for(int i=0; i<lines; i++)
137 |         {
138 |         	fprintf(fp,"%d\n",classMap[i]);
139 |         }
140 |         fclose(fp);  
141 |    
142 |         return 0;
143 |     }
144 |     else
145 | 	{
146 |     	return -3; //No file found
147 | 	}
148 | }
149 | 
150 | /*
151 | 
152 | Function initCentroids: This function copies the values of the initial centroids, using their 
153 | position in the input data structure as a reference map.
154 | */
155 | void initCentroids(const float *data, float* centroids, int* centroidPos, int samples, int K)
156 | {
157 | 	int i;
158 | 	int idx;
159 | 	for(i=0; i<K; i++)
160 | 	{
161 | 		idx = centroidPos[i];
162 | 		memcpy(&centroids[i*samples], &data[idx*samples], (samples*sizeof(float)));
163 | 	}
164 | }
165 | 
166 | /*
167 | Function euclideanDistance: Euclidean distance
168 | This function could be modified
169 | */
170 | float euclideanDistance(float *point, float *center, int samples)
171 | {
172 | 	float dist=0.0;
173 | 	for(int i=0; i<samples; i++) 
174 | 	{
175 | 		dist+= (point[i]-center[i])*(point[i]-center[i]);
176 | 	}
177 | 	dist = sqrt(dist);
178 | 	return(dist);
179 | }
180 | 
181 | /*
182 | Function zeroFloatMatriz: Set matrix elements to 0
183 | This function could be modified
184 | */
185 | void zeroFloatMatriz(float *matrix, int rows, int columns)
186 | {
187 | 	int i,j;
188 | 	for (i=0; i<rows; i++)
189 | 		for (j=0; j<columns; j++)
190 | 			matrix[i*columns+j] = 0.0;	
191 | }
192 | 
193 | /*
194 | Function zeroIntArray: Set array elements to 0
195 | This function could be modified
196 | */
197 | void zeroIntArray(int *array, int size)
198 | {
199 | 	int i;
200 | 	for (i=0; i<size; i++)
201 | 		array[i] = 0;	
202 | }
203 | 
204 | 
205 | 
206 | int main(int argc, char* argv[])
207 | {
208 | 
209 | 	//START CLOCK***************************************
210 | 	clock_t start, end;
211 | 	start = clock();
212 | 	//**************************************************
213 | 	/*
214 | 	* PARAMETERS
215 | 	*
216 | 	* argv[1]: Input data file
217 | 	* argv[2]: Number of clusters
218 | 	* argv[3]: Maximum number of iterations of the method. Algorithm termination condition.
219 | 	* argv[4]: Minimum percentage of class changes. Algorithm termination condition.
220 | 	*          If between one iteration and the next, the percentage of class changes is less than
221 | 	*          this percentage, the algorithm stops.
222 | 	* argv[5]: Precision in the centroid distance after the update.
223 | 	*          It is an algorithm termination condition. If between one iteration of the algorithm 
224 | 	*          and the next, the maximum distance between centroids is less than this precision, the
225 | 	*          algorithm stops.
226 | 	* argv[6]: Output file. Class assigned to each point of the input file.
227 | 	* */
228 | 	if(argc !=  7)
229 | 	{
230 | 		fprintf(stderr,"EXECUTION ERROR K-MEANS: Parameters are not correct.\n");
231 | 		fprintf(stderr,"./KMEANS [Input Filename] [Number of clusters] [Number of iterations] [Number of changes] [Threshold] [Output data file]\n");
232 | 		fflush(stderr);
233 | 		exit(-1);
234 | 	}
235 | 
236 | 	// Reading the input data
237 | 	// lines = number of points; samples = number of dimensions per point
238 | 	int lines = 0, samples= 0;  
239 | 	
240 | 	int error = readInput(argv[1], &lines, &samples);
241 | 	if(error != 0)
242 | 	{
243 | 		showFileError(error,argv[1]);
244 | 		exit(error);
245 | 	}
246 | 	
247 | 	float *data = (float*)calloc(lines*samples,sizeof(float));
248 | 	if (data == NULL)
249 | 	{
250 | 		fprintf(stderr,"Memory allocation error.\n");
251 | 		exit(-4);
252 | 	}
253 | 	error = readInput2(argv[1], data);
254 | 	if(error != 0)
255 | 	{
256 | 		showFileError(error,argv[1]);
257 | 		exit(error);
258 | 	}
259 | 
260 | 	// Parameters
261 | 	int K=atoi(argv[2]); 
262 | 	int maxIterations=atoi(argv[3]);
263 | 	int minChanges= (int)(lines*atof(argv[4])/100.0);
264 | 	float maxThreshold=atof(argv[5]);
265 | 
266 | 	int *centroidPos = (int*)calloc(K,sizeof(int));
267 | 	float *centroids = (float*)calloc(K*samples,sizeof(float));
268 | 	int *classMap = (int*)calloc(lines,sizeof(int));
269 | 
270 |     if (centroidPos == NULL || centroids == NULL || classMap == NULL)
271 | 	{
272 | 		fprintf(stderr,"Memory allocation error.\n");
273 | 		exit(-4);
274 | 	}
275 | 
276 | 	// Initial centrodis
277 | 	srand(0);
278 | 	int i;
279 | 	for(i=0; i<K; i++) 
280 | 		centroidPos[i]=rand()%lines;
281 | 	
282 | 	// Loading the array of initial centroids with the data from the array data
283 | 	// The centroids are points stored in the data array.
284 | 	initCentroids(data, centroids, centroidPos, samples, K);
285 | 
286 | 
287 | 	printf("\n\tData file: %s \n\tPoints: %d\n\tDimensions: %d\n", argv[1], lines, samples);
288 | 	printf("\tNumber of clusters: %d\n", K);
289 | 	printf("\tMaximum number of iterations: %d\n", maxIterations);
290 | 	printf("\tMinimum number of changes: %d [%g%% of %d points]\n", minChanges, atof(argv[4]), lines);
291 | 	printf("\tMaximum centroid precision: %f\n", maxThreshold);
292 | 	
293 | 	//END CLOCK*****************************************
294 | 	end = clock();
295 | 	printf("\nMemory allocation: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
296 | 	fflush(stdout);
297 | 	//**************************************************
298 | 	//START CLOCK***************************************
299 | 	start = clock();
300 | 	//**************************************************
301 | 	char *outputMsg = (char *)calloc(10000,sizeof(char));
302 | 	char line[100];
303 | 
304 | 	int j;
305 | 	int class;
306 | 	float dist, minDist;
307 | 	int it=0;
308 | 	int changes = 0;
309 | 	float maxDist;
310 | 
311 | 	//pointPerClass: number of points classified in each class
312 | 	//auxCentroids: mean of the points in each class
313 | 	int *pointsPerClass = (int *)malloc(K*sizeof(int));
314 | 	float *auxCentroids = (float*)malloc(K*samples*sizeof(float));
315 | 	float *distCentroids = (float*)malloc(K*sizeof(float)); 
316 | 	if (pointsPerClass == NULL || auxCentroids == NULL || distCentroids == NULL)
317 | 	{
318 | 		fprintf(stderr,"Memory allocation error.\n");
319 | 		exit(-4);
320 | 	}
321 | 
322 | /*
323 |  *
324 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
325 |  *
326 |  */
327 | 
328 | 	do{
329 | 		it++;
330 | 	
331 | 		//1. Calculate the distance from each point to the centroid
332 | 		//Assign each point to the nearest centroid.
333 | 		changes = 0;
334 | 		for(i=0; i<lines; i++)
335 | 		{
336 | 			class=1;
337 | 			minDist=FLT_MAX;
338 | 			for(j=0; j<K; j++)
339 | 			{
340 | 				dist=euclideanDistance(&data[i*samples], &centroids[j*samples], samples);
341 | 
342 | 				if(dist < minDist)
343 | 				{
344 | 					minDist=dist;
345 | 					class=j+1;
346 | 				}
347 | 			}
348 | 			if(classMap[i]!=class)
349 | 			{
350 | 				changes++;
351 | 			}
352 | 			classMap[i]=class;
353 | 		}
354 | 
355 | 		// 2. Recalculates the centroids: calculates the mean within each cluster
356 | 		zeroIntArray(pointsPerClass,K);
357 | 		zeroFloatMatriz(auxCentroids,K,samples);
358 | 
359 | 		for(i=0; i<lines; i++) 
360 | 		{
361 | 			class=classMap[i];
362 | 			pointsPerClass[class-1] = pointsPerClass[class-1] +1;
363 | 			for(j=0; j<samples; j++){
364 | 				auxCentroids[(class-1)*samples+j] += data[i*samples+j];
365 | 			}
366 | 		}
367 | 
368 | 		for(i=0; i<K; i++) 
369 | 		{
370 | 			for(j=0; j<samples; j++){
371 | 				auxCentroids[i*samples+j] /= pointsPerClass[i];
372 | 			}
373 | 		}
374 | 		
375 | 		maxDist=FLT_MIN;
376 | 		for(i=0; i<K; i++){
377 | 			distCentroids[i]=euclideanDistance(&centroids[i*samples], &auxCentroids[i*samples], samples);
378 | 			if(distCentroids[i]>maxDist) {
379 | 				maxDist=distCentroids[i];
380 | 			}
381 | 		}
382 | 		memcpy(centroids, auxCentroids, (K*samples*sizeof(float)));
383 | 		
384 | 		sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist);
385 | 		outputMsg = strcat(outputMsg,line);
386 | 
387 | 	} while((changes>minChanges) && (it<maxIterations) && (maxDist>maxThreshold));
388 | 
389 | /*
390 |  *
391 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
392 |  *
393 |  */
394 | 	// Output and termination conditions
395 | 	printf("%s",outputMsg);	
396 | 
397 | 	//END CLOCK*****************************************
398 | 	end = clock();
399 | 	printf("\nComputation: %f seconds", (double)(end - start) / CLOCKS_PER_SEC);
400 | 	fflush(stdout);
401 | 	//**************************************************
402 | 	//START CLOCK***************************************
403 | 	start = clock();
404 | 	//**************************************************
405 | 
406 | 	
407 | 
408 | 	if (changes <= minChanges) {
409 | 		printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges);
410 | 	}
411 | 	else if (it >= maxIterations) {
412 | 		printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations);
413 | 	}
414 | 	else {
415 | 		printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold);
416 | 	}	
417 | 
418 | 	// Writing the classification of each point to the output file.
419 | 	error = writeResult(classMap, lines, argv[6]);
420 | 	if(error != 0)
421 | 	{
422 | 		showFileError(error, argv[6]);
423 | 		exit(error);
424 | 	}
425 | 
426 | 	//Free memory
427 | 	free(data);
428 | 	free(classMap);
429 | 	free(centroidPos);
430 | 	free(centroids);
431 | 	free(distCentroids);
432 | 	free(pointsPerClass);
433 | 	free(auxCentroids);
434 | 
435 | 	//END CLOCK*****************************************
436 | 	end = clock();
437 | 	printf("\n\nMemory deallocation: %f seconds\n", (double)(end - start) / CLOCKS_PER_SEC);
438 | 	fflush(stdout);
439 | 	//***************************************************/
440 | 	return 0;
441 | }
442 | 


--------------------------------------------------------------------------------
/projects/kmeans/KMEANS_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * k-Means clustering algorithm
  3 |  *
  4 |  * CUDA version
  5 |  *
  6 |  * Parallel computing (Degree in Computer Engineering)
  7 |  * 2022/2023
  8 |  *
  9 |  * Version: 1.0
 10 |  *
 11 |  * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano
 12 |  * Grupo Trasgo, Universidad de Valladolid (Spain)
 13 |  *
 14 |  * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
 15 |  * https://creativecommons.org/licenses/by-sa/4.0/
 16 |  */
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <ctype.h>
 20 | #include <math.h>
 21 | #include <time.h>
 22 | #include <string.h>
 23 | #include <float.h>
 24 | #include <cuda.h>
 25 | 
 26 | 
 27 | #define MAXLINE 2000
 28 | #define MAXCAD 200
 29 | 
 30 | //Macros
 31 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
 32 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 33 | 
 34 | /*
 35 |  * Macros to show errors when calling a CUDA library function,
 36 |  * or after launching a kernel
 37 |  */
 38 | #define CHECK_CUDA_CALL( a )	{ \
 39 | 	cudaError_t ok = a; \
 40 | 	if ( ok != cudaSuccess ) \
 41 | 		fprintf(stderr, "-- Error CUDA call in line %d: %s\n", __LINE__, cudaGetErrorString( ok ) ); \
 42 | 	}
 43 | #define CHECK_CUDA_LAST()	{ \
 44 | 	cudaError_t ok = cudaGetLastError(); \
 45 | 	if ( ok != cudaSuccess ) \
 46 | 		fprintf(stderr, "-- Error CUDA last in line %d: %s\n", __LINE__, cudaGetErrorString( ok ) ); \
 47 | 	}
 48 | 
 49 | /* 
 50 | Function showFileError: It displays the corresponding error during file reading.
 51 | */
 52 | void showFileError(int error, char* filename)
 53 | {
 54 | 	printf("Error\n");
 55 | 	switch (error)
 56 | 	{
 57 | 		case -1:
 58 | 			fprintf(stderr,"\tFile %s has too many columns.\n", filename);
 59 | 			fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE);
 60 | 			break;
 61 | 		case -2:
 62 | 			fprintf(stderr,"Error reading file: %s.\n", filename);
 63 | 			break;
 64 | 		case -3:
 65 | 			fprintf(stderr,"Error writing file: %s.\n", filename);
 66 | 			break;
 67 | 	}
 68 | 	fflush(stderr);	
 69 | }
 70 | 
 71 | /* 
 72 | Function readInput: It reads the file to determine the number of rows and columns.
 73 | */
 74 | int readInput(char* filename, int *lines, int *samples)
 75 | {
 76 |     FILE *fp;
 77 |     char line[MAXLINE] = "";
 78 |     char *ptr;
 79 |     const char *delim = "\t";
 80 |     int contlines, contsamples = 0;
 81 |     
 82 |     contlines = 0;
 83 | 
 84 |     if ((fp=fopen(filename,"r"))!=NULL)
 85 |     {
 86 |         while(fgets(line, MAXLINE, fp)!= NULL) 
 87 | 		{
 88 | 			if (strchr(line, '\n') == NULL)
 89 | 			{
 90 | 				return -1;
 91 | 			}
 92 |             contlines++;       
 93 |             ptr = strtok(line, delim);
 94 |             contsamples = 0;
 95 |             while(ptr != NULL)
 96 |             {
 97 |             	contsamples++;
 98 | 				ptr = strtok(NULL, delim);
 99 | 	    	}	    
100 |         }
101 |         fclose(fp);
102 |         *lines = contlines;
103 |         *samples = contsamples;  
104 |         return 0;
105 |     }
106 |     else
107 | 	{
108 |     	return -2;
109 | 	}
110 | }
111 | 
112 | /* 
113 | Function readInput2: It loads data from file.
114 | */
115 | int readInput2(char* filename, float* data)
116 | {
117 |     FILE *fp;
118 |     char line[MAXLINE] = "";
119 |     char *ptr;
120 |     const char *delim = "\t";
121 |     int i = 0;
122 |     
123 |     if ((fp=fopen(filename,"rt"))!=NULL)
124 |     {
125 |         while(fgets(line, MAXLINE, fp)!= NULL)
126 |         {         
127 |             ptr = strtok(line, delim);
128 |             while(ptr != NULL)
129 |             {
130 |             	data[i] = atof(ptr);
131 |             	i++;
132 | 				ptr = strtok(NULL, delim);
133 | 	   		}
134 | 	    }
135 |         fclose(fp);
136 |         return 0;
137 |     }
138 |     else
139 | 	{
140 |     	return -2; //No file found
141 | 	}
142 | }
143 | 
144 | /* 
145 | Function writeResult: It writes in the output file the cluster of each sample (point).
146 | */
147 | int writeResult(int *classMap, int lines, const char* filename)
148 | {	
149 |     FILE *fp;
150 |     
151 |     if ((fp=fopen(filename,"wt"))!=NULL)
152 |     {
153 |         for(int i=0; i<lines; i++)
154 |         {
155 |         	fprintf(fp,"%d\n",classMap[i]);
156 |         }
157 |         fclose(fp);  
158 |    
159 |         return 0;
160 |     }
161 |     else
162 | 	{
163 |     	return -3; //No file found
164 | 	}
165 | }
166 | 
167 | /*
168 | 
169 | Function initCentroids: This function copies the values of the initial centroids, using their 
170 | position in the input data structure as a reference map.
171 | */
172 | void initCentroids(const float *data, float* centroids, int* centroidPos, int samples, int K)
173 | {
174 | 	int i;
175 | 	int idx;
176 | 	for(i=0; i<K; i++)
177 | 	{
178 | 		idx = centroidPos[i];
179 | 		memcpy(&centroids[i*samples], &data[idx*samples], (samples*sizeof(float)));
180 | 	}
181 | }
182 | 
183 | /*
184 | Function euclideanDistance: Euclidean distance
185 | This function could be modified
186 | */
187 | float euclideanDistance(float *point, float *center, int samples)
188 | {
189 | 	float dist=0.0;
190 | 	for(int i=0; i<samples; i++) 
191 | 	{
192 | 		dist+= (point[i]-center[i])*(point[i]-center[i]);
193 | 	}
194 | 	dist = sqrt(dist);
195 | 	return(dist);
196 | }
197 | 
198 | /*
199 | Function zeroFloatMatriz: Set matrix elements to 0
200 | This function could be modified
201 | */
202 | void zeroFloatMatriz(float *matrix, int rows, int columns)
203 | {
204 | 	int i,j;
205 | 	for (i=0; i<rows; i++)
206 | 		for (j=0; j<columns; j++)
207 | 			matrix[i*columns+j] = 0.0;	
208 | }
209 | 
210 | /*
211 | Function zeroIntArray: Set array elements to 0
212 | This function could be modified
213 | */
214 | void zeroIntArray(int *array, int size)
215 | {
216 | 	int i;
217 | 	for (i=0; i<size; i++)
218 | 		array[i] = 0;	
219 | }
220 | 
221 | 
222 | 
223 | int main(int argc, char* argv[])
224 | {
225 | 
226 | 	//START CLOCK***************************************
227 | 	double start, end;
228 | 	start = omp_get_wtime();
229 | 	//**************************************************
230 | 	/*
231 | 	* PARAMETERS
232 | 	*
233 | 	* argv[1]: Input data file
234 | 	* argv[2]: Number of clusters
235 | 	* argv[3]: Maximum number of iterations of the method. Algorithm termination condition.
236 | 	* argv[4]: Minimum percentage of class changes. Algorithm termination condition.
237 | 	*          If between one iteration and the next, the percentage of class changes is less than
238 | 	*          this percentage, the algorithm stops.
239 | 	* argv[5]: Precision in the centroid distance after the update.
240 | 	*          It is an algorithm termination condition. If between one iteration of the algorithm 
241 | 	*          and the next, the maximum distance between centroids is less than this precision, the
242 | 	*          algorithm stops.
243 | 	* argv[6]: Output file. Class assigned to each point of the input file.
244 | 	* */
245 | 	if(argc !=  7)
246 | 	{
247 | 		fprintf(stderr,"EXECUTION ERROR K-MEANS: Parameters are not correct.\n");
248 | 		fprintf(stderr,"./KMEANS [Input Filename] [Number of clusters] [Number of iterations] [Number of changes] [Threshold] [Output data file]\n");
249 | 		fflush(stderr);
250 | 		exit(-1);
251 | 	}
252 | 
253 | 	// Reading the input data
254 | 	// lines = number of points; samples = number of dimensions per point
255 | 	int lines = 0, samples= 0;  
256 | 	
257 | 	int error = readInput(argv[1], &lines, &samples);
258 | 	if(error != 0)
259 | 	{
260 | 		showFileError(error,argv[1]);
261 | 		exit(error);
262 | 	}
263 | 	
264 | 	float *data = (float*)calloc(lines*samples,sizeof(float));
265 | 	if (data == NULL)
266 | 	{
267 | 		fprintf(stderr,"Memory allocation error.\n");
268 | 		exit(-4);
269 | 	}
270 | 	error = readInput2(argv[1], data);
271 | 	if(error != 0)
272 | 	{
273 | 		showFileError(error,argv[1]);
274 | 		exit(error);
275 | 	}
276 | 
277 | 	// Parameters
278 | 	int K=atoi(argv[2]); 
279 | 	int maxIterations=atoi(argv[3]);
280 | 	int minChanges= (int)(lines*atof(argv[4])/100.0);
281 | 	float maxThreshold=atof(argv[5]);
282 | 
283 | 	int *centroidPos = (int*)calloc(K,sizeof(int));
284 | 	float *centroids = (float*)calloc(K*samples,sizeof(float));
285 | 	int *classMap = (int*)calloc(lines,sizeof(int));
286 | 
287 |     if (centroidPos == NULL || centroids == NULL || classMap == NULL)
288 | 	{
289 | 		fprintf(stderr,"Memory allocation error.\n");
290 | 		exit(-4);
291 | 	}
292 | 
293 | 	// Initial centrodis
294 | 	srand(0);
295 | 	int i;
296 | 	for(i=0; i<K; i++) 
297 | 		centroidPos[i]=rand()%lines;
298 | 	
299 | 	// Loading the array of initial centroids with the data from the array data
300 | 	// The centroids are points stored in the data array.
301 | 	initCentroids(data, centroids, centroidPos, samples, K);
302 | 
303 | 
304 | 	printf("\n\tData file: %s \n\tPoints: %d\n\tDimensions: %d\n", argv[1], lines, samples);
305 | 	printf("\tNumber of clusters: %d\n", K);
306 | 	printf("\tMaximum number of iterations: %d\n", maxIterations);
307 | 	printf("\tMinimum number of changes: %d [%g%% of %d points]\n", minChanges, atof(argv[4]), lines);
308 | 	printf("\tMaximum centroid precision: %f\n", maxThreshold);
309 | 	
310 | 	//END CLOCK*****************************************
311 | 	end = omp_get_wtime();
312 | 	printf("\nMemory allocation: %f seconds\n", end - start);
313 | 	fflush(stdout);
314 | 
315 | 	CHECK_CUDA_CALL( cudaSetDevice(0) );
316 | 	CHECK_CUDA_CALL( cudaDeviceSynchronize() );
317 | 	//**************************************************
318 | 	//START CLOCK***************************************
319 | 	start = omp_get_wtime();
320 | 	//**************************************************
321 | 	char *outputMsg = (char *)calloc(10000,sizeof(char));
322 | 	char line[100];
323 | 
324 | 	int j;
325 | 	int class;
326 | 	float dist, minDist;
327 | 	int it=0;
328 | 	int changes = 0;
329 | 	float maxDist;
330 | 
331 | 	//pointPerClass: number of points classified in each class
332 | 	//auxCentroids: mean of the points in each class
333 | 	int *pointsPerClass = (int *)malloc(K*sizeof(int));
334 | 	float *auxCentroids = (float*)malloc(K*samples*sizeof(float));
335 | 	float *distCentroids = (float*)malloc(K*sizeof(float)); 
336 | 	if (pointsPerClass == NULL || auxCentroids == NULL || distCentroids == NULL)
337 | 	{
338 | 		fprintf(stderr,"Memory allocation error.\n");
339 | 		exit(-4);
340 | 	}
341 | 
342 | /*
343 |  *
344 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
345 |  *
346 |  */
347 | 
348 | 	do{
349 | 		it++;
350 | 	
351 | 		//1. Calculate the distance from each point to the centroid
352 | 		//Assign each point to the nearest centroid.
353 | 		changes = 0;
354 | 		for(i=0; i<lines; i++)
355 | 		{
356 | 			class=1;
357 | 			minDist=FLT_MAX;
358 | 			for(j=0; j<K; j++)
359 | 			{
360 | 				dist=euclideanDistance(&data[i*samples], &centroids[j*samples], samples);
361 | 
362 | 				if(dist < minDist)
363 | 				{
364 | 					minDist=dist;
365 | 					class=j+1;
366 | 				}
367 | 			}
368 | 			if(classMap[i]!=class)
369 | 			{
370 | 				changes++;
371 | 			}
372 | 			classMap[i]=class;
373 | 		}
374 | 
375 | 		// 2. Recalculates the centroids: calculates the mean within each cluster
376 | 		zeroIntArray(pointsPerClass,K);
377 | 		zeroFloatMatriz(auxCentroids,K,samples);
378 | 
379 | 		for(i=0; i<lines; i++) 
380 | 		{
381 | 			class=classMap[i];
382 | 			pointsPerClass[class-1] = pointsPerClass[class-1] +1;
383 | 			for(j=0; j<samples; j++){
384 | 				auxCentroids[(class-1)*samples+j] += data[i*samples+j];
385 | 			}
386 | 		}
387 | 
388 | 		for(i=0; i<K; i++) 
389 | 		{
390 | 			for(j=0; j<samples; j++){
391 | 				auxCentroids[i*samples+j] /= pointsPerClass[i];
392 | 			}
393 | 		}
394 | 		
395 | 		maxDist=FLT_MIN;
396 | 		for(i=0; i<K; i++){
397 | 			distCentroids[i]=euclideanDistance(&centroids[i*samples], &auxCentroids[i*samples], samples);
398 | 			if(distCentroids[i]>maxDist) {
399 | 				maxDist=distCentroids[i];
400 | 			}
401 | 		}
402 | 		memcpy(centroids, auxCentroids, (K*samples*sizeof(float)));
403 | 		
404 | 		sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist);
405 | 		outputMsg = strcat(outputMsg,line);
406 | 
407 | 	} while((changes>minChanges) && (it<maxIterations) && (maxDist>maxThreshold));
408 | 
409 | /*
410 |  *
411 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
412 |  *
413 |  */
414 | 	// Output and termination conditions
415 | 	printf("%s",outputMsg);	
416 | 
417 | 	CHECK_CUDA_CALL( cudaDeviceSynchronize() );
418 | 
419 | 	//END CLOCK*****************************************
420 | 	end = omp_get_wtime();
421 | 	printf("\nComputation: %f seconds", end - start);
422 | 	fflush(stdout);
423 | 	//**************************************************
424 | 	//START CLOCK***************************************
425 | 	start = omp_get_wtime();
426 | 	//**************************************************
427 | 
428 | 	
429 | 
430 | 	if (changes <= minChanges) {
431 | 		printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges);
432 | 	}
433 | 	else if (it >= maxIterations) {
434 | 		printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations);
435 | 	}
436 | 	else {
437 | 		printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold);
438 | 	}	
439 | 
440 | 	// Writing the classification of each point to the output file.
441 | 	error = writeResult(classMap, lines, argv[6]);
442 | 	if(error != 0)
443 | 	{
444 | 		showFileError(error, argv[6]);
445 | 		exit(error);
446 | 	}
447 | 
448 | 	//Free memory
449 | 	free(data);
450 | 	free(classMap);
451 | 	free(centroidPos);
452 | 	free(centroids);
453 | 	free(distCentroids);
454 | 	free(pointsPerClass);
455 | 	free(auxCentroids);
456 | 
457 | 	//END CLOCK*****************************************
458 | 	end = omp_get_wtime();
459 | 	printf("\n\nMemory deallocation: %f seconds\n", end - start);
460 | 	fflush(stdout);
461 | 	//***************************************************/
462 | 	return 0;
463 | }
464 | 


--------------------------------------------------------------------------------
/projects/kmeans/KMEANS_mpi.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * k-Means clustering algorithm
  3 |  *
  4 |  * MPI version
  5 |  *
  6 |  * Parallel computing (Degree in Computer Engineering)
  7 |  * 2022/2023
  8 |  *
  9 |  * Version: 1.0
 10 |  *
 11 |  * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano
 12 |  * Grupo Trasgo, Universidad de Valladolid (Spain)
 13 |  *
 14 |  * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
 15 |  * https://creativecommons.org/licenses/by-sa/4.0/
 16 |  */
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <ctype.h>
 20 | #include <math.h>
 21 | #include <time.h>
 22 | #include <string.h>
 23 | #include <float.h>
 24 | #include <mpi.h>
 25 | 
 26 | #define MAXLINE 2000
 27 | #define MAXCAD 200
 28 | 
 29 | //Macros
 30 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
 31 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 32 | 
 33 | /* 
 34 | Function showFileError: It displays the corresponding error during file reading.
 35 | */
 36 | void showFileError(int error, char* filename)
 37 | {
 38 | 	printf("Error\n");
 39 | 	switch (error)
 40 | 	{
 41 | 		case -1:
 42 | 			fprintf(stderr,"\tFile %s has too many columns.\n", filename);
 43 | 			fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE);
 44 | 			break;
 45 | 		case -2:
 46 | 			fprintf(stderr,"Error reading file: %s.\n", filename);
 47 | 			break;
 48 | 		case -3:
 49 | 			fprintf(stderr,"Error writing file: %s.\n", filename);
 50 | 			break;
 51 | 	}
 52 | 	fflush(stderr);	
 53 | }
 54 | 
 55 | /* 
 56 | Function readInput: It reads the file to determine the number of rows and columns.
 57 | */
 58 | int readInput(char* filename, int *lines, int *samples)
 59 | {
 60 |     FILE *fp;
 61 |     char line[MAXLINE] = "";
 62 |     char *ptr;
 63 |     const char *delim = "\t";
 64 |     int contlines, contsamples = 0;
 65 |     
 66 |     contlines = 0;
 67 | 
 68 |     if ((fp=fopen(filename,"r"))!=NULL)
 69 |     {
 70 |         while(fgets(line, MAXLINE, fp)!= NULL) 
 71 | 		{
 72 | 			if (strchr(line, '\n') == NULL)
 73 | 			{
 74 | 				return -1;
 75 | 			}
 76 |             contlines++;       
 77 |             ptr = strtok(line, delim);
 78 |             contsamples = 0;
 79 |             while(ptr != NULL)
 80 |             {
 81 |             	contsamples++;
 82 | 				ptr = strtok(NULL, delim);
 83 | 	    	}	    
 84 |         }
 85 |         fclose(fp);
 86 |         *lines = contlines;
 87 |         *samples = contsamples;  
 88 |         return 0;
 89 |     }
 90 |     else
 91 | 	{
 92 |     	return -2;
 93 | 	}
 94 | }
 95 | 
 96 | /* 
 97 | Function readInput2: It loads data from file.
 98 | */
 99 | int readInput2(char* filename, float* data)
100 | {
101 |     FILE *fp;
102 |     char line[MAXLINE] = "";
103 |     char *ptr;
104 |     const char *delim = "\t";
105 |     int i = 0;
106 |     
107 |     if ((fp=fopen(filename,"rt"))!=NULL)
108 |     {
109 |         while(fgets(line, MAXLINE, fp)!= NULL)
110 |         {         
111 |             ptr = strtok(line, delim);
112 |             while(ptr != NULL)
113 |             {
114 |             	data[i] = atof(ptr);
115 |             	i++;
116 | 				ptr = strtok(NULL, delim);
117 | 	   		}
118 | 	    }
119 |         fclose(fp);
120 |         return 0;
121 |     }
122 |     else
123 | 	{
124 |     	return -2; //No file found
125 | 	}
126 | }
127 | 
128 | /* 
129 | Function writeResult: It writes in the output file the cluster of each sample (point).
130 | */
131 | int writeResult(int *classMap, int lines, const char* filename)
132 | {	
133 |     FILE *fp;
134 |     
135 |     if ((fp=fopen(filename,"wt"))!=NULL)
136 |     {
137 |         for(int i=0; i<lines; i++)
138 |         {
139 |         	fprintf(fp,"%d\n",classMap[i]);
140 |         }
141 |         fclose(fp);  
142 |    
143 |         return 0;
144 |     }
145 |     else
146 | 	{
147 |     	return -3; //No file found
148 | 	}
149 | }
150 | 
151 | /*
152 | 
153 | Function initCentroids: This function copies the values of the initial centroids, using their 
154 | position in the input data structure as a reference map.
155 | */
156 | void initCentroids(const float *data, float* centroids, int* centroidPos, int samples, int K)
157 | {
158 | 	int i;
159 | 	int idx;
160 | 	for(i=0; i<K; i++)
161 | 	{
162 | 		idx = centroidPos[i];
163 | 		memcpy(&centroids[i*samples], &data[idx*samples], (samples*sizeof(float)));
164 | 	}
165 | }
166 | 
167 | /*
168 | Function euclideanDistance: Euclidean distance
169 | This function could be modified
170 | */
171 | float euclideanDistance(float *point, float *center, int samples)
172 | {
173 | 	float dist=0.0;
174 | 	for(int i=0; i<samples; i++) 
175 | 	{
176 | 		dist+= (point[i]-center[i])*(point[i]-center[i]);
177 | 	}
178 | 	dist = sqrt(dist);
179 | 	return(dist);
180 | }
181 | 
182 | /*
183 | Function zeroFloatMatriz: Set matrix elements to 0
184 | This function could be modified
185 | */
186 | void zeroFloatMatriz(float *matrix, int rows, int columns)
187 | {
188 | 	int i,j;
189 | 	for (i=0; i<rows; i++)
190 | 		for (j=0; j<columns; j++)
191 | 			matrix[i*columns+j] = 0.0;	
192 | }
193 | 
194 | /*
195 | Function zeroIntArray: Set array elements to 0
196 | This function could be modified
197 | */
198 | void zeroIntArray(int *array, int size)
199 | {
200 | 	int i;
201 | 	for (i=0; i<size; i++)
202 | 		array[i] = 0;	
203 | }
204 | 
205 | int main(int argc, char* argv[])
206 | {
207 | 	/* 0. Initialize MPI */
208 | 	MPI_Init( &argc, &argv );
209 | 	int rank;
210 | 	MPI_Comm_rank( MPI_COMM_WORLD, &rank );
211 | 	MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
212 | 
213 | 	//START CLOCK***************************************
214 | 	double start, end;
215 | 	start = MPI_Wtime();
216 | 	//**************************************************
217 | 	/*
218 | 	* PARAMETERS
219 | 	*
220 | 	* argv[1]: Input data file
221 | 	* argv[2]: Number of clusters
222 | 	* argv[3]: Maximum number of iterations of the method. Algorithm termination condition.
223 | 	* argv[4]: Minimum percentage of class changes. Algorithm termination condition.
224 | 	*          If between one iteration and the next, the percentage of class changes is less than
225 | 	*          this percentage, the algorithm stops.
226 | 	* argv[5]: Precision in the centroid distance after the update.
227 | 	*          It is an algorithm termination condition. If between one iteration of the algorithm 
228 | 	*          and the next, the maximum distance between centroids is less than this precision, the
229 | 	*          algorithm stops.
230 | 	* argv[6]: Output file. Class assigned to each point of the input file.
231 | 	* */
232 | 	if(argc !=  7)
233 | 	{
234 | 		fprintf(stderr,"EXECUTION ERROR K-MEANS: Parameters are not correct.\n");
235 | 		fprintf(stderr,"./KMEANS [Input Filename] [Number of clusters] [Number of iterations] [Number of changes] [Threshold] [Output data file]\n");
236 | 		fflush(stderr);
237 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
238 | 	}
239 | 
240 | 	// Reading the input data
241 | 	// lines = number of points; samples = number of dimensions per point
242 | 	int lines = 0, samples= 0;  
243 | 	
244 | 	int error = readInput(argv[1], &lines, &samples);
245 | 	if(error != 0)
246 | 	{
247 | 		showFileError(error,argv[1]);
248 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
249 | 	}
250 | 	
251 | 	float *data = (float*)calloc(lines*samples,sizeof(float));
252 | 	if (data == NULL)
253 | 	{
254 | 		fprintf(stderr,"Memory allocation error.\n");
255 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
256 | 	}
257 | 	error = readInput2(argv[1], data);
258 | 	if(error != 0)
259 | 	{
260 | 		showFileError(error,argv[1]);
261 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
262 | 	}
263 | 
264 | 	// Parameters
265 | 	int K=atoi(argv[2]); 
266 | 	int maxIterations=atoi(argv[3]);
267 | 	int minChanges= (int)(lines*atof(argv[4])/100.0);
268 | 	float maxThreshold=atof(argv[5]);
269 | 
270 | 	int *centroidPos = (int*)calloc(K,sizeof(int));
271 | 	float *centroids = (float*)calloc(K*samples,sizeof(float));
272 | 	int *classMap = (int*)calloc(lines,sizeof(int));
273 | 
274 |     if (centroidPos == NULL || centroids == NULL || classMap == NULL)
275 | 	{
276 | 		fprintf(stderr,"Memory allocation error.\n");
277 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
278 | 	}
279 | 
280 | 	// Initial centrodis
281 | 	srand(0);
282 | 	int i;
283 | 	for(i=0; i<K; i++) 
284 | 		centroidPos[i]=rand()%lines;
285 | 	
286 | 	// Loading the array of initial centroids with the data from the array data
287 | 	// The centroids are points stored in the data array.
288 | 	initCentroids(data, centroids, centroidPos, samples, K);
289 | 
290 | 
291 | 	printf("\n\tData file: %s \n\tPoints: %d\n\tDimensions: %d\n", argv[1], lines, samples);
292 | 	printf("\tNumber of clusters: %d\n", K);
293 | 	printf("\tMaximum number of iterations: %d\n", maxIterations);
294 | 	printf("\tMinimum number of changes: %d [%g%% of %d points]\n", minChanges, atof(argv[4]), lines);
295 | 	printf("\tMaximum centroid precision: %f\n", maxThreshold);
296 | 	
297 | 	//END CLOCK*****************************************
298 | 	end = MPI_Wtime();;
299 | 	printf("\nMemory allocation: %f seconds\n", end - start);
300 | 	fflush(stdout);
301 | 	//**************************************************
302 | 	//START CLOCK***************************************
303 | 	start = MPI_Wtime();
304 | 	//**************************************************
305 | 	char *outputMsg = (char *)calloc(10000,sizeof(char));
306 | 	char line[100];
307 | 
308 | 	int j;
309 | 	int class;
310 | 	float dist, minDist;
311 | 	int it=0;
312 | 	int changes = 0;
313 | 	float maxDist;
314 | 
315 | 	//pointPerClass: number of points classified in each class
316 | 	//auxCentroids: mean of the points in each class
317 | 	int *pointsPerClass = (int *)malloc(K*sizeof(int));
318 | 	float *auxCentroids = (float*)malloc(K*samples*sizeof(float));
319 | 	float *distCentroids = (float*)malloc(K*sizeof(float)); 
320 | 	if (pointsPerClass == NULL || auxCentroids == NULL || distCentroids == NULL)
321 | 	{
322 | 		fprintf(stderr,"Memory allocation error.\n");
323 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
324 | 	}
325 | 
326 | /*
327 |  *
328 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
329 |  *
330 |  */
331 | 
332 | 	do{
333 | 		it++;
334 | 	
335 | 		//1. Calculate the distance from each point to the centroid
336 | 		//Assign each point to the nearest centroid.
337 | 		changes = 0;
338 | 		for(i=0; i<lines; i++)
339 | 		{
340 | 			class=1;
341 | 			minDist=FLT_MAX;
342 | 			for(j=0; j<K; j++)
343 | 			{
344 | 				dist=euclideanDistance(&data[i*samples], &centroids[j*samples], samples);
345 | 
346 | 				if(dist < minDist)
347 | 				{
348 | 					minDist=dist;
349 | 					class=j+1;
350 | 				}
351 | 			}
352 | 			if(classMap[i]!=class)
353 | 			{
354 | 				changes++;
355 | 			}
356 | 			classMap[i]=class;
357 | 		}
358 | 
359 | 		// 2. Recalculates the centroids: calculates the mean within each cluster
360 | 		zeroIntArray(pointsPerClass,K);
361 | 		zeroFloatMatriz(auxCentroids,K,samples);
362 | 
363 | 		for(i=0; i<lines; i++) 
364 | 		{
365 | 			class=classMap[i];
366 | 			pointsPerClass[class-1] = pointsPerClass[class-1] +1;
367 | 			for(j=0; j<samples; j++){
368 | 				auxCentroids[(class-1)*samples+j] += data[i*samples+j];
369 | 			}
370 | 		}
371 | 
372 | 		for(i=0; i<K; i++) 
373 | 		{
374 | 			for(j=0; j<samples; j++){
375 | 				auxCentroids[i*samples+j] /= pointsPerClass[i];
376 | 			}
377 | 		}
378 | 		
379 | 		maxDist=FLT_MIN;
380 | 		for(i=0; i<K; i++){
381 | 			distCentroids[i]=euclideanDistance(&centroids[i*samples], &auxCentroids[i*samples], samples);
382 | 			if(distCentroids[i]>maxDist) {
383 | 				maxDist=distCentroids[i];
384 | 			}
385 | 		}
386 | 		memcpy(centroids, auxCentroids, (K*samples*sizeof(float)));
387 | 		
388 | 		sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist);
389 | 		outputMsg = strcat(outputMsg,line);
390 | 
391 | 	} while((changes>minChanges) && (it<maxIterations) && (maxDist>maxThreshold));
392 | 
393 | /*
394 |  *
395 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
396 |  *
397 |  */
398 | 	// Output and termination conditions
399 | 	printf("%s",outputMsg);	
400 | 
401 | 	//END CLOCK*****************************************
402 | 	end = MPI_Wtime();
403 | 	printf("\nComputation: %f seconds", end - start);
404 | 	fflush(stdout);
405 | 	//**************************************************
406 | 	//START CLOCK***************************************
407 | 	start = MPI_Wtime();
408 | 	//**************************************************
409 | 
410 | 	
411 | 
412 | 	if (changes <= minChanges) {
413 | 		printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges);
414 | 	}
415 | 	else if (it >= maxIterations) {
416 | 		printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations);
417 | 	}
418 | 	else {
419 | 		printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold);
420 | 	}	
421 | 
422 | 	// Writing the classification of each point to the output file.
423 | 	error = writeResult(classMap, lines, argv[6]);
424 | 	if(error != 0)
425 | 	{
426 | 		showFileError(error, argv[6]);
427 | 		MPI_Abort( MPI_COMM_WORLD, EXIT_FAILURE );
428 | 	}
429 | 
430 | 	//Free memory
431 | 	free(data);
432 | 	free(classMap);
433 | 	free(centroidPos);
434 | 	free(centroids);
435 | 	free(distCentroids);
436 | 	free(pointsPerClass);
437 | 	free(auxCentroids);
438 | 
439 | 	//END CLOCK*****************************************
440 | 	end = MPI_Wtime();
441 | 	printf("\n\nMemory deallocation: %f seconds\n", end - start);
442 | 	fflush(stdout);
443 | 	//***************************************************/
444 | 	MPI_Finalize();
445 | 	return 0;
446 | }
447 | 


--------------------------------------------------------------------------------
/projects/kmeans/KMEANS_omp.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * k-Means clustering algorithm
  3 |  *
  4 |  * OpenMP version
  5 |  *
  6 |  * Parallel computing (Degree in Computer Engineering)
  7 |  * 2022/2023
  8 |  *
  9 |  * Version: 1.0
 10 |  *
 11 |  * (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano
 12 |  * Grupo Trasgo, Universidad de Valladolid (Spain)
 13 |  *
 14 |  * This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
 15 |  * https://creativecommons.org/licenses/by-sa/4.0/
 16 |  */
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <ctype.h>
 20 | #include <math.h>
 21 | #include <time.h>
 22 | #include <string.h>
 23 | #include <float.h>
 24 | #include <omp.h>
 25 | 
 26 | #define MAXLINE 2000
 27 | #define MAXCAD 200
 28 | 
 29 | //Macros
 30 | #define MIN(a,b) ((a) < (b) ? (a) : (b))
 31 | #define MAX(a,b) ((a) > (b) ? (a) : (b))
 32 | 
 33 | /* 
 34 | Function showFileError: It displays the corresponding error during file reading.
 35 | */
 36 | void showFileError(int error, char* filename)
 37 | {
 38 | 	printf("Error\n");
 39 | 	switch (error)
 40 | 	{
 41 | 		case -1:
 42 | 			fprintf(stderr,"\tFile %s has too many columns.\n", filename);
 43 | 			fprintf(stderr,"\tThe maximum number of columns has been exceeded. MAXLINE: %d.\n", MAXLINE);
 44 | 			break;
 45 | 		case -2:
 46 | 			fprintf(stderr,"Error reading file: %s.\n", filename);
 47 | 			break;
 48 | 		case -3:
 49 | 			fprintf(stderr,"Error writing file: %s.\n", filename);
 50 | 			break;
 51 | 	}
 52 | 	fflush(stderr);	
 53 | }
 54 | 
 55 | /* 
 56 | Function readInput: It reads the file to determine the number of rows and columns.
 57 | */
 58 | int readInput(char* filename, int *lines, int *samples)
 59 | {
 60 |     FILE *fp;
 61 |     char line[MAXLINE] = "";
 62 |     char *ptr;
 63 |     const char *delim = "\t";
 64 |     int contlines, contsamples = 0;
 65 |     
 66 |     contlines = 0;
 67 | 
 68 |     if ((fp=fopen(filename,"r"))!=NULL)
 69 |     {
 70 |         while(fgets(line, MAXLINE, fp)!= NULL) 
 71 | 		{
 72 | 			if (strchr(line, '\n') == NULL)
 73 | 			{
 74 | 				return -1;
 75 | 			}
 76 |             contlines++;       
 77 |             ptr = strtok(line, delim);
 78 |             contsamples = 0;
 79 |             while(ptr != NULL)
 80 |             {
 81 |             	contsamples++;
 82 | 				ptr = strtok(NULL, delim);
 83 | 	    	}	    
 84 |         }
 85 |         fclose(fp);
 86 |         *lines = contlines;
 87 |         *samples = contsamples;  
 88 |         return 0;
 89 |     }
 90 |     else
 91 | 	{
 92 |     	return -2;
 93 | 	}
 94 | }
 95 | 
 96 | /* 
 97 | Function readInput2: It loads data from file.
 98 | */
 99 | int readInput2(char* filename, float* data)
100 | {
101 |     FILE *fp;
102 |     char line[MAXLINE] = "";
103 |     char *ptr;
104 |     const char *delim = "\t";
105 |     int i = 0;
106 |     
107 |     if ((fp=fopen(filename,"rt"))!=NULL)
108 |     {
109 |         while(fgets(line, MAXLINE, fp)!= NULL)
110 |         {         
111 |             ptr = strtok(line, delim);
112 |             while(ptr != NULL)
113 |             {
114 |             	data[i] = atof(ptr);
115 |             	i++;
116 | 				ptr = strtok(NULL, delim);
117 | 	   		}
118 | 	    }
119 |         fclose(fp);
120 |         return 0;
121 |     }
122 |     else
123 | 	{
124 |     	return -2; //No file found
125 | 	}
126 | }
127 | 
128 | /* 
129 | Function writeResult: It writes in the output file the cluster of each sample (point).
130 | */
131 | int writeResult(int *classMap, int lines, const char* filename)
132 | {	
133 |     FILE *fp;
134 |     
135 |     if ((fp=fopen(filename,"wt"))!=NULL)
136 |     {
137 |         for(int i=0; i<lines; i++)
138 |         {
139 |         	fprintf(fp,"%d\n",classMap[i]);
140 |         }
141 |         fclose(fp);  
142 |    
143 |         return 0;
144 |     }
145 |     else
146 | 	{
147 |     	return -3; //No file found
148 | 	}
149 | }
150 | 
151 | /*
152 | 
153 | Function initCentroids: This function copies the values of the initial centroids, using their 
154 | position in the input data structure as a reference map.
155 | */
156 | void initCentroids(const float *data, float* centroids, int* centroidPos, int samples, int K)
157 | {
158 | 	int i;
159 | 	int idx;
160 | 	for(i=0; i<K; i++)
161 | 	{
162 | 		idx = centroidPos[i];
163 | 		memcpy(&centroids[i*samples], &data[idx*samples], (samples*sizeof(float)));
164 | 	}
165 | }
166 | 
167 | /*
168 | Function euclideanDistance: Euclidean distance
169 | This function could be modified
170 | */
171 | float euclideanDistance(float *point, float *center, int samples)
172 | {
173 | 	float dist=0.0;
174 | 	for(int i=0; i<samples; i++) 
175 | 	{
176 | 		dist+= (point[i]-center[i])*(point[i]-center[i]);
177 | 	}
178 | 	dist = sqrt(dist);
179 | 	return(dist);
180 | }
181 | 
182 | /*
183 | Function zeroFloatMatriz: Set matrix elements to 0
184 | This function could be modified
185 | */
186 | void zeroFloatMatriz(float *matrix, int rows, int columns)
187 | {
188 | 	int i,j;
189 | 	for (i=0; i<rows; i++)
190 | 		for (j=0; j<columns; j++)
191 | 			matrix[i*columns+j] = 0.0;	
192 | }
193 | 
194 | /*
195 | Function zeroIntArray: Set array elements to 0
196 | This function could be modified
197 | */
198 | void zeroIntArray(int *array, int size)
199 | {
200 | 	int i;
201 | 	for (i=0; i<size; i++)
202 | 		array[i] = 0;	
203 | }
204 | 
205 | 
206 | 
207 | int main(int argc, char* argv[])
208 | {
209 | 
210 | 	//START CLOCK***************************************
211 | 	double start, end;
212 | 	start = omp_get_wtime();
213 | 	//**************************************************
214 | 	/*
215 | 	* PARAMETERS
216 | 	*
217 | 	* argv[1]: Input data file
218 | 	* argv[2]: Number of clusters
219 | 	* argv[3]: Maximum number of iterations of the method. Algorithm termination condition.
220 | 	* argv[4]: Minimum percentage of class changes. Algorithm termination condition.
221 | 	*          If between one iteration and the next, the percentage of class changes is less than
222 | 	*          this percentage, the algorithm stops.
223 | 	* argv[5]: Precision in the centroid distance after the update.
224 | 	*          It is an algorithm termination condition. If between one iteration of the algorithm 
225 | 	*          and the next, the maximum distance between centroids is less than this precision, the
226 | 	*          algorithm stops.
227 | 	* argv[6]: Output file. Class assigned to each point of the input file.
228 | 	* */
229 | 	if(argc !=  7)
230 | 	{
231 | 		fprintf(stderr,"EXECUTION ERROR K-MEANS: Parameters are not correct.\n");
232 | 		fprintf(stderr,"./KMEANS [Input Filename] [Number of clusters] [Number of iterations] [Number of changes] [Threshold] [Output data file]\n");
233 | 		fflush(stderr);
234 | 		exit(-1);
235 | 	}
236 | 
237 | 	// Reading the input data
238 | 	// lines = number of points; samples = number of dimensions per point
239 | 	int lines = 0, samples= 0;  
240 | 	
241 | 	int error = readInput(argv[1], &lines, &samples);
242 | 	if(error != 0)
243 | 	{
244 | 		showFileError(error,argv[1]);
245 | 		exit(error);
246 | 	}
247 | 	
248 | 	float *data = (float*)calloc(lines*samples,sizeof(float));
249 | 	if (data == NULL)
250 | 	{
251 | 		fprintf(stderr,"Memory allocation error.\n");
252 | 		exit(-4);
253 | 	}
254 | 	error = readInput2(argv[1], data);
255 | 	if(error != 0)
256 | 	{
257 | 		showFileError(error,argv[1]);
258 | 		exit(error);
259 | 	}
260 | 
261 | 	// Parameters
262 | 	int K=atoi(argv[2]); 
263 | 	int maxIterations=atoi(argv[3]);
264 | 	int minChanges= (int)(lines*atof(argv[4])/100.0);
265 | 	float maxThreshold=atof(argv[5]);
266 | 
267 | 	int *centroidPos = (int*)calloc(K,sizeof(int));
268 | 	float *centroids = (float*)calloc(K*samples,sizeof(float));
269 | 	int *classMap = (int*)calloc(lines,sizeof(int));
270 | 
271 |     if (centroidPos == NULL || centroids == NULL || classMap == NULL)
272 | 	{
273 | 		fprintf(stderr,"Memory allocation error.\n");
274 | 		exit(-4);
275 | 	}
276 | 
277 | 	// Initial centrodis
278 | 	srand(0);
279 | 	int i;
280 | 	for(i=0; i<K; i++) 
281 | 		centroidPos[i]=rand()%lines;
282 | 	
283 | 	// Loading the array of initial centroids with the data from the array data
284 | 	// The centroids are points stored in the data array.
285 | 	initCentroids(data, centroids, centroidPos, samples, K);
286 | 
287 | 
288 | 	printf("\n\tData file: %s \n\tPoints: %d\n\tDimensions: %d\n", argv[1], lines, samples);
289 | 	printf("\tNumber of clusters: %d\n", K);
290 | 	printf("\tMaximum number of iterations: %d\n", maxIterations);
291 | 	printf("\tMinimum number of changes: %d [%g%% of %d points]\n", minChanges, atof(argv[4]), lines);
292 | 	printf("\tMaximum centroid precision: %f\n", maxThreshold);
293 | 	
294 | 	//END CLOCK*****************************************
295 | 	end = omp_get_wtime();
296 | 	printf("\nMemory allocation: %f seconds\n", end - start);
297 | 	fflush(stdout);
298 | 	//**************************************************
299 | 	//START CLOCK***************************************
300 | 	start = omp_get_wtime();
301 | 	//**************************************************
302 | 	char *outputMsg = (char *)calloc(10000,sizeof(char));
303 | 	char line[100];
304 | 
305 | 	int j;
306 | 	int class;
307 | 	float dist, minDist;
308 | 	int it=0;
309 | 	int changes = 0;
310 | 	float maxDist;
311 | 
312 | 	//pointPerClass: number of points classified in each class
313 | 	//auxCentroids: mean of the points in each class
314 | 	int *pointsPerClass = (int *)malloc(K*sizeof(int));
315 | 	float *auxCentroids = (float*)malloc(K*samples*sizeof(float));
316 | 	float *distCentroids = (float*)malloc(K*sizeof(float)); 
317 | 	if (pointsPerClass == NULL || auxCentroids == NULL || distCentroids == NULL)
318 | 	{
319 | 		fprintf(stderr,"Memory allocation error.\n");
320 | 		exit(-4);
321 | 	}
322 | 
323 | /*
324 |  *
325 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
326 |  *
327 |  */
328 | 
329 | 	do{
330 | 		it++;
331 | 	
332 | 		//1. Calculate the distance from each point to the centroid
333 | 		//Assign each point to the nearest centroid.
334 | 		changes = 0;
335 | 		for(i=0; i<lines; i++)
336 | 		{
337 | 			class=1;
338 | 			minDist=FLT_MAX;
339 | 			for(j=0; j<K; j++)
340 | 			{
341 | 				dist=euclideanDistance(&data[i*samples], &centroids[j*samples], samples);
342 | 
343 | 				if(dist < minDist)
344 | 				{
345 | 					minDist=dist;
346 | 					class=j+1;
347 | 				}
348 | 			}
349 | 			if(classMap[i]!=class)
350 | 			{
351 | 				changes++;
352 | 			}
353 | 			classMap[i]=class;
354 | 		}
355 | 
356 | 		// 2. Recalculates the centroids: calculates the mean within each cluster
357 | 		zeroIntArray(pointsPerClass,K);
358 | 		zeroFloatMatriz(auxCentroids,K,samples);
359 | 
360 | 		for(i=0; i<lines; i++) 
361 | 		{
362 | 			class=classMap[i];
363 | 			pointsPerClass[class-1] = pointsPerClass[class-1] +1;
364 | 			for(j=0; j<samples; j++){
365 | 				auxCentroids[(class-1)*samples+j] += data[i*samples+j];
366 | 			}
367 | 		}
368 | 
369 | 		for(i=0; i<K; i++) 
370 | 		{
371 | 			for(j=0; j<samples; j++){
372 | 				auxCentroids[i*samples+j] /= pointsPerClass[i];
373 | 			}
374 | 		}
375 | 		
376 | 		maxDist=FLT_MIN;
377 | 		for(i=0; i<K; i++){
378 | 			distCentroids[i]=euclideanDistance(&centroids[i*samples], &auxCentroids[i*samples], samples);
379 | 			if(distCentroids[i]>maxDist) {
380 | 				maxDist=distCentroids[i];
381 | 			}
382 | 		}
383 | 		memcpy(centroids, auxCentroids, (K*samples*sizeof(float)));
384 | 		
385 | 		sprintf(line,"\n[%d] Cluster changes: %d\tMax. centroid distance: %f", it, changes, maxDist);
386 | 		outputMsg = strcat(outputMsg,line);
387 | 
388 | 	} while((changes>minChanges) && (it<maxIterations) && (maxDist>maxThreshold));
389 | 
390 | /*
391 |  *
392 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
393 |  *
394 |  */
395 | 	// Output and termination conditions
396 | 	printf("%s",outputMsg);	
397 | 
398 | 	//END CLOCK*****************************************
399 | 	end = omp_get_wtime();
400 | 	printf("\nComputation: %f seconds", end - start);
401 | 	fflush(stdout);
402 | 	//**************************************************
403 | 	//START CLOCK***************************************
404 | 	start = omp_get_wtime();
405 | 	//**************************************************
406 | 
407 | 	
408 | 
409 | 	if (changes <= minChanges) {
410 | 		printf("\n\nTermination condition:\nMinimum number of changes reached: %d [%d]", changes, minChanges);
411 | 	}
412 | 	else if (it >= maxIterations) {
413 | 		printf("\n\nTermination condition:\nMaximum number of iterations reached: %d [%d]", it, maxIterations);
414 | 	}
415 | 	else {
416 | 		printf("\n\nTermination condition:\nCentroid update precision reached: %g [%g]", maxDist, maxThreshold);
417 | 	}	
418 | 
419 | 	// Writing the classification of each point to the output file.
420 | 	error = writeResult(classMap, lines, argv[6]);
421 | 	if(error != 0)
422 | 	{
423 | 		showFileError(error, argv[6]);
424 | 		exit(error);
425 | 	}
426 | 
427 | 	//Free memory
428 | 	free(data);
429 | 	free(classMap);
430 | 	free(centroidPos);
431 | 	free(centroids);
432 | 	free(distCentroids);
433 | 	free(pointsPerClass);
434 | 	free(auxCentroids);
435 | 
436 | 	//END CLOCK*****************************************
437 | 	end = omp_get_wtime();
438 | 	printf("\n\nMemory deallocation: %f seconds\n", end - start);
439 | 	fflush(stdout);
440 | 	//***************************************************/
441 | 	return 0;
442 | }
443 | 


--------------------------------------------------------------------------------
/projects/kmeans/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | k-Means clustering algorithm
 3 | 
 4 | Parallel computing (Degree in Computer Engineering)
 5 | 2022/2023
 6 | 
 7 | EduHPC 2023: Peachy assignment
 8 | 
 9 | (c) 2022-2023 Diego García-Álvarez, Arturo Gonzalez-Escribano
10 | Group Trasgo, Universidad de Valladolid (Spain)
11 | 
12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
13 | https://creativecommons.org/licenses/by-sa/4.0/
14 | 
15 | 


--------------------------------------------------------------------------------
/projects/kmeans/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # K-means 
 3 | #
 4 | # Parallel computing (Degree in Computer Engineering)
 5 | # 2022/2023
 6 | #
 7 | # (c) 2023 Diego Garcia-Alvarez and Arturo Gonzalez-Escribano
 8 | # Grupo Trasgo, Universidad de Valladolid (Spain)
 9 | #
10 | 
11 | # Compilers
12 | CC=gcc
13 | OMPFLAG=-fopenmp
14 | MPICC=mpicc
15 | CUDACC=nvcc
16 | 
17 | # Flags for optimization and libs
18 | FLAGS=-O3 -Wall
19 | LIBS=-lm
20 | 
21 | # Targets to build
22 | OBJS=KMEANS_seq KMEANS_omp KMEANS_mpi KMEANS_cuda
23 | 
24 | # Rules. By default show help
25 | help:
26 | 	@echo
27 | 	@echo "K-means clustering method"
28 | 	@echo
29 | 	@echo "Group Trasgo, Universidad de Valladolid (Spain)"
30 | 	@echo
31 | 	@echo "make KMEANS_seq	Build only the sequential version"
32 | 	@echo "make cKMEANS_omp	Build only the OpenMP version"
33 | 	@echo "make KMEANS_mpi	Build only the MPI version"
34 | 	@echo "make KMEANS_cuda	Build only the CUDA version"
35 | 	@echo
36 | 	@echo "make all	Build all versions (Sequential, OpenMP)"
37 | 	@echo "make debug	Build all version with demo output for small surfaces"
38 | 	@echo "make clean	Remove targets"
39 | 	@echo
40 | 
41 | all: $(OBJS)
42 | 
43 | KMEANS_seq: KMEANS.c
44 | 	$(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
45 | 
46 | KMEANS_omp: KMEANS_omp.c
47 | 	$(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@
48 | 
49 | KMEANS_mpi: KMEANS_mpi.c
50 | 	$(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
51 | 
52 | KMEANS_cuda: KMEANS_cuda.cu
53 | 	$(CUDACC) $(DEBUG) $< $(LIBS) -o $@
54 | 
55 | 
56 | # Remove the target files
57 | clean:
58 | 	rm -rf $(OBJS)
59 | 
60 | # Compile in debug mode
61 | debug:
62 | 	make DEBUG="-DDEBUG -g" FLAGS= all
63 | 
64 | 


--------------------------------------------------------------------------------
/projects/kmeans/README:
--------------------------------------------------------------------------------
 1 | 
 2 | k-Means clustering algorithm
 3 | 
 4 | EduHPC 2023: Peachy assignment
 5 | 
 6 | (c) 2022 Diego García-Álvarez, Arturo Gonzalez-Escribano
 7 | Group Trasgo, Universidad de Valladolid (Spain)
 8 | 
 9 | --------------------------------------------------------------
10 | 
11 | Read the handout and use the sequential code as reference to study.
12 | Use the other source files to parallelize with the proper programming model.
13 | 
14 | Edit the first lines in the Makefile to set your preferred compilers and flags
15 | for both the sequential code and for each parallel programming model: 
16 | OpenMP, MPI, and CUDA.
17 | 
18 | To see a description of the Makefile options execute:
19 | $ make help 
20 | 
21 | Use the input files in the test_files directory for your first tests.
22 | Students are encouraged to manually write or automatically generate
23 | their own input files for more complete tests. See a description of
24 | the input files format in the handout.
25 | 
26 | 


--------------------------------------------------------------------------------
/projects/kmeans/handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/projects/kmeans/handout.pdf


--------------------------------------------------------------------------------
/projects/kmeans/test_files/input2D2.inp:
--------------------------------------------------------------------------------
 1 | -81	0
 2 | 47	84
 3 | 55	-3
 4 | -33	29
 5 | 5	-93
 6 | -47	72
 7 | 34	-15
 8 | 43	0
 9 | 98	-73
10 | -9	-18
11 | -44	67
12 | 86	-94
13 | -77	-59
14 | 82	-90
15 | 60	-21
16 | 61	29
17 | 80	-43
18 | -38	-16
19 | 54	30
20 | 63	-42
21 | 


--------------------------------------------------------------------------------
/projects/sequence/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Exact DNA sequence alignment for multiple patterns
 3 | 
 4 | Parallel computing (Degree in Computer Engineering)
 5 | 2023/2024
 6 | 
 7 | EduHPC 2024: Peachy assignment
 8 | 
 9 | (c) 2023-2024 Arturo Gonzalez-Escribano, Diego García-Álvarez, Jesús Cámara 
10 | Group Trasgo, Grupo GAMUVa, Universidad de Valladolid (Spain)
11 | 
12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
13 | https://creativecommons.org/licenses/by-sa/4.0/
14 | 
15 | 


--------------------------------------------------------------------------------
/projects/sequence/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Exact genetic sequence alignment
 3 | #
 4 | # Parallel computing (Degree in Computer Engineering)
 5 | # 2023/2024
 6 | #
 7 | # (c) 2024 Arturo Gonzalez-Escribano
 8 | # Grupo Trasgo, Universidad de Valladolid (Spain)
 9 | #
10 | 
11 | # Compilers
12 | CC=gcc
13 | OMPFLAG=-fopenmp
14 | MPICC=mpicc
15 | CUDACC=nvcc
16 | 
17 | # Flags for optimization and external libs
18 | LIBS=-lm
19 | FLAGS=-O3 -Wall
20 | CUDAFLAGS=-O3 -Xcompiler -Wall
21 | 
22 | # Targets to build
23 | OBJS=align_seq align_omp align_mpi align_cuda
24 | 
25 | # Rules. By default show help
26 | help:
27 | 	@echo
28 | 	@echo "Exact genetic sequence alignment"
29 | 	@echo
30 | 	@echo "Group Trasgo, Universidad de Valladolid (Spain)"
31 | 	@echo
32 | 	@echo "make align_seq	Build only the sequential version"
33 | 	@echo "make align_omp	Build only the OpenMP version"
34 | 	@echo "make align_mpi	Build only the MPI version"
35 | 	@echo "make align_cuda	Build only the CUDA version"
36 | 	@echo
37 | 	@echo "make all	Build all versions (Sequential, OpenMP, MPI, CUDA)"
38 | 	@echo "make debug	Build all version with demo output for small sequences"
39 | 	@echo "make clean	Remove targets"
40 | 	@echo
41 | 
42 | all: $(OBJS)
43 | 
44 | align_seq: align.c rng.c
45 | 	$(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
46 | 
47 | align_omp: align_omp.c rng.c
48 | 	$(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@
49 | 
50 | align_mpi: align_mpi.c rng.c
51 | 	$(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
52 | 
53 | align_cuda: align_cuda.cu rng.c
54 | 	$(CUDACC) $(CUDAFLAGS) $(DEBUG) $< $(LIBS) -o $@
55 | 
56 | 
57 | # Remove the target files
58 | clean:
59 | 	rm -rf $(OBJS)
60 | 
61 | # Compile in debug mode
62 | debug:
63 | 	make DEBUG="-DDEBUG -g" all
64 | 


--------------------------------------------------------------------------------
/projects/sequence/README:
--------------------------------------------------------------------------------
 1 | 
 2 | EduHPC 2024: Peachy assignment
 3 | 
 4 | (c) 2023-2024 Arturo Gonzalez-Escribano, Diego García-Álvarez, Jesús Cámara 
 5 | Group Trasgo, Grupo GAMUVa, Universidad de Valladolid (Spain)
 6 | 
 7 | --------------------------------------------------------------
 8 | 
 9 | Read the handout and use the sequential code as reference to study.
10 | Use the other source files to parallelize with the proper programming model.
11 | 
12 | Edit the first lines in the Makefile to set your preferred compilers and flags
13 | for both the sequential code and for each parallel programming model: 
14 | OpenMP, MPI, and CUDA.
15 | 
16 | To see a description of the Makefile options execute:
17 | $ make help 
18 | 
19 | Use the following program arguments for your first tests.
20 | Students are encouraged to generate their own program arguments for more 
21 | complete tests. See a description of the program arguments in the handout.
22 | 
23 | 
24 | Example tests
25 | ==============
26 | 
27 | 1) Basic test:
28 | --------------
29 | 300 0.1 0.3 0.35 100 5 5 300 150 50 150 80 M 609823
30 | 
31 | 
32 | 2) Simple tests for race conditions:
33 | ------------------------------------
34 | 1000 0.35 0.2 0.25 0 0 0 20000 10 0 500 0 M 4353435
35 | 
36 | 10000 0.35 0.2 0.25 0 0 0 10000 9000 9000 50 100 M 4353435
37 | 
38 | 
39 | 3) Check that the program works for sequences longest than INT_MAX:
40 | -------------------------------------------------------------------
41 | 4294967300 0.35 0.2 0.25 0 0 0 1 1 0 4294967298 0 M 683224
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/projects/sequence/align.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Exact genetic sequence alignment
  3 |  * (Using brute force)
  4 |  *
  5 |  * Reference sequential version (Do not modify this code)
  6 |  *
  7 |  * Computacion Paralela, Grado en Informatica (Universidad de Valladolid)
  8 |  * 2023/2024
  9 |  *
 10 |  * v1.2
 11 |  *
 12 |  * (c) 2024, Arturo Gonzalez-Escribano
 13 |  */
 14 | #include<stdio.h>
 15 | #include<stdlib.h>
 16 | #include<string.h>
 17 | #include<limits.h>
 18 | #include<sys/time.h>
 19 | 
 20 | 
 21 | /* Arbitrary value to indicate that no matches are found */
 22 | #define	NOT_FOUND	-1
 23 | 
 24 | /* Arbitrary value to restrict the checksums period */
 25 | #define CHECKSUM_MAX	65535
 26 | 
 27 | 
 28 | /* 
 29 |  * Utils: Function to get wall time
 30 |  */
 31 | double cp_Wtime(){
 32 | 	struct timeval tv;
 33 | 	gettimeofday(&tv, NULL);
 34 | 	return tv.tv_sec + 1.0e-6 * tv.tv_usec;
 35 | }
 36 | 
 37 | /*
 38 |  * Utils: Random generator
 39 |  */
 40 | #include "rng.c"
 41 | 
 42 | 
 43 | /*
 44 |  *
 45 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
 46 |  *
 47 |  */
 48 | 
 49 | /*
 50 |  * Function: Increment the number of pattern matches on the sequence positions
 51 |  * 	This function can be changed and/or optimized by the students
 52 |  */
 53 | void increment_matches( int pat, unsigned long *pat_found, unsigned long *pat_length, int *seq_matches ) {
 54 | 	unsigned long ind;	
 55 | 	for( ind=0; ind<pat_length[pat]; ind++) {
 56 | 		if ( seq_matches[ pat_found[pat] + ind ] == NOT_FOUND )
 57 | 			seq_matches[ pat_found[pat] + ind ] = 0;
 58 | 		else
 59 | 			seq_matches[ pat_found[pat] + ind ] ++;
 60 | 	}
 61 | }
 62 | 
 63 | /*
 64 |  * Function: Fill random sequence or pattern
 65 |  */
 66 | void generate_rng_sequence( rng_t *random, float prob_G, float prob_C, float prob_A, char *seq, unsigned long length) {
 67 | 	unsigned long ind; 
 68 | 	for( ind=0; ind<length; ind++ ) {
 69 | 		double prob = rng_next( random );
 70 | 		if( prob < prob_G ) seq[ind] = 'G';
 71 | 		else if( prob < prob_C ) seq[ind] = 'C';
 72 | 		else if( prob < prob_A ) seq[ind] = 'A';
 73 | 		else seq[ind] = 'T';
 74 | 	}
 75 | }
 76 | 
 77 | /*
 78 |  * Function: Copy a sample of the sequence
 79 |  */
 80 | void copy_sample_sequence( rng_t *random, char *sequence, unsigned long seq_length, unsigned long pat_samp_loc_mean, unsigned long pat_samp_loc_dev, char *pattern, unsigned long length) {
 81 | 	/* Choose location */
 82 | 	unsigned long  location = (unsigned long)rng_next_normal( random, (double)pat_samp_loc_mean, (double)pat_samp_loc_dev );
 83 | 	if ( location > seq_length - length ) location = seq_length - length;
 84 | 	if ( location <= 0 ) location = 0;
 85 | 
 86 | 	/* Copy sample */
 87 | 	unsigned long ind; 
 88 | 	for( ind=0; ind<length; ind++ )
 89 | 		pattern[ind] = sequence[ind+location];
 90 | }
 91 | 
 92 | /*
 93 |  *
 94 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
 95 |  *
 96 |  */
 97 | 
 98 | /*
 99 |  * Function: Allocate new patttern
100 |  */
101 | char *pattern_allocate( rng_t *random, unsigned long pat_rng_length_mean, unsigned long pat_rng_length_dev, unsigned long seq_length, unsigned long *new_length ) {
102 | 
103 | 	/* Random length */
104 | 	unsigned long length = (unsigned long)rng_next_normal( random, (double)pat_rng_length_mean, (double)pat_rng_length_dev );
105 | 	if ( length > seq_length ) length = seq_length;
106 | 	if ( length <= 0 ) length = 1;
107 | 
108 | 	/* Allocate pattern */
109 | 	char *pattern = (char *)malloc( sizeof(char) * length );
110 | 	if ( pattern == NULL ) {
111 | 		fprintf(stderr,"\n-- Error allocating a pattern of size: %lu\n", length );
112 | 		exit( EXIT_FAILURE );
113 | 	}
114 | 
115 | 	/* Return results */
116 | 	*new_length = length;
117 | 	return pattern;
118 | }
119 | 
120 | 
121 | /*
122 |  * Function: Regenerate a sample of the sequence
123 |  */
124 | void generate_sample_sequence( rng_t *random, rng_t random_seq, float prob_G, float prob_C, float prob_A, unsigned long seq_length, unsigned long pat_samp_loc_mean, unsigned long pat_samp_loc_dev, char *pattern, unsigned long length ) {
125 | 	/* Choose location */
126 | 	unsigned long  location = (unsigned long)rng_next_normal( random, (double)pat_samp_loc_mean, (double)pat_samp_loc_dev );
127 | 	if ( location > seq_length - length ) location = seq_length - length;
128 | 	if ( location <= 0 ) location = 0;
129 | 
130 | 	/* Regenerate sample */
131 | 	rng_t local_random = random_seq;
132 | 	rng_skip( &local_random, location );
133 | 	generate_rng_sequence( &local_random, prob_G, prob_C, prob_A, pattern, length);
134 | }
135 | 
136 | 
137 | /*
138 |  * Function: Print usage line in stderr
139 |  */
140 | void show_usage( char *program_name ) {
141 | 	fprintf(stderr,"Usage: %s ", program_name );
142 | 	fprintf(stderr,"<seq_length> <prob_G> <prob_C> <prob_A> <pat_rng_num> <pat_rng_length_mean> <pat_rng_length_dev> <pat_samples_num> <pat_samp_length_mean> <pat_samp_length_dev> <pat_samp_loc_mean> <pat_samp_loc_dev> <pat_samp_mix:B[efore]|A[fter]|M[ixed]> <long_seed>\n");
143 | 	fprintf(stderr,"\n");
144 | }
145 | 
146 | 
147 | 
148 | /*
149 |  * MAIN PROGRAM
150 |  */
151 | int main(int argc, char *argv[]) {
152 | 	/* 0. Default output and error without buffering, forces to write immediately */
153 | 	setbuf(stdout, NULL);
154 | 	setbuf(stderr, NULL);
155 | 
156 | 	/* 1. Read scenary arguments */
157 | 	/* 1.1. Check minimum number of arguments */
158 | 	if (argc < 15) {
159 | 		fprintf(stderr, "\n-- Error: Not enough arguments when reading configuration from the command line\n\n");
160 | 		show_usage( argv[0] );
161 | 		exit( EXIT_FAILURE );
162 | 	}
163 | 
164 | 	/* 1.2. Read argument values */
165 | 	unsigned long seq_length = atol( argv[1] );
166 | 	float prob_G = atof( argv[2] );
167 | 	float prob_C = atof( argv[3] );
168 | 	float prob_A = atof( argv[4] );
169 | 	if ( prob_G + prob_C + prob_A > 1 ) {
170 | 		fprintf(stderr, "\n-- Error: The sum of G,C,A,T nucleotid probabilities cannot be higher than 1\n\n");
171 | 		show_usage( argv[0] );
172 | 		exit( EXIT_FAILURE );
173 | 	}
174 | 	prob_C += prob_G;
175 | 	prob_A += prob_C;
176 | 
177 | 	int pat_rng_num = atoi( argv[5] );
178 | 	unsigned long pat_rng_length_mean = atol( argv[6] );
179 | 	unsigned long pat_rng_length_dev = atol( argv[7] );
180 | 	
181 | 	int pat_samp_num = atoi( argv[8] );
182 | 	unsigned long pat_samp_length_mean = atol( argv[9] );
183 | 	unsigned long pat_samp_length_dev = atol( argv[10] );
184 | 	unsigned long pat_samp_loc_mean = atol( argv[11] );
185 | 	unsigned long pat_samp_loc_dev = atol( argv[12] );
186 | 
187 | 	char pat_samp_mix = argv[13][0];
188 | 	if ( pat_samp_mix != 'B' && pat_samp_mix != 'A' && pat_samp_mix != 'M' ) {
189 | 		fprintf(stderr, "\n-- Error: Incorrect first character of pat_samp_mix: %c\n\n", pat_samp_mix);
190 | 		show_usage( argv[0] );
191 | 		exit( EXIT_FAILURE );
192 | 	}
193 | 
194 | 	unsigned long seed = atol( argv[14] );
195 | 
196 | #ifdef DEBUG
197 | 	/* DEBUG: Print arguments */
198 | 	printf("\nArguments: seq_length=%lu\n", seq_length );
199 | 	printf("Arguments: Accumulated probabilitiy G=%f, C=%f, A=%f, T=1\n", prob_G, prob_C, prob_A );
200 | 	printf("Arguments: Random patterns number=%d, length_mean=%lu, length_dev=%lu\n", pat_rng_num, pat_rng_length_mean, pat_rng_length_dev );
201 | 	printf("Arguments: Sample patterns number=%d, length_mean=%lu, length_dev=%lu, loc_mean=%lu, loc_dev=%lu\n", pat_samp_num, pat_samp_length_mean, pat_samp_length_dev, pat_samp_loc_mean, pat_samp_loc_dev );
202 | 	printf("Arguments: Type of mix: %c, Random seed: %lu\n", pat_samp_mix, seed );
203 | 	printf("\n");
204 | #endif // DEBUG
205 | 
206 | 	/* 2. Initialize data structures */
207 | 	/* 2.1. Allocate and fill sequence */
208 | 	char *sequence = (char *)malloc( sizeof(char) * seq_length );
209 | 	if ( sequence == NULL ) {
210 | 		fprintf(stderr,"\n-- Error allocating the sequence for size: %lu\n", seq_length );
211 | 		exit( EXIT_FAILURE );
212 | 	}
213 | 	rng_t random = rng_new( seed );
214 | 	generate_rng_sequence( &random, prob_G, prob_C, prob_A, sequence, seq_length);
215 | 
216 | 	/* 2.2. Allocate and fill patterns */
217 | 	/* 2.2.1 Allocate main structures */
218 | 	int pat_number = pat_rng_num + pat_samp_num;
219 | 	unsigned long *pat_length = (unsigned long *)malloc( sizeof(unsigned long) * pat_number );
220 | 	char **pattern = (char **)malloc( sizeof(char*) * pat_number );
221 | 	if ( pattern == NULL || pat_length == NULL ) {
222 | 		fprintf(stderr,"\n-- Error allocating the basic patterns structures for size: %d\n", pat_number );
223 | 		exit( EXIT_FAILURE );
224 | 	}
225 | 
226 | 	/* 2.2.2 Allocate and initialize ancillary structure for pattern types */
227 | 	int ind;
228 | 	unsigned long lind;
229 | 	#define PAT_TYPE_NONE	0
230 | 	#define PAT_TYPE_RNG	1
231 | 	#define PAT_TYPE_SAMP	2
232 | 	char *pat_type = (char *)malloc( sizeof(char) * pat_number );
233 | 	if ( pat_type == NULL ) {
234 | 		fprintf(stderr,"\n-- Error allocating ancillary structure for pattern of size: %d\n", pat_number );
235 | 		exit( EXIT_FAILURE );
236 | 	}
237 | 	for( ind=0; ind<pat_number; ind++ ) pat_type[ind] = PAT_TYPE_NONE;
238 | 
239 | 	/* 2.2.3 Fill up pattern types using the chosen mode */
240 | 	switch( pat_samp_mix ) {
241 | 	case 'A':
242 | 		for( ind=0; ind<pat_rng_num; ind++ ) pat_type[ind] = PAT_TYPE_RNG;
243 | 		for( ; ind<pat_number; ind++ ) pat_type[ind] = PAT_TYPE_SAMP;
244 | 		break;
245 | 	case 'B':
246 | 		for( ind=0; ind<pat_samp_num; ind++ ) pat_type[ind] = PAT_TYPE_SAMP;
247 | 		for( ; ind<pat_number; ind++ ) pat_type[ind] = PAT_TYPE_RNG;
248 | 		break;
249 | 	default:
250 | 		if ( pat_rng_num == 0 ) {
251 | 			for( ind=0; ind<pat_number; ind++ ) pat_type[ind] = PAT_TYPE_SAMP;
252 | 		}
253 | 		else if ( pat_samp_num == 0 ) {
254 | 			for( ind=0; ind<pat_number; ind++ ) pat_type[ind] = PAT_TYPE_RNG;
255 | 		}
256 | 		else if ( pat_rng_num < pat_samp_num ) {
257 | 			int interval = pat_number / pat_rng_num;
258 | 			for( ind=0; ind<pat_number; ind++ ) 
259 | 				if ( (ind+1) % interval == 0 ) pat_type[ind] = PAT_TYPE_RNG;
260 | 				else pat_type[ind] = PAT_TYPE_SAMP;
261 | 		}
262 | 		else {
263 | 			int interval = pat_number / pat_samp_num;
264 | 			for( ind=0; ind<pat_number; ind++ ) 
265 | 				if ( (ind+1) % interval == 0 ) pat_type[ind] = PAT_TYPE_SAMP;
266 | 				else pat_type[ind] = PAT_TYPE_RNG;
267 | 		}
268 | 	}
269 | 
270 | 	/* 2.2.4 Generate the patterns */
271 | 	for( ind=0; ind<pat_number; ind++ ) {
272 | 		if ( pat_type[ind] == PAT_TYPE_RNG ) {
273 | 			pattern[ind] = pattern_allocate( &random, pat_rng_length_mean, pat_rng_length_dev, seq_length, &pat_length[ind] );
274 | 			generate_rng_sequence( &random, prob_G, prob_C, prob_A, pattern[ind], pat_length[ind] );
275 | 		}
276 | 		else if ( pat_type[ind] == PAT_TYPE_SAMP ) {
277 | 			pattern[ind] = pattern_allocate( &random, pat_samp_length_mean, pat_samp_length_dev, seq_length, &pat_length[ind] );
278 | #ifdef REGENERATE_SAMPLE_PATTERNS
279 | 			rng_t random_seq_orig = rng_new( seed );
280 | 			generate_sample_sequence( &random, random_seq_orig, prob_G, prob_C, prob_A, seq_length, pat_samp_loc_mean, pat_samp_loc_dev, pattern[ind], pat_length[ind] );
281 | #else
282 | 			copy_sample_sequence( &random, sequence, seq_length, pat_samp_loc_mean, pat_samp_loc_dev, pattern[ind], pat_length[ind] );
283 | #endif
284 | 		}
285 | 		else {
286 | 			fprintf(stderr,"\n-- Error internal: Paranoic check! A pattern without type at position %d\n", ind );
287 | 			exit( EXIT_FAILURE );
288 | 		}
289 | 	}
290 | 	free( pat_type );
291 | 
292 | #ifdef DEBUG
293 | 	/* DEBUG: Print sequence and patterns */
294 | 	printf("-----------------\n");
295 | 	printf("Sequence: ");
296 | 	for( ind=0; ind<seq_length; ind++ ) 
297 | 		printf( "%c", sequence[ind] );
298 | 	printf("\n-----------------\n");
299 | 	printf("Patterns: %d ( rng: %d, samples: %d )\n", pat_number, pat_rng_num, pat_samp_num );
300 | 	int debug_pat;
301 | 	for( debug_pat=0; debug_pat<pat_number; debug_pat++ ) {
302 | 		printf( "Pat[%d]: ", debug_pat );
303 | 		for( ind=0; ind<pat_length[debug_pat]; ind++ ) 
304 | 			printf( "%c", pattern[debug_pat][ind] );
305 | 		printf("\n");
306 | 	}
307 | 	printf("-----------------\n\n");
308 | #endif // DEBUG
309 | 
310 | 	/* Avoid the usage of arguments to take strategic decisions
311 | 	 * In a real case the user only has the patterns and sequence data to analize
312 | 	 */
313 | 	argc = 0;
314 | 	argv = NULL;
315 | 	pat_rng_num = 0;
316 | 	pat_rng_length_mean = 0;
317 | 	pat_rng_length_dev = 0;
318 | 	pat_samp_num = 0;
319 | 	pat_samp_length_mean = 0;
320 | 	pat_samp_length_dev = 0;
321 | 	pat_samp_loc_mean = 0;
322 | 	pat_samp_loc_dev = 0;
323 | 	pat_samp_mix = '0';
324 | 
325 | 	/* 2.3. Other result data and structures */
326 | 	int pat_matches = 0;
327 | 
328 | 	/* 2.3.1. Other results related to patterns */
329 | 	unsigned long *pat_found;
330 | 	pat_found = (unsigned long *)malloc( sizeof(unsigned long) * pat_number );
331 | 	if ( pat_found == NULL ) {
332 | 		fprintf(stderr,"\n-- Error allocating aux pattern structure for size: %d\n", pat_number );
333 | 		exit( EXIT_FAILURE );
334 | 	}
335 | 	/* 2.3.2. Other results related to the main sequence */
336 | 	int *seq_matches;
337 | 	seq_matches = (int *)malloc( sizeof(int) * seq_length );
338 | 	if ( seq_matches == NULL ) {
339 | 		fprintf(stderr,"\n-- Error allocating aux sequence structures for size: %lu\n", seq_length );
340 | 		exit( EXIT_FAILURE );
341 | 	}
342 | 
343 | 	
344 | 	/* 3. Start global timer */
345 | 	double ttotal = cp_Wtime();
346 | 
347 | /*
348 |  *
349 |  * START HERE: DO NOT CHANGE THE CODE ABOVE THIS POINT
350 |  *
351 |  */
352 | 
353 | 	/* 4. Initialize ancillary structures */
354 | 	for( ind=0; ind<pat_number; ind++) {
355 | 		pat_found[ind] = (unsigned long)NOT_FOUND;
356 | 	}
357 | 	for( lind=0; lind<seq_length; lind++) {
358 | 		seq_matches[lind] = NOT_FOUND;
359 | 	}
360 | 
361 | 	/* 5. Search for each pattern */
362 | 	unsigned long start;
363 | 	int pat;
364 | 	for( pat=0; pat < pat_number; pat++ ) {
365 | 
366 | 		/* 5.1. For each posible starting position */
367 | 		for( start=0; start <= seq_length - pat_length[pat]; start++) {
368 | 
369 | 			/* 5.1.1. For each pattern element */
370 | 			for( lind=0; lind<pat_length[pat]; lind++) {
371 | 				/* Stop this test when different nucleotids are found */
372 | 				if ( sequence[start + lind] != pattern[pat][lind] ) break;
373 | 			}
374 | 			/* 5.1.2. Check if the loop ended with a match */
375 | 			if ( lind == pat_length[pat] ) {
376 | 				pat_matches++;
377 | 				pat_found[pat] = start;
378 | 				break;
379 | 			}
380 | 		}
381 | 
382 | 		/* 5.2. Pattern found */
383 | 		if ( pat_found[pat] != (unsigned long)NOT_FOUND ) {
384 | 			/* 4.2.1. Increment the number of pattern matches on the sequence positions */
385 | 			increment_matches( pat, pat_found, pat_length, seq_matches );
386 | 		}
387 | 	}
388 | 
389 | 	/* 7. Check sums */
390 | 	unsigned long checksum_matches = 0;
391 | 	unsigned long checksum_found = 0;
392 | 	for( ind=0; ind < pat_number; ind++) {
393 | 		if ( pat_found[ind] != (unsigned long)NOT_FOUND )
394 | 			checksum_found = ( checksum_found + pat_found[ind] ) % CHECKSUM_MAX;
395 | 	}
396 | 	for( lind=0; lind < seq_length; lind++) {
397 | 		if ( seq_matches[lind] != NOT_FOUND )
398 | 			checksum_matches = ( checksum_matches + seq_matches[lind] ) % CHECKSUM_MAX;
399 | 	}
400 | 
401 | #ifdef DEBUG
402 | 	/* DEBUG: Write results */
403 | 	printf("-----------------\n");
404 | 	printf("Found start:");
405 | 	for( debug_pat=0; debug_pat<pat_number; debug_pat++ ) {
406 | 		printf( " %lu", pat_found[debug_pat] );
407 | 	}
408 | 	printf("\n");
409 | 	printf("-----------------\n");
410 | 	printf("Matches:");
411 | 	for( lind=0; lind<seq_length; lind++ ) 
412 | 		printf( " %d", seq_matches[lind] );
413 | 	printf("\n");
414 | 	printf("-----------------\n");
415 | #endif // DEBUG
416 | 
417 | 	/* Free local resources */	
418 | 	free( sequence );
419 | 	free( seq_matches );
420 | 
421 | /*
422 |  *
423 |  * STOP HERE: DO NOT CHANGE THE CODE BELOW THIS POINT
424 |  *
425 |  */
426 | 
427 | 	/* 8. Stop global timer */
428 | 	ttotal = cp_Wtime() - ttotal;
429 | 
430 | 	/* 9. Output for leaderboard */
431 | 	printf("\n");
432 | 	/* 9.1. Total computation time */
433 | 	printf("Time: %lf\n", ttotal );
434 | 
435 | 	/* 9.2. Results: Statistics */
436 | 	printf("Result: %d, %lu, %lu\n\n", 
437 | 			pat_matches,
438 | 			checksum_found,
439 | 			checksum_matches );
440 | 		
441 | 	/* 10. Free resources */	
442 | 	int i;
443 | 	for( i=0; i<pat_number; i++ ) free( pattern[i] );
444 | 	free( pattern );
445 | 	free( pat_length );
446 | 	free( pat_found );
447 | 
448 | 	/* 11. End */
449 | 	return 0;
450 | }
451 | 


--------------------------------------------------------------------------------
/projects/sequence/handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/projects/sequence/handout.pdf


--------------------------------------------------------------------------------
/projects/sequence/rng.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Simple random generator
 3 |  * LCG (Linear Congruential Generator)
 4 |  *
 5 |  * Computacion Paralela, Grado en Informatica (Universidad de Valladolid)
 6 |  * 2023/2024
 7 |  *
 8 |  * v1.2
 9 |  *
10 |  * (c) 2024, Arturo Gonzalez-Escribano
11 |  */
12 | #include<stdint.h>
13 | #include<math.h>
14 | 
15 | /*
16 |  * Constants
17 |  */
18 | #define RNG_MULTIPLIER 6364136223846793005ULL
19 | #define RNG_INCREMENT  1442695040888963407ULL
20 | 
21 | /*
22 |  * Type for random sequences state
23 |  */
24 | typedef uint64_t	rng_t;
25 | 
26 | /*
27 |  * Constructor: Create a new state from a seed
28 |  */
29 | #ifdef __CUDACC__
30 | __host__ __device__ 
31 | #endif
32 | rng_t rng_new(uint64_t seed) {
33 |     uint64_t hash = seed;
34 |     hash = (hash ^ (hash >> 30)) * 0xbf58476d1ce4e5b9ULL;
35 |     hash = (hash ^ (hash >> 27)) * 0x94d049bb133111ebULL;
36 |     hash = hash ^ (hash >> 31);
37 |     return hash; // initial state
38 | }
39 | 
40 | /*
41 |  * Next: Advance state and return a double number uniformely distributed
42 |  * Adapted from the implementation on PCG (https://www.pcg-random.org/)
43 |  */
44 | #ifdef __CUDACC__
45 | __host__ __device__ 
46 | #endif
47 | double rng_next(rng_t *seq) {
48 |     *seq = ( *seq * RNG_MULTIPLIER + RNG_INCREMENT);
49 |     return (double) ldexpf( *seq, -64 );
50 | }
51 | 
52 | /*
53 |  * Next Normal: Advance state and return a double number distributed with a normal(mu,sigma)
54 |  */
55 | #ifdef __CUDACC__
56 | __host__ __device__ 
57 | #endif
58 | double rng_next_normal( rng_t *seq, double mu, double sigma) {
59 |     double u1 = rng_next(seq);
60 |     double u2 = rng_next(seq);
61 | 
62 |     double z0 = sqrt(-2.0 * log(u1)) * cos(2.0 * M_PI * u2);
63 |     // double z1 = sqrt(-2.0 * log(u1)) * sin(2.0 * M_PI * u2);
64 |     
65 |     return mu + sigma * z0;
66 | }
67 | 
68 | /*
69 |  * Skip ahead: Advance state with an arbitrary jump in log time
70 |  * Adapted from the implementation on PCG (https://www.pcg-random.org/)
71 |  */
72 | #ifdef __CUDACC__
73 | __host__ __device__ 
74 | #endif
75 | void rng_skip( rng_t *seq, uint64_t steps ) {
76 |     uint64_t cur_mult = RNG_MULTIPLIER;
77 |     uint64_t cur_plus = RNG_INCREMENT;
78 | 
79 |     uint64_t acc_mult = 1u;
80 |     uint64_t acc_plus = 0u;
81 |     while (steps > 0) {
82 |         if (steps & 1) {
83 |             acc_mult *= cur_mult;
84 |             acc_plus = acc_plus * cur_mult + cur_plus;
85 |         }
86 |         cur_plus = (cur_mult + 1) * cur_plus;
87 |         cur_mult *= cur_mult;
88 |         steps /= 2;
89 |     }
90 |     *seq = acc_mult * (*seq) + acc_plus;
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/projects/wind/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Simulation of a Wind Tunnel
 3 | 
 4 | Parallel computing (Degree in Computer Engineering)
 5 | 2020/2021
 6 | 
 7 | EduHPC 2021: Peachy assignment
 8 | 
 9 | (c) 2021 Arturo Gonzalez-Escribano, Yuri Torres
10 | Group Trasgo, Universidad de Valladolid (Spain)
11 | 
12 | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
13 | https://creativecommons.org/licenses/by-sa/4.0/
14 | 
15 | 


--------------------------------------------------------------------------------
/projects/wind/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Wind-tunnel
 3 | #
 4 | # Parallel computing (Degree in Computer Engineering)
 5 | # 2020/2021
 6 | #
 7 | # (c) 2021 Arturo Gonzalez-Escribano
 8 | # Grupo Trasgo, Universidad de Valladolid (Spain)
 9 | #
10 | 
11 | # Compilers
12 | CC=gcc
13 | OMPFLAG=-fopenmp
14 | MPICC=mpicc
15 | CUDACC=nvcc
16 | 
17 | # Flags for optimization and libs
18 | FLAGS=-O3 -Wall
19 | LIBS=-lm
20 | 
21 | # Targets to build
22 | OBJS=wind_seq wind_omp wind_mpi wind_cuda
23 | 
24 | # Rules. By default show help
25 | help:
26 | 	@echo
27 | 	@echo "Wind tunnel"
28 | 	@echo
29 | 	@echo "Group Trasgo, Universidad de Valladolid (Spain)"
30 | 	@echo
31 | 	@echo "make wind_seq	Build only the reference sequential version"
32 | 	@echo "make wind_omp	Build only the OpenMP version"
33 | 	@echo "make wind_mpi	Build only the MPI version"
34 | 	@echo "make wind_cuda	Build only the CUDA version"
35 | 	@echo
36 | 	@echo "make all	Build all versions (Sequential, OpenMP)"
37 | 	@echo "make debug	Build all version with demo output for small surfaces"
38 | 	@echo "make clean	Remove targets"
39 | 	@echo
40 | 
41 | all: $(OBJS)
42 | 
43 | wind_seq: wind.c
44 | 	$(CC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
45 | 
46 | wind_omp: wind_omp.c
47 | 	$(CC) $(FLAGS) $(DEBUG) $(OMPFLAG) $< $(LIBS) -o $@
48 | 
49 | wind_mpi: wind_mpi.c
50 | 	$(MPICC) $(FLAGS) $(DEBUG) $< $(LIBS) -o $@
51 | 
52 | wind_cuda: wind_cuda.cu
53 | 	$(CUDACC) $(DEBUG) $< $(LIBS) -o $@
54 | 
55 | # Remove the target files
56 | clean:
57 | 	rm -rf $(OBJS)
58 | 
59 | # Compile in debug mode
60 | debug:
61 | 	make DEBUG="-DDEBUG -g" FLAGS= all
62 | 
63 | 


--------------------------------------------------------------------------------
/projects/wind/README:
--------------------------------------------------------------------------------
 1 | 
 2 | Simulation of a Wind Tunnel
 3 | 
 4 | EduHPC 2021: Peachy assignment
 5 | 
 6 | (c) 2021 Arturo Gonzalez-Escribano, Yuri Torres
 7 | Group Trasgo, Universidad de Valladolid (Spain)
 8 | 
 9 | --------------------------------------------------------------
10 | 
11 | Read the handout and use the sequential code as reference to study.
12 | Use the other source files to parallelize with the proper programming model.
13 | 
14 | Edit the first lines in the Makefile to set your preferred compilers and flags
15 | for both the sequential code and for each parallel programming model: 
16 | OpenMP, MPI, and CUDA.
17 | 
18 | To see a description of the Makefile options execute:
19 | $ make help 
20 | 
21 | Examples:
22 | 
23 | Use the following combinatios of arguments for your first tests.
24 | You will discover that they represent different classes of scenarios
25 | and problems when the code is parallelized.
26 | Students are encouraged to design and use their own scenarios for more 
27 | complete tests. See a description of the arguments in the handout.
28 | 
29 | Examples:
30 | 
31 | Only propagation:
32 | ./wind_seq 538 60 1397 0.5 30 29 0 0 0 0 0 0 3431 9012 6432
33 | 
34 | ./wind_seq 456 812 1004 2.2 21 745 0 0 0 0 0 0 684 384 1292
35 | 
36 | ./wind_seq 38000 32 31000 0.5 3 24 0 0 0 0 0 0 583 1943 2345
37 | 
38 | ./wind_seq 32 2100000 118 0.1 0 2100000 0 0 0 0 0 0 673 3902 43
39 | 
40 | Fixed particles with chosen positions:
41 | ./wind_seq 102 80 352 0.1 10 50 0 0 0 0 0 0 3431 9012 12432 20 12 0.712 20 13 0.713 20 14 0.714 20 15 0.715 20 16 0.716 20 17 0.717 20 18 0.718 20 19 0.719 20 20 0.720 30 16 0.516 30 18 0.518 30 20 0.520 30 22 0.522 40 20 0.420 40 30 0.430 40 40 0.440 40 50 0.450 40 60 0.460 40 70 0.470
42 | 
43 | Fixed particles with random and chosen positions:
44 | ./wind_seq 102 80 352 0.1 10 50 15 16 0.1 0 0 0 3431 9012 12432 20 12 0.712 20 13 0.713 20 14 0.714 20 15 0.715 20 16 0.716 20 17 0.717 20 18 0.718 20 19 0.719 20 20 0.720 30 16 0.516 30 18 0.518 30 20 0.520 30 22 0.522 40 20 0.420 40 30 0.430 40 40 0.440 40 50 0.450 40 60 0.460 40 70 0.470
45 | 
46 | Fixed and moving particles with initial random positions:
47 | ./wind_seq 2100 457 6300 0.4 1 452 20 2000 0.001 16 50 0.2 583 223 712
48 | 
49 | 


--------------------------------------------------------------------------------
/projects/wind/handout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanieleDeSensi/multicore-programming/d8d141d0d4e11d67f6474db3fde2ed0cf4c588ca/projects/wind/handout.pdf


--------------------------------------------------------------------------------
/utils/create_users_pmc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | create_user_no_prompt () {
 4 | 	echo ========$1
 5 | 	echo ========$2
 6 | 
 7 | 	local USERNAME="$1"
 8 | 	local PASSWORD="$2"
 9 | 	local GROUP="$3"
10 | 	local FULLNAME="$4"
11 | 	local EMAIL="$5"
12 | 
13 | 	local GECOS=""
14 | 	echo "Creating user ${USERNAME} ${FULLNAME} ${EMAIL}..."
15 | 	adduser --ingroup ${GROUP} --disabled-login --gecos "" ${USERNAME}
16 | 	echo "Setting password..."
17 | 	echo "${USERNAME}:${PASSWORD}" | chpasswd
18 | 	echo "Adding user to docker group..."
19 | 	usermod -aG docker ${USERNAME}
20 | 	echo "Running make..."
21 | 	( cd /var/yp/; make)
22 | }
23 | 
24 | [ $SUDO_USER ] && user=$SUDO_USER || user=$(whoami)
25 | 
26 | GROUP="studenti_psmc"
27 | 
28 | while IFS=$'\t' read -r EMAIL PASSWORD NAME SURNAME; do
29 |     USERNAME=$(echo $EMAIL | cut -d '@' -f 1 | tr '.' '_')
30 |     FULLNAME="${NAME} ${SURNAME}"
31 |     #echo "----"
32 |     #echo ${USERNAME}
33 |     #echo ${EMAIL}
34 |     #echo ${PASSWORD}
35 |     #echo ${GROUP}
36 |     #echo ${FULLNAME}
37 |     #echo ${EMAIL}
38 |     create_user_no_prompt "${USERNAME}" "${PASSWORD}" "${GROUP}" "${FULLNAME}" "${EMAIL}"
39 | done < users_list.csv
40 | 


--------------------------------------------------------------------------------
/utils/openmpiscript.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##**************************************************************
  4 | ##
  5 | ## Copyright (C) 1990-2018, Condor Team, Computer Sciences Department,
  6 | ## University of Wisconsin-Madison, WI.
  7 | ##
  8 | ## Licensed under the Apache License, Version 2.0 (the "License"); you
  9 | ## may not use this file except in compliance with the License.  You may
 10 | ## obtain a copy of the License at
 11 | ##
 12 | ##    http://www.apache.org/licenses/LICENSE-2.0
 13 | ##
 14 | ## Unless required by applicable law or agreed to in writing, software
 15 | ## distributed under the License is distributed on an "AS IS" BASIS,
 16 | ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | ## See the License for the specific language governing permissions and
 18 | ## limitations under the License.
 19 | ##
 20 | ##**************************************************************
 21 | 
 22 | # This is a script to run OpenMPI jobs under the HTCondor parallel universe.
 23 | # OpenMPI assumes that a full install is available on all execute nodes.
 24 | 
 25 | ## sample submit script
 26 | #universe = parallel
 27 | #executable = openmpiscript
 28 | #arguments = actual_mpi_job arg1 arg2 arg3
 29 | #getenv = true
 30 | #
 31 | #should_transfer_files = yes
 32 | #transfer_input_files = actual_mpi_job
 33 | #when_to_transfer_output = on_exit_or_evict
 34 | #
 35 | #output = out.$(NODE)
 36 | #error  = err.$(NODE)
 37 | #log    = log
 38 | #
 39 | #machine_count = 8
 40 | #queue
 41 | ##
 42 | 
 43 | ## configuration options
 44 | # $USE_OPENMP should be set to true if using OpenMP with your OpenMPI executable (not typical).
 45 | USE_OPENMP=false
 46 | 
 47 | # Set the paths to the helper scripts
 48 | # Get them from the HTCondor libexec directory
 49 | ORTED_LAUNCHER=$(condor_config_val libexec)/orted_launcher.sh
 50 | GET_ORTED_CMD=$(condor_config_val libexec)/get_orted_cmd.sh
 51 | # Or set a custom path (e.g. the local directory if transferring the scripts)
 52 | #ORTED_LAUNCHER=./orted_launcher.sh
 53 | #GET_ORTED_CMD=./get_orted_cmd.sh
 54 | 
 55 | # $MPDIR points to the location of the OpenMPI install
 56 | # The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended)
 57 | 
 58 | 
 59 | MPDIR=/usr
 60 | #MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | # Or set it manually
 67 | #MPDIR=/usr/lib64/openmpi
 68 | 
 69 | # $EXINT is a comma-delimited list of excluded network interfaces.
 70 | # If your mpi jobs are hanging, OpenMPI may be trying to use too many
 71 | # network interfaces to communicate between nodes.
 72 | # The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended)
 73 | EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES)
 74 | # Or set it manually
 75 | #EXINT="docker0,virbr0"
 76 | ##
 77 | 
 78 | ## configuration check
 79 | # We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp
 80 | # so that OpenMPI caches all data under the user's scratch directory.
 81 | # Not having /tmp mounted under scratch may hang mpi jobs.
 82 | _USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH)
 83 | if [ -z $_USE_SCRATCH ]; then
 84 |     >&2 echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config"
 85 | elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then
 86 |     >&2 echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH"
 87 | fi
 88 | 
 89 | # If MPDIR is not set, then use a default value
 90 | if [ -z $MPDIR ]; then
 91 |     >&2 echo "WARNING: Using default value for \$MPDIR in openmpiscript"
 92 |     MPDIR=/usr/lib64/openmpi
 93 | fi
 94 | PATH=$MPDIR/bin:.:$PATH
 95 | export PATH
 96 | 
 97 | # If EXINT is not set, then use some default values
 98 | if [ -z $EXINT ]; then
 99 |     >&2 echo "WARNING: Using default values for \$EXINT in openmpiscript"
100 |     EXINT="docker0,virbr0"
101 | fi
102 | ##
103 | 
104 | ## cleanup function
105 | _orted_launcher_pid=0
106 | _mpirun_pid=0
107 | CONDOR_CHIRP=$(condor_config_val libexec)/condor_chirp
108 | force_cleanup() {
109 |     # Forward SIGTERM to the orted launcher
110 |     if [ $_orted_launcher_pid -ne 0 ]; then
111 | 	kill -s SIGTERM $_orted_launcher_pid
112 |     fi
113 | 
114 |     # Cleanup mpirun
115 |     if [ $_CONDOR_PROCNO -eq 0 ] && [ $_mpirun_pid -ne 0 ]; then
116 | 	$CONDOR_CHIRP ulog "Node $_CONDOR_PROCNO caught SIGTERM, cleaning up mpirun"
117 | 	rm $HOSTFILE
118 | 	
119 | 	# Send SIGTERM to mpirun and the orted launcher
120 | 	kill -s SIGTERM $_mpirun_pid
121 | 
122 | 	# Give mpirun 30 seconds to terminate nicely
123 | 	for i in {1..30}; do
124 | 	    kill -0 $_mpirun_pid 2> /dev/null # returns 0 if running
125 | 	    _mpirun_killed=$?
126 | 	    if [ $_mpirun_killed -ne 0 ]; then
127 | 		break
128 | 	    fi
129 | 	    sleep 1
130 | 	done
131 | 
132 | 	# If mpirun is still running, send SIGKILL
133 | 	if [ $_mpirun_killed -eq 0 ]; then
134 | 	    $CONDOR_CHIRP ulog "mpirun hung on Node ${_CONDOR_PROCNO}, sending SIGKILL!"
135 | 	    kill -s SIGKILL $_mpirun_pid
136 | 	fi
137 | 
138 |     fi
139 |     exit 1
140 | }
141 | trap force_cleanup SIGTERM
142 | ##
143 | 
144 | ## execute node setup
145 | export PATH=$MPDIR/bin:$PATH
146 | 
147 | # Run the orted launcher (gets orted command from condor_chirp)
148 | $ORTED_LAUNCHER &
149 | _orted_launcher_pid=$!
150 | if [ $_CONDOR_PROCNO -ne 0 ]; then
151 |     # If not on node 0, wait for orted
152 |     wait $_orted_launcher_pid
153 |     exit $?
154 | fi
155 | ##
156 | 
157 | ## head node (node 0) setup
158 | # Build the hostfile
159 | HOSTFILE=hosts
160 | while [ -f $_CONDOR_SCRATCH_DIR/$HOSTFILE ]; do
161 |     HOSTFILE=x$HOSTFILE
162 | done
163 | HOSTFILE=$_CONDOR_SCRATCH_DIR/$HOSTFILE
164 | REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus)
165 | 
166 | for node in $(seq 0 $(( $_CONDOR_NPROCS - 1 ))); do
167 |     if $USE_OPENMP; then
168 | 	# OpenMP will do the threading on the execute node
169 | 	echo "$node slots=1" >> $HOSTFILE
170 |     else
171 | 	# OpenMPI will do the threading on the execute node
172 | 	echo "$node slots=$REQUEST_CPUS" >> $HOSTFILE
173 |     fi
174 | done
175 | 
176 | # Make sure the executable is executable
177 | EXECUTABLE=$1
178 | shift
179 | chmod +x $EXECUTABLE
180 | ##
181 | 
182 | ## run mpirun
183 | # Set MCA values for running on HTCondor
184 | export OMPI_MCA_plm_rsh_agent=$GET_ORTED_CMD     # use the helper script instead of ssh
185 | export OMPI_MCA_plm_rsh_no_tree_spawn=1          # disable ssh tree spawn
186 | export OMPI_MCA_orte_hetero_nodes=1              # do not assume same hardware on each node
187 | export OMPI_MCA_orte_startup_timeout=120         # allow two minutes before failing
188 | export OMPI_MCA_hwloc_base_binding_policy="none" # do not bind to cpu cores
189 | export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT"   # exclude unused tcp network interfaces
190 | 
191 | # Optional MCA values to set for firewalled setups
192 | #export OMPI_MCA_btl_tcp_port_min_v4=1024    # lowest port number that can be used
193 | #export OMPI_MCA_btl_tcp_port_range_v4=64511 # range of ports above lowest that can be used
194 | 
195 | # Optionally set MCA values for increasing mpirun verbosity per component
196 | # (see ompi_info for more components)
197 | #export OMPI_MCA_plm_base_verbose=30
198 | #export OMPI_MCA_orte_base_verbose=30
199 | #export OMPI_MCA_hwloc_base_verbose=30
200 | #export OMPI_MCA_btl_base_verbose=30
201 | 
202 | # Run mpirun in the background and wait for it to exit
203 | mpirun -v --prefix $MPDIR -hostfile $HOSTFILE $EXECUTABLE $@ &
204 | _mpirun_pid=$!
205 | wait $_mpirun_pid
206 | _mpirun_exit=$?
207 | 
208 | ## clean up
209 | # Wait for orted to finish
210 | wait $_orted_launcher_pid
211 | rm $HOSTFILE
212 | exit $_mpirun_exit
213 | 


--------------------------------------------------------------------------------