├── ASSIGNMENTS
    ├── exercise1
    │   ├── algs.png
    │   ├── exercise1.md
    │   └── naive_model.png
    └── exercise2
    │   ├── exercise2.md
    │   ├── exercise2.v1.1.pdf
    │   ├── exercise2.v1.pdf
    │   ├── quicksort.c
    │   └── read_write_pgm_image.c
├── CODE_OPTIMIZATION
    ├── 00--optimization--preliminaries_and_compiler_usage.pdf
    ├── 01--Modern_architecture.pdf
    ├── 02--optimization--cache.pdf
    ├── 03--optimization--branches.pdf
    ├── 05--optimization--loops-and-prefetching.pdf
    ├── Readme.md
    ├── examples_on_branching
    │   ├── if_forest_inside_loop
    │   │   └── loop.c
    │   ├── sort_2_arrays
    │   │   ├── branchpred2.c
    │   │   ├── compile
    │   │   └── mypapi.h
    │   └── unpredictable_datastream
    │   │   ├── amonra.gen10
    │   │       ├── branchpred.besmart.s
    │   │       ├── branchpred.besmart2.c
    │   │       ├── branchpred.besmart2.s
    │   │       ├── branchpred.s
    │   │       ├── branchpred.stat
    │   │       ├── out.2
    │   │       └── out.v
    │   │   ├── branchpred
    │   │   ├── branchpred.c
    │   │   ├── branchpred.c~
    │   │   ├── branchpred.smart
    │   │   └── branchpred.smart2
    ├── examples_on_cache
    │   ├── hot_and_cold_fields
    │   │   ├── hotcold_a.v0.c
    │   │   ├── hotcold_a.v1.c
    │   │   ├── hotcold_b.v0.c
    │   │   ├── hotcold_b.v1.c
    │   │   ├── hotcold_c.v0.c
    │   │   └── hotcold_c.v1.c
    │   ├── matrix_transpose
    │   │   ├── transpose
    │   │   │   ├── matrix_transpose.c
    │   │   │   ├── matrix_transpose_swapped.c
    │   │   │   ├── matrix_transpose_swapped_unroll.c
    │   │   │   └── matrix_transpose_unroll.c
    │   │   └── transpose_by_blocks
    │   │   │   ├── matrix_transpose_blocks.v0.c
    │   │   │   ├── matrix_transpose_blocks.v1.c
    │   │   │   ├── matrix_transpose_blocks.v2.c
    │   │   │   ├── matrix_transpose_blocks.v3.c
    │   │   │   └── mypapi.h
    │   └── memory_mountain
    │   │   ├── Makefile
    │   │   ├── README
    │   │   ├── clock.c
    │   │   ├── clock.h
    │   │   ├── fcyc2.c
    │   │   ├── fcyc2.h
    │   │   ├── mountain.c
    │   │   ├── mountain.gcc
    │   │   ├── plotmountain.gp
    │   │   └── v2
    │   │       ├── Makefile
    │   │       ├── fcyc2.c
    │   │       ├── fcyc2.h
    │   │       ├── mountain.c
    │   │       └── mountain.gcc
    └── examples_on_pipelines
    │   ├── combine_2_arrays
    │       ├── compile
    │       ├── mypapi.h
    │       ├── pipeline.c
    │       ├── run
    │       ├── v0.c
    │       ├── v1.c
    │       ├── v1b.c
    │       ├── v2.c
    │       ├── v3.c
    │       ├── v3b.c
    │       ├── v4.c
    │       └── vector.c
    │   ├── matrix_multiplication
    │       ├── matmul.c
    │       ├── matmul_simple.c
    │       ├── mypapi.h
    │       ├── plot.gp
    │       └── run
    │   ├── polynomial_evaluation
    │       ├── Makefile
    │       ├── benchmark.c
    │       ├── poly.c
    │       ├── poly.h
    │       ├── readme.md
    │       ├── statistics
    │       │   ├── cpe.c
    │       │   ├── cpe.h
    │       │   ├── fcyc.c
    │       │   ├── fcyc.h
    │       │   ├── lsquare.c
    │       │   └── lsquare.h
    │       └── timing
    │       │   ├── clock.c
    │       │   └── clock.h
    │   └── reduction
    │       ├── mypapi.h
    │       ├── plot.gp
    │       ├── reduction.c
    │       └── reduction.h
├── HPC_TOOLS_and_STORAGE
    └── Readme.md
├── Materials
    ├── A_note_on_Endiansim.pdf
    ├── Readme.md
    ├── What_every_computer_scientist_should_know_about_floating-point.pdf
    ├── arguments.c
    └── topics.pdf
├── PARALLELISM
    ├── Readme.md
    ├── codes
    │   ├── memory.c
    │   └── pi.c
    ├── lecture01-intro-toHPC.pdf
    ├── lecture02-HPC-hardware.pdf
    ├── lecture03-HPCsoftware-stack.pdf
    ├── lecture04-on-parallel-programming.pdf
    └── slurm
    │   ├── README.md
    │   ├── slurm01.job
    │   ├── slurm02_A.job
    │   ├── slurm02_B.job
    │   ├── slurm02_C.job
    │   ├── slurm03_A.job
    │   ├── slurm03_B.job
    │   ├── slurm03_C.job
    │   ├── slurm04.job
    │   └── slurm05.job
├── PARALLEL_PROGRAMMING
    ├── MPI
    │   ├── Readme.md
    │   ├── basic-mpi-codes
    │   │   ├── Brecv.c
    │   │   ├── CBlockSends.c
    │   │   ├── clean.sh
    │   │   ├── compile_openMPI_gnu.sh
    │   │   ├── compile_openMPI_intel.sh
    │   │   ├── deadlock.c
    │   │   ├── linear-array.c
    │   │   ├── mpi_env_call.c
    │   │   ├── mpi_hello_world.F90
    │   │   ├── mpi_hello_world.c
    │   │   ├── mpi_hello_world_sync.c
    │   │   ├── mpi_pi.c
    │   │   ├── mpi_pi.job
    │   │   ├── send_message.F90
    │   │   ├── send_message.c
    │   │   └── sendrecv_message.c
    │   ├── collective-mpi
    │   │   ├── all2allv3d.c
    │   │   ├── allgather.job
    │   │   ├── allgather.py
    │   │   ├── allgatherv.c
    │   │   ├── b_cast.c
    │   │   ├── b_cast.f
    │   │   ├── clean.sh
    │   │   ├── compile.sh
    │   │   ├── gather.c
    │   │   ├── gather.f
    │   │   ├── mpi_bcastcompare.c
    │   │   ├── reduce.c
    │   │   ├── reduce.f
    │   │   ├── scatter.c
    │   │   └── scatter.f
    │   ├── compiling-and-running-mpi-programs.md
    │   ├── lecture05-MPI-Programming-part-A.pdf
    │   ├── lecture05-MPI-Programming-part-B.pdf
    │   ├── lecture06-Network-basics-for-MPI-application.pptx
    │   └── pi_scalability
    │   │   └── scalability.job
    └── OpenMP
    │   ├── 00--Memory_model.pdf
    │   ├── 01--Intro_to_OpenMP.pdf
    │   ├── 02--parallel_regions.pdf
    │   ├── 03--loops.pdf
    │   ├── 04--threads_affinity.pdf
    │   ├── examples
    │       ├── .#for.c
    │       ├── parallel_loops
    │       │   ├── 00_array_sum_with_race.c
    │       │   ├── 01a_array_sum.c
    │       │   ├── 01b_array_sum.c
    │       │   ├── 01c_array_sum.c
    │       │   ├── 01d_array_sum.c
    │       │   ├── 02_falsesharing.c
    │       │   ├── 03_falsesharing_fixed.c
    │       │   ├── 04_scheduling.c
    │       │   ├── 05_first_and_last_private.c
    │       │   ├── loop_without_for.c
    │       │   ├── pi_openmp.c
    │       │   └── pi_openmp.fix.c
    │       ├── parallel_regions
    │       │   ├── 00_scope_of_variables.c
    │       │   ├── 00_stack_and_scope.c
    │       │   ├── 01_simple_pr_wrong.c
    │       │   ├── 02_simple_pr.c
    │       │   ├── 03a_num_of_threads.c
    │       │   ├── 03b_num_of_threads.c
    │       │   ├── 04_order_of_threads_wrong.c
    │       │   ├── 05a_order_of_threads.c
    │       │   ├── 05b_order_of_threads.c
    │       │   ├── 05c_order_of_threads.c
    │       │   ├── 09_clauses__copyin.c
    │       │   ├── 09_clauses__copyin__clarify.c
    │       │   ├── 09_clauses__copyprivate.c
    │       │   ├── 09_clauses__firstprivate.c
    │       │   ├── 09_clauses__lastprivate.c
    │       │   └── 09_clauses__threadprivate.c
    │       └── threads_affinity
    │       │   ├── 00_where_I_am.c
    │       │   ├── 01_where_I_am_omp.c
    │       │   ├── 02_where_I_am_omp.c
    │       │   ├── 03_where_I_am_nested.c
    │       │   ├── 04_touch_by_one.c
    │       │   ├── 05_touch_by_all.c
    │       │   └── 06_touch_by_all_threadprivate.c
    │   ├── examples_on_stack
    │       ├── 00_explore_how_bytes_are_stored.c
    │       ├── 01a_understanding_the_stack.c
    │       └── 01b_understanding_the_stack.c
    │   └── exercises
    │       ├── .#lab_exercise.2.c
    │       ├── exercises.pdf
    │       ├── lab_exercise.2.c
    │       ├── lab_exercise.2.v2.c
    │       ├── lab_exercise.c
    │       ├── my_lab_exercise.2.c
    │       ├── my_lab_exercise.2.v2.c
    │       ├── prefix_sum.serial.c
    │       ├── prefix_sum.serial.h
    │       └── write_pgm_image.c
├── README.md
├── intro_to_course.pdf
└── lecture01-intro-toHPC.pdf


/ASSIGNMENTS/exercise1/algs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/algs.png


--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise1/naive_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/naive_model.png


--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise2/exercise2.v1.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.1.pdf


--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise2/exercise2.v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/01--Modern_architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/01--Modern_architecture.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/02--optimization--cache.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/02--optimization--cache.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/03--optimization--branches.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/03--optimization--branches.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/Readme.md:
--------------------------------------------------------------------------------
1 | # Materials on serial code optimization
2 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/sort_2_arrays/compile:
--------------------------------------------------------------------------------
1 | gcc -march=native -I/scratch/Software/include -DUSE_PAPI -o branchpred2 branchpred2.c -lm -L/scratch/Software/lib -lpapi
2 | gcc -march=native -DBESMART -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart branchpred2.c -lm -L/scratch/Software/lib -lpapi
3 | gcc -march=native -DBESMART2 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart2 branchpred2.c -lm -L/scratch/Software/lib -lpapi
4 | gcc -march=native -DBESMART3 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart3 branchpred2.c -lm -L/scratch/Software/lib -lpapi
5 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/branchpred.besmart2.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of the exercises for the Lectures on 
  3 |  *   "Foundations of High Performance Computing"
  4 |  * given at 
  5 |  *   Master in HPC and 
  6 |  *   Master in Data Science and Scientific Computing
  7 |  * @ SISSA, ICTP and University of Trieste
  8 |  *
  9 |  * contact: luca.tornatore@inaf.it
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | 
 25 | 
 26 | 
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include <string.h>
 30 | #include <time.h>
 31 | 
 32 | 
 33 | #define SIZE_DEFAULT 1000000
 34 | #define TOP (2 << 20)
 35 | #define PIVOT (TOP >> 2)
 36 | 
 37 | 
 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec +	\
 39 | 		   (double)ts.tv_nsec * 1e-9)
 40 | 
 41 | 
 42 | 
 43 | int main(int argc, char **argv)
 44 | {
 45 |   int  SIZE;
 46 |   int *data;
 47 |   int  cc, ii;
 48 | 
 49 |   long long sum = 0;
 50 | 
 51 |   struct timespec ts;
 52 |   double tstart, tstop;
 53 |   
 54 |   if(argc > 1)
 55 |     SIZE = atoi( *(argv+1) );
 56 |   else
 57 |     SIZE = SIZE_DEFAULT;
 58 | 
 59 |   // Generate data
 60 |   data = (int*)calloc(SIZE, sizeof(int));
 61 |   srand((int)(SIZE));
 62 |   
 63 |   for (cc = 0; cc < SIZE; cc++)
 64 |     data[cc] = rand() % TOP;
 65 | 
 66 | 
 67 |   tstart = TCPU_TIME;
 68 |   
 69 |   for (cc = 0; cc < 1000; cc++)
 70 |       {
 71 | 	sum = 0;
 72 | 	long long _sum_[4] = {0};
 73 |         for (ii = 0; ii < SIZE; ii+=4)
 74 |           {
 75 | 	    _sum_[0] += (data[ii]>PIVOT? data[ii] : 0);
 76 | 	    _sum_[1] += (data[ii+1]>PIVOT? data[ii+1] : 0);
 77 |             _sum_[2] += (data[ii+2]>PIVOT? data[ii+2] : 0);
 78 |             _sum_[3] += (data[ii+3]>PIVOT? data[ii+3] : 0);
 79 |           }
 80 | 	sum += (_sum_[0] + _sum_[1]) + (_sum_[2] + _sum_[3]);
 81 |       }
 82 | 
 83 |   tstop = TCPU_TIME;
 84 |   
 85 | #ifdef WOW
 86 |   tot_tstop = TCPU_TIME;
 87 | #endif
 88 |   
 89 |   free(data);
 90 | 
 91 |  #if !defined(WOW)
 92 |   printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
 93 | 
 94 | #else
 95 |   double tot_time  = tot_tstop - tot_tstart;
 96 |   double loop_time = tstop - tstart;
 97 |   printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
 98 | 	 sum, tot_time, loop_time, tot_time - loop_time);
 99 | #endif
100 | 
101 |   printf("\n");
102 |   return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.2:
--------------------------------------------------------------------------------
 1 | 
 2 | 3.4191
 3 | 
 4 | 
 5 | 3.57167
 6 | 
 7 | 
 8 | 3.44099
 9 | 
10 | 
11 | 4.17072
12 | 
13 | 
14 | 4.20686
15 | 
16 | 
17 | 3.64886
18 | 
19 | 
20 | 3.39921
21 | 
22 | 
23 | 4.78118
24 | 
25 | 
26 | 3.54926
27 | 
28 | 
29 | 3.52104
30 | 
31 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.v:
--------------------------------------------------------------------------------
 1 | 
 2 | sum is 9831544284110, elapsed seconds: 3.45677
 3 | 
 4 | 
 5 | sum is 9831544284110, elapsed seconds: 3.80376
 6 | 
 7 | 
 8 | sum is 9831544284110, elapsed seconds: 4.81135
 9 | 
10 | 
11 | sum is 9831544284110, elapsed seconds: 3.60161
12 | 
13 | 
14 | sum is 9831544284110, elapsed seconds: 3.65025
15 | 
16 | 
17 | sum is 9831544284110, elapsed seconds: 3.68967
18 | 
19 | 
20 | sum is 9831544284110, elapsed seconds: 3.63842
21 | 
22 | 
23 | sum is 9831544284110, elapsed seconds: 3.63771
24 | 
25 | 
26 | sum is 9831544284110, elapsed seconds: 3.6503
27 | 
28 | 
29 | sum is 9831544284110, elapsed seconds: 3.64676
30 | 
31 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of the exercises for the Lectures on 
  3 |  *   "Foundations of High Performance Computing"
  4 |  * given at 
  5 |  *   Master in HPC and 
  6 |  *   Master in Data Science and Scientific Computing
  7 |  * @ SISSA, ICTP and University of Trieste
  8 |  *
  9 |  * contact: luca.tornatore@inaf.it
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | 
 25 | 
 26 | 
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include <string.h>
 30 | #include <time.h>
 31 | 
 32 | 
 33 | #define SIZE_DEFAULT 1000000
 34 | #define TOP (2 << 20)
 35 | #define PIVOT (TOP >> 2)
 36 | 
 37 | 
 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec +	\
 39 | 		   (double)ts.tv_nsec * 1e-9)
 40 | 
 41 | 
 42 | #ifdef WOW  
 43 | int compare(const void *A, const void *B)
 44 | {
 45 |   return *(int*)A - *(int*)B;
 46 | }
 47 | #endif
 48 | 
 49 | int main(int argc, char **argv)
 50 | {
 51 |   int  SIZE;
 52 |   int *data;
 53 |   int  cc, ii;
 54 | 
 55 | #ifdef WOW
 56 |   double tot_tstart, tot_tstop;
 57 | #endif
 58 | 
 59 |   long long sum = 0;
 60 | 
 61 |   struct timespec ts;
 62 |   double tstart, tstop;
 63 |   
 64 |   if(argc > 1)
 65 |     SIZE = atoi( *(argv+1) );
 66 |   else
 67 |     SIZE = SIZE_DEFAULT;
 68 | 
 69 |   // Generate data
 70 |   data = (int*)calloc(SIZE, sizeof(int));
 71 |   srand((int)(SIZE));
 72 |   
 73 |   for (cc = 0; cc < SIZE; cc++)
 74 |     data[cc] = rand() % TOP;
 75 | 
 76 |    
 77 |   
 78 | #ifdef WOW
 79 |   tot_tstart = TCPU_TIME;
 80 |   // !!! With this, the next loop runs faster
 81 |   qsort(data, SIZE, sizeof(int), compare);
 82 | #endif
 83 |   
 84 | 
 85 |   tstart = TCPU_TIME;
 86 |   
 87 |   for (cc = 0; cc < 1000; cc++)
 88 |       {
 89 | 	sum = 0;
 90 | 
 91 |         for (ii = 0; ii < SIZE; ii++)
 92 |           {
 93 | #if !defined( BESMART ) && !defined( BESMART2 )
 94 |             if (data[ii] > PIVOT)
 95 |               sum += data[ii];
 96 | 
 97 | #elif defined( BESMART )
 98 |             unsigned int t = (data[ii] - PIVOT - 1) >> 31;   // the additional -1 is for the case data[ii]==PIVOT
 99 |             sum += ~t & data[ii];
100 | 
101 | #elif defined( BESMART2 )
102 | 	    //sum += (data[ii]>PIVOT)*data[ii];
103 | 	    sum += (data[ii]>PIVOT? data[ii] : 0);
104 | #endif
105 |           }
106 |       }
107 | 
108 |   tstop = TCPU_TIME;
109 |   
110 | #ifdef WOW
111 |   tot_tstop = TCPU_TIME;
112 | #endif
113 |   
114 |   free(data);
115 | 
116 |  #if !defined(WOW)
117 |   printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
118 | 
119 | #else
120 |   double tot_time  = tot_tstop - tot_tstart;
121 |   double loop_time = tstop - tstart;
122 |   printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
123 | 	 sum, tot_time, loop_time, tot_time - loop_time);
124 | #endif
125 | 
126 |   printf("\n");
127 |   return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c~:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This file is part of the exercises for the Lectures on 
  3 |  *   "Foundations of High Performance Computing"
  4 |  * given at 
  5 |  *   Master in HPC and 
  6 |  *   Master in Data Science and Scientific Computing
  7 |  * @ SISSA, ICTP and University of Trieste
  8 |  *
  9 |  * contact: luca.tornatore@inaf.it
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | 
 25 | 
 26 | 
 27 | #include <stdlib.h>
 28 | #include <stdio.h>
 29 | #include <string.h>
 30 | #include <time.h>
 31 | 
 32 | 
 33 | #define SIZE_DEFAULT 1000000
 34 | #define TOP (2 << 20)
 35 | #define PIVOT (TOP >> 2)
 36 | 
 37 | 
 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec +	\
 39 | 		   (double)ts.tv_nsec * 1e-9)
 40 | 
 41 | 
 42 | #ifdef WOW  
 43 | int compare(const void *A, const void *B)
 44 | {
 45 |   return *(int*)A - *(int*)B;
 46 | }
 47 | #endif
 48 | 
 49 | int main(int argc, char **argv)
 50 | {
 51 |   int  SIZE;
 52 |   int *data;
 53 |   int  cc, ii;
 54 | 
 55 | #ifdef WOW
 56 |   double tot_tstart, tot_tstop;
 57 | #endif
 58 | 
 59 |   long long sum = 0;
 60 | 
 61 |   struct timespec ts;
 62 |   double tstart, tstop;
 63 |   
 64 |   if(argc > 1)
 65 |     SIZE = atoi( *(argv+1) );
 66 |   else
 67 |     SIZE = SIZE_DEFAULT;
 68 | 
 69 |   // Generate data
 70 |   data = (int*)calloc(SIZE, sizeof(int));
 71 |   srand((int)(SIZE));
 72 |   
 73 |   for (cc = 0; cc < SIZE; cc++)
 74 |     data[cc] = rand() % TOP;
 75 | 
 76 |    
 77 |   
 78 | #ifdef WOW
 79 |   tot_tstart = TCPU_TIME;
 80 |   // !!! With this, the next loop runs faster
 81 |   qsort(data, SIZE, sizeof(int), compare);
 82 | #endif
 83 |   
 84 | 
 85 |   tstart = TCPU_TIME;
 86 |   
 87 |   for (cc = 0; cc < 1000; cc++)
 88 |       {
 89 | 	sum = 0;
 90 | 
 91 |         for (ii = 0; ii < SIZE; ii++)
 92 |           {
 93 | #if !defined( BESMART ) && !defined( BESMART2 )
 94 |             if (data[ii] > PIVOT)
 95 |               sum += data[ii];
 96 | 
 97 | #elif defined( BESMART )
 98 |             unsigned int t = (data[ii] - PIVOT - 1) >> 31;   // the additional -1 is for the case data[ii]==PIVOT
 99 |             sum += ~t & data[ii];
100 | 
101 | #elif defined( BESMART2 )
102 | 	    sum += (data[ii]>PIVOT)*data[ii];
103 | #endif
104 |           }
105 |       }
106 | 
107 |   tstop = TCPU_TIME;
108 |   
109 | #ifdef WOW
110 |   tot_tstop = TCPU_TIME;
111 | #endif
112 |   
113 |   free(data);
114 | 
115 |  #if !defined(WOW)
116 |   printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
117 | 
118 | #else
119 |   double tot_time  = tot_tstop - tot_tstart;
120 |   double loop_time = tstop - tstart;
121 |   printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
122 | 	 sum, tot_time, loop_time, tot_time - loop_time);
123 | #endif
124 | 
125 |   printf("\n");
126 |   return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v0.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700  // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | typedef struct node_t {
 40 |   double         key;
 41 |   char           data[DATASIZE];
 42 |   struct node_t *next;
 43 | } node;
 44 | 
 45 | 
 46 | 
 47 | 
 48 | #define N_default 10000
 49 | 
 50 | int main( int argc, char **argv )
 51 | {
 52 |   struct timespec ts;
 53 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 54 |   
 55 |   // -------------------------------------
 56 |   // startup
 57 |   
 58 |   int N    = N_default;
 59 |   
 60 |   if ( argc > 1 )
 61 |     N = atoi( *(argv+1) );
 62 | 
 63 | 
 64 |   // -------------------------------------
 65 |   // setup
 66 | 
 67 |   double *keys  = (double*)calloc( N, sizeof(double));
 68 |   node   *last  = NULL;
 69 |   node   *first = NULL;
 70 | 
 71 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 72 |   srand48( time(NULL) );
 73 | 
 74 |   for( int nn = 0; nn < N; nn++ )
 75 |     {
 76 |       node *new = (node*)calloc( 1, sizeof(node) );
 77 |       if ( last != NULL )
 78 | 	last->next = new;
 79 |       else
 80 | 	first = new;
 81 |       new ->key  = drand48();
 82 |       keys[nn] = new->key;
 83 |       new ->next = NULL;
 84 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 85 |       last = new;
 86 |     }
 87 | 
 88 | 
 89 |   printf("now let's search for all of them\n"); fflush(stdout);
 90 |   
 91 |   int NSHOTS    = N;
 92 |   double sum    = 0;
 93 |   
 94 |   double tstart = CPU_TIME;
 95 |   
 96 |   for( int ii = 0; ii < NSHOTS; ii++ )
 97 |     {      
 98 |       double key = keys[(int)(drand48() * N)];
 99 |       node *target = first;
100 | 
101 |       // this implementation is less efficient than
102 |       // that in v1      
103 |       for ( int nn = 0; nn < N; nn++ )
104 | 	if ( target->key == key )
105 | 	  sum += target->key;
106 | 	else
107 | 	  target = target->next;      
108 |     }
109 | 
110 |   double et = CPU_TIME - tstart;
111 | 
112 |   printf("timing for %d shots: %g\n", NSHOTS, et );
113 | 
114 |   node *target = first;
115 |   while( target->next != NULL )
116 |     {
117 |       node *tmp = target->next;
118 |       free(target);
119 |       target = tmp;
120 |     }
121 |   
122 |   return 0;
123 | }
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v1.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700  // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | typedef struct node_t {
 40 |   double         key;
 41 |   char           data[DATASIZE];
 42 |   struct node_t *next;
 43 | } node;
 44 | 
 45 | 
 46 | 
 47 | 
 48 | #define N_default 10000
 49 | 
 50 | int main( int argc, char **argv )
 51 | {
 52 |   struct timespec ts;
 53 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 54 |   
 55 |   // -------------------------------------
 56 |   // startup
 57 |   
 58 |   int N    = N_default;
 59 |   
 60 |   if ( argc > 1 )
 61 |     N = atoi( *(argv+1) );
 62 | 
 63 | 
 64 |   // -------------------------------------
 65 |   // setup
 66 | 
 67 |   double *keys  = (double*)calloc( N, sizeof(double));
 68 |   node   *last  = NULL;
 69 |   node   *first = NULL;
 70 | 
 71 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 72 |   srand48( time(NULL) );
 73 | 
 74 |   for( int nn = 0; nn < N; nn++ )
 75 |     {
 76 |       node *new = (node*)calloc( 1, sizeof(node) );
 77 |       if ( last != NULL )
 78 | 	last->next = new;
 79 |       else
 80 | 	first = new;
 81 |       new ->key  = drand48();
 82 |       keys[nn] = new->key;
 83 |       new ->next = NULL;
 84 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 85 |       last = new;
 86 |     }
 87 | 
 88 | 
 89 |   printf("now let's search for all of them\n"); fflush(stdout);
 90 |   
 91 |   int NSHOTS    = N;
 92 |   double sum    = 0;
 93 |   
 94 |   double tstart = CPU_TIME;
 95 |   
 96 |   for( int ii = 0; ii < NSHOTS; ii++ )
 97 |     {      
 98 |       double key = keys[(int)(drand48() * N)];
 99 |       node *target = first;
100 | 
101 |       while ( target->key != key )
102 | 	target = target->next;
103 |       sum += target->key;
104 |     }
105 | 
106 |   double et = CPU_TIME - tstart;
107 | 
108 |   printf("timing for %d shots: %g\n", NSHOTS, et );
109 | 
110 |   node *target = first;
111 |   while( target->next != NULL )
112 |     {
113 |       node *tmp = target->next;
114 |       free(target);
115 |       target = tmp;
116 |     }
117 |   
118 |   return 0;
119 | }
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v0.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700   // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | typedef struct node_t {
 40 |   double         key;
 41 |   struct node_t *next;
 42 |   char           data[DATASIZE];
 43 | } node;
 44 | 
 45 | 
 46 | 
 47 | 
 48 | #define N_default 10000
 49 | 
 50 | int main( int argc, char **argv )
 51 | {
 52 |   struct timespec ts;
 53 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 54 |   
 55 |   // -------------------------------------
 56 |   // startup
 57 |   
 58 |   int N    = N_default;
 59 |   
 60 |   if ( argc > 1 )
 61 |     N = atoi( *(argv+1) );
 62 | 
 63 | 
 64 |   // -------------------------------------
 65 |   // setup
 66 | 
 67 |   double *keys  = (double*)calloc( N, sizeof(double));
 68 |   node   *last  = NULL;
 69 |   node   *first = NULL;
 70 | 
 71 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 72 |   srand48( time(NULL) );
 73 | 
 74 |   for( int nn = 0; nn < N; nn++ )
 75 |     {
 76 |       node *new = (node*)calloc( 1, sizeof(node) );
 77 |       if ( last != NULL )
 78 | 	last->next = new;
 79 |       else
 80 | 	first = new;
 81 |       new ->key  = drand48();
 82 |       keys[nn] = new->key;
 83 |       new ->next = NULL;
 84 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 85 |       last = new;
 86 |     }
 87 | 
 88 | 
 89 |   printf("now let's search for all of them\n"); fflush(stdout);
 90 |   
 91 |   int NSHOTS    = N;
 92 |   double sum    = 0;
 93 |   
 94 |   double tstart = CPU_TIME;
 95 |   
 96 |   for( int ii = 0; ii < NSHOTS; ii++ )
 97 |     {      
 98 |       double key = keys[(int)(drand48() * N)];
 99 |       node *target = first;
100 | 
101 |       // this implementation is less efficient than
102 |       // that in v1
103 |       for ( int nn = 0; nn < N; nn++ )
104 | 	if ( target->key == key )
105 | 	  sum += target->key;
106 | 	else
107 | 	  target = target->next;      
108 |     }
109 | 
110 |   double et = CPU_TIME - tstart;
111 | 
112 |   printf("timing for %d shots: %g\n", NSHOTS, et );
113 | 
114 |   node *target = first;
115 |   while( target->next != NULL )
116 |     {
117 |       node *tmp = target->next;
118 |       free(target);
119 |       target = tmp;
120 |     }
121 |   
122 |   return 0;
123 | }
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v1.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700   // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | typedef struct node_t {
 40 |   double         key;
 41 |   struct node_t *next;
 42 |   char           data[DATASIZE];
 43 | } node;
 44 | 
 45 | 
 46 | 
 47 | 
 48 | #define N_default 10000
 49 | 
 50 | int main( int argc, char **argv )
 51 | {
 52 |   struct timespec ts;
 53 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 54 |   
 55 |   // -------------------------------------
 56 |   // startup
 57 |   
 58 |   int N    = N_default;
 59 |   
 60 |   if ( argc > 1 )
 61 |     N = atoi( *(argv+1) );
 62 | 
 63 | 
 64 |   // -------------------------------------
 65 |   // setup
 66 | 
 67 |   double *keys  = (double*)calloc( N, sizeof(double));
 68 |   node   *last  = NULL;
 69 |   node   *first = NULL;
 70 | 
 71 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 72 |   srand48( time(NULL) );
 73 | 
 74 |   for( int nn = 0; nn < N; nn++ )
 75 |     {
 76 |       node *new = (node*)calloc( 1, sizeof(node) );
 77 |       if ( last != NULL )
 78 | 	last->next = new;
 79 |       else
 80 | 	first = new;
 81 |       new ->key  = drand48();
 82 |       keys[nn] = new->key;
 83 |       new ->next = NULL;
 84 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 85 |       last = new;
 86 |     }
 87 | 
 88 | 
 89 |   printf("now let's search for all of them\n"); fflush(stdout);
 90 |   
 91 |   int NSHOTS    = N;
 92 |   double sum    = 0;
 93 |   
 94 |   double tstart = CPU_TIME;
 95 |   
 96 |   for( int ii = 0; ii < NSHOTS; ii++ )
 97 |     {      
 98 |       double key = keys[(int)(drand48() * N)];
 99 |       node *target = first;
100 | 
101 |       while ( target->key != key )
102 | 	target = target->next;
103 |       sum += target->key;
104 |     }
105 | 
106 |   double et = CPU_TIME - tstart;
107 | 
108 |   printf("timing for %d shots: %g\n", NSHOTS, et );
109 | 
110 |   node *target = first;
111 |   while( target->next != NULL )
112 |     {
113 |       node *tmp = target->next;
114 |       free(target);
115 |       target = tmp;
116 |     }
117 |   
118 |   return 0;
119 | }
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v0.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700  // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | 
 40 | typedef struct node_t {
 41 |   double         key;
 42 |   struct node_t *next;
 43 |   void          *data;
 44 | } node;
 45 | 
 46 | 
 47 | 
 48 | 
 49 | #define N_default 10000
 50 | 
 51 | int main( int argc, char **argv )
 52 | {
 53 |   struct timespec ts;
 54 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 55 |   
 56 |   // -------------------------------------
 57 |   // startup
 58 |   
 59 |   int N    = N_default;
 60 |   
 61 |   if ( argc > 1 )
 62 |     N = atoi( *(argv+1) );
 63 | 
 64 | 
 65 |   // -------------------------------------
 66 |   // setup
 67 | 
 68 |   double *keys    = (double*)calloc( N, sizeof(double));
 69 |   char   *alldata = (char*)calloc( DATASIZE*N, sizeof(char));
 70 |   node   *last    = NULL;
 71 |   node   *first   = NULL;
 72 | 
 73 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 74 |   srand48( time(NULL) );
 75 | 
 76 |   for( int nn = 0; nn < N; nn++ )
 77 |     {
 78 |       node *new = (node*)calloc( 1, sizeof(node) );
 79 |       if ( last != NULL )
 80 | 	last->next = new;
 81 |       else
 82 | 	first = new;
 83 |       new ->key  = drand48();
 84 |       keys[nn] = new->key;
 85 |       new ->next = NULL;
 86 |       new ->data = alldata + DATASIZE*nn;
 87 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 88 |       last = new;
 89 |     }
 90 | 
 91 | 
 92 |   printf("now let's search for all of them\n"); fflush(stdout);
 93 |   
 94 |   int NSHOTS    = N;
 95 |   double sum    = 0;
 96 |   
 97 |   double tstart = CPU_TIME;
 98 |   
 99 |   for( int ii = 0; ii < NSHOTS; ii++ )
100 |     {      
101 |       double key = keys[(int)(drand48() * N)];
102 |       node *target = first;
103 | 
104 |       // this implementation is less efficient than
105 |       // that in v1
106 |       for ( int nn = 0; nn < N; nn++ )
107 | 	if ( target->key == key )
108 | 	  sum += target->key;
109 | 	else
110 | 	  target = target->next;
111 |     }
112 | 
113 |   double et = CPU_TIME - tstart;
114 | 
115 |   printf("timing for %d shots: %g\n", NSHOTS, et );
116 | 
117 |   node *target = first;
118 |   while( target->next != NULL )
119 |     {
120 |       node *tmp = target->next;
121 |       free(target);
122 |       target = tmp;
123 |     }
124 |   
125 |   return 0;
126 | }
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v1.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * This file is part of the exercises for the Lectures on 
  4 |  *   "Foundations of High Performance Computing"
  5 |  * given at 
  6 |  *   Master in HPC and 
  7 |  *   Master in Data Science and Scientific Computing
  8 |  * @ SISSA, ICTP and University of Trieste
  9 |  *  2019
 10 |  *
 11 |  *     This is free software; you can redistribute it and/or modify
 12 |  *     it under the terms of the GNU General Public License as published by
 13 |  *     the Free Software Foundation; either version 3 of the License, or
 14 |  *     (at your option) any later version.
 15 |  *     This code is distributed in the hope that it will be useful,
 16 |  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 |  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 |  *     GNU General Public License for more details.
 19 |  *
 20 |  *     You should have received a copy of the GNU General Public License 
 21 |  *     along with this program.  If not, see <http://www.gnu.org/licenses/>
 22 |  */
 23 | 
 24 | #define _XOPEN_SOURCE 700  // ensures we're using c11 standard
 25 | #include <stdlib.h>
 26 | #include <stdio.h>
 27 | #include <string.h>
 28 | #include <strings.h>
 29 | #include <time.h>
 30 | 
 31 | 
 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec +	\
 33 | 		  (double)ts.tv_nsec * 1e-9)
 34 | 
 35 | #ifndef DATASIZE
 36 | #define DATASIZE 200
 37 | #endif
 38 | 
 39 | 
 40 | typedef struct node_t {
 41 |   double         key;
 42 |   struct node_t *next;
 43 |   void          *data;
 44 | } node;
 45 | 
 46 | 
 47 | 
 48 | 
 49 | #define N_default 10000
 50 | 
 51 | int main( int argc, char **argv )
 52 | {
 53 |   struct timespec ts;
 54 |   clockid_t       id = CLOCK_PROCESS_CPUTIME_ID;
 55 |   
 56 |   // -------------------------------------
 57 |   // startup
 58 |   
 59 |   int N    = N_default;
 60 |   
 61 |   if ( argc > 1 )
 62 |     N = atoi( *(argv+1) );
 63 | 
 64 | 
 65 |   // -------------------------------------
 66 |   // setup
 67 | 
 68 |   double *keys    = (double*)calloc( N, sizeof(double));
 69 |   char   *alldata = (char*)calloc( DATASIZE*N, sizeof(char));
 70 |   node   *last    = NULL;
 71 |   node   *first   = NULL;
 72 | 
 73 |   printf("creating and initializing %d nodes\n", N ); fflush(stdout);
 74 |   srand48( time(NULL) );
 75 | 
 76 |   for( int nn = 0; nn < N; nn++ )
 77 |     {
 78 |       node *new = (node*)calloc( 1, sizeof(node) );
 79 |       if ( last != NULL )
 80 | 	last->next = new;
 81 |       else
 82 | 	first = new;
 83 |       new ->key  = drand48();
 84 |       keys[nn] = new->key;
 85 |       new ->next = NULL;
 86 |       new ->data = alldata + DATASIZE*nn;
 87 |       memset( new->data, 0, sizeof(char)*DATASIZE);
 88 |       last = new;
 89 |     }
 90 | 
 91 | 
 92 |   printf("now let's search for all of them\n"); fflush(stdout);
 93 |   
 94 |   int NSHOTS    = N;
 95 |   double sum    = 0;
 96 |   
 97 |   double tstart = CPU_TIME;
 98 |   
 99 |   for( int ii = 0; ii < NSHOTS; ii++ )
100 |     {      
101 |       double key = keys[(int)(drand48() * N)];
102 |       node *target = first;
103 | 
104 |       while ( target->key != key )
105 | 	target = target->next;
106 |       sum += target->key;
107 |     }
108 | 
109 |   double et = CPU_TIME - tstart;
110 | 
111 |   printf("timing for %d shots: %g\n", NSHOTS, et );
112 | 
113 |   node *target = first;
114 |   while( target->next != NULL )
115 |     {
116 |       node *tmp = target->next;
117 |       free(target);
118 |       target = tmp;
119 |     }
120 |   
121 |   return 0;
122 | }
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/mypapi.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #if defined(USE_PAPI)                                           // -----------------------------------------------------------
 4 | #include <papi.h>
 5 | 
 6 | typedef unsigned long long int uLint;
 7 | 
 8 | #define PAPI_EVENTS_NUM 4
 9 | int   papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L2_DCM };
10 | int   papi_EventSet                = PAPI_NULL;             // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0};                   // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0};                   // accumulate the counters' values
13 | 
14 |                                                                 // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) {						\
16 |     if ( (R) != PAPI_OK ) {						\
17 |       printf("a problem with PAPI (code %d) arise at line %d\n",	\
18 | 	     (R), __LINE__);fflush(stdout); return (R); }}
19 | 
20 | 
21 |                                                                 // check that PAPI is OK,
22 |                                                                 // issue a warning if not with a
23 |                                                                 // provided message
24 | #define PAPI_WARN( R, S ) {						\
25 |     if ( (R) != PAPI_OK ) {						\
26 |       printf("a problem  with PAPI (code %d) arise at line %d: %s\n",	\
27 | 	     (R),  __LINE__, (S)); fflush(stdout); }}
28 | 
29 |                                                                 // check that PAPI is OK about an event
30 |                                                                 // issue a warning if not with a
31 |                                                                 // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) {				\
33 |     if ( (R) != PAPI_OK ) {						\
34 |       printf("a problem  with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | 	     (R), (E),  __LINE__, (S1), (n)); fflush(stdout); }}
36 | 
37 | 
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 |       retval = PAPI_query_event(papi_events[i]);			\
40 |       if ( retval == PAPI_OK ) {					\
41 | 	retval = PAPI_add_event(papi_EventSet, papi_events[i]);		\
42 | 	PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 |       PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)}  } }
44 | 
45 | #define PAPI_INIT {							\
46 |     int retval = PAPI_library_init(PAPI_VER_CURRENT);			\
47 |     if (retval != PAPI_VER_CURRENT)					\
48 |       printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 |     retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 |     PAPI_ADD_EVENTS_to_SET; }
51 | 
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR  { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 | 
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR {						\
58 |     int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 | 
60 | #define PAPI_STOP_CNTR {						\
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer);			\
62 | if( retval == PAPI_OK ) {						\
63 |   for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)				\
64 |     papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 | 
66 | #define PAPI_GET_CNTR( i ) ( papi_values[(i)] )
67 | 
68 | #define PAPI_FLUSH_BUFFER {				\
69 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
70 |       papi_buffer[ jj] = 0; }
71 | 
72 | #define PAPI_FLUSH {					\
73 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
74 |       papi_values[jj] = papi_buffer[ jj] = 0; }
75 | 
76 | 
77 | #else                                                           // -----------------------------------------------------------
78 | 
79 | #define PAPI_EVENTS_NUM 0
80 | #define PAPI_INIT
81 | #define PAPI_START_CNTR
82 | #define PAPI_STOP_CNTR
83 | #define PAPI_FLUSH
84 | #define PAPI_GET_CNTR( i ) 0
85 | 
86 | #endif                                                          // -----------------------------------------------------------
87 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | COMPILER=gcc
 3 | 
 4 | ifeq ($(COMPILER),gcc)
 5 | CC = gcc
 6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm -D__i686__ 
 7 | SUFFIX = .gcc
 8 | LIBM = -lm
 9 | endif
10 | 
11 | ifeq ($(COMPILER),icc)
12 | CC = icc
13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo
14 | SUFFIX = .icc
15 | LIBM =
16 | endif
17 | 
18 | ifeq ($(COMPILER),pgcc)
19 | CC = pgcc
20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm
21 | SUFFIX = .pgcc
22 | LIBM = -lm
23 | endif
24 | 
25 | mountain: mountain.c fcyc2.c clock.c
26 | 	$(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c clock.c $(LIBM)
27 | 
28 | clean:
29 | 	rm -f mountain$(SUFFIX) *.o *~
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/README:
--------------------------------------------------------------------------------
 1 | This directory contains code for generating a memory mountain, as
 2 | described in Computer Systems: A Programmer's Perspective 
 3 | 
 4 | clock.{c,h}	- routines for using x86 and Alpha cycle timers
 5 | fcyc2.{c,h}	- routines that estimate the number of cycles required 
 6 |                   by a function f that takes two arguments.
 7 | Makefile	- memory mountain makefile
 8 | mountain.c	- program that generates the memory mountain.
 9 | 
10 | (1) set the compiler at the top of Makefile
11 | (2) invoke make
12 | (3) execute the mountain.$COMPILER
13 | (4) copy the output in a file named mountain.dat
14 | (5) use plotmountain.gp to plot the data using gnuplot
15 |     type 'load "plotmountain.gp"' from inside gnuplot
16 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/clock.h:
--------------------------------------------------------------------------------
 1 | /* Routines for using cycle counter */
 2 | 
 3 | /* Start the counter */
 4 | void start_counter();
 5 | 
 6 | /* Get # cycles since counter started */
 7 | double get_counter();
 8 | 
 9 | 
10 | /* Measure overhead for counter */
11 | double ovhd();
12 | 
13 | /* Determine clock rate of processor */
14 | double mhz(int verbose);
15 | 
16 | /* Determine clock rate of processor, having more control over accuracy */
17 | double mhz_full(int verbose, int sleeptime);
18 | 
19 | /** Special counters that compensate for timer interrupt overhead */
20 | 
21 | void start_comp_counter();
22 | 
23 | double get_comp_counter();
24 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/fcyc2.h:
--------------------------------------------------------------------------------
 1 | /* Find number of cycles used by function that takes 2 arguments */
 2 | 
 3 | /* Function to be tested takes two integer arguments */
 4 | typedef int (*test_funct)(int, int); 
 5 | 
 6 | /* Compute time used by function f  */
 7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache);
 8 | 
 9 | /*********  These routines are used to help with the analysis *********/
10 | 
11 | /*
12 | Parameters:
13 |   k:  How many samples must be within epsilon for convergence 
14 |   epsilon: What is tolerance 
15 |   maxsamples: How many samples until give up?
16 | */
17 | 
18 | /* Full version of fcyc with control over parameters */
19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
20 | 		 int k, double epsilon, int maxsamples, int compensate);
21 | 
22 | /* Get current minimum */
23 | double get_min();
24 | 
25 | /* What is convergence status for k minimum measurements within epsilon
26 |    Returns 0 if not converged, #samples if converged, and -1 if can't
27 |    reach convergence
28 | */
29 | 
30 | int has_converged(int k, double epsilon, int maxsamples);
31 | 
32 | /* What is error of current measurement */
33 | double err(int k);
34 | 
35 | /*************  Try other clocking methods *****************/
36 | 
37 | /* Full version that uses the time of day clock */
38 | double fcyc2_full_tod(test_funct f, int param1, int param2, int clear_cache,
39 | 		     int k, double epsilon, int maxsamples, int compensate);
40 | 
41 | double fcyc2_tod(test_funct f, int param1, int param2, int clear_cache);
42 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.c:
--------------------------------------------------------------------------------
  1 | /* mountain.c - Generate the memory mountain. */
  2 | /* $begin mountainmain */
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <math.h>
  6 | #include "fcyc2.h" /* measurement routines */
  7 | #include "clock.h" /* routines to access the cycle counter */
  8 | 
  9 | #define MINBYTES (1 << 14)  /* First working set size */
 10 | #define MAXBYTES (1 << 27)  /* Last working set size */
 11 | #define MAXSTRIDE 15        /* Stride x8 bytes */
 12 | #define MAXELEMS MAXBYTES/sizeof(long) 
 13 | 
 14 | 
 15 | long data[MAXELEMS];      /* The global array we'll be traversing */
 16 | 
 17 | 
 18 | void init_data(long *data, int n);
 19 | int test(int elems, int stride);
 20 | double run(int size, int stride, double Mhz);
 21 | 
 22 | /* $begin mountainmain */
 23 | int main()
 24 | {
 25 |     int size;        /* Working set size (in bytes) */
 26 |     int stride;      /* Stride (in array elements) */
 27 |     double Mhz;      /* Clock frequency */
 28 | 
 29 |     init_data(data, MAXELEMS); /* Initialize each element in data */
 30 |     Mhz = mhz(0);              /* Estimate the clock frequency */
 31 | 
 32 | 
 33 |     printf("# Clock frequency is approx. %.1f MHz\n", Mhz);
 34 |     printf("# Memory mountain (MB/sec)\n");
 35 | 
 36 |     
 37 |     printf("%d\t", MAXSTRIDE);
 38 |     for (stride = 1; stride <= MAXSTRIDE; stride++)
 39 |       printf("%d\t", stride);
 40 | 
 41 |     printf("\n");
 42 | 
 43 |     /* begin mountainmain */
 44 |     for (size = MAXBYTES; size >= MINBYTES; size >>= 1)
 45 |       {
 46 | 	int log2size_kb = (int)(log2((double)size / 1024.0));
 47 | 	printf("%d\t", log2size_kb);
 48 | 	
 49 | 	for (stride = 1; stride <= MAXSTRIDE; stride++)
 50 | 	  printf("%.0f\t", run(size, stride, Mhz));	    
 51 | 
 52 | 	printf("\n");
 53 |     }
 54 |     exit(0);
 55 | }
 56 | 
 57 | 
 58 | /* init_data - initializes the array */
 59 | void init_data(long *data, int n)
 60 | {
 61 |     int i;
 62 | 
 63 |     for (i = 0; i < n; i++)
 64 | 	data[i] = i;
 65 | }
 66 | 
 67 | /* $begin mountainfuns */
 68 | /* test - Iterate over first "elems" elements of array "data" with
 69 |  *        stride of "stride", using 4x4 loop unrolling.
 70 |  */
 71 | int test(int elems, int stride)
 72 | {
 73 |     long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4;
 74 |     long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
 75 |     long length = elems;
 76 |     long limit = length - sx4;
 77 | 
 78 |     /* Combine 4 elements at a time */
 79 |     for (i = 0; i < limit; i += sx4) {
 80 | 	acc0 = acc0 + data[i];     
 81 |         acc1 = acc1 + data[i+stride];
 82 | 	acc2 = acc2 + data[i+sx2]; 
 83 |         acc3 = acc3 + data[i+sx3];
 84 |     }
 85 | 
 86 |     /* Finish any remaining elements */
 87 |     for (; i < length; i++) {
 88 | 	acc0 = acc0 + data[i];
 89 |     }
 90 |     return ((acc0 + acc1) + (acc2 + acc3));
 91 | }
 92 | 
 93 | /* run - Run test(elems, stride) and return read throughput (MB/s).
 94 |  *       "size" is in bytes, "stride" is in array elements, and Mhz is
 95 |  *       CPU clock frequency in Mhz.
 96 |  */
 97 | double run(int size, int stride, double Mhz)
 98 | {   
 99 |     double cycles;
100 |     int elems = size / sizeof(double);       
101 | 
102 |     test(elems, stride);                     /* Warm up the cache */       //line:mem:warmup
103 |     cycles = fcyc2(test, elems, stride, 0);  /* Call test(elems,stride) */ //line:mem:fcyc
104 |     return (size / stride) / (cycles / Mhz); /* Convert cycles to MB/s */  //line:mem:bwcompute
105 | }
106 | /* $end mountainfuns */
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/plotmountain.gp:
--------------------------------------------------------------------------------
 1 | set samples 100
 2 | set isosamples 100
 3 | set xyplane 0
 4 | 
 5 | set xlabel "STRIDES" font ", 16"
 6 | set ylabel "SIZE (KB, log_2)" font ", 16"
 7 | set zlabel "MBs/sec" offset -3, 0 font ",16" rotate parallel
 8 | 
 9 | set tics font ", 12"
10 | 
11 | set pm3d
12 | splot [:17][4:17] "mountain.dat" u 1:2:3 matrix nonuniform with lines lc 0 notitle
13 | 
14 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | COMPILER=gcc
 3 | 
 4 | ifeq ($(COMPILER),gcc)
 5 | CC = gcc
 6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm
 7 | SUFFIX = .gcc
 8 | LIBM = -lm
 9 | endif
10 | 
11 | ifeq ($(COMPILER),icc)
12 | CC = icc
13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo
14 | SUFFIX = .icc
15 | LIBM =
16 | endif
17 | 
18 | ifeq ($(COMPILER),pgcc)
19 | CC = pgcc
20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm
21 | SUFFIX = .pgcc
22 | LIBM = -lm
23 | endif
24 | 
25 | mountain: mountain.c fcyc2.c
26 | 	$(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c $(LIBM)
27 | 
28 | clean:
29 | 	rm -f mountain$(SUFFIX) *.o *~
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.c:
--------------------------------------------------------------------------------
  1 | /* Compute time used by a function f that takes two integer args */
  2 | #include <stdlib.h>
  3 | #include <sys/times.h>
  4 | #include <stdio.h>
  5 | #include <time.h>
  6 | 
  7 | #include "fcyc2.h"
  8 | 
  9 | #define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \
 10 | 					 (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;})
 11 | 
 12 | static double *values = NULL;
 13 | int samplecount = 0;
 14 | 
 15 | #define KEEP_VALS 1
 16 | #define KEEP_SAMPLES 1
 17 | 
 18 | #if KEEP_SAMPLES
 19 | double *samples = NULL;
 20 | #endif
 21 | 
 22 | 
 23 | static void init_sampler(int k, int maxsamples)
 24 | {
 25 |   if (values)
 26 |     free(values);
 27 |   values = calloc(k, sizeof(double));
 28 | #if KEEP_SAMPLES
 29 |   if (samples)
 30 |     free(samples);
 31 |   /* Allocate extra for wraparound analysis */
 32 |   samples = calloc(maxsamples+k, sizeof(double));
 33 | #endif
 34 |   samplecount = 0;
 35 | }
 36 | 
 37 | 
 38 | /* Add new sample.  */
 39 | void add_sample(double val, int k)
 40 | {
 41 |   int pos = 0;
 42 |   if (samplecount < k) {
 43 |     pos = samplecount;
 44 |     values[pos] = val;
 45 |   } else if (val < values[k-1]) {
 46 |     pos = k-1;
 47 |     values[pos] = val;
 48 |   }
 49 | #if KEEP_SAMPLES
 50 |   samples[samplecount] = val;
 51 | #endif
 52 |   samplecount++;
 53 |   /* Insertion sort */
 54 |   while (pos > 0 && values[pos-1] > values[pos]) {
 55 |     double temp = values[pos-1];
 56 |     values[pos-1] = values[pos];
 57 |     values[pos] = temp;
 58 |     pos--;
 59 |   }
 60 | }
 61 | 
 62 | /* Get current minimum */
 63 | double get_min()
 64 | {
 65 |   return values[0];
 66 | }
 67 | 
 68 | /* What is relative error for kth smallest sample */
 69 | double err(int k)
 70 | {
 71 |   if (samplecount < k)
 72 |     return 1000.0;
 73 |   return (values[k-1] - values[0])/values[0];
 74 | }
 75 | 
 76 | /* Have k minimum measurements converged within epsilon? */
 77 | int has_converged(int k_arg, double epsilon_arg, int maxsamples)
 78 | {
 79 |   if ((samplecount >= k_arg) &&
 80 |       ((1 + epsilon_arg)*values[0] >= values[k_arg-1]))
 81 |     return samplecount;
 82 |   if ((samplecount >= maxsamples))
 83 |     return -1;
 84 |   return 0;
 85 | }
 86 | 
 87 | /* Code to clear cache */
 88 | #define ASIZE (1 << 20)
 89 | #define STRIDE 8
 90 | static int stuff[ASIZE];
 91 | static int sink;
 92 | 
 93 | static void clear()
 94 | {
 95 |   int x = sink;
 96 |   int i;
 97 |   for (i = 0; i < ASIZE; i += STRIDE)
 98 |     x += stuff[i];
 99 |   sink = x;
100 | }
101 | 
102 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
103 | 		 int k, double epsilon, int maxsamples, int compensate) 
104 | {
105 |   double result;
106 |   init_sampler(k, maxsamples);
107 |   if (compensate) {
108 |     do {
109 |       if (clear_cache)
110 | 	clear();
111 |       f(param1, param2);   /* warm cache */
112 |       double tstart = CPU_TIME;
113 |       f(param1, param2);
114 |       tstart = CPU_TIME - tstart;
115 |       add_sample(tstart, k);
116 |     } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples);
117 |   } else {
118 |     do {
119 |       if (clear_cache)
120 | 	clear();
121 |       f(param1, param2); /* warm cache */
122 |       double tstart = CPU_TIME;
123 |       f(param1, param2);
124 |       tstart = CPU_TIME-tstart;
125 |       add_sample(tstart, k);
126 |     } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples);
127 |   }
128 | #ifdef DEBUG
129 |   {
130 |     int i;
131 |     printf(" %d smallest values: [", k);
132 |     for (i = 0; i < k; i++)
133 |       printf("%.0f%s", values[i], i==k-1 ? "]\n" : ", ");
134 |   }
135 | #endif
136 |   result = values[0];
137 | #if !KEEP_VALS
138 |   free(values); 
139 |   values = NULL;
140 | #endif
141 |   return result;  
142 | }
143 | 
144 | 
145 | double fcyc2(test_funct f, int param1, int param2, int clear_cache)
146 | {
147 |   return fcyc2_full(f, param1, param2, clear_cache, 3, 0.01, 500, 0);
148 | }
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.h:
--------------------------------------------------------------------------------
 1 | /* Find number of cycles used by function that takes 2 arguments */
 2 | 
 3 | /* Function to be tested takes two integer arguments */
 4 | typedef int (*test_funct)(int, int); 
 5 | 
 6 | /* Compute time used by function f  */
 7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache);
 8 | 
 9 | /*********  These routines are used to help with the analysis *********/
10 | 
11 | /*
12 | Parameters:
13 |   k:  How many samples must be within epsilon for convergence 
14 |   epsilon: What is tolerance 
15 |   maxsamples: How many samples until give up?
16 | */
17 | 
18 | /* Full version of fcyc with control over parameters */
19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
20 | 		 int k, double epsilon, int maxsamples, int compensate);
21 | 
22 | /* Get current minimum */
23 | double get_min();
24 | 
25 | /* What is convergence status for k minimum measurements within epsilon
26 |    Returns 0 if not converged, #samples if converged, and -1 if can't
27 |    reach convergence
28 | */
29 | 
30 | int has_converged(int k, double epsilon, int maxsamples);
31 | 
32 | /* What is error of current measurement */
33 | double err(int k);
34 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.c:
--------------------------------------------------------------------------------
  1 | /* mountain.c - Generate the memory mountain. */
  2 | /* $begin mountainmain */
  3 | #include <stdlib.h>
  4 | #include <stdio.h>
  5 | #include <math.h>
  6 | #include "fcyc2.h"
  7 | 
  8 | 
  9 | #define MINBYTES (1 << 14)  /* First working set size */
 10 | #define MAXBYTES (1 << 27)  /* Last working set size */
 11 | #define MAXSTRIDE 15        /* Stride x8 bytes */
 12 | #define MAXELEMS MAXBYTES/sizeof(long) 
 13 | 
 14 | 
 15 | long data[MAXELEMS];      /* The global array we'll be traversing */
 16 | 
 17 | void init_data(long *data, int n);
 18 | int test(int elems, int stride);
 19 | double run(int size, int stride);
 20 | 
 21 | 
 22 | int main()
 23 | {
 24 |     int size;        /* Working set size (in bytes) */
 25 |     int stride;      /* Stride (in array elements) */
 26 | 
 27 |     init_data(data, MAXELEMS); /* Initialize each element in data */
 28 | 
 29 |     printf("# Memory mountain (MB/sec)\n");
 30 | 
 31 |     
 32 |     printf("%d\t", MAXSTRIDE);
 33 |     for (stride = 1; stride <= MAXSTRIDE; stride++)
 34 |       printf("%d\t", stride);
 35 | 
 36 |     printf("\n");
 37 | 
 38 |     /* begin mountainmain */
 39 |     for (size = MAXBYTES; size >= MINBYTES; size >>= 1)
 40 |       {
 41 | 	int log2size_kb = (int)(log2((double)size / 1024.0));
 42 | 	printf("%d\t", log2size_kb);
 43 | 	
 44 | 	for (stride = 1; stride <= MAXSTRIDE; stride++)
 45 | 	  printf("%.0f\t", run(size, stride));	    
 46 | 
 47 | 	printf("\n");
 48 |     }
 49 |     exit(0);
 50 | }
 51 | 
 52 | 
 53 | /* init_data - initializes the array */
 54 | void init_data(long *data, int n)
 55 | {
 56 |     int i;
 57 | 
 58 |     for (i = 0; i < n; i++)
 59 | 	data[i] = i;
 60 | }
 61 | 
 62 | /* $begin mountainfuns */
 63 | /* test - Iterate over first "elems" elements of array "data" with
 64 |  *        stride of "stride", using 4x4 loop unrolling.
 65 |  */
 66 | int test(int elems, int stride)
 67 | {
 68 |     long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4;
 69 |     long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
 70 |     long length = elems;
 71 |     long limit = length - sx4;
 72 | 
 73 |     /* Combine 4 elements at a time */
 74 |     for (i = 0; i < limit; i += sx4) {
 75 | 	acc0 = acc0 + data[i];     
 76 |         acc1 = acc1 + data[i+stride];
 77 | 	acc2 = acc2 + data[i+sx2]; 
 78 |         acc3 = acc3 + data[i+sx3];
 79 |     }
 80 | 
 81 |     /* Finish any remaining elements */
 82 |     for (; i < length; i++) {
 83 | 	acc0 = acc0 + data[i];
 84 |     }
 85 |     return ((acc0 + acc1) + (acc2 + acc3));
 86 | }
 87 | 
 88 | /* run - Run test(elems, stride) and return read throughput (MB/s).
 89 |  *       "size" is in bytes, "stride" is in array elements, and Mhz is
 90 |  *       CPU clock frequency in Mhz.
 91 |  */
 92 | double run(int size, int stride)
 93 | {   
 94 |     double timing;
 95 |     int elems = size / sizeof(double);       
 96 | 
 97 |     test(elems, stride);                     /* Warm up the cache */       //line:mem:warmup
 98 |     timing = fcyc2(test, elems, stride, 0);  /* Call test(elems,stride) */ //line:mem:fcyc
 99 |     return (size / stride) / timing; /* Convert cycles to MB/s */  //line:mem:bwcompute
100 | }
101 | /* $end mountainfuns */
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/compile:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for f in v?.c;
 4 | do
 5 |     version=$( echo $f | cut -d'.' -f1 | cut -d'v' -f2)
 6 |     echo "compiling "$version" -> v"$version
 7 |     gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version pipeline.c -lm -lpapi
 8 |     gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version.O3n pipeline.c -lm -lpapi -O3 -march=native -mavx2
 9 | done
10 | 
11 | echo "compiling vector"
12 | gcc -std=c11 -DUSE_PAPI -march=native -o vetor vector.c -lm -lpapi
13 | gcc -std=c11 -DUSE_PAPI -O3 -march=native -mavx2 -o vector.O3n vector.c -lm -lpapi
14 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/mypapi.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #if defined(USE_PAPI)                                           // -----------------------------------------------------------
 4 | #include <papi.h>
 5 | 
 6 | #define PAPI_EVENTS_NUM 2
 7 | int       papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC };
 8 | int       papi_EventSet                = PAPI_NULL;             // the handle for the events' set
 9 | long long papi_buffer[PAPI_EVENTS_NUM] = {0};                   // storage for the counters' values
10 | long long papi_values[PAPI_EVENTS_NUM] = {0};                   // accumulate the counters' values
11 | 
12 |                                                                 // check that PAPI is OK, exit if not
13 | #define PAPI_CHECK( R ) {						\
14 |     if ( (R) != PAPI_OK ) {						\
15 |       printf("a problem with PAPI (code %d) arise at line %d\n",	\
16 | 	     (R), __LINE__);fflush(stdout); return (R); }}
17 | 
18 | 
19 |                                                                 // check that PAPI is OK,
20 |                                                                 // issue a warning if not with a
21 |                                                                 // provided message
22 | #define PAPI_WARN( R, S ) {						\
23 |     if ( (R) != PAPI_OK ) {						\
24 |       printf("a problem  with PAPI (code %d) arise at line %d: %s\n",	\
25 | 	     (R),  __LINE__, (S)); fflush(stdout); }}
26 | 
27 |                                                                 // check that PAPI is OK about an event
28 |                                                                 // issue a warning if not with a
29 |                                                                 // provided message
30 | #define PAPI_WARN_EVENT( R, E, S1, n ) {				\
31 |     if ( (R) != PAPI_OK ) {						\
32 |       printf("a problem  with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
33 | 	     (R), (E),  __LINE__, (S1), (n)); fflush(stdout); }}
34 | 
35 | 
36 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
37 |       retval = PAPI_query_event(papi_events[i]);			\
38 |       if ( retval == PAPI_OK ) {					\
39 | 	retval = PAPI_add_event(papi_EventSet, papi_events[i]);		\
40 | 	PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
41 |       PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)}  } }
42 | 
43 | #define PAPI_INIT {							\
44 |     int retval = PAPI_library_init(PAPI_VER_CURRENT);			\
45 |     if (retval != PAPI_VER_CURRENT)					\
46 |       printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
47 |     retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
48 |     PAPI_ADD_EVENTS_to_SET; }
49 | 
50 | // to use HIGH-LEVEL API
51 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
52 | //#define PAPI_STOP_CNTR  { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
53 | 
54 | // to use NORMAL API
55 | #define PAPI_START_CNTR {						\
56 |     int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
57 | 
58 | #define PAPI_STOP_CNTR {						\
59 | int retval = PAPI_stop(papi_EventSet, papi_buffer);			\
60 | if( retval == PAPI_OK ) {						\
61 |   for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)				\
62 |     papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
63 | 
64 | 
65 | 
66 | #else                                                           // -----------------------------------------------------------
67 | 
68 | #define PAPI_INIT
69 | #define PAPI_START_CNTR
70 | #define PAPI_STOP_CNTR
71 | 
72 | #endif                                                          // -----------------------------------------------------------
73 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #./compile_all
 4 | 
 5 | export LC_NUMERIC="en_US.UTF-8"    #that is to avoid problems with locales when using printf
 6 | 
 7 | declare -a opts=("" ".O3n")
 8 | declare -a opt_names=("no-opt" "opt")
 9 | 
10 | ntypes=${#types[@]}
11 | nopts=${#opts[@]}
12 | ncompilers=${#compilers[@]}
13 | 
14 | # --------------------------------------------------
15 | # get results
16 | timings=()
17 | IPC=()
18 | 
19 | execs=(v?)
20 | execs+=(vector)
21 | 
22 | for f in ${execs[@]};
23 | do
24 |     version=$(echo $f | cut -f2 -d'v')    
25 |     printf "\trunning version v%s\n" $version
26 |     
27 |     for (( o=0; o<nopts; o++ ));
28 |     do
29 | 	opt=${opts[$o]}
30 | 	./$f${opts[$o]} > out
31 | 
32 | 	IPC+=($(cat out | grep IPC | cut -d':' -f2 ))
33 | 	timings+=($(cat out | grep cycles-per-element | cut -d':' -f3 | cut -d']' -f1 ))	
34 |     done
35 |     rm -f out
36 | done
37 | 
38 | 
39 | # --------------------------------------------------
40 | # write results on the stdout
41 | 
42 | # ............................
43 | # headers
44 | echo
45 | printf "%s\t" ""
46 | for (( o=0; o<nopts; o++ ));	   
47 | do
48 |     if [[ ${opts[$o]} == "" ]]; then printf "%11s\t" "plain"; else printf "%11s\t" ${opts[$o]}; fi
49 | done
50 | echo
51 | 
52 | printf "%9s" ""
53 | for (( o=0; o<nopts; o++ ));	   
54 | do
55 |     printf "%4s%3s%4s\t" "CPE" "" "IPC"
56 | done
57 | 
58 | echo
59 | echo
60 | 
61 | # ............................
62 | # results
63 | idx=0
64 | 
65 | if [[ ${exists[$c]} == 0 ]]; then continue; fi
66 | 
67 | for f in ${execs[@]}
68 | do
69 |     printf "%-s\t " $f
70 |     for (( o=0; o<nopts; o++ ));	   
71 |     do	
72 | 	printf "%4.2f%3s%4.2f\t" ${timings[$idx]} " " ${IPC[$idx]}
73 | 	idx=$(($idx+1))
74 |     done
75 |     echo
76 | done
77 | 
78 | 
79 | 
80 | 	
81 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/v0.c:
--------------------------------------------------------------------------------
1 | //
2 | //
3 | //
4 | {
5 |   for ( int i = 0; i < N; i++ )
6 |     sum += array1[i] * array2[i];
7 | }
8 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/v1.c:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | {
 5 |   double sum1 = 0, sum2 = 0;
 6 |   /* double * const array1_ref = array1; */
 7 |   /* double * const array2_ref = array2; */
 8 | 
 9 | #pragma ivdep
10 |   for ( int i = 0; i < N-1; i+=2 )
11 |     // simply unrolling 2 times, exposes the fact that at least
12 |     // 2 elements of the array can be processed independently
13 |     {
14 |       sum1 += array1[   i ] * array2[   i ];
15 |       sum2 += array1[ i+1 ] * array2[ i+1 ];
16 |     }
17 |   if ( N % 2 )
18 |     sum = array1[ N-1 ] * array2[ N-1 ];
19 | 
20 |   sum += sum1 + sum2;
21 | }
22 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/v2.c:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | {
 5 |       int N_4 = (N/4)*4;
 6 |       double A[2];
 7 |       int idx = 0;
 8 |       
 9 |       A[0] = array1[0] * array2[0];
10 |       for ( int i = 0; i < N_4; i+=4 )
11 | 	{
12 | 	  int noidx = !idx;
13 | 	  
14 | 	  A[noidx] = array1[i+4] * array2[i+4];
15 | 	  
16 | 	  sum += A[idx] +
17 | 	    array1[i+1] * array2[i+1] +
18 | 	    array1[i+2] * array2[i+2] +
19 | 	    array1[i+3] * array2[i+3];
20 | 
21 | 	  idx = noidx;
22 | 
23 | 	}
24 | 
25 |       for ( int i = N_4; i < N; i++ )
26 | 	sum += array1[i] * array2[i];
27 | }
28 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/v3.c:
--------------------------------------------------------------------------------
 1 | //
 2 | //
 3 | //
 4 | {
 5 |   int N_4 = (N/4)*4;
 6 |       
 7 | #pragma ivdep
 8 |   for ( int i = 0; i < N_4; i+=4 )
 9 |     {
10 |       sum += array1[i] * array2[i] +
11 | 	array1[i+1] * array2[i+1] +
12 | 	array1[i+2] * array2[i+2] +
13 | 	array1[i+3] * array2[i+3];
14 |     }
15 | 
16 |   for ( int i = N_4; i < N; i++ )
17 |     sum += array1[i] * array2[i];     
18 | }
19 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/v4.c:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | #define _DO_NOT_OPTIMIZE_BEGIN __asm__ volatile("" ::: "memory");
 4 | #define _DO_NOT_OPTIMIZE_END __asm__ volatile("" ::: "memory");
 5 |   
 6 |   typedef double v4df __attribute__ ((vector_size (4*sizeof(double))));
 7 |   typedef union {
 8 |     v4df   V;
 9 |     double v[4];
10 |   }v4df_u;
11 | 
12 | 
13 | #ifdef _GNU_SOURCE      
14 |   v4df sum_ = {0, 0, 0, 0};
15 | #else
16 |   v4df sum_ = {0};
17 | #endif
18 |   v4df register mytmp;
19 |   v4df register tmp  = *((v4df*)&array1[0]) * *((v4df*)&array2[0]);
20 | 
21 |   int N__4 = N/4;
22 |   int N_4 = N__4*4;
23 | 
24 | #pragma ivdep
25 |   for( int i = 1; i <= N__4; i++)
26 |     {
27 |       _DO_NOT_OPTIMIZE_BEGIN;
28 |       mytmp = *((v4df*)array1 + i) * *((v4df*)array2 + i);
29 |       _DO_NOT_OPTIMIZE_END;
30 |       sum_ += tmp;
31 |       tmp   = mytmp;
32 |     }
33 | 
34 |   for ( int i = N_4; i < N; i++ )
35 |     sum += array1[ i ] * array2[ i ];           
36 |       
37 |   v4df_u sum_u ;
38 |       
39 |   sum_u.V = sum_;
40 |       
41 |   sum_u.v[0] = sum_u.v[0] + sum_u.v[1];
42 |   sum_u.v[2] = sum_u.v[2] + sum_u.v[3];
43 |       
44 |   sum += sum_u.v[0] + sum_u.v[2];
45 |       
46 | }
47 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/matmul_simple.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * authors   : Giuseppe Piero Brandino - eXact-lab s.r.l.
  3 |  * date      : October 2018
  4 |  * copyright : GNU Public License
  5 |  */
  6 | 
  7 | #include <stdio.h>
  8 | #include <sys/time.h>
  9 | #include <stdlib.h>
 10 | 
 11 | double cclock()
 12 |   /* Returns elepsed seconds past from the last call to timer rest */
 13 | {
 14 | 
 15 |     struct timeval tmp;
 16 |     double sec;
 17 |     gettimeofday( &tmp, (struct timezone *)0 );
 18 |     sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
 19 |     return sec;
 20 | }
 21 | 
 22 | void setup_matrix(double* a, int n, int m, int stride)
 23 | {
 24 |     int i,j;
 25 |     for (i = 0; i < n; i++)
 26 |         for (j = 0; j < m; j++)
 27 |             a[i*m + j ] = i*m + j + stride;
 28 | 
 29 | }
 30 | 
 31 | void clear_matrix(double* a, int n, int m)
 32 | {
 33 |     int i,j;
 34 |     for (i = 0; i < n; i++)      
 35 |         for (j = 0; j < m; j++)
 36 |             a[i*m + j ] = 0;
 37 | 
 38 | }
 39 | 
 40 | 
 41 | void mat_mult(double* a, double* b, double* c, int n, int m, int o)
 42 | {
 43 | 
 44 |     int i, j, k;
 45 |     for (i = 0; i < n; i++)
 46 |         for (j = 0; j < o; j++)
 47 |             for ( k = 0; k < m; k++)
 48 |                 c[i*o + j] += a[i*m + k] * b[k*o + j];
 49 | }
 50 | 
 51 | 
 52 | void mat_mult_opt(double* a, double* b, double* c, int n, int m, int o)
 53 | {
 54 | 
 55 |     int i, j, k;
 56 |     for (i = 0; i < n; i++)
 57 |         for (k = 0; k < m; k++)
 58 |             for (j = 0; j < o; j++)
 59 |                 c[i*o + j] += a[i*m + k] * b[k*o + j];
 60 | 
 61 | 
 62 | }
 63 | 
 64 | 
 65 | int main(int argc, char** argv)
 66 | {
 67 | 
 68 |    double *a, *b, *c;
 69 |    int w, m,n,o;
 70 |    double begin, end;
 71 |      
 72 |    if (argc < 5)
 73 |      {
 74 |        printf(" Calculates c(n,o)=a(n,m)*b(m,o) \n");
 75 |        printf(" Usage: %s case n m o ", argv[0]);
 76 |        return 1;
 77 |      }
 78 | 
 79 |    w=atoi(argv[1]);
 80 |    n=atoi(argv[2]);
 81 |    m=atoi(argv[3]);
 82 |    o=atoi(argv[4]);
 83 | 
 84 |    a = malloc(n * m * sizeof(double));
 85 |    b = malloc(m * o * sizeof(double));
 86 |    c = malloc(n * o * sizeof(double));
 87 | 
 88 |    setup_matrix(a, n, m, 0);
 89 |    setup_matrix(b, m, o, m*n);
 90 |    clear_matrix(c, n, o);
 91 | 
 92 |    if( w == 0 )
 93 |      {
 94 |        begin = cclock();
 95 |        mat_mult(a, b, c, n, m, o);
 96 |        end = cclock();
 97 |        printf ("NON-optimized elapsed time %9.4f s  \n\n", end - begin );
 98 |      }
 99 |    else
100 |      {
101 |        begin = cclock();
102 |        mat_mult_opt(a, b, c, n, m, o);
103 |        end = cclock();
104 |        printf ("    Optimized Elapsed time %9.4f s  \n\n", end - begin );
105 |      }
106 | 
107 | //   printf("%f\n", c[0]);
108 |    free(a);
109 |    free(b); 
110 |    free(c);
111 | 
112 |    return 0;
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/mypapi.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #if defined(USE_PAPI)                                           // -----------------------------------------------------------
 4 | #include <papi.h>
 5 | 
 6 | typedef unsigned long long int uLint;
 7 | 
 8 | #define PAPI_EVENTS_NUM 3
 9 | int   papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM };
10 | int   papi_EventSet                = PAPI_NULL;             // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0};                   // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0};                   // accumulate the counters' values
13 | 
14 |                                                                 // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) {						\
16 |     if ( (R) != PAPI_OK ) {						\
17 |       printf("a problem with PAPI (code %d) arise at line %d\n",	\
18 | 	     (R), __LINE__);fflush(stdout); return (R); }}
19 | 
20 | 
21 |                                                                 // check that PAPI is OK,
22 |                                                                 // issue a warning if not with a
23 |                                                                 // provided message
24 | #define PAPI_WARN( R, S ) {						\
25 |     if ( (R) != PAPI_OK ) {						\
26 |       printf("a problem  with PAPI (code %d) arise at line %d: %s\n",	\
27 | 	     (R),  __LINE__, (S)); fflush(stdout); }}
28 | 
29 |                                                                 // check that PAPI is OK about an event
30 |                                                                 // issue a warning if not with a
31 |                                                                 // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) {				\
33 |     if ( (R) != PAPI_OK ) {						\
34 |       printf("a problem  with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | 	     (R), (E),  __LINE__, (S1), (n)); fflush(stdout); }}
36 | 
37 | 
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 |       retval = PAPI_query_event(papi_events[i]);			\
40 |       if ( retval == PAPI_OK ) {					\
41 | 	retval = PAPI_add_event(papi_EventSet, papi_events[i]);		\
42 | 	PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 |       PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)}  } }
44 | 
45 | #define PAPI_INIT {							\
46 |     int retval = PAPI_library_init(PAPI_VER_CURRENT);			\
47 |     if (retval != PAPI_VER_CURRENT)					\
48 |       printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 |     retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 |     PAPI_ADD_EVENTS_to_SET; }
51 | 
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR  { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 | 
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR {						\
58 |     int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 | 
60 | #define PAPI_STOP_CNTR {						\
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer);			\
62 | if( retval == PAPI_OK ) {						\
63 |   for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)				\
64 |     papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 | 
66 | #define PAPI_FLUSH_BUFFER {				\
67 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
68 |       papi_buffer[ jj] = 0; }
69 | 
70 | #define PAPI_FLUSH {					\
71 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
72 |       papi_values[jj] = papi_buffer[ jj] = 0; }
73 | 
74 | 
75 | #else                                                           // -----------------------------------------------------------
76 | 
77 | #define PAPI_INIT
78 | #define PAPI_START_CNTR
79 | #define PAPI_STOP_CNTR
80 | 
81 | #endif                                                          // -----------------------------------------------------------
82 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/plot.gp:
--------------------------------------------------------------------------------
 1 | reset
 2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28"
 3 | #set terminal qt enhanced size 1200,1000
 4 | 
 5 | set key inside top left font ",22"
 6 | set tics font ",22"
 7 | set lmargin screen 0.08
 8 | set rmargin screen 0.95
 9 | set bmargin screen 0.12
10 | 
11 | set xlabel "N" font ",22" offset 0,0.5
12 | 
13 | unset yrange
14 | unset xrange
15 | 
16 | array OPT[2]
17 | OPT[1] = "O0 "
18 | OPT[2] = "O3 "
19 | 
20 | array W[2]
21 | W[1] = 3
22 | W[2] = 1.5
23 | 
24 | array DT[2]
25 | DT[1] = "-- __"
26 | DT[2] = 1
27 | 
28 | array TYPE[3]
29 | TYPE[1] = "naive"
30 | TYPE[2] = "optimized"
31 | TYPE[3] = "tailed"
32 | 
33 | 
34 | # ---------------------------------------------
35 | 
36 | set output "timings.png"
37 | set ylabel "timing (sec)" font ",22" offset 2
38 | 
39 | 
40 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1],\
41 |      "" u 1:(1.5e-8*$1**3) w l lc 0 lw 2 dt '..' notitle,\
42 |      "" u 1:(3e-9*$1**3) w l lc 0 lw 2 dt '..' notitle
43 | 
44 | 
45 | # ---------------------------------------------
46 | 
47 | set output "timings_ratio.png"
48 | set ylabel "timings / timings_{naive}" font ",22" offset 2
49 | 
50 | ref = 2
51 | clr = 2
52 | plot for[L = 1:2] for [i = 3:4] "timings" u 1:(column(i+(L-1)*3)/column(ref)) w lp ps 2 lw W[L] dt DT[L] lc ((L-1)*3+(i-1)) title OPT[L].TYPE[i-1]
53 | 
54 | # ---------------------------------------------
55 | 
56 | set output "timings_per_element.png"
57 | set ylabel "timing per element (nsec)" font ",22"  offset 2
58 | 
59 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)/($1**3)*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
60 | 
61 | 
62 | # ---------------------------------------------
63 | 
64 | set output "CPE.png"
65 | set ylabel "CPE" font ",22"  offset 2
66 | 
67 | plot for[L = 1:2] for [i = 2:4] "CPEs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
68 | 
69 | 
70 | # ---------------------------------------------
71 | 
72 | set output "L1M.png"
73 | set ylabel "Level 1 misses per element" font ",22"  offset 2
74 | 
75 | plot for[L = 1:2] for [i = 2:4] "L1Ms" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
76 | 
77 | 
78 | # ---------------------------------------------
79 | 
80 | set output "IPC.png"
81 | set key inside bottom left
82 | set ylabel "IPC" font ",22"  offset 2
83 | set yrange [:4]
84 | 
85 | plot for[L = 1:2] for [i = 2:4] "IPCs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
86 | 
87 | 
88 | 
89 | 
90 | set output
91 | reset
92 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export LC_NUMERIC="en_US.UTF-8"
 4 | export LC_LOCALE="en_US.UTF-8"
 5 | 
 6 | exec=matmul
 7 | 
 8 | declare -a outputs=("timings" "IPCs" "CPEs" "L1Ms")
 9 | ndata=${#outputs[@]}
10 | 
11 | declare -a optimizations=("Non-opt" "Opt")
12 | noptimizations=${#optimizations[@]}
13 | 
14 | declare -a versions=("naive " "lpswap" "tailed")
15 | nversions=${#versions[@]}
16 | 
17 | # --------------------------------------------
18 | #  SAVE THE OLD TABLES, IF PRESENT
19 | # --------------------------------------------
20 | 
21 | for (( o=0 ; o < $ndata; o++ ));
22 | do
23 |     mv -f ${outputs[$o]} ${outputs[$o]}.back
24 |     echo -n "#    ">> ${outputs[$o]}
25 |     
26 |     for (( p=0; p < $noptimizations; p++ )); do
27 | 	echo -e -n ${optimizations[$p]}"\t\t" >> ${outputs[$o]} ;
28 |     done
29 |     echo >> ${outputs[$o]}
30 |     
31 |     echo -n "#N  ">> ${outputs[$o]}
32 |     for (( p=0; p < $noptimizations; p++ )); do
33 | 	for (( v=0; v < $nversions; v++ )); do	    
34 | 	    echo -n ${versions[$v]}" " >> ${outputs[$o]}; done;
35 |     done
36 |     echo >> ${outputs[$o]}
37 | done
38 | 
39 | # --------------------------------------------
40 | #  PREPARE OUTPUT FOLDER
41 | # --------------------------------------------
42 | 
43 | output_dir=./output_saved
44 | if [ ! -d $output_dir ]; then mkdir $output_dir; fi
45 | 
46 | # --------------------------------------------
47 | 
48 | 
49 | start=100
50 | stop=3000
51 | inc=100
52 | 
53 | 
54 | 
55 | 
56 | echo -n "running.. "
57 | for (( N=$start; N<=$stop; N+=$inc ));
58 | do
59 |     echo -n "N="$N".. "
60 |     for (( V=0; V<$nversions; V++ ));
61 |     do
62 | 	taskset -c 2 ./$exec $V $N $N $N > ${output_dir}/output.${V}.${N}
63 | 	results+=($(cat ${output_dir}/output.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}'))
64 | 
65 | 	taskset -c 2 ./${exec}.On $V $N $N $N > ${output_dir}/output.O.${V}.${N}
66 | 	resultsO+=($(cat ${output_dir}/output.O.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}'))
67 | 
68 |     done
69 | 
70 |     for (( o=0 ; o < ${#outputs[@]}; o++ ));
71 |     do
72 | 	echo -n $N" " >> ${outputs[$o]}
73 | 	for (( c=0; c<$nversions; c++ )); do echo -n ${results[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done
74 | 	for (( c=0; c<$nversions; c++ )); do echo -n ${resultsO[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done
75 | 	echo >> ${outputs[$o]}
76 |     done
77 |     
78 |     results=()
79 |     resultsO=()
80 | done
81 | echo
82 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/Makefile:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | #CFLAGS=-Wall -O1 -msse3
 3 | CFLAGS= -O3 -march=native
 4 | OBJ=poly.o benchmark.o timing/clock.o statistics/cpe.o statistics/fcyc.o statistics/lsquare.o
 5 | LDFLAGS=-lm
 6 | # phony targets will always be remade, so a file named "clean"
 7 | # won't prevent the clean target from running
 8 | .PHONY: all clean run
 9 | EXE=driver
10 | 
11 | all: $(EXE)
12 | 
13 | $(EXE): $(OBJ)
14 | 	$(CC) $(CFLAGS) -o $(EXE) $(OBJ) $(LDFLAGS)
15 | 
16 | run: $(EXE)
17 | 	./$(EXE)
18 | 
19 | clean:
20 | 	rm -f $(EXE) $(OBJ)
21 | 
22 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/benchmark.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <math.h>
 4 | #include "statistics/cpe.h"
 5 | #include "poly.h"
 6 | 
 7 | #define SHORT 0
 8 | #if SHORT 
 9 | #define ASIZE 31
10 | #else
11 | #define ASIZE 973
12 | #endif
13 | #define EPS (1e-8)
14 | 
15 | /* Keep track of a number of different programs */
16 | #define MAX_BENCHMARKS 100
17 | 
18 | static struct {
19 |     poly_t cfunct;
20 |     char *description;
21 |     double cpe;
22 | } benchmarks[MAX_BENCHMARKS];
23 | 
24 | static int benchmark_count = 0;
25 | static int current_benchmark = 0;
26 | 
27 | static double* data = NULL;
28 | static double x;
29 | static double result;
30 | static poly_t check_func = NULL;
31 | 
32 | static void setup()
33 | {
34 | 	int i;
35 | 	if (!data)
36 | 		data = (double*) malloc(sizeof(double) * ASIZE);
37 | 	if (!data) {
38 | 		fprintf(stderr, "Memory allocation error!\n");
39 | 		exit(EXIT_FAILURE);
40 | 	}
41 | 	/* Initialize array  */
42 | 	for (i = 0; i < ASIZE; i++)
43 | 		data[i] = (drand48() * 2) - 1;
44 | 	x = (drand48() * 2) - 1;
45 | }
46 | 
47 | void run(int cnt) {
48 | 	result = benchmarks[current_benchmark].cfunct(data, x, cnt);
49 | }
50 | 
51 | static void run_test(int bench_index) {
52 | 	double cpe;
53 | 	char *description = benchmarks[bench_index].description;
54 | 	double good_result;
55 | 	current_benchmark = bench_index;
56 | 	printf("starting benchmark %d\n", bench_index);
57 | 	setup();
58 | 	cpe = find_cpe_full(run, ASIZE, 200000, stdout, RAN_SAMPLE, 0.3, 0);
59 | 	if (check_func) {
60 | 		result = benchmarks[bench_index].cfunct(data, x, ASIZE);
61 | 		good_result = check_func(data, x, ASIZE);
62 | 		if (result - good_result > EPS) {
63 | 			printf("Function %s, Should be %f, Got %f\n",
64 | 			       description, good_result, result);
65 | 		}
66 | 	}
67 | 	benchmarks[current_benchmark].cpe = cpe;
68 | 	/* print results */
69 | 	printf("%s: ", description);
70 | 	printf("%.2f cycles/element\n\n", cpe);
71 | }
72 | 
73 | void add_function(poly_t f, char *description) {
74 | 	benchmarks[benchmark_count].cfunct = f;
75 | 	benchmarks[benchmark_count].description = description;
76 | 	benchmark_count++;
77 | }
78 | 
79 | void set_check_function(poly_t f) {
80 | 	check_func = f;
81 | }
82 | 
83 | int main()
84 | {
85 | 	int i;
86 | 	register_functions();
87 | 	printf("\n");
88 | 	for (i = 0; i < benchmark_count; i++) {
89 | 		run_test(i);
90 | 	}
91 | 	free(data);
92 | 	return EXIT_SUCCESS;
93 | }
94 | 
95 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.c:
--------------------------------------------------------------------------------
  1 | #include "poly.h"
  2 | 
  3 | double poly(double a[], double x, int degree)
  4 | {
  5 | 	long int i;
  6 | 	double result = a[0];
  7 | 	double xpwr = x; /* equals x^i at start of loop */
  8 | 	for (i = 1; i <= degree; i++) {
  9 | 		result += a[i] * xpwr;
 10 | 		xpwr = x * xpwr;
 11 | 	}
 12 | 	return result;
 13 | }
 14 | 
 15 | double polyh(double a[], double x, int degree)
 16 | {
 17 | 	long int i;
 18 | 	double result = a[degree];
 19 | 		for (i = degree-1; i >= 0; i--)
 20 | 			result = a[i] + x*result;
 21 | 		return result;
 22 | }
 23 | 
 24 | double mypoly1(double a[], double x, int degree)
 25 | {
 26 |   long int i;
 27 |   double x2   = x*x;
 28 |   double res  = a[0];
 29 |   double xpwr = x;
 30 |   
 31 |   for ( i = 1; i < degree; i += 2 )
 32 |     {
 33 |       res += a[i] * xpwr;
 34 |       res += a[i+1] * xpwr * x;
 35 |       xpwr *= x2;	
 36 |     }
 37 |   return res;
 38 | }
 39 | 
 40 | double mypoly2(double a[], double x, int degree)
 41 | {
 42 |   long int i;
 43 |   double x2   = x*x;
 44 |   double res  = a[0];
 45 |   double xpwr = x;
 46 | 
 47 |   for ( i = 1; i < degree; i += 2 )
 48 |     {
 49 |       res += (a[i] + a[i+1]*x)* xpwr;
 50 |       //      res += a[i+1] * xpwr * x;
 51 |       xpwr *= x2;	
 52 |     }
 53 |   for ( ; i <= degree; i ++ )
 54 |     {
 55 |       res += a[i] * xpwr;
 56 |       xpwr *= x;
 57 |     }
 58 |   return res;
 59 | }
 60 | 
 61 | double mypoly3(double a[], double x, int degree)
 62 | {
 63 |   long int i;
 64 |   double x2   = x*x;
 65 |   double res1  = a[0];
 66 |   double res2  = a[2];
 67 |   double xpwr = x;
 68 |   double xpwr3 = x2*x;
 69 |   
 70 |   for ( i = 1; i < degree-4; i += 4 )
 71 |     {
 72 |       res1 += (a[i] + a[i+1]*x)* xpwr;
 73 |       res2 += (a[i+2] + a[i+3]*x)* xpwr3;
 74 |       xpwr *= x2;
 75 |       xpwr3 *= x2;
 76 |     }
 77 |   for ( ; i <= degree; i ++ )
 78 |     {
 79 |       res1 += a[i] * xpwr;
 80 |       xpwr *= x;	
 81 |     }
 82 |   
 83 |   return res1+res2;
 84 | }
 85 | 
 86 | 
 87 | double mypoly4(double a[], double x, int degree)
 88 | {
 89 |   long int i;
 90 |   double x2 = x*x;
 91 |   double res_even  = a[0];
 92 |   double res_odd   = 0;
 93 |   double xpwr_even = x2;
 94 |   double xpwr_odd  = x;
 95 |   
 96 |   for ( i = 1; i <= degree; i += 2 )
 97 |     {
 98 |       res_odd   += a[i] * xpwr_odd;
 99 |       xpwr_odd  *= x2;
100 |       res_even  += a[i+1] * xpwr_even;
101 |       xpwr_even *= x2;
102 |     }
103 |   //for ( ; i <= degree; i ++ )
104 |   //  {
105 |   //    res_odd += a[i] * xpwr_even;
106 |   //    xpwr_even *= x;	
107 |   //  }
108 |   return res_even + res_odd;
109 | }
110 | 
111 | void register_functions(void)
112 | {
113 | 	set_check_function(&poly); /* used as reference implementation */
114 | 
115 | 	add_function(&poly, "Polynomial: Naive implementation");
116 | 	add_function(&polyh, "Polynomial: Horner's method");
117 | 	add_function(&mypoly1, "Polynomial: my poly1, unroll x 2");
118 | 	add_function(&mypoly2, "Polynomial: my poly2, 2 separate loops");
119 | 	add_function(&mypoly3, "Polynomial: my poly3, unroll x 2 and separate accumulation");
120 | 
121 | 	return;
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.h:
--------------------------------------------------------------------------------
 1 | #if __INTEL_COMPILER
 2 | /* inline function definitions */
 3 | #pragma warning ( disable : 1418 )
 4 | #endif
 5 | 
 6 | typedef double (*poly_t)(double*, double, int);
 7 | /* Add routine to list of programs to measure */
 8 | void add_function(poly_t f, char *description);
 9 | /* Set routine to check results against */
10 | void set_check_function(poly_t f);
11 | /* called by main to register the set of routines to benchmark */
12 | void register_functions(void);
13 | 
14 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/readme.md:
--------------------------------------------------------------------------------
1 | pipelining at work in evaulation of polynomials
2 | 
3 | just typing "make" you should get an executable named "driver".
4 | by default, -O3 -march=native is enabled.
5 | 
6 | That will evaluate a polynomial using different functions that are defined in poly.c
7 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/cpe.h:
--------------------------------------------------------------------------------
 1 | /* Compute CPE for function */
 2 | 
 3 | /* Compute for function that is linear in some parameter cnt */
 4 | typedef void (*elem_fun_t)(int);
 5 | 
 6 | /* Different ways of finding samples 
 7 |    UNI_SAMPLE: samples uniformly spaced between bias*maxcnt and maxcnt
 8 |    RAN_SAMPLE: samples randomly selected between bias*maxcnt and maxcnt
 9 | */
10 | 
11 | typedef enum {UNI_SAMPLE, RAN_SAMPLE}
12 |   sample_t;
13 | 
14 | /* Find cpe for function f, which allows cnt up to maxcnt.
15 |    Uses default parameters
16 | */
17 | double find_cpe(elem_fun_t f, int maxcnt);
18 | 
19 | /* Find cpe for function f, which allows cnt up to maxcnt, using
20 |    specified number of sample points.
21 |    If data_file, then print data so that can plot points with Excel
22 |    smethod determines method for generating samples
23 | */
24 | double find_cpe_full(elem_fun_t f, int maxcnt, int samples, FILE *data_file,
25 | 		     sample_t smethod, double bias, int verbose);
26 | 
27 | /* Find number of cycles taken by function.
28 |    Do this by running number of trials until best two within TOL (2%) of
29 |    each other
30 | */
31 | double measure_function(elem_fun_t f, int cnt);
32 | 
33 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/fcyc.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /* Fcyc measures the speed of any "test function."  Such a function
 3 |    is passed a list of integer parameters, which it may interpret
 4 |    in any way it chooses.
 5 | */
 6 | 
 7 | typedef void (*test_funct)(int *);
 8 | 
 9 | /* Compute number of cycles used by function f on given set of parameters */
10 | double fcyc(test_funct f, int* params);
11 | 
12 | /***********************************************************/
13 | /* Set the various parameters used by measurement routines */
14 | 
15 | 
16 | /* When set, will run code to clear cache before each measurement 
17 |    Default = 0
18 | */
19 | void set_fcyc_clear_cache(int clear);
20 | 
21 | /* Set size of cache to use when clearing cache 
22 |    Default = 1<<19 (512KB)
23 | */
24 | void set_fcyc_cache_size(int bytes);
25 | 
26 | /* Set size of cache block 
27 |    Default = 32
28 | */
29 | void set_fcyc_cache_block(int bytes);
30 | 
31 | /* When set, will attempt to compensate for timer interrupt overhead 
32 |    Default = 0
33 | */
34 | void set_fcyc_compensate(int compensate);
35 | 
36 | /* Value of K in K-best
37 |    Default = 3
38 | */
39 | void set_fcyc_k(int k);
40 | 
41 | /* Maximum number of samples attempting to find K-best within some tolerance.
42 |    When exceeded, just return best sample found.
43 |    Default = 20
44 | */
45 | void set_fcyc_maxsamples(int maxsamples);
46 | 
47 | /* Tolerance required for K-best
48 |    Default = 0.01
49 | */
50 | void set_fcyc_epsilon(double epsilon);
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.c:
--------------------------------------------------------------------------------
  1 | /* Compute least squares fit of set of data points */
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "lsquare.h"
  5 | 
  6 | typedef struct {
  7 |   double sum_x;
  8 |   double sum_y;
  9 |   double sum_xx;
 10 |   double sum_xy;
 11 | } ls_stat_t;
 12 | 
 13 | /* Accumulate various sums of the data */
 14 | static void ls_stats(double *xval, double *yval, int cnt, ls_stat_t *statp)
 15 | {
 16 |   int i;
 17 |   statp->sum_x = 0.0;
 18 |   statp->sum_y = 0.0;
 19 |   statp->sum_xx = 0.0;
 20 |   statp->sum_xy = 0.0;
 21 |   for (i = 0; i < cnt; i++) {
 22 |     double x = xval[i];
 23 |     double y = yval[i];
 24 |     statp->sum_x += x;
 25 |     statp->sum_y += y;
 26 |     statp->sum_xx += x * x;
 27 |     statp->sum_xy += x * y;
 28 |   }
 29 | }
 30 | 
 31 | double ls_slope(double *xval, double *yval, int cnt)
 32 | {
 33 |   double slope;
 34 |   ls_stat_t stat;
 35 |   ls_stats(xval, yval, cnt, &stat);
 36 |   slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
 37 |     (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
 38 |   return slope;
 39 | }
 40 | 
 41 | double ls_intercept(double *xval, double *yval, int cnt)
 42 | {
 43 |   double intercept;
 44 |   ls_stat_t stat;
 45 |   ls_stats(xval, yval, cnt, &stat);
 46 |   intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
 47 |     (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
 48 |   return intercept;
 49 | }
 50 | 
 51 | static double rel_err(double x, double y, double slope, double intercept)
 52 | {
 53 |   double offset = y - (slope*x+intercept);
 54 |   if (offset < 0)
 55 |     offset = -offset;
 56 |   if (x == 0)
 57 |     return offset;
 58 |   /*
 59 |   printf("x = %.2f, y = %.2f, a = %.2f, b = %.2f\n",
 60 | 	  x, y, slope, intercept);
 61 |   printf("Abs err = %.2f, Rel err = %.2f\n", offset, offset/x);
 62 |   */
 63 |   return offset/x;
 64 | }
 65 | 
 66 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype)
 67 | {
 68 |   double slope;
 69 |   double intercept;
 70 |   ls_stat_t stat;
 71 |   int i;
 72 |   double num, denom;
 73 |   ls_stats(xval, yval, cnt, &stat);
 74 |   slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
 75 |     (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
 76 |   intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
 77 |     (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
 78 |   num = denom = 0;
 79 |   for (i = 0; i < cnt; i++) {
 80 |     double e = rel_err(xval[i], yval[i], slope, intercept);
 81 |     switch (etype) {
 82 |     case LS_AVG:
 83 |       num += e;
 84 |       denom++;
 85 |       break;
 86 |     case LS_MAX:
 87 |       if (num < e)
 88 | 	num = e;
 89 |       denom = 1;
 90 |       break;
 91 |     default:
 92 |       fprintf(stderr, "Invalid error type: %d\n", etype);
 93 |       exit(1);
 94 |       break;
 95 |     }
 96 |   }
 97 |   return num/denom;
 98 | }
 99 | 
100 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.h:
--------------------------------------------------------------------------------
 1 | /* Compute least squares fit of set of data points */
 2 | 
 3 | /* Fit is of form y = mx + b.  m is slope, b is intercept */
 4 | double ls_slope(double *xval, double *yval, int cnt);
 5 | double ls_intercept(double *xval, double *yval, int cnt);
 6 | 
 7 | typedef enum {LS_AVG, LS_MAX} ls_err_t;
 8 | 
 9 | /* Determine error (either absolute or average) of least squares fit */
10 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype);
11 | 
12 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/timing/clock.h:
--------------------------------------------------------------------------------
 1 | #if __INTEL_COMPILER
 2 | /* inline function definitions */
 3 | #pragma warning ( disable : 1418 )
 4 | #endif
 5 | 
 6 | /* Routines for using cycle counter */
 7 | 
 8 | /* Start the counter */
 9 | void start_counter();
10 | void start_counter_copy();
11 | 
12 | /* Get # cycles since counter started */
13 | double get_counter();
14 | double get_counter_copy();
15 | 
16 | 
17 | /* Measure overhead for counter */
18 | double ovhd();
19 | 
20 | /* Determine clock rate of processor */
21 | double mhz(int verbose);
22 | 
23 | /* Determine clock rate of processor, having more control over accuracy */
24 | double mhz_full(int verbose, int sleeptime);
25 | 
26 | /** Special counters that compensate for timer interrupt overhead */
27 | 
28 | void start_comp_counter();
29 | 
30 | double get_comp_counter();
31 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/mypapi.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #if defined(USE_PAPI)                                           // -----------------------------------------------------------
 4 | #include <papi.h>
 5 | 
 6 | typedef unsigned long long int uLint;
 7 | 
 8 | #define PAPI_EVENTS_NUM 3
 9 | int   papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM };
10 | int   papi_EventSet                = PAPI_NULL;             // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0};                   // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0};                   // accumulate the counters' values
13 | 
14 |                                                                 // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) {						\
16 |     if ( (R) != PAPI_OK ) {						\
17 |       printf("a problem with PAPI (code %d) arise at line %d\n",	\
18 | 	     (R), __LINE__);fflush(stdout); return (R); }}
19 | 
20 | 
21 |                                                                 // check that PAPI is OK,
22 |                                                                 // issue a warning if not with a
23 |                                                                 // provided message
24 | #define PAPI_WARN( R, S ) {						\
25 |     if ( (R) != PAPI_OK ) {						\
26 |       printf("a problem  with PAPI (code %d) arise at line %d: %s\n",	\
27 | 	     (R),  __LINE__, (S)); fflush(stdout); }}
28 | 
29 |                                                                 // check that PAPI is OK about an event
30 |                                                                 // issue a warning if not with a
31 |                                                                 // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) {				\
33 |     if ( (R) != PAPI_OK ) {						\
34 |       printf("a problem  with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | 	     (R), (E),  __LINE__, (S1), (n)); fflush(stdout); }}
36 | 
37 | 
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 |       retval = PAPI_query_event(papi_events[i]);			\
40 |       if ( retval == PAPI_OK ) {					\
41 | 	retval = PAPI_add_event(papi_EventSet, papi_events[i]);		\
42 | 	PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 |       PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)}  } }
44 | 
45 | #define PAPI_INIT {							\
46 |     int retval = PAPI_library_init(PAPI_VER_CURRENT);			\
47 |     if (retval != PAPI_VER_CURRENT)					\
48 |       printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 |     retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 |     PAPI_ADD_EVENTS_to_SET; }
51 | 
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR  { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 | 
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR {						\
58 |     int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 | 
60 | #define PAPI_STOP_CNTR {						\
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer);			\
62 | if( retval == PAPI_OK ) {						\
63 |   for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)				\
64 |     papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 | 
66 | #define PAPI_FLUSH_BUFFER {				\
67 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
68 |       papi_buffer[ jj] = 0; }
69 | 
70 | #define PAPI_FLUSH {					\
71 |     for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++)	\
72 |       papi_values[jj] = papi_buffer[ jj] = 0; }
73 | 
74 | 
75 | #else                                                           // -----------------------------------------------------------
76 | 
77 | #define PAPI_INIT
78 | #define PAPI_START_CNTR
79 | #define PAPI_STOP_CNTR
80 | 
81 | #endif                                                          // -----------------------------------------------------------
82 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/plot.gp:
--------------------------------------------------------------------------------
 1 | reset
 2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28"
 3 | #set terminal qt enhanced size 1200,1000
 4 | 
 5 | 
 6 | set tics font ",22"
 7 | set rmargin screen 0.95
 8 | set bmargin screen 0.12
 9 | 
10 | set xlabel "N" font ",22" offset 0,0.5
11 | 
12 | unset yrange
13 | unset xrange
14 | 
15 | array OPT[2]
16 | OPT[1] = "O0 "
17 | OPT[2] = "O3 "
18 | 
19 | array W[2]
20 | W[1] = 3
21 | W[2] = 1.5
22 | 
23 | array DT[2]
24 | DT[1] = "-- __"
25 | DT[2] = 1
26 | 
27 | NTYPE = 7
28 | array TYPE[NTYPE]
29 | TYPE[1] = "naive"
30 | TYPE[2] = "UR2x1"
31 | TYPE[3] = "UR2x1g"
32 | TYPE[4] = "UR2x2"
33 | TYPE[5] = "UR4x2g"
34 | TYPE[6] = "UR4x4"
35 | TYPE[7] = "vUR4x4"
36 | 
37 | 
38 | # ---------------------------------------------
39 | set key inside top left font ",22"
40 | set lmargin screen 0.08
41 | # ---------------------------------------------
42 | 
43 | set output "timings.png"
44 | set ylabel "timing (sec)" font ",22" offset 2
45 | 
46 | 
47 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
48 | 
49 | 
50 | # ---------------------------------------------
51 | set key outside left
52 | set lmargin screen 0.22
53 | # ---------------------------------------------
54 | 
55 | set output "timings_per_element.png"
56 | set ylabel "timing per element (nsec)" font ",22"  offset 2, -6
57 | 
58 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/$1*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
59 | 
60 | # ---------------------------------------------
61 | 
62 | set output "timings_ratio.png"
63 | set ylabel "timings / timings_{naive}" font ",22" offset 2
64 | 
65 | ref = 2
66 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/column(ref)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
67 | 
68 | # ---------------------------------------------
69 | set output "CPE.png"
70 | set ylabel "CPE" font ",22"  offset 2
71 | 
72 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "CPEs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
73 | 
74 | 
75 | # ---------------------------------------------
76 | 
77 | set output "L1M.png"
78 | set ylabel "Level 1 misses per element" font ",22"  offset 2,-5
79 | 
80 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "L1Ms" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
81 | 
82 | 
83 | # ---------------------------------------------
84 | 
85 | set output "IPC.png"
86 | set ylabel "IPC" font ",22"  offset 2
87 | 
88 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "IPCs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
89 | 
90 | 
91 | 
92 | 
93 | set output
94 | reset
95 | 


--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/reduction.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #if defined(_GNU_SOURCE)
 3 | #include <sched.h>
 4 | #endif
 5 | 
 6 | // ─────────────────────────────────────────────────────────────────
 7 | // define the datatype
 8 | //
 9 | #if !defined(ITYPE)
10 | #warning "compiling with double type"
11 | #define DTYPE  double         // type of data
12 | #define DATYPE double         // type for accumulator
13 | #else
14 | #warning "compiling with int type"
15 | #define DTYPE  unsigned int               // type of data
16 | #define DATYPE long long unsigned int          // type for accumulator
17 | #endif
18 | 
19 | 
20 | 
21 | typedef unsigned long long int uLint;
22 | 
23 | //
24 | // ------------------------------------------------------------------
25 | 
26 | 
27 | #define CONCAT(x,y) x ## y
28 | 
29 | // ─────────────────────────────────────────────────────────────────
30 | // define the timing routines
31 | //
32 | 
33 | #define CPU_TIME    (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ),	\
34 | 		     (double)ts.tv_sec +				\
35 | 		     (double)ts.tv_nsec * 1e-9)
36 | 
37 | //
38 | // ------------------------------------------------------------------
39 | 
40 | 
41 | // ─────────────────────────────────────────────────────────────────
42 | // define the vector generator
43 | //
44 | 
45 | #define DEFINE_VECT( T, N, NAME ) typedef T v##NAME __attribute__((vector_size( sizeof(T) * N))); typedef union { v##NAME v; T s[N]; } u##NAME;
46 | 
47 | 
48 | 
49 | // ─────────────────────────────────────────────────────────────────
50 | // define the vector generator
51 | //
52 | 
53 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
54 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep")
55 | #elif defined(__INTEL_COMPILER) | defined(__ICC)
56 | #define PRAGMA_VECT_LOOP _Pragma("parallel")
57 | #elif defined(__clang__)
58 | #define PRAGMA_VECT_LOOP _Pragma("ivdep")
59 | #else
60 | #define PRAGMA_VECT_LOOP
61 | #endif
62 | 
63 | //
64 | // ------------------------------------------------------------------
65 | 
66 | // ─────────────────────────────────────────────────────────────────
67 | // 
68 | //
69 | 
70 | 
71 | 
72 | // ─────────────────────────────────────────────────────────────────
73 | // define the debug printing routine
74 | //
75 | 
76 | #ifdef DEBUG
77 | #define PRINTF(...) printf(__VA_ARGS__)
78 | #define DEBUG_IO 2
79 | #else
80 | #define PRINTF(...)
81 | #endif
82 | 
83 | 
84 | 
85 | DEFINE_VECT( DTYPE, 4, 4d );
86 | DEFINE_VECT( long int, 4, 4i );
87 | 


--------------------------------------------------------------------------------
/HPC_TOOLS_and_STORAGE/Readme.md:
--------------------------------------------------------------------------------
1 | # Materials on HPC libraries, tools, storage
2 | 


--------------------------------------------------------------------------------
/Materials/A_note_on_Endiansim.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/A_note_on_Endiansim.pdf


--------------------------------------------------------------------------------
/Materials/Readme.md:
--------------------------------------------------------------------------------
1 | # Sparse materials on various topics
2 | 
3 | In this folder we will upload materials of interest
4 | 
5 | 1) topics.pdf :: a continuosly updated pdf with various topics discussed in the class
6 | 2) What every Computer Scientist should know about floating point :: a good introduction to the IEEE floating point representation
7 |    
8 | 


--------------------------------------------------------------------------------
/Materials/What_every_computer_scientist_should_know_about_floating-point.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/What_every_computer_scientist_should_know_about_floating-point.pdf


--------------------------------------------------------------------------------
/Materials/arguments.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <stdio.h>
 5 | 
 6 | 
 7 | int main (int argc, char **argv )
 8 | {
 9 | 
10 |   printf("argv is located at address %p and points to %p\n", &argv, argv );
11 |   
12 |   int i = 0;
13 |   while ( i < argc )
14 |     {
15 |       printf("arguments %d is located at address %p and reads as %s\n", i, argv + i, *(argv+i));
16 |       i++;
17 |     }
18 | 
19 |   return 0;
20 | }
21 | 


--------------------------------------------------------------------------------
/Materials/topics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/topics.pdf


--------------------------------------------------------------------------------
/PARALLELISM/Readme.md:
--------------------------------------------------------------------------------
 1 | # Section one: INTRODUCTION TO HPC and PARALLEL CONCEPTS
 2 | 
 3 | ## Day 1: introduction to HPC
 4 | date: Tuesday 28/09/2023
 5 | 
 6 | ### lectures
 7 |  
 8 |  - Stefano Cozzini : [introduction to HPC](lecture01-intro-toHPC.pdf)
 9 | 
10 | 
11 | The lecture above introduces HPC concepts and basic definitions.
12 | 
13 | There is plenty of materials on the topic on the web. 
14 | Here a few links to start with:
15 | 
16 | - [FLOPS definition from wikipedia](https://en.wikipedia.org/wiki/FLOPS)
17 | - [ HPC short introduction from European perspective](https://ec.europa.eu/digital-single-market/en/high-performance-computing)
18 | - [ a must read paper: Reinventing High Performance Computing: Challenges and Opportunities](https://arxiv.org/abs/2203.02544)
19 | - [what can we do with an exascale machine](https://www.hpe.com/us/en/insights/articles/whats-with-the-18-zeros-2009.html)
20 | - [the www.top500.org: it deserves a visit to check a few things](https://www.top500.org)
21 | 
22 |  Application ( not discussed in lecture) 
23 | - [Folding@home project: take a look](https://foldingathome.org/?lng=en)
24 | - [AlphaFold web page](https://alphafold.com/)
25 | 
26 | ### Materials for Linux beginners: 
27 | 
28 |  - [one simple tutorial to start using ssh](https://www.ssh.com/ssh/command/)
29 |  - [linux/unix shell short tutorial for novice users](http://swcarpentry.github.io/shell-novice/)
30 | 


--------------------------------------------------------------------------------
/PARALLELISM/codes/memory.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <assert.h>
 5 | 
 6 | void callocator(double ** vec, size_t N)
 7 | {
 8 |   *vec=(double *)calloc(N, sizeof(double));
 9 |   assert(*vec != NULL);
10 | }
11 | 
12 | int main(int argc, char **argv)
13 | {
14 |   double * v ;
15 |   size_t i, j, m;
16 |   for (i = 1e3 ; i < 1e8 ; i*=10 ) {
17 |     m = sizeof(double) * i ;
18 |     callocator(&v, i);
19 |     for (j=0; j<i; j++) {
20 |       v[j] = 1.; 
21 |     }
22 |     free(v);
23 |     fprintf(stdout, "mem = %llu B \n", m);
24 |     fflush(stdout);
25 |     sleep(30);
26 |     fprintf(stdout, "We have waited 30 seconds\n", m);
27 |     fflush(stdout);
28 |   }
29 |   return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/PARALLELISM/codes/pi.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2016 Master in High Performance Computing
 3 |  *
 4 |  * Adapted from the net by Alberto Sartori. 
 5 |  */
 6 | 
 7 | // pi.c: Montecarlo integration to determine pi
 8 | 
 9 | // We have a circle inscribed inside a square of unit lenght. The
10 | // ratio between the area of the square (1) and the circle (pi/4) is
11 | // pi/4. Therefore, if we randomly choose N points inside the square,
12 | // on average, only M=N*pi/4 points will belong to the circle. From
13 | // the last relation we can estimate pi.
14 | 
15 | #include <stdio.h>
16 | #include <time.h>
17 | #include <stdlib.h>
18 | // if you don ' t have drand48 uncomment the following two lines 10
19 | // #define drand48 1.0/RANDMAXrand
20 | // #define srand48 srand
21 | #define seed 68111 // seed for number generator
22 | 
23 | int main (int argc, char ** argv) {
24 | 
25 |   if (argc<2)
26 |     {
27 |       printf(" Usage: %s number \n",argv[0]);
28 |       return 1;
29 |     }
30 |   long long int N = atoll(argv[1]);
31 |   long long int M = 0 ;
32 |   double pi = 0;
33 |   // point coordinates
34 |   double x , y;
35 |   clock_t start_time, end_time; 
36 |   double total_time;
37 |   start_time = clock();
38 |   
39 |   srand48 ( seed ) ; // seed the number generator
40 | 
41 |   long long int i;
42 |   for (i = 0 ; i < N ; i++)
43 |     {
44 |       // take a point P(x,y) inside the unit square
45 |       x = drand48(); 
46 |       y = drand48();
47 |       
48 |       // check if the point P(x,y) is inside the circle
49 |       if ((x*x + y*y)<1)
50 | 	M++; 
51 |     }
52 |   pi = 4.0*M/N ; // calculate area
53 |   end_time=clock();
54 |   printf ( "\n # of trials = %llu , estimate of pi is %1.9f \n", N, pi ) ;
55 |   total_time= ( (double) (end_time - start_time) )/CLOCKS_PER_SEC ;
56 |   printf ( "\n # walltime : %10.8f \n", total_time );   
57 |   return 0;
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/PARALLELISM/lecture01-intro-toHPC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture01-intro-toHPC.pdf


--------------------------------------------------------------------------------
/PARALLELISM/lecture02-HPC-hardware.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture02-HPC-hardware.pdf


--------------------------------------------------------------------------------
/PARALLELISM/lecture03-HPCsoftware-stack.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture03-HPCsoftware-stack.pdf


--------------------------------------------------------------------------------
/PARALLELISM/lecture04-on-parallel-programming.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture04-on-parallel-programming.pdf


--------------------------------------------------------------------------------
/PARALLELISM/slurm/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains the following files:
 2 | 
 3 | - slurm01.job
 4 | A simple example of a batch script for Slurm jobs
 5 | 
 6 | - slurm02_#.job 
 7 | Three jobs showing how to run job steps within a Slurm job and the differences between allocating tasks and nodes 
 8 | 
 9 | - slurm03_#.job
10 | Three jobs showing the importance of specifying walltime and memory requirements 
11 | 
12 | - slurm04.job
13 | A simple job showing how to: load modules, compile and run an application within a Slurm job
14 | 
15 | - slurm05.job
16 | A simple job showing what happens when we load a module 
17 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm01.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Name of the job
 4 | #SBATCH --job-name=my_first_job
 5 | 
 6 | # Define the number of nodes you need.
 7 | #SBATCH --nodes=1
 8 | 
 9 | # Define the number of tasks you need. Use with distributed parallelism
10 | #SBATCH --ntasks=16
11 | 
12 | # Eventually, you can further specify the number of tasks per node
13 | #SBATCH --ntasks-per-node=16
14 | 
15 | # Define the number of CPUs allocated to each task. Use with shared memory parallelism
16 | #SBATCH --cpus-per-task=2
17 | 
18 | # Define how long the job will run in real time. Format is d-hh:mm:ss 
19 | # For a 30 seconds job 
20 | #SBATCH --time=0-00:00:30
21 | 
22 | ## Define the account name, e.g. for the Laboratory of Data Engineering 
23 | ##SBATCH -A lade
24 | 
25 | # Define the partition on which the job shall run, e.g. EPYC, THIN, GPU, DGX
26 | #SBATCH -p EPYC
27 | 
28 | # Define how much memory you need. Choose one between the following 
29 | # --mem will define memory per node 
30 | # --mem-per-cpu will define memory per CPU/core
31 | #SBATCH --mem-per-cpu=1500MB
32 | ##SBATCH --mem=5GB    # this one is not in effect, due to the double hash
33 | 
34 | # Specify the output and error files
35 | #SBATCH --output=%x.%j.out 
36 | #SBATCH --error=%x.%j.err
37 | 
38 | # Eventually, you can turn on mail notification.
39 | # Among the possibilities we can list: NONE, BEGIN, END, FAIL, ALL
40 | ##SBATCH --mail-type=BEGIN,END
41 | ##SBATCH --mail-user=fifo@lifo.com
42 | 
43 | # Pick nodes with feature 'foo'. Different clusters have different features available.
44 | # Most of the time you don't need this
45 | ##SBATCH -C foo
46 | 
47 | # Restrict the job to run on the node(s) named
48 | ##SBATCH -w epyc008
49 | 
50 | #Start the program
51 | 
52 | >&2 echo "DIR is ${SLURM_SUBMIT_DIR}"
53 | 
54 | srun /bin/hostname
55 | srun sleep 60
56 | 
57 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_A.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=my_second_job_A
 4 | #SBATCH --time=0-00:10:00
 5 | #SBATCH -p EPYC
 6 | #SBATCH -n3 # 3 tasks
 7 | #SBATCH --output=%x.%j.out 
 8 | #SBATCH --error=%x.%j.err
 9 | echo Starting job $SLURM_JOB_ID
10 | echo SLURM assigned me these nodes
11 | srun -l hostname
12 | 
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 sleep 60  & # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 sleep 60  & # start 1 copy of program 2
17 | echo "3)" $(date)
18 | wait # wait for all to finish
19 | 
20 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_B.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=my_second_job_B
 4 | #SBATCH --time=0-00:10:00
 5 | #SBATCH -p EPYC
 6 | #SBATCH -n3 # 3 tasks
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | echo Starting job $SLURM_JOB_ID
10 | echo SLURM assigned me these nodes
11 | srun -l hostname
12 | 
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 sleep 60   # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 sleep 60   # start 1 copy of program 2
17 | echo "3)" $(date)
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_C.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=my_second_job_C
 3 | #SBATCH --time=0-00:10:00
 4 | #SBATCH -p EPYC
 5 | #SBATCH -n3 # 3 tasks
 6 | #SBATCH --output=%x.%j.out
 7 | #SBATCH --error=%x.%j.err
 8 | #SBATCH -N3 # 3 NODES 
 9 | 
10 | echo Starting job $SLURM_JOB_ID
11 | echo SLURM assigned me these nodes
12 | srun -l hostname
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 -N2 sleep 60 & # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 -N1 sleep 60 & # start 1 copy of program 2
17 | echo "3)" $(date)
18 | wait # wait for all to finish
19 | 
20 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_A.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=memory_A         # Job name
 3 | #SBATCH --ntasks=1                  # Run a single task
 4 | #SBATCH --mem=70M                   # Job Memory
 5 | #SBATCH --time=00:15:00             # Time limit hrs:min:sec
 6 | #SBATCH -p THIN
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | 
10 | pwd; hostname; date
11 | cd ../codes
12 | 
13 | gcc memory.c -o memory.x
14 | ./memory.x 
15 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_B.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=memory_B         # Job name
 3 | #SBATCH --ntasks=1                  # Run a single task
 4 | #SBATCH --mem=90M                   # Job Memory
 5 | #SBATCH --time=00:05:00             # Time limit hrs:min:sec
 6 | #SBATCH -p THIN
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | 
10 | pwd; hostname; date
11 | cd ../codes
12 | 
13 | gcc memory.c -o memory.x
14 | ./memory.x
15 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_C.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=memory_C         # Job name
 3 | #SBATCH --ntasks=1                  # Run a single task
 4 | #SBATCH --mem=100M                  # Job Memory
 5 | #SBATCH --time=00:00:01             # Time limit hrs:min:sec
 6 | #SBATCH -p THIN
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | 
10 | pwd; hostname; date
11 | cd ../codes
12 | 
13 | gcc memory.c -o memory.x
14 | ./memory.x
15 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm04.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=compile_and_run_pi
 4 | #SBATCH --time=0-00:10:00
 5 | #SBATCH -p EPYC
 6 | #SBATCH -n1 # 1 tasks
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | #SBATCH -N1 # 1 NODES 
10 | echo Starting job $SLURM_JOB_ID
11 | echo Current dir is ${SLURM_SUBMIT_DIR}
12 | 
13 | module purge
14 | module load compiler # For Intel compiler instead of GNU compiler
15 | cd ../codes
16 | echo "Now, I am in $(pwd)"
17 | icx pi.c -O3 -o pi.x
18 | ./pi.x 100000000
19 | 
20 | 


--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm05.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=modules          # Job name
 3 | #SBATCH --ntasks=1                  # Run a single task
 4 | #SBATCH --time=00:05:00             # Time limit hrs:min:sec
 5 | #SBATCH -p EPYC
 6 | #SBATCH --output=%x.%j.out
 7 | #SBATCH --error=%x.%j.err
 8 | 
 9 | module purge
10 | echo "a) "$LD_LIBRARY_PATH
11 | module load openMPI/4.1.5/gnu
12 | echo "b) "$LD_LIBRARY_PATH
13 | module purge
14 | echo "c) "$LD_LIBRARY_PATH
15 | module load openMPI/4.1.5/icx
16 | echo "d) "$LD_LIBRARY_PATH
17 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/Readme.md:
--------------------------------------------------------------------------------
 1 |  # This folder collects materials on MPI and OpenMP
 2 | 
 3 |  ## MPI section 
 4 | 
 5 | A collection of materials/references for the MPI lectures 
 6 | 
 7 | 
 8 | ### lectures (all by S.Cozzini)
 9 |  
10 |  - lecture 5a: [MPI programming partA ](lecture05-MPI-Programming-A.pdf)
11 |  - lecture 5b: [MPI programming partB ](lecture05-MPI-Programming-B.pdf)
12 | 
13 | 
14 | ### Main references for MPI lectures: 
15 | 
16 |  - chapter 9 of reference 4 is a nice and detailed introduction to MPI. 
17 |  - exercises and tutorials on  MPI are present all over the web. Here a couple of examples:
18 |     - [Here a very good starting point](https://www.mcs.anl.gov/research/projects/mpi/tutorial/index.html)
19 |     - [Another simple tutorial](https://mpitutorial.com/tutorials/) 
20 |     - [A virtual course where I took a lot of materials, including some exercises](https://cvw.cac.cornell.edu/MPIP2P/)
21 |  
22 | ### tutorials  (contributed by Niccolo Tosato and Marco Celoria)
23 |   - tutorial 1: [compiling and running MPI program on ORFEO (prepared by N.Tosato)](compiling-and-running-mpi-programs.md)
24 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/Brecv.c:
--------------------------------------------------------------------------------
  1 | // taken from https://cvw.cac.cornell.edu/MPIP2P/brecv
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <time.h>
  5 | #include "mpi.h"
  6 | #define TAG 100
  7 | 
  8 | void print_time(double tbegin, double tend);
  9 | int new_sleep(int);
 10 | int SLEEP(clock_t);
 11 | 
 12 | /* -------------------------------------------------------------------
 13 |  * helper to calculate elapsed time and print results
 14 |  * ------------------------------------------------------------------- 
 15 |  */
 16 | void print_time(double tbegin, double tend)
 17 | {
 18 |   int dt;
 19 |   dt = (int)((tend - tbegin) * 1000000.0);
 20 |   printf(" Elapsed time for send = %8d uSec\n", dt);
 21 | }
 22 | 
 23 | /* -----------------------------------------------------------
 24 |  * helpers to sleep program
 25 |  * ----------------------------------------------------------- 
 26 |  */
 27 | int SLEEP(clock_t wait)
 28 | {
 29 |     clock_t goal;
 30 |     wait *= 1000;
 31 |     goal = wait + clock();
 32 |     while (goal >  clock() )
 33 |       ;
 34 |     return (0);
 35 | }
 36 | 
 37 | int new_sleep(int amount)
 38 | {
 39 |   SLEEP(amount);
 40 |   return (0);
 41 | }
 42 | 
 43 | /* -----------------------------------------------------------
 44 |  * Main Program
 45 |  * ----------------------------------------------------------- 
 46 |  */
 47 | int main(int argc, char **argv)
 48 | {
 49 |   float *message;            /* message buffer               */
 50 |   int rank,                  /* rank of task in communicator */
 51 |       size, i;
 52 |   int mlen;                  /* dimension of the message     */
 53 |   MPI_Status status;         /* status of communication      */
 54 |   double tbegin, tend;       /* used to measure elapsed time */
 55 | 
 56 |   if (argc != 2) {
 57 |     printf(" Usage: blocksends <message_length_in_number_of_floats>\n");
 58 |     return -1;
 59 |   }
 60 | 
 61 | /* -------------------------------------------------------------------
 62 |  * do initial housekeeping: allocate memory for messages,
 63 |  * initialize program with MPI, define message tags
 64 |  * ------------------------------------------------------------------ 
 65 |  */
 66 | 
 67 |   mlen = atoi(argv[1]);
 68 |   message = (float *)malloc(mlen * sizeof(float));
 69 | 
 70 | 
 71 |   MPI_Init(&argc, &argv);
 72 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
 73 |   MPI_Comm_rank( MPI_COMM_WORLD, &rank );
 74 |   if(size != 2) {
 75 |     printf("This application is meant to be run with 2 processes.\n");
 76 |     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
 77 |   }
 78 |   printf(" Process %d initialized\n", rank);
 79 |   printf(" Message size = %6d floats\n", mlen);
 80 |   printf(" Total size = %6lu bytes\n", (mlen* sizeof(float)));
 81 | 
 82 | /* -----------------------------------------------------------------
 83 |  * task 0 will report the elapsed time for a blocking send
 84 |  * ----------------------------------------------------------------- 
 85 |  */
 86 |   if (rank == 0)  {
 87 |     for (i = 0; i < mlen; i++) message[i] = 100;
 88 |     printf(" Task %d sending message\n", rank);
 89 |     MPI_Barrier(MPI_COMM_WORLD);
 90 |     tbegin = MPI_Wtime();
 91 |     MPI_Send(message, mlen, MPI_FLOAT, 1, TAG, MPI_COMM_WORLD);
 92 |     tend = MPI_Wtime();
 93 |     print_time(tbegin, tend);
 94 |   }
 95 | 
 96 | /* -----------------------------------------------------------------
 97 |  * task 1 sleeps for 1 second, and then calls a blocking receive.
 98 |  * the sleep is intended to simulate time spent in useful computation
 99 |  * ----------------------------------------------------------------- 
100 |  */
101 |   else if (rank == 1)  {
102 |     for (i = 0; i < mlen; i++) message[i] = -100;
103 |     MPI_Barrier(MPI_COMM_WORLD);
104 |     new_sleep(1);
105 |     MPI_Recv(message, mlen, MPI_FLOAT, 0, TAG, MPI_COMM_WORLD, &status );
106 |     printf(" Task %d received message\n", rank);
107 |   }
108 |   MPI_Finalize();
109 |   return 0;
110 | }
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/clean.sh:
--------------------------------------------------------------------------------
1 | rm *.x
2 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_gnu.sh:
--------------------------------------------------------------------------------
 1 | module load openMPI/4.1.5/gnu
 2 | 
 3 | mpicc  Brecv.c                -g3 -o Brecv.x
 4 | mpicc  CBlockSends.c          -g3 -o CBlockSends.x
 5 | mpicc  deadlock.c             -g3 -o deadlock.x
 6 | mpicc  linear-array.c         -g3 -o linear-array.x
 7 | mpicc  mpi_env_call.c         -g3 -o mpi_env_call.x
 8 | mpicc  mpi_hello_world.c      -g3 -o mpi_hello_world.x
 9 | mpicc  mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
10 | mpif90 mpi_hello_world.F90    -g3 -o mpi_hello_world_F.x
11 | mpicc  mpi_pi.c           -O3 -g3 -o mpi_pi.x
12 | mpif90 send_message.F90       -g3 -o send_message_F.x
13 | mpicc  send_message.c         -g3 -o send_message.x
14 | mpicc  sendrecv_message.c     -g3 -o sendrecv_message.x
15 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_intel.sh:
--------------------------------------------------------------------------------
 1 | module purge
 2 | module load openMPI/4.1.5/icx
 3 | 
 4 | mpicc  Brecv.c                -g3 -o Brecv.x
 5 | mpicc  CBlockSends.c          -g3 -o CBlockSends.x
 6 | mpicc  deadlock.c             -g3 -o deadlock.x
 7 | mpicc  linear-array.c         -g3 -o linear-array.x
 8 | mpicc  mpi_env_call.c         -g3 -o mpi_env_call.x
 9 | mpicc  mpi_hello_world.c      -g3 -o mpi_hello_world.x
10 | mpicc  mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
11 | mpif90 mpi_hello_world.F90    -g3 -o mpi_hello_world_F.x
12 | mpicc  mpi_pi.c           -O3 -g3 -o mpi_pi.x
13 | mpif90 send_message.F90       -g3 -o send_message_F.x
14 | mpicc  send_message.c         -g3 -o send_message.x
15 | mpicc  sendrecv_message.c     -g3 -o sendrecv_message.x
16 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/deadlock.c:
--------------------------------------------------------------------------------
 1 | // A simple program with a deadloack inside.
 2 | // Taken and adapted from somewhere on the net 
 3 | #include <stdio.h>
 4 | #include "mpi.h"
 5 | #include <stdlib.h>
 6 | 
 7 | int main(int argc, char *argv[])        
 8 | {
 9 | #define MSGLEN  1024
10 |   int ITAG_A = 100, ITAG_B = 200; 
11 |   int irank, i, isize, idest, isrc, istag, iretag;
12 |   float rmsg1[MSGLEN];
13 |   float rmsg2[MSGLEN];
14 |   MPI_Status recv_status;
15 | 
16 |   MPI_Init(&argc, &argv);
17 |   MPI_Comm_rank(MPI_COMM_WORLD, &irank);  
18 |   MPI_Comm_size(MPI_COMM_WORLD, &isize);  
19 | 
20 |   if(isize != 2) {
21 |     printf("This application is meant to be run with 2 processes.\n");
22 |     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
23 |   }
24 | 
25 |   printf("I am rank %d of %d \n", irank, isize );
26 |   // load an array of float numbers as message
27 |   for (i = 1; i <= MSGLEN; i++) {
28 |     rmsg1[i] = 100;
29 |     rmsg2[i] = -100;
30 |   }
31 |   if (irank == 0) { 
32 |     idest  = 1;
33 |     isrc   = 1;
34 |     istag  = ITAG_A;
35 |     iretag = ITAG_B;
36 |   }
37 |   else if (irank == 1) {
38 |     idest  = 0;
39 |     isrc   = 0;
40 |     istag  = ITAG_B;
41 |     iretag = ITAG_A;
42 |   }
43 | 
44 |   printf("Task %d sends the message with tag %d of length %lu \n", 
45 |          irank, istag, MSGLEN * sizeof(float));
46 |   
47 |   printf("Task %d receives  message with tag %d of length %lu \n", 
48 |          irank, iretag, MSGLEN * sizeof(float));
49 |   
50 |   MPI_Barrier(MPI_COMM_WORLD);
51 | 
52 |   MPI_Send(&rmsg1, MSGLEN, MPI_FLOAT, idest, istag, MPI_COMM_WORLD); 
53 |   MPI_Recv(&rmsg2, MSGLEN, MPI_FLOAT, isrc, iretag, MPI_COMM_WORLD, &recv_status);
54 |   printf("Task %d has received the message\n", irank);
55 |   MPI_Finalize();
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/linear-array.c:
--------------------------------------------------------------------------------
 1 | // A simple 1-D example:
 2 | // each element receive the rank of the previous one, add its rank and send forward. 
 3 | // taken somewhere on net and adapted.
 4 | // Final SUM is the sum of n-1 integers.
 5 | 
 6 | #include <stdio.h>
 7 | #include "mpi.h"
 8 | 
 9 | int main(int argc,char *argv[])
10 | {
11 |   int MyRank, Numprocs;
12 |   int value, sum = 0;
13 |   int Source, Source_tag;
14 |   int Destination, Destination_tag;
15 |   int Root = 0;
16 |   MPI_Status status;
17 | 
18 |   MPI_Init(&argc,&argv);
19 |   MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);
20 |   MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
21 |    
22 |   if (MyRank == Root) {
23 |     Destination     = MyRank + 1;
24 |     Destination_tag = 0;
25 |     MPI_Send(&MyRank, 1, MPI_INT, Destination, Destination_tag, MPI_COMM_WORLD);
26 |   }
27 |   else {     
28 |     if (MyRank<Numprocs - 1) {
29 |       Source     = MyRank - 1;
30 |       Source_tag = 0;
31 |       
32 |       MPI_Recv(&value, 1, MPI_INT, Source, Source_tag, MPI_COMM_WORLD, &status);
33 |       
34 |       sum = MyRank + value;
35 |       Destination     = MyRank + 1;
36 |       Destination_tag = 0;
37 |       MPI_Send(&sum, 1, MPI_INT, Destination, Destination_tag, MPI_COMM_WORLD);
38 |     }
39 |     else {
40 |       Source     = MyRank - 1;
41 |       Source_tag = 0;
42 |       
43 |       MPI_Recv(&value, 1, MPI_INT, Source, Source_tag, MPI_COMM_WORLD, &status);
44 |       
45 |       sum = MyRank + value;
46 |       printf("MyRank %d Final SUM %d\n", MyRank, sum);
47 |     }
48 |   }
49 |   
50 |   MPI_Finalize();
51 |   
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_env_call.c:
--------------------------------------------------------------------------------
 1 | // required MPI include file  
 2 | // print out the name of the processor I am running on
 3 | #include "mpi.h"
 4 | #include <stdio.h>
 5 | 
 6 | int main(int argc, char *argv[]) {
 7 |   int  numtasks, rank, len, rc; 
 8 |   char hostname[MPI_MAX_PROCESSOR_NAME];
 9 | 
10 | // initialize MPI  
11 |   MPI_Init(&argc,&argv);
12 | 
13 | // get number of tasks 
14 |   MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
15 | 
16 | // get my rank  
17 |   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
18 | 
19 | // this one is obvious  
20 |   MPI_Get_processor_name(hostname, &len);
21 |   printf ("Number of tasks= %d. My rank= %d. Running on %s\n", numtasks, rank, hostname);
22 | 
23 | // done with MPI  
24 |   MPI_Finalize();
25 | }
26 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.F90:
--------------------------------------------------------------------------------
 1 | PROGRAM hello
 2 |  INCLUDE 'mpif.h'
 3 |  INTEGER err, rank, size, name_len
 4 |  CHARACTER(MPI_MAX_PROCESSOR_NAME) processor_name
 5 |  CALL MPI_INIT(err)
 6 |  CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank,err)
 7 |  CALL MPI_COMM_SIZE(MPI_COMM_WORLD,size,err)
 8 |  CALL MPI_GET_PROCESSOR_NAME(processor_name,name_len,err)
 9 |  print *, 'Hello world from processor ', processor_name, ' rank ', rank, ' out of ', size, ' processors'
10 |  CALL MPI_FINALIZE(err)
11 | END
12 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   // Initialize the MPI environment
 6 |   MPI_Init(NULL, NULL);
 7 |   
 8 |   // Get the number of processes
 9 |   int world_size;
10 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
11 |   
12 |   // Get the rank of the process
13 |   int world_rank;
14 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
15 |    
16 |   // Get the name of the processor
17 |   char processor_name[MPI_MAX_PROCESSOR_NAME];
18 |   int name_len;
19 |   MPI_Get_processor_name(processor_name, &name_len);
20 |   
21 |   fprintf(stdout, "Hello world from processor %s, rank %d out of %d processors\n",
22 |           processor_name, world_rank, world_size);
23 |   // Finalize the MPI environment.
24 |   MPI_Finalize();
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world_sync.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 |   // Initialize the MPI environment
 6 |   MPI_Init(NULL, NULL);
 7 |   
 8 |   // Get the number of processes
 9 |   int world_size;
10 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
11 |   
12 |   // Get the rank of the process
13 |   int world_rank;
14 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
15 |   
16 |   // Get the name of the processor
17 |   char processor_name[MPI_MAX_PROCESSOR_NAME];
18 |   int name_len;
19 |   MPI_Get_processor_name(processor_name, &name_len);
20 |   
21 |   // Print off a hello world message
22 |   for (int i=0; i<world_size; i++) {
23 |     if(world_rank==i) {
24 |       fprintf(stdout, "Hello world from processor %s, rank %d out of %d processors\n",
25 |               processor_name, world_rank, world_size);
26 |       fflush(stdout);
27 |     }
28 |     MPI_Barrier(MPI_COMM_WORLD);
29 |   }
30 |   // Finalize the MPI environment.
31 |   MPI_Finalize();
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_pi.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2015 - 2016 Master in High Performance Computing
 3 |  *
 4 |  * Adapted from the net by  Giuseppe Brandino. 
 5 |  * Last modified by Alberto Sartori. 
 6 |  * Added time and promoted to long long all important variables
 7 |  * Some more modification by S.Cozzini (nov 2021)
 8 |  */
 9 | 
10 | #include <stdlib.h>
11 | #include <unistd.h>
12 | #include <stdio.h>
13 | #include <math.h>
14 | #include <string.h>
15 | #include <mpi.h>
16 | #define USE MPI
17 | #define SEED 35791246
18 | 
19 | int main (int argc , char *argv[])
20 | {
21 |   // coordinates
22 |   double x, y;
23 | 
24 |   // number of points inside the circle
25 |   long long int M, local_M; 
26 |   double pi;
27 |    
28 |   // times 
29 |   double start_time, comp_time, end_time, wall_time, avg_walltime, max_walltime;   
30 |   int myid, numprocs, proc;
31 |   MPI_Status status;
32 |   MPI_Request request;
33 |   // master process
34 |   int master = 0;
35 |   int tag = 123;
36 | 
37 |   MPI_Init(&argc, &argv);
38 |   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
39 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
40 |   fprintf (stdout, "I am  %d\n", myid);
41 |   if (argc <=1 ) {
42 |     fprintf(stderr, "Usage : mpi -np n %s number_of_iterations \n", argv[0]);
43 |     MPI_Finalize();
44 |     exit(-1);
45 |   }
46 | 
47 |   long long int N = atoll(argv[1])/numprocs;
48 |   // take time of processors after initial I/O operation
49 |   start_time = MPI_Wtime();
50 | 
51 |   // initialize random numbers 
52 |   srand48(SEED * (myid + 1)); // seed the number generator
53 |   local_M = 0;
54 |   long long int i;
55 |   for (i = 0; i < N ; i++) {
56 |     // take a point P(x,y) inside the unit square
57 |     x = drand48(); 
58 |     y = drand48();
59 |     // check if the point P(x,y) is inside the circle
60 |     if ( (x*x + y*y) < 1)
61 |       local_M++;
62 |   }
63 |   // take time of processors after initial I/O operation
64 |   MPI_Barrier(MPI_COMM_WORLD);
65 |   comp_time=MPI_Wtime();
66 | 
67 |   if (myid == 0) { //if I am the master process gather results from others
68 |     M = local_M;
69 |     for (proc = 1; proc < numprocs; proc++) {
70 |       MPI_Recv(&local_M, 1, MPI_LONG_LONG, proc, tag, MPI_COMM_WORLD, &status);
71 |       M += local_M;
72 |     }
73 |     pi = 4.0 * M / (N * numprocs);
74 |     end_time = MPI_Wtime();
75 |   }
76 |   else {   // for all the slave processes send results to the master /
77 |     MPI_Ssend(&local_M, 1,MPI_LONG_LONG, master, tag, MPI_COMM_WORLD);
78 |     end_time=MPI_Wtime();
79 |   }
80 |  
81 |   wall_time = end_time - start_time;
82 |   MPI_Reduce(&wall_time, &avg_walltime, 1, MPI_DOUBLE, MPI_SUM, master, MPI_COMM_WORLD);
83 |   avg_walltime = avg_walltime / numprocs;
84 |   MPI_Reduce(&wall_time, &max_walltime, 1, MPI_DOUBLE, MPI_MAX, master, MPI_COMM_WORLD);
85 | 
86 |   fprintf(stdout, "\n# walltime on processor %i : %10.8f\n", myid, wall_time);
87 |   fprintf(stdout, "\n# walltime after computation on processor %i : %10.8f\n", myid, comp_time - start_time);
88 |   fprintf(stdout, "\n# walltime for communication on processor %i : %10.8f\n", myid, end_time - comp_time);
89 |   fflush(stdout);
90 |   if (myid ==0) { 
91 |     printf ( "\n# of trials = %llu , estimate of pi is %1.9f\n", N * numprocs, pi);
92 |     fprintf(stdout, "\n[*] Average Walltime: %10.8f\n", avg_walltime);
93 |     fprintf(stdout, "\n(*) Max Walltime: %10.8f\n", max_walltime);
94 |     fflush(stdout);
95 |   } 
96 |   MPI_Finalize() ; // let MPI finish up /
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_pi.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=pi_epyc
 4 | #SBATCH --time=0-00:10:00
 5 | #SBATCH -p EPYC
 6 | #SBATCH -n128
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | #SBATCH -N1 # 1 NODES 
10 | echo Starting job $SLURM_JOB_ID
11 | echo Current dir is ${SLURM_SUBMIT_DIR}
12 | 
13 | module purge
14 | module load compiler
15 | module load intelMPI/2021.7.1
16 | mpiicc -cc=icx  mpi_hello_world.c      -g3 -o mpi_hello_world.x
17 | mpiicc -cc=icx  mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
18 | mpiifort mpi_hello_world.F90   -g3 -o mpi_hello_world_F.x
19 | mpiicc -cc=icx mpi_pi.c -O3 -g3 -o mpi_pi.x
20 | 
21 | mpirun -np 12 ./mpi_pi.x 10000
22 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.F90:
--------------------------------------------------------------------------------
 1 | 
 2 | Program MPI 
 3 | ! a simple implementation of send/receive message 
 4 |    Implicit None
 5 | !
 6 |    Include 'mpif.h'
 7 | !
 8 |    Integer                                 :: rank
 9 |    Integer                                 :: buffer
10 |    Integer, Dimension( 1:MPI_status_size ) :: status
11 |    Integer                                 :: error
12 | !
13 |    Call MPI_init( error )
14 |    Call MPI_comm_rank( MPI_comm_world, rank, error )
15 | !
16 |    If( rank == 0 ) Then
17 |       Call MPI_recv( buffer, 1, MPI_integer, 1, 10, &
18 |                      MPI_comm_world, status, error )
19 |       Print*, 'Rank ', rank, ' buffer=', buffer
20 |       If( buffer /= 33 ) Print*, 'fail'
21 |    End If                      
22 | !
23 |    If( rank == 1 ) Then
24 |       buffer = 33
25 |       Call MPI_send( buffer, 1, MPI_integer, 0, 10, &
26 |                      MPI_comm_world, error )
27 |    End If
28 | !
29 |    Call MPI_finalize( error )
30 | End Program MPI
31 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <string.h>
 5 | int main(int argc, char** argv) {
 6 |   MPI_Init(&argc, &argv);
 7 |   int rank, size;
 8 |   int buffer;
 9 |   MPI_Status status;
10 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
11 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
12 |   if(size != 2) {
13 |     printf("This application is meant to be run with 2 processes.\n");
14 |     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
15 |   }
16 |   if (rank == 0){
17 |     // int MPI_Recv(void* buffer, int count, MPI_Datatype datatype,
18 |     //              int sender, int tag, MPI_Comm communicator, MPI_Status* status);
19 |     MPI_Recv(&buffer, 1, MPI_INT, 1, 9, MPI_COMM_WORLD, &status);
20 |     fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer);
21 |     if (buffer != 33) fprintf(stderr, "Fail\n");
22 |   }
23 |   if (rank == 1) {
24 |     buffer = 33;
25 |     // int MPI_Send(const void* buffer, int count, MPI_Datatype datatype,
26 |     //              int recipient, int tag, MPI_Comm communicator);
27 |     MPI_Send(&buffer, 1, MPI_INT, 0, 9, MPI_COMM_WORLD);
28 |   }
29 |   MPI_Finalize();
30 |   return EXIT_SUCCESS;
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/sendrecv_message.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <string.h>
 5 | int main(int argc, char** argv) {
 6 |   MPI_Init(&argc, &argv);
 7 |   int rank, size;
 8 |   int buffer;
 9 |   char message[2][16];
10 |   MPI_Status status;
11 |   MPI_Comm_size(MPI_COMM_WORLD, &size);
12 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
13 |   if(size != 2) {
14 |     printf("This application is meant to be run with 2 processes.\n");
15 |     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
16 |   }
17 |   if (rank == 0){
18 |     strcpy(message[0], "skew");
19 |     strcpy(message[1], "squeue");
20 |     // int MPI_Sendrecv(const void* buffer_send, int count_send, 
21 |     //                  MPI_Datatype datatype_send, int recipient, int tag_send,
22 |     //                  void* buffer_recv, int count_recv,
23 |     //                  MPI_Datatype datatype_recv, int sender, int tag_recv,
24 |     //                  MPI_Comm communicator, MPI_Status* status);
25 |     MPI_Sendrecv(message, 32, MPI_CHAR, 1, 10, 
26 | 		 &buffer,  1, MPI_INT,  1,  9, 
27 | 		 MPI_COMM_WORLD, &status);
28 |     fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer);
29 |     if (buffer != 33) fprintf(stderr, "Fail\n");
30 |   }
31 |   if (rank == 1) {
32 |     buffer = 33;	  
33 |     MPI_Sendrecv(&buffer,  1,  MPI_INT, 0,  9,
34 |                  message, 32, MPI_CHAR, 0, 10, 
35 |                  MPI_COMM_WORLD, &status);
36 |     
37 |     fprintf(stdout, "Rank %d: message[0] = %s, message[1] = %s \n", 
38 | 		    rank, message[0], message[1]);
39 |   }
40 |   MPI_Finalize();
41 |   return EXIT_SUCCESS;
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #SBATCH --job-name=allgather-example
 4 | #SBATCH -p GPU
 5 | #SBATCH --nodes=2
 6 | #SBATCH --ntasks-per-node=1
 7 | ##SBATCH --gres=gpu:2
 8 | #SBATCH --time=0:10:00
 9 | #SBATCH -o allgather.%A.out
10 | #SBATCH -e allgather.%A.error
11 | ##SBATCH -A lade
12 | #SBATCH --wait-all-nodes=1
13 | #SBATCH --cpus-per-task=16
14 | #SBATCH --mem=10G 
15 | ##SBATCH -w dgx002
16 | CURRENT_DIR=${SLURM_SUBMIT_DIR}
17 | head_node=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
18 | head_node_ip=$( srun  --nodes=1 --ntasks=1 -w "$head_node" --exclusive hostname --ip-address)
19 | echo "head_node=" ${head_node} " - head_node_ip=" $head_node_ip
20 | #export LOGLEVEL=INFO
21 | #export NCCL_DEBUG=INFO
22 | export OMP_NUM_THREADS=16
23 | cd ../..
24 | source myenv_v100/bin/activate
25 | cd -
26 | echo $(pwd)
27 | echo ${CUDA_VISIBLE_DEVICES}
28 | 
29 | srun -l torchrun  \
30 | --nnodes 2 \
31 | --nproc_per_node 2 \
32 | --rdzv_id $RANDOM \
33 | --rdzv_backend c10d \
34 | --rdzv_endpoint $head_node_ip:29500 \
35 | allgather.py 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | import torch.multiprocessing as mp
 3 | import torch
 4 | import os
 5 | 
 6 | # salloc -N1 -n1 -c64 -A lade -p DGX --gpus-per-node=4 --time=1:59:00
 7 | # srun python tmp.py
 8 | def ddp_setup():
 9 |     dist.init_process_group(backend="nccl")
10 |     torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
11 | 
12 | #def ddp_setup(rank: int, world_size: int):
13 | #   """
14 | #   Args:
15 | #       rank: Unique identifier of each process
16 | #      world_size: Total number of processes
17 | #   """
18 | #   os.environ["MASTER_ADDR"] = "localhost"
19 | #   os.environ["MASTER_PORT"] = "12355"
20 | #   dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
21 | #   torch.cuda.set_device(rank)
22 | 
23 | def run():
24 |     local_rank  = int(os.environ["LOCAL_RANK"])
25 |     global_rank = int(os.environ["RANK"])
26 |     world_size = dist.get_world_size()
27 |     torch.manual_seed(global_rank)
28 |     n = torch.randint(high=10, size=(1,), dtype=int).to(local_rank)
29 |     a = torch.tensor([global_rank] * n, dtype=int).to(local_rank)
30 |     for p in range(world_size):
31 |         if global_rank==p:
32 |             print(f"A) {global_rank}: {a}", flush=True)
33 |         dist.barrier()
34 |     nelements_list = [torch.zeros_like(n).to(local_rank) for _ in range(world_size)]
35 |     dist.all_gather(tensor = n, tensor_list = nelements_list)
36 |     gather_list = [torch.zeros(int(nelements_list[i]), dtype=int).to(local_rank) for i in range(world_size)]
37 |     dist.all_gather(tensor = a, tensor_list = gather_list)
38 |     res = torch.cat((gather_list))
39 |     for p in range(world_size):
40 |         if global_rank==p:
41 |             print(f"B) {global_rank}: {res}", flush=True)
42 |         dist.barrier()
43 | 
44 | def main():
45 |    ddp_setup()
46 |    run()
47 |    dist.destroy_process_group()
48 | 
49 | if __name__ == "__main__":
50 | #   world_size = torch.cuda.device_count()
51 | #   mp.spawn(main, args=(world_size,), nprocs=world_size)
52 |     main()
53 | 
54 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgatherv.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <unistd.h>
 3 | #include <stdio.h>
 4 | #include <math.h>
 5 | #include <string.h>
 6 | #include <mpi.h>
 7 | #include <assert.h>
 8 | #define SEED 35791246
 9 | 
10 | int main(int argc, char** argv) {
11 | 
12 |   int myid, nproc;
13 |   MPI_Init(NULL, NULL);
14 |   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
15 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
16 |   srand(SEED*(myid+1)) ; // seed the number generator
17 |   int numel = 1 + (rand() % 9);
18 |   int totel;
19 |   
20 |   int counts_recv[nproc];
21 |   int displacements[nproc];
22 |   MPI_Allgather(&numel, 1, MPI_INT, counts_recv, 1, MPI_INT, MPI_COMM_WORLD);
23 |   displacements[0] = 0 ;
24 |   for (int i = 1; i < nproc ; i++){
25 |     displacements[i] = displacements[i-1] + counts_recv[i-1];
26 |   }
27 |   
28 |   double * a = (double*)malloc(sizeof(double) * numel);
29 |   assert(a != NULL);
30 | 
31 |   for (int i=0; i < numel; i++) {
32 |     a[i]=myid;
33 |   }
34 |   
35 |   for (int i = 0; i < nproc; i++) {
36 |     if (i == myid) {
37 |       fprintf(stdout, "BEFORE\tmyid = %d\n", myid );
38 |       for (int n = 0 ; n < numel; n++) 
39 |         fprintf(stdout, "\ta[%d]=%.1f\n", n, a[n]);
40 |       fprintf(stdout, "\n");
41 |       fflush(stdout);
42 |     }
43 |     MPI_Barrier(MPI_COMM_WORLD);
44 |   }
45 |   
46 |   MPI_Allreduce(&numel, &totel, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
47 |   double * b = (double*)malloc(sizeof(double) * totel);
48 |   assert(b != NULL);
49 |   
50 |   MPI_Allgatherv(a, numel, MPI_DOUBLE, b, counts_recv, displacements, MPI_DOUBLE, MPI_COMM_WORLD);
51 |   for (int i = 0; i < nproc; i++) {
52 |     if (i == myid) {
53 |       fprintf(stdout, "AFTER\tmyid = %d\n", myid );
54 |       for (int n = 0 ; n < totel; n++) 
55 |         fprintf(stdout, "\tb[%d]=%.1f\n", n, b[n]);
56 |       fprintf(stdout, "\n");
57 |       fflush(stdout);
58 |     }
59 |     MPI_Barrier(MPI_COMM_WORLD);
60 |   }
61 |   free(a);
62 |   free(b);
63 |   MPI_Finalize();
64 | }
65 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <assert.h>
 5 | 
 6 | int main(int argc, char** argv) {
 7 | 
 8 |   int num_elements = 2;
 9 |   int myid, nproc, root;
10 |   MPI_Init(NULL, NULL);
11 |   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
12 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
13 |   double * a = (double*)malloc(sizeof(double) * num_elements);
14 |   assert(a != NULL);
15 |   for (int i=0; i < num_elements; i++) {
16 |     a[i]=0.;
17 |   }
18 |   root = 0;
19 |   if (myid == root) {  
20 |     for (int i = 0 ; i < num_elements; i++)
21 |       a[i] = 2. * (i + 1.);
22 |   }
23 |   for (int i = 0; i < nproc; i++) {
24 |     if (i == myid) {
25 |       fprintf(stdout, "%d\tbefore:", myid );
26 |       for (int n = 0 ; n < num_elements; n++) 
27 |         fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]);
28 |       fprintf(stdout, "\n");
29 |       fflush(stdout);
30 |     }
31 |     MPI_Barrier(MPI_COMM_WORLD);
32 |   }
33 |   // int MPI_Bcast(void* buffer, int count, MPI_Datatype datatype, int emitter_rank, MPI_Comm communicator);
34 |   MPI_Bcast(a, num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);
35 |   for (int i = 0; i < nproc; i++) {
36 |     if (i == myid) {
37 |       fprintf(stdout, "%d\tafter:", myid );
38 |       for (int n = 0 ; n < num_elements; n++) 
39 |         fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]);
40 |       fprintf(stdout, "\n");
41 |       fflush(stdout);
42 |     }
43 |     MPI_Barrier(MPI_COMM_WORLD);
44 |   }
45 |   free(a);
46 |   MPI_Finalize();
47 | }
48 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.f:
--------------------------------------------------------------------------------
 1 |         PROGRAM broad_cast
 2 |         INCLUDE 'mpif.h'
 3 |         INTEGER ierr, myid, nproc, root
 4 |         INTEGER status(MPI_STATUS_SIZE)
 5 |         REAL A(2)
 6 |         CALL MPI_INIT(ierr)
 7 |         CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
 8 |         CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
 9 |         root = 0
10 |         a(1)=0.0
11 |         A(2)=0.0
12 |         IF( myid .EQ. 0 ) THEN
13 |         a(1) = 2.0
14 |         a(2) = 4.0
15 |         END IF
16 |         WRITE(6,*) myid, ' before: a(1)=', a(1), 'a(2)=', a(2)
17 |         CALL MPI_BARRIER() 
18 |         CALL MPI_BCAST(a, 2, MPI_REAL, 0, MPI_COMM_WORLD, ierr)
19 |         WRITE(6,*) myid, ' after : a(1)=', a(1), 'a(2)=', a(2)
20 |         CALL MPI_FINALIZE(ierr)
21 |         END
22 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/clean.sh:
--------------------------------------------------------------------------------
1 | rm *.x
2 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/compile.sh:
--------------------------------------------------------------------------------
 1 | module purge
 2 | module load openMPI/4.1.5/gnu
 3 | mpicc scatter.c -o scatter_c.x
 4 | mpicc gather.c  -o gather_c.x
 5 | mpicc b_cast.c  -o b_cast_c.x
 6 | mpicc reduce.c  -o reduce_c.x
 7 | mpicc mpi_bcastcompare.c -o mpi_bcastcompare.x
 8 | mpicc allgatherv.c -o allgatherv.x
 9 | mpicc all2allv3d.c -o all2allv3d.x
10 | 
11 | mpifort scatter.f -o scatter_f.x
12 | mpifort scatter.f -o scatter_f.x
13 | mpifort gather.f  -o gather_f.x
14 | mpifort b_cast.f  -o b_cast_f.x
15 | mpifort reduce.f  -o reduce_f.x
16 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <assert.h>
 5 | 
 6 | int main(int argc, char** argv) {
 7 | 
 8 |   int myid, nproc, root;
 9 |   int num_elements = 8;
10 |   int nsnd = 2;
11 |   double *a;
12 |   double *b;
13 |   a = (double*)malloc(sizeof(double) * num_elements);
14 |   b = (double*)malloc(sizeof(double) * nsnd);
15 |   assert(a != NULL);
16 |   assert(b != NULL);
17 |   MPI_Init(NULL, NULL);
18 |   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
19 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
20 |   int gat_elements = nsnd * nproc;
21 |   root=0;
22 |   if(num_elements < gat_elements && myid == root) {
23 |     printf("This application is meant to be run with no more than %d MPI processes.\n", num_elements/nsnd);
24 |     MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
25 |   }
26 |   for (int i = 0; i < nsnd; i++)
27 |       b[i] = myid;
28 |   // int MPI_Gather(const void* buffer_send, int count_send, MPI_Datatype datatype_send,
29 |   //                void* buffer_recv, int count_recv, MPI_Datatype datatype_recv,
30 |   //                int root, MPI_Comm communicator);
31 |   MPI_Gather(b, nsnd, MPI_DOUBLE, a, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD);
32 |   if (myid==root) {
33 |     fprintf(stdout, "myid=%d:\n", myid);
34 |     for (int i = 0; i < gat_elements; i++) 
35 |       fprintf(stdout, "\ta[%d]=%.2f\n", i, a[i]);
36 |     fprintf(stdout, "\n");
37 |     for (int i = gat_elements; i < num_elements; i++) 
38 |       fprintf(stdout, "\t\ta[%d]=%.2f\n", i, a[i]);
39 |   }
40 |   free(a);
41 |   free(b);
42 |   MPI_Finalize();
43 | }
44 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.f:
--------------------------------------------------------------------------------
 1 |         PROGRAM gather
 2 |         INCLUDE 'mpif.h'
 3 |         INTEGER ierr, myid, nproc, nsnd, I, root
 4 |         INTEGER status(MPI_STATUS_SIZE)
 5 |         REAL A(16), B(2)
 6 |         CALL MPI_INIT(ierr)
 7 |         CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
 8 |         CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
 9 |         root = 0
10 |         b(1) = REAL( myid )
11 |         b(2) = REAL( myid )
12 |         nsnd = 2
13 |         CALL MPI_GATHER(b, nsnd, MPI_REAL, a, nsnd,
14 |      & MPI_REAL, root, MPI_COMM_WORLD, ierr)
15 |         IF( myid .eq. root ) THEN
16 |                 DO i = 1, (nsnd*nproc)
17 |                  WRITE(6,*) myid, ': a(i)=', a(i)
18 |                 END DO
19 |         END IF
20 |         CALL MPI_FINALIZE(ierr)
21 |         END
22 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/mpi_bcastcompare.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <assert.h>
 5 | 
 6 | void my_bcast(void* data, int count, MPI_Datatype datatype, int root,
 7 |               MPI_Comm communicator) {
 8 |   int world_rank;
 9 |   MPI_Comm_rank(communicator, &world_rank);
10 |   int world_size;
11 |   MPI_Comm_size(communicator, &world_size);
12 | 
13 |   if (world_rank == root) {
14 |     // If we are the root process, send our data to everyone
15 |     int i;
16 |     for (i = 0; i < world_size; i++) {
17 |       if (i != world_rank) {
18 |         MPI_Send(data, count, datatype, i, 0, communicator);
19 |       }
20 |     }
21 |   } else {
22 |     // If we are a receiver process, receive the data from the root
23 |     MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE);
24 |   }
25 | }
26 | 
27 | int main(int argc, char** argv) {
28 | 
29 |   int num_elements = 1000;
30 |   int num_trials = 10;
31 | 
32 |   MPI_Init(NULL, NULL);
33 | 
34 |   int world_rank;
35 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
36 | 
37 |   double total_my_bcast_time = 0.0;
38 |   double total_mpi_bcast_time = 0.0;
39 |   int i;
40 |   int* data = (int*)malloc(sizeof(int) * num_elements);
41 |   assert(data != NULL);
42 | 
43 |   for (i = 0; i < num_trials; i++) {
44 |     // Time my_bcast
45 |     // Synchronize before starting timing
46 |     MPI_Barrier(MPI_COMM_WORLD);
47 |     total_my_bcast_time -= MPI_Wtime();
48 |     my_bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
49 |     // Synchronize again before obtaining final time
50 |     MPI_Barrier(MPI_COMM_WORLD);
51 |     total_my_bcast_time += MPI_Wtime();
52 | 
53 |     // Time MPI_Bcast
54 |     MPI_Barrier(MPI_COMM_WORLD);
55 |     total_mpi_bcast_time -= MPI_Wtime();
56 |     MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
57 |     MPI_Barrier(MPI_COMM_WORLD);
58 |     total_mpi_bcast_time += MPI_Wtime();
59 |   }
60 | 
61 | // Print off timing information
62 |   if (world_rank == 0) {
63 |     printf("Data size = %d, Trials = %d\n", num_elements * (int)sizeof(int), num_trials);
64 |     printf("Avg my_bcast  time = %lf\n", total_my_bcast_time / num_trials);
65 |     printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials);
66 |   }
67 |   
68 |   free(data);
69 |   MPI_Finalize();
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <mpi.h>
 3 | 
 4 | int main(int argc, char** argv) {
 5 | 
 6 |   int num_elements = 2;
 7 |   int myid, nproc, root;
 8 |   double a[num_elements], b[num_elements];
 9 |   for (int i = 0; i < num_elements; i++)
10 |     a[i] = 2.0 * (1+i);
11 |   root=0;
12 |   MPI_Init(NULL, NULL);
13 |   int world_rank;
14 |   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
15 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
16 |   //int MPI_Reduce(const void* send_buffer, void* receive_buffer, int count,
17 |   //               MPI_Datatype datatype, MPI_Op operation, int root, MPI_Comm communicator);
18 |   MPI_Reduce(a, b, num_elements, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
19 |   if (myid == 0) {
20 |     fprintf(stdout,"myid=%d:\n", myid);
21 |     for (int i = 0; i < num_elements; i++)
22 |       fprintf(stdout,"\tb[%d]=%.2f\n", i, b[i]);
23 |   fprintf(stdout,"\n");
24 |   }
25 |   MPI_Finalize();
26 | }
27 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.f:
--------------------------------------------------------------------------------
 1 |       PROGRAM reduce
 2 |       INCLUDE 'mpif.h'
 3 |       INTEGER ierr, myid, nproc, root
 4 |       INTEGER status(MPI_STATUS_SIZE)
 5 |       REAL A(2), res(2)
 6 |       CALL MPI_INIT(ierr)
 7 |       CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
 8 |       CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
 9 |       root = 0
10 |       a(1) = 2.0
11 |       a(2) = 4.0
12 |       CALL MPI_REDUCE(a, res, 2, MPI_REAL, MPI_SUM, root,
13 |      &  MPI_COMM_WORLD, ierr)
14 |       IF( myid .EQ. 0 ) THEN
15 |         WRITE(6,*) myid, ': res(1)=', res(1), 'res(2)=', res(2)
16 |       END IF
17 |       CALL MPI_FINALIZE(ierr)
18 |       END
19 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <mpi.h>
 4 | #include <assert.h>
 5 | 
 6 | 
 7 | 
 8 | 
 9 | int main(int argc, char** argv) {
10 | 
11 |   int myid, nproc, root;
12 |   int num_elements = 8;
13 |   int nsnd = 2;
14 |   double a[num_elements];
15 |   double *b;
16 |   b = (double*)malloc(sizeof(double) * nsnd);
17 |   assert(b != NULL);
18 |   MPI_Init(NULL, NULL);
19 |   MPI_Comm_size(MPI_COMM_WORLD, &nproc);
20 |   MPI_Comm_rank(MPI_COMM_WORLD, &myid);
21 |   root=0;
22 |   if(nproc * nsnd != num_elements && myid == root) {
23 |         printf("This application is meant to be run with %d MPI processes.\n", num_elements/nsnd);
24 |         MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
25 |   }
26 |   if (myid == root) {
27 |     for (int i = 0; i < num_elements; i++)
28 |       a[i] = i+1;
29 |   }
30 |   // int MPI_Scatter(const void* buffer_send, int count_send, MPI_Datatype datatype_send,
31 |   //                 void* buffer_recv, int count_recv, MPI_Datatype datatype_recv,
32 |   //                 int root, MPI_Comm communicator);
33 | 
34 |   MPI_Scatter(a, nsnd, MPI_DOUBLE, b, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD);
35 |   fprintf(stdout, "myid=%d:\tb[0]=%.2f,\tb[1]=%.2f\n",myid, b[0], b[1] );
36 |   free(b);
37 |   MPI_Finalize();
38 | }
39 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.f:
--------------------------------------------------------------------------------
 1 |        PROGRAM scatter
 2 |         INCLUDE 'mpif.h'
 3 |         INTEGER ierr, myid, nproc, nsnd, I, root
 4 |         INTEGER status(MPI_STATUS_SIZE)
 5 |         REAL A(16), B(2)
 6 |         CALL MPI_INIT(ierr)
 7 |         CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
 8 |         CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
 9 |         root = 0
10 |         IF( myid .eq. root ) THEN
11 |           DO i = 1, 16
12 |              a(i) = REAL(i)
13 |           END DO
14 |         END IF
15 |         nsnd = 2
16 |         CALL MPI_SCATTER(a, nsnd, MPI_REAL, b, nsnd,
17 |      & MPI_REAL, root, MPI_COMM_WORLD, ierr)
18 |         WRITE(6,*) myid, ': b(1)=', b(1), 'b(2)=', b(2)
19 |         CALL MPI_FINALIZE(ierr)
20 |        END
21 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/pi_scalability/scalability.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=scaling          # Job name
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks=128                # Run a single task
 5 | #SBATCH --time=01:20:00             # Time limit hrs:min:sec
 6 | #SBATCH -p EPYC
 7 | #SBATCH --output=%x.%j.out
 8 | #SBATCH --error=%x.%j.err
 9 | #SBATCH --exclusive
10 | module purge
11 | module load openMPI/4.1.5/gnu
12 | PI="../basic-mpi-codes/mpi_pi"
13 | mpicc -O3  ${PI}.c -o mpi_pi.x
14 | element="socket"
15 | N=1000000000
16 | echo "tasks, N, avg_walltime" > pi_strong.csv
17 | for i in $(eval echo {0..$SLURM_NTASKS..8}); 
18 | do
19 |     if [ "$i" -eq "0" ]
20 |     then
21 | 	echo -n "1, $N," >> pi_strong.csv
22 |         mpirun --map-by ${element} -np 1  ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv	
23 |     else
24 |         echo -n "$i, $N," >> pi_strong.csv
25 |         mpirun --map-by ${element} -np $i ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv
26 |     fi
27 | done
28 | 
29 | echo "tasks, N, avg_walltime" > pi_weak.csv
30 | for i in $(eval echo {0..$SLURM_NTASKS..8}); 
31 | do
32 |     if [ "$i" -eq "0" ] 
33 |     then
34 |         M=$N
35 |         echo -n "1, $M," >> pi_weak.csv
36 |         mpirun --map-by ${element} -np 1  ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv 
37 |     else
38 |         M=$((${N}*${i}))
39 |         echo -n "$i, $M," >> pi_weak.csv
40 |         mpirun --map-by ${element} -np $i ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv
41 |     fi
42 | done
43 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/.#for.c:
--------------------------------------------------------------------------------
1 | luca@ggg.2121:1698304345


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/loop_without_for.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <omp.h>
37 | 
38 | 
39 | #define N_default     1000  // how long is the main array
40 | 
41 | int main( int argc, char **argv )
42 | {
43 | 
44 |   int N         = N_default;
45 |   int nthreads  = 1;
46 | 
47 |   // check whether some arg has been passed on
48 |   if ( argc > 1 )
49 |     {
50 |       N = atoi( *(argv+1) );
51 |       if ( argc > 2 )
52 | 	nthreads = atoi( *(argv+2) );
53 |     }
54 | 
55 |   if( nthreads > 1 )
56 |     omp_set_num_threads(nthreads);
57 |   #pragma omp parallel
58 |   {
59 |     int me       = omp_get_thread_num();
60 |     int nthreads = omp_get_num_threads();
61 |     
62 |     int chunk    = N / nthreads;
63 |     int mod      = N % nthreads;
64 |     int my_first = chunk*me + ((me < mod)?me:mod);
65 |     int my_chunk = chunk + (mod > 0)*(me < mod);
66 | 
67 |    #pragma omp single
68 |     printf("nthreads: %d, N: %d --- chunk is %d, reminder is %d\n", nthreads, N, chunk, mod);
69 | 
70 |     printf("thread %d : from %d to %d\n", me, my_first, my_first+my_chunk);
71 | 
72 |     /*
73 |      * here you could then insert a for loop
74 |      * int my_stop = my_first + my_chunk;
75 |      * for( int i = myfirst; i < my_stop; i++ )
76 |      *   ...
77 |      */
78 |   } 
79 | 
80 | 
81 |   return 0;
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <unistd.h>
 3 | #include <stdio.h>
 4 | #include <math.h>
 5 | #include <string.h>
 6 | #include <omp.h>
 7 | #include <time.h>
 8 | 
 9 | #define DEFAULT 1000000
10 | #define SEED    918273
11 | 
12 | int main ( int argc, char **argv)
13 | {
14 |     
15 |     long long int M=0;
16 |     int           nthreads;
17 |     double        pi;
18 |     
19 |     
20 |     
21 |     #pragma omp parallel  
22 |     #pragma omp master
23 |     nthreads = omp_get_num_threads();
24 | 
25 |     long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ;
26 |     printf("omp calculation with %d threads\nN=%Ld\n",
27 | 	   nthreads ,N);
28 | 
29 |     double timing = omp_get_wtime();
30 |     #pragma omp parallel
31 |     {
32 |       int myid = omp_get_thread_num();
33 |       double x, y ;
34 |       srand48(SEED*(myid+1));
35 | 
36 |      #pragma omp for reduction(+:M)
37 |       for( long long unsigned i = 0; i < N; i++)
38 | 	{
39 | 	  x = drand48(); 
40 | 	  y = drand48();
41 | 	  M += ((x*x + y*y) < 1.0);
42 | 	}
43 |     }
44 |     
45 |     timing = omp_get_wtime() - timing;
46 |     printf("Estimation of pi: %1.9f\n Walltime:%g\n",
47 | 	   (4.0*(double)M)/N, timing );
48 |       
49 |     return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.fix.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <unistd.h>
 3 | #include <stdio.h>
 4 | #include <math.h>
 5 | #include <string.h>
 6 | #include <omp.h>
 7 | #include <time.h>
 8 | 
 9 | #define DEFAULT 1000000
10 | #define SEED    918273
11 | 
12 | int main(int argc,char* argv[])
13 | {
14 | 
15 |   long long unsigned int M = 0;
16 |   int                    nthreads;
17 |   
18 |  #pragma omp parallel  
19 |  #pragma omp master
20 |   nthreads = omp_get_num_threads();    
21 | 
22 |   long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ;
23 |   printf("omp calculation with %d threads\nN=%Ld\n", nthreads ,N);
24 | 
25 |   double timing = omp_get_wtime();
26 |  #pragma omp parallel
27 |   {
28 |     int myid = omp_get_thread_num();
29 |     int unsigned short myseeds[3] = {SEED+(myid),SEED+(myid*3+1), SEED+(myid*4+2)};
30 |     
31 |     seed48( myseeds );
32 |     
33 |    #pragma omp for reduction(+:M)
34 |     for( long long unsigned int i = 0; i < N; i++)
35 |       {
36 | 	double x = erand48( myseeds ); 
37 | 	double y = erand48( myseeds );
38 | 	
39 | 	M += ( (x*x + y*y) < 1.0 );
40 |       } 
41 |   }    
42 |     
43 |   timing = omp_get_wtime() - timing;
44 |   
45 |   printf("Estimation of pi: %1.9f\n Walltime:%g\n",
46 | 	 (4.0*(double)M)/N, timing );
47 |   return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/00_scope_of_variables.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #define _GNU_SOURCE
34 | #include <stdlib.h>
35 | #include <stdio.h>
36 | #include <string.h>
37 | #include <unistd.h>
38 | #include <sys/syscall.h>
39 | #include <omp.h>
40 | 
41 | 
42 | int main( int argc, char **argv )
43 | {
44 |   int i;
45 | 
46 |   printf( "\nmain thread (pid: %d, tid: %ld) data:\n"
47 | 	  "&i is @ address : %p\n\n",
48 | 	  (int)getpid(), syscall(SYS_gettid), &i);
49 |   
50 |   // just try who is the private i for each thread
51 |  #pragma omp parallel private(i)
52 |   {
53 |     int me = omp_get_thread_num();
54 |     
55 |     printf( "\tthread nr %d    ( tid %ld, from pid %d ) :\n"
56 | 	    "\t\tmy i address is %p\n",
57 | 	    me, syscall(SYS_gettid), (int)getpid(), &i );
58 |   }
59 | 
60 |   printf( "\n" );  
61 |   return 0;
62 | }
63 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/01_simple_pr_wrong.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | #if defined(__STDC__)
28 | #  if (__STDC_VERSION__ >= 199901L)
29 | #     define _XOPEN_SOURCE 700
30 | #  endif
31 | #endif
32 | #include <stdlib.h>
33 | #include <stdio.h>
34 | #include <string.h>
35 | #include <unistd.h>
36 | #include <omp.h>
37 | 
38 | 
39 | int main( int argc, char **argv )
40 | {
41 | 
42 |   int nthreads;
43 |   int my_thread_id;
44 |   
45 | #if defined(_OPENMP)
46 |   
47 | #pragma omp parallel               // this creates a parallel region
48 |                                    // that is encompassed by the
49 |                                    // opening and closing { }
50 |                                    //
51 |                                    // you can modify the number of
52 |                                    // spawned threads through the
53 |                                    //   OMP_THREAD_NUM
54 |                                    // environmental variable
55 |   
56 |   {   
57 |     
58 |     my_thread_id = omp_get_thread_num();  // note: this assignment is not thread-safe
59 |     sleep(0.05);
60 |     #pragma omp master
61 |     nthreads = omp_get_num_threads();
62 | 
63 |                                    // the order in which different threads will
64 |                                    // arrive at this print is undefined;
65 |                                    // if you run this code several times, you will
66 |                                    // obtain different results
67 | 
68 |     printf( "\tgreetings from thread num %d\n", my_thread_id);
69 |   }
70 | #else
71 |   
72 |   nthreads = 1;
73 | #endif
74 |   
75 |   printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
76 |   
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/02_simple_pr.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | #if defined(__STDC__)
28 | #  if (__STDC_VERSION__ >= 199901L)
29 | #     define _XOPEN_SOURCE 700
30 | #  endif
31 | #endif
32 | #include <stdlib.h>
33 | #include <stdio.h>
34 | #include <string.h>
35 | #include <omp.h>
36 | 
37 | 
38 | int main( int argc, char **argv )
39 | {
40 | 
41 |   int nthreads;
42 | 
43 | #if defined(_OPENMP)
44 |   
45 | #pragma omp parallel               // this creates a parallel region
46 |                                    // that is encompassed by the
47 |                                    // opening and closing { }
48 |                                    //
49 |                                    // you can modify the number of
50 |                                    // spawned threads through the
51 |                                    //   OMP_THREAD_NUM
52 |                                    // environmental variable
53 |   
54 |   {   
55 |     
56 |     int my_thread_id = omp_get_thread_num();  // note: this assignment is now
57 |                                               // thread-safe because the lvalue
58 | 					      // is a private variable
59 |     #pragma omp master
60 |     nthreads = omp_get_num_threads();
61 | 
62 |                                    // the order in which different threads will
63 |                                    // arrive at this print is undefined;
64 |                                    // if you run this code several times, you will
65 |                                    // obtain different results
66 | 
67 |     printf( "\tgreetings from thread num %d\n", my_thread_id);
68 |   }
69 | #else
70 |   
71 |   nthreads = 1;
72 |   printf( "\tgreetings from thread num 0\n");
73 | #endif
74 |   
75 |   printf(" %d thread%s greeted you from the %sparallel region\n",
76 | 	nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
77 |   
78 |   return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/03a_num_of_threads.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <omp.h>
37 | 
38 | 
39 | int main( int argc, char **argv )
40 | {
41 | 
42 |   int nthreads;
43 | 
44 | #if defined(_OPENMP)
45 | 
46 |   int threads_num = 1;
47 | 
48 |   if ( argc > 1 )
49 |     {
50 | 				             // read the argument given
51 |       threads_num = atoi(*(argv+1));
52 |       omp_set_num_threads( threads_num );
53 |     }
54 |   
55 |  #pragma omp parallel              // this creates a parallel region
56 |                                    // that is encompassed by the
57 |                                    // opening and closing { }
58 |                                    //
59 |                                    // you can modify the number of
60 |                                    // spawned threads in different
61 |                                    // ways:
62 |                                    // 1) through the OMP_THREAD_NUM
63 |                                    //    environmental variable
64 |                                    // 2) using the omp_set_num_threads()
65 |                                    //
66 |                                    // you can also declare the desired
67 |                                    // number at the creation of the
68 |                                    // parallel region:
69 |   
70 |   //#pragma omp parallel num_threads( threads_num )
71 |   
72 |   {   
73 |     
74 |     int my_thread_id = omp_get_thread_num();
75 |    #pragma omp master
76 |     nthreads = omp_get_num_threads();
77 | 
78 |                                    // the order in which different threads will
79 |                                    // arrive at this print is undefined;
80 |                                    // if you run this code several times, you will
81 |                                    // obtain different results
82 | 
83 |     printf( "\tgreetings from thread num %d\n", my_thread_id );
84 |   }
85 | 
86 | #else
87 | 
88 |   nthreads = 1;
89 | 
90 | #endif
91 | 
92 |   printf(" %d thread%s greeted you from the %sparallel region\n",
93 | 	nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );  
94 |   
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/04_order_of_threads_wrong.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | #if defined(__STDC__)
28 | #  if (__STDC_VERSION__ >= 199901L)
29 | #     define _XOPEN_SOURCE 700
30 | #  endif
31 | #endif
32 | #include <stdlib.h>
33 | #include <stdio.h>
34 | #include <string.h>
35 | #include <omp.h>
36 | 
37 | 
38 | int main( int argc, char **argv )
39 | {
40 | 
41 |   int nthreads;
42 |   
43 |  #if defined(_OPENMP)
44 | 
45 |   int order = 0;
46 |   
47 |  #pragma omp parallel              // this creates a parallel region
48 |                                    // that is encompassed by the
49 |                                    // opening and closing { }
50 |                                    //
51 |                                    // you can modify the number of
52 |                                    // spawned threads through the
53 |                                    //   OMP_THREAD_NUM
54 |                                    // environmental variable
55 |   
56 |   {   
57 |     
58 |     int my_thread_id = omp_get_thread_num();
59 |     #pragma omp master
60 |     nthreads = omp_get_num_threads();
61 | 
62 |                                    // now we impose an ordered output
63 |                                    // although not ina very efficient way
64 | 
65 |         	                   // the "critical" directive identifies a
66 |         	                   // section that must be executed by a
67 |         	                   // single thread at a time.
68 | 	                           // Here, un unspecified number of threads
69 | 	                           // will print the message.
70 | 	                           // That is just due to this particular
71 | 	                           // case: in fact, ALL the threads will
72 | 	                           // execute the if test. However, which are
73 | 	                           // those that succeed, print and modify the
74 | 	                           // "order" value depends on which have been
75 | 	                           // the previous ones, and on the relative delay.
76 |    #pragma omp critical                   
77 |     if ( order == my_thread_id )
78 |       {
79 | 	printf( "\tgreetings from thread num %d\n", my_thread_id );	
80 | 	order++;		   
81 |       }
82 |   }
83 |  #else
84 |   
85 |   nthreads = 1;
86 |  #endif
87 |   
88 |   printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
89 |   
90 |   return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05b_order_of_threads.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <omp.h>
37 | 
38 | 
39 | int main( int argc, char **argv )
40 | {
41 | 
42 |   int nthreads;
43 |   
44 | #if defined(_OPENMP)
45 | 
46 | #pragma omp parallel
47 |   {   
48 |     
49 |     int my_thread_id = omp_get_thread_num();
50 |    #pragma omp master
51 |     nthreads = omp_get_num_threads();
52 |    #pragma omp barrier                           // let all the threads to read
53 | 						 //   the correct value of nthreads
54 |     
55 |    #pragma omp for ordered                       // declare a for within which there
56 |     for ( int i = 0; i < nthreads; i++)          //   are ordered regions
57 |      #pragma omp ordered                         // declare the ordered region
58 |       printf( "\tgreetings from thread num %d\n", my_thread_id );
59 | 
60 |   }
61 | #else
62 |   
63 |   nthreads = 1;
64 | #endif
65 |   
66 |   printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
67 |   
68 |   return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05c_order_of_threads.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <omp.h>
37 | 
38 | void do_something( int who_am_I )
39 | {
40 |  #pragma omp ordered
41 |   printf( "\tgreetings from thread num %d\n", who_am_I );
42 | }
43 | 
44 | 
45 | int main( int argc, char **argv )
46 | {
47 | 
48 |   int nthreads;
49 |   
50 |  #if defined(_OPENMP)
51 | 
52 |  #pragma omp parallel
53 |   {   
54 |     
55 |     int my_thread_id = omp_get_thread_num();
56 |    #pragma omp master
57 |     nthreads = omp_get_num_threads();
58 |    #pragma omp barrier                           // let all the threads to read
59 | 						 //   the correct value of nthreads
60 |     
61 |    #pragma omp for ordered                       // declare a for within which there
62 |     for ( int i = 0; i < nthreads; i++)          //   are ordered regions
63 |       do_something( my_thread_id );
64 |       
65 | 
66 |   }
67 |  #else
68 |   
69 |   nthreads = 1;
70 |  #endif
71 |   
72 |   printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
73 |   
74 |   return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__copyin__clarify.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <time.h>
37 | #include <omp.h>
38 | 
39 | 
40 | double golden_value = 0;
41 | #pragma omp threadprivate( golden_value )
42 | 
43 | 
44 | int main( int argc, char **argv )
45 | {
46 |   srand48(time(NULL));
47 |   int N = 10;  
48 | 
49 |  #pragma omp parallel copyin(golden_value)
50 |   // the copying of thread 0's golden_value
51 |   // happens here, at the entering of the
52 |   // parallel region;
53 |   //
54 |   {
55 |     
56 |    #pragma omp master
57 |     golden_value = 1.618033988;       // we do not expect
58 | 				      // this value to be
59 | 				      // broadcasted
60 | 
61 |    #pragma omp barrier
62 |     
63 |     printf("[PR 1] thread %d has a golden value %g\n",
64 | 	   omp_get_thread_num(), golden_value );
65 |   }    
66 | 
67 | 
68 |  #pragma omp parallel copyin(golden_value)
69 |   // here the master's value is copied again;
70 |   // since it was modified in the previous
71 |   // PR, we do expect that now everybody
72 |   // will have the new value
73 |   //
74 |   printf("[PR 2] thread %d has a golden value %g\n",
75 | 	 omp_get_thread_num(), golden_value );
76 | 
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__firstprivate.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <omp.h>
37 | 
38 | #define DEFAULT 10
39 | 
40 | int main( int argc, char **argv )
41 | {
42 |   
43 |   int  i = (argc > 1 ? atoi(*(argv+1)) : DEFAULT );
44 |   int  nthreads;
45 |   int *array;
46 | 
47 |  #pragma omp parallel
48 |  #pragma omp master
49 |   nthreads = omp_get_num_threads();
50 | 
51 |   array = (int*)calloc( nthreads, sizeof(int) );
52 |   
53 |  #pragma omp parallel firstprivate( i, array )
54 |   {
55 |     int me = omp_get_thread_num();
56 |     
57 |     // Here we can refer to both i and array.
58 |     // Although they are *different* memory region
59 |     // than the ones that are hosted in the
60 |     // serial region, their value at the entry
61 |     // of the parallel region is initialized
62 |     // to the value that the corresponding variables
63 |     // have in the serial region.
64 | 
65 |     
66 |     array[me] = i + me;   // a perfectly valid reference
67 |     
68 |     array = NULL;         // we screw up.. but only in
69 |                           // this scope because this
70 |                           // array is _not_ the same
71 |                           // than that outise the p-region
72 |   }
73 | 
74 |   for( int j = 0; j < nthreads; j++ )
75 |     printf("entry %3d is %3d (expected was %3d)\n",
76 | 	   j, array[j], i + j );
77 | 
78 |   free(array);
79 |   return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__threadprivate.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /* ────────────────────────────────────────────────────────────────────────── *
  3 |  │                                                                            │
  4 |  │ This file is part of the exercises for the Lectures on                     │
  5 |  │   "Foundations of High Performance Computing"                              │
  6 |  │ given at                                                                   │
  7 |  │   Master in HPC and                                                        │
  8 |  │   Master in Data Science and Scientific Computing                          │
  9 |  │ @ SISSA, ICTP and University of Trieste                                    │
 10 |  │                                                                            │
 11 |  │ contact: luca.tornatore@inaf.it                                            │
 12 |  │                                                                            │
 13 |  │     This is free software; you can redistribute it and/or modify           │
 14 |  │     it under the terms of the GNU General Public License as published by   │
 15 |  │     the Free Software Foundation; either version 3 of the License, or      │
 16 |  │     (at your option) any later version.                                    │
 17 |  │     This code is distributed in the hope that it will be useful,           │
 18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
 19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
 20 |  │     GNU General Public License for more details.                           │
 21 |  │                                                                            │
 22 |  │     You should have received a copy of the GNU General Public License      │
 23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
 24 |  │                                                                            │
 25 |  * ────────────────────────────────────────────────────────────────────────── */
 26 | 
 27 | 
 28 | #if defined(__STDC__)
 29 | #  if (__STDC_VERSION__ >= 199901L)
 30 | #     define _XOPEN_SOURCE 700
 31 | #  endif
 32 | #endif
 33 | #include <stdlib.h>
 34 | #include <stdio.h>
 35 | #include <string.h>
 36 | #include <time.h>
 37 | #include <omp.h>
 38 | 
 39 | 
 40 | int  me, myN;
 41 | int *array;
 42 | 
 43 | #pragma omp threadprivate( me, myN, array )
 44 | 
 45 | 
 46 | #define DEFAULT 100000
 47 | 
 48 | int main( int argc, char **argv )
 49 | {
 50 |   int    N = ( argc > 1 ? atoi(*(argv+1)) : DEFAULT);
 51 | 
 52 |  #pragma omp parallel 
 53 |   {
 54 |     me = omp_get_thread_num();
 55 |     
 56 |     int nthreads = omp_get_num_threads();
 57 | 
 58 |     // note that we did not declare neither
 59 |     // myN nor array nor me in this scope
 60 |     //
 61 |     myN = (N / nthreads) + (me < N%nthreads);
 62 |     array = (int*)calloc( myN, sizeof(int) );
 63 | 
 64 |     printf("+ thread %d has got %d elements; local array "
 65 | 	   "(address stored in %p) starts at %p\n",
 66 | 	   me, myN, &array, array );
 67 |     
 68 |     // write something in the array
 69 |     //
 70 | 
 71 |     int max = ( myN > 3 ? 3 : myN );
 72 |     for( int j = 0; j < max; j++ )
 73 |       array[j] = me*1000 + j;
 74 |   }
 75 | 
 76 | 
 77 |   printf("\nnow we are again in a serial region\n\n");
 78 |   
 79 | 
 80 |  #pragma omp parallel 
 81 |   {
 82 |     char buffer[200];
 83 |     sprintf( buffer, "* thread %d :: ", me );
 84 |     
 85 |     int max = ( myN > 3 ? 3 : myN );
 86 |     for( int j = 0; j < max; j++ )
 87 |       sprintf( &buffer[strlen(buffer)], "[%d] = %4d , ", j, array[j] );
 88 | 
 89 |     printf("%s\n", buffer );
 90 | 
 91 |     // we must free array from within a parallel region
 92 |     // is we did this in a serial region, only the memory
 93 |     // associated to the master thread would be freed
 94 |     //
 95 |     free(array);
 96 |   }
 97 | 
 98 | 
 99 |   return 0;
100 | }
101 | 
102 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples_on_stack/00_explore_how_bytes_are_stored.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | 
 5 | int main( void )
 6 | {
 7 | 
 8 |   unsigned int i    = 128;
 9 |   int          size = sizeof(i);
10 | 
11 |   /*
12 |    *  i is and integer variable, and as suche it requires 4 bytes
13 |    *  let's explore how this 4 bytes are placed in memory
14 |    */  
15 |   
16 |   for ( int j = 0; j < size; j++ )
17 |     // 
18 |     // we loop over the bytes that make up the variable i
19 |     // note: to be general, we asked size to be the value
20 |     //       returned by sizeof()
21 |     //
22 |     {
23 |       // let's print the value of the entire bitfield
24 |       // when we interpret it as an integer
25 |       printf("i is: %d\n", i );
26 | 
27 |       // now we access each byte of i
28 |       //
29 |       char *byte = (char*)&i;
30 |       for( int k = 0; k < size; k++ )
31 | 	printf("\t%p : %d\n", byte+k, *(byte+k) );
32 | 
33 |       // convince yourself that the previous for loop could have been
34 |       // written as follows:
35 |       // ( un-comment the next 2 lines to test it
36 |       
37 |       /* for( int k = 0; k < size; k++ ) */
38 |       /* 	printf("\t%p : %d\n", (char*)&i+k, *(((char*)&i)+k)); */
39 |       
40 |       // why is it so ?
41 |       // -- &i is the address of i; more precisely
42 |       // it is the address of the begin of i, i.e.
43 |       // the address of the furst of the bytes that
44 |       // form i.
45 |       // -- (char*)&i means that we interpret the
46 |       // address &i as an address to a char
47 |       // -- *(char*)&i reads as "the value of the byte
48 |       // at the address &i"
49 |       // -- (char*)&i+k is k-byte after the byte at
50 |       // address &i
51 |       
52 | 
53 |       printf("\n");
54 | 
55 |       // now we multiply i by 256.
56 |       // the operators << and >> read as "shift the argument's bit on the left [or right]
57 |       // by the specified amount of bits "
58 |       // In this case the amount of bits is 8, i.e. is is the same than multiplying by 256
59 |       //
60 |       i <<= 8;
61 | 
62 |       // we are doing this because we want that only a single bit is set per each byte
63 |       // among the i's bytes.
64 |       // we started from a value of 1, i.e. only the first bit of the first byte of i was
65 |       // set; multiplying  by 256 (i.e. bit-shifting by 8 positions) we move that bit
66 |       // to he next byte.
67 |     }
68 | 
69 |   return 0;
70 | }
71 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/.#lab_exercise.2.c:
--------------------------------------------------------------------------------
1 | luca@ggg.26667:1698393520


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/lab_exercise.c:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ────────────────────────────────────────────────────────────────────────── *
 3 |  │                                                                            │
 4 |  │ This file is part of the exercises for the Lectures on                     │
 5 |  │   "Foundations of High Performance Computing"                              │
 6 |  │ given at                                                                   │
 7 |  │   Master in HPC and                                                        │
 8 |  │   Master in Data Science and Scientific Computing                          │
 9 |  │ @ SISSA, ICTP and University of Trieste                                    │
10 |  │                                                                            │
11 |  │ contact: luca.tornatore@inaf.it                                            │
12 |  │                                                                            │
13 |  │     This is free software; you can redistribute it and/or modify           │
14 |  │     it under the terms of the GNU General Public License as published by   │
15 |  │     the Free Software Foundation; either version 3 of the License, or      │
16 |  │     (at your option) any later version.                                    │
17 |  │     This code is distributed in the hope that it will be useful,           │
18 |  │     but WITHOUT ANY WARRANTY; without even the implied warranty of         │
19 |  │     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          │
20 |  │     GNU General Public License for more details.                           │
21 |  │                                                                            │
22 |  │     You should have received a copy of the GNU General Public License      │
23 |  │     along with this program.  If not, see <http://www.gnu.org/licenses/>   │
24 |  │                                                                            │
25 |  * ────────────────────────────────────────────────────────────────────────── */
26 | 
27 | 
28 | #if defined(__STDC__)
29 | #  if (__STDC_VERSION__ >= 199901L)
30 | #     define _XOPEN_SOURCE 700
31 | #  endif
32 | #endif
33 | #include <stdlib.h>
34 | #include <stdio.h>
35 | #include <string.h>
36 | #include <time.h>
37 | #include <math.h>
38 | #include <omp.h>
39 | 
40 | 
41 | #define N_DFLT 1000
42 | 
43 | 
44 | int main ( int argc, char **argv )
45 | {
46 | 
47 |   int N   = ( (argc > 1) ? atoi(*(argv+1)) : N_DFLT);
48 |   int Nth = ( (argc > 2) ? atoi(*(argv+2)) : 0);
49 | 
50 |   unsigned int *array = (int*)malloc( sizeof(int) * N );
51 | 
52 |   if ( Nth > 0 )
53 |     omp_set_num_threads = Nth;
54 |   
55 |  #pragma omp parallel
56 |   {
57 |     int myid     = omp_get_thread_num();
58 |     int nthreads = omp_get_num_threads();
59 |     
60 |     for ( unsigned int i = 0; i < N; i++ )
61 |       array[i] = i*i;
62 | 
63 |   }
64 | 
65 |   //
66 |   // check the results
67 |   // can you parallelize this as well ?
68 |   //
69 |   
70 |   unsigned int faults = 0;
71 |   for ( unsigned int i = 0; i < N; i++ )
72 |     faults += ( array[i] != i*i );
73 | 
74 |   if ( faults > 0 )
75 |     printf("wow, you've been able to get %u faults\n",
76 | 	   faults );
77 |   
78 |   return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.c:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #if defined(__STDC__)
  4 | #  if (__STDC_VERSION__ >= 199901L)
  5 | #     define _XOPEN_SOURCE 700
  6 | #  endif
  7 | #endif
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <strings.h>
 11 | #include <stdio.h>
 12 | #include <time.h>
 13 | #include "prefix_sum.serial.h"
 14 | 
 15 | 
 16 | 
 17 | inline double scan( const uint N, DTYPE * restrict array )
 18 | {
 19 | 
 20 |   DTYPE avg    = array[0];
 21 | 
 22 |   for ( uint ii = 1; ii < N; ii++ )
 23 |     {
 24 |       avg       += array[ii];
 25 |       array[ii]  = avg;
 26 |     }
 27 |   
 28 |   return avg;
 29 | }
 30 | 
 31 | 
 32 | inline DTYPE scan_efficient( const uint N, DTYPE * restrict array )
 33 | {
 34 | 
 35 |   uint N_4 = (N/4)*4;
 36 | 
 37 |   {
 38 |     DTYPE temp = array[2];
 39 |     array[1]   += array[0];
 40 |     array[3]   += temp;
 41 |     array[2]   += array[1];
 42 |     array[3]   += array[1];
 43 |   }
 44 |   
 45 |   PRAGMA_VECT_LOOP
 46 |   for ( uint ii = 4; ii < N_4; ii+=4 )
 47 |     {
 48 |       DTYPE register temp = array[ii+2];
 49 |       array[ii]     += array[ii-1];      
 50 |       array[ii+1]   += array[ii];
 51 |       array[ii+3]   += temp;
 52 |       array[ii+2]   += array[ii+1];
 53 |       array[ii+3]   += array[ii+1];      
 54 |     }
 55 |   
 56 |   for ( uint ii = N_4; ii < N; ii++ )
 57 |     array[ii] += array[ii-1];
 58 |   
 59 |   return array[N-1];
 60 | }
 61 | 
 62 | 
 63 | #define N_default  1000
 64 | #define _scan      0
 65 | #define _scan_e    1
 66 | 
 67 | int main ( int argc, char **argv )
 68 | {
 69 |   
 70 |   struct timespec ts;  
 71 |   int             Nth_level1 = 1;
 72 |   int             Nth_level2 = 0;
 73 |   
 74 |   // -------------------------------------------------------------
 75 |   // variables' initialization to default values
 76 |   //
 77 | 
 78 |   uint    N        = N_default;
 79 |   int    scan_type = _scan;
 80 |   
 81 |   
 82 |   if ( argc > 1 )
 83 |     {
 84 |       scan_type = atoi( *(argv+1) );
 85 |       if ( argc > 2 )
 86 | 	N  = (unsigned)atoi( *(argv+2) );
 87 |     }
 88 | 
 89 |   printf( "scan type: %d\n", scan_type );
 90 | 
 91 |   
 92 |   // -------------------------------------------------------------
 93 |   // data init.
 94 | 
 95 |   double timing_start;
 96 |   double timing_scan;
 97 |   double timing_prepare;
 98 |   double total_weight;
 99 |   
100 |   uint   N_alloc = ((N/4)+1)*4;
101 |   //  DTYPE *array   = (DTYPE*)aligned_alloc( 32, N_alloc * sizeof(DTYPE) );
102 |   DTYPE *array   = (DTYPE*)malloc( N_alloc * sizeof(DTYPE) );
103 | 
104 |   timing_start = CPU_TIME;
105 | 
106 |   // initialize with pseudo-random numbers 
107 | 
108 |   /* srand48(time(0)); */
109 |   /* for ( int ii = 0; ii < N; ii++ ) */
110 |   /*   topnodes[ii] = base + drand48()*range; */
111 | 
112 |   // initialize with the first N integer
113 |   // (that makes the results easy to check)
114 |   // //
115 | 
116 |   for ( uint ii = 0; ii < N; ii++ )
117 |     array[ii] = (double)ii;
118 |   
119 |   timing_prepare = CPU_TIME - timing_start;
120 | 
121 |   // ................................................
122 |   //  SCAN
123 |   // ................................................
124 | 
125 |   if ( scan_type == _scan )
126 |     total_weight = scan( N, array );
127 | 
128 |   else if (scan_type == _scan_e)
129 |     total_weight = scan_efficient( N, array );
130 | 
131 |   /* else if (scan_type == _scan_b) */
132 |   /*   total_weight = scan_b( N, array ); */
133 | 
134 |   timing_scan  = CPU_TIME - timing_start;      
135 | 
136 |   printf("timing for scan is %g, timing for prepare is %g [total weight: %g]\n",
137 | 	 timing_scan, timing_prepare, total_weight);
138 |   return 0;
139 | }
140 | 


--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.h:
--------------------------------------------------------------------------------
 1 | 
 2 | // ─────────────────────────────────────────────────────────────────
 3 | // define the datatype
 4 | //
 5 | #if !defined(DTYPE)
 6 | #define DTYPE double
 7 | #endif
 8 | 
 9 | typedef unsigned int uint;
10 | 
11 | 
12 | // ─────────────────────────────────────────────────────────────────
13 | // define the timing routines
14 | //
15 | 
16 | #define CPU_TIME ({struct timespec ts;					\
17 |     clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ),			\
18 |       (double)ts.tv_sec +						\
19 |       (double)ts.tv_nsec * 1e-9;})
20 | 
21 | 
22 | // ─────────────────────────────────────────────────────────────────
23 | // define the vector generator
24 | //
25 | 
26 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
27 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep")
28 | #elif defined(__INTEL_COMPILER) | defined(__ICC)
29 | #define PRAGMA_VECT_LOOP _Pragma("parallel")
30 | #elif defined(__clang__)
31 | #define PRAGMA_VECT_LOOP _Pragma("ivdep")
32 | #else
33 | #define PRAGMA_VECT_LOOP
34 | #endif
35 | 
36 | 
37 | // ─────────────────────────────────────────────────────────────────
38 | // prototypes
39 | //
40 | 
41 | double scan           ( const uint, DTYPE * restrict );
42 | double scan_efficient ( const uint, DTYPE * restrict );
43 | 
44 | 


--------------------------------------------------------------------------------
/intro_to_course.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/intro_to_course.pdf


--------------------------------------------------------------------------------
/lecture01-intro-toHPC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/lecture01-intro-toHPC.pdf


--------------------------------------------------------------------------------