├── ASSIGNMENTS
├── exercise1
│ ├── algs.png
│ ├── exercise1.md
│ └── naive_model.png
└── exercise2
│ ├── exercise2.md
│ ├── exercise2.v1.1.pdf
│ ├── exercise2.v1.pdf
│ ├── quicksort.c
│ └── read_write_pgm_image.c
├── CODE_OPTIMIZATION
├── 00--optimization--preliminaries_and_compiler_usage.pdf
├── 01--Modern_architecture.pdf
├── 02--optimization--cache.pdf
├── 03--optimization--branches.pdf
├── 05--optimization--loops-and-prefetching.pdf
├── Readme.md
├── examples_on_branching
│ ├── if_forest_inside_loop
│ │ └── loop.c
│ ├── sort_2_arrays
│ │ ├── branchpred2.c
│ │ ├── compile
│ │ └── mypapi.h
│ └── unpredictable_datastream
│ │ ├── amonra.gen10
│ │ ├── branchpred.besmart.s
│ │ ├── branchpred.besmart2.c
│ │ ├── branchpred.besmart2.s
│ │ ├── branchpred.s
│ │ ├── branchpred.stat
│ │ ├── out.2
│ │ └── out.v
│ │ ├── branchpred
│ │ ├── branchpred.c
│ │ ├── branchpred.c~
│ │ ├── branchpred.smart
│ │ └── branchpred.smart2
├── examples_on_cache
│ ├── hot_and_cold_fields
│ │ ├── hotcold_a.v0.c
│ │ ├── hotcold_a.v1.c
│ │ ├── hotcold_b.v0.c
│ │ ├── hotcold_b.v1.c
│ │ ├── hotcold_c.v0.c
│ │ └── hotcold_c.v1.c
│ ├── matrix_transpose
│ │ ├── transpose
│ │ │ ├── matrix_transpose.c
│ │ │ ├── matrix_transpose_swapped.c
│ │ │ ├── matrix_transpose_swapped_unroll.c
│ │ │ └── matrix_transpose_unroll.c
│ │ └── transpose_by_blocks
│ │ │ ├── matrix_transpose_blocks.v0.c
│ │ │ ├── matrix_transpose_blocks.v1.c
│ │ │ ├── matrix_transpose_blocks.v2.c
│ │ │ ├── matrix_transpose_blocks.v3.c
│ │ │ └── mypapi.h
│ └── memory_mountain
│ │ ├── Makefile
│ │ ├── README
│ │ ├── clock.c
│ │ ├── clock.h
│ │ ├── fcyc2.c
│ │ ├── fcyc2.h
│ │ ├── mountain.c
│ │ ├── mountain.gcc
│ │ ├── plotmountain.gp
│ │ └── v2
│ │ ├── Makefile
│ │ ├── fcyc2.c
│ │ ├── fcyc2.h
│ │ ├── mountain.c
│ │ └── mountain.gcc
└── examples_on_pipelines
│ ├── combine_2_arrays
│ ├── compile
│ ├── mypapi.h
│ ├── pipeline.c
│ ├── run
│ ├── v0.c
│ ├── v1.c
│ ├── v1b.c
│ ├── v2.c
│ ├── v3.c
│ ├── v3b.c
│ ├── v4.c
│ └── vector.c
│ ├── matrix_multiplication
│ ├── matmul.c
│ ├── matmul_simple.c
│ ├── mypapi.h
│ ├── plot.gp
│ └── run
│ ├── polynomial_evaluation
│ ├── Makefile
│ ├── benchmark.c
│ ├── poly.c
│ ├── poly.h
│ ├── readme.md
│ ├── statistics
│ │ ├── cpe.c
│ │ ├── cpe.h
│ │ ├── fcyc.c
│ │ ├── fcyc.h
│ │ ├── lsquare.c
│ │ └── lsquare.h
│ └── timing
│ │ ├── clock.c
│ │ └── clock.h
│ └── reduction
│ ├── mypapi.h
│ ├── plot.gp
│ ├── reduction.c
│ └── reduction.h
├── HPC_TOOLS_and_STORAGE
└── Readme.md
├── Materials
├── A_note_on_Endiansim.pdf
├── Readme.md
├── What_every_computer_scientist_should_know_about_floating-point.pdf
├── arguments.c
└── topics.pdf
├── PARALLELISM
├── Readme.md
├── codes
│ ├── memory.c
│ └── pi.c
├── lecture01-intro-toHPC.pdf
├── lecture02-HPC-hardware.pdf
├── lecture03-HPCsoftware-stack.pdf
├── lecture04-on-parallel-programming.pdf
└── slurm
│ ├── README.md
│ ├── slurm01.job
│ ├── slurm02_A.job
│ ├── slurm02_B.job
│ ├── slurm02_C.job
│ ├── slurm03_A.job
│ ├── slurm03_B.job
│ ├── slurm03_C.job
│ ├── slurm04.job
│ └── slurm05.job
├── PARALLEL_PROGRAMMING
├── MPI
│ ├── Readme.md
│ ├── basic-mpi-codes
│ │ ├── Brecv.c
│ │ ├── CBlockSends.c
│ │ ├── clean.sh
│ │ ├── compile_openMPI_gnu.sh
│ │ ├── compile_openMPI_intel.sh
│ │ ├── deadlock.c
│ │ ├── linear-array.c
│ │ ├── mpi_env_call.c
│ │ ├── mpi_hello_world.F90
│ │ ├── mpi_hello_world.c
│ │ ├── mpi_hello_world_sync.c
│ │ ├── mpi_pi.c
│ │ ├── mpi_pi.job
│ │ ├── send_message.F90
│ │ ├── send_message.c
│ │ └── sendrecv_message.c
│ ├── collective-mpi
│ │ ├── all2allv3d.c
│ │ ├── allgather.job
│ │ ├── allgather.py
│ │ ├── allgatherv.c
│ │ ├── b_cast.c
│ │ ├── b_cast.f
│ │ ├── clean.sh
│ │ ├── compile.sh
│ │ ├── gather.c
│ │ ├── gather.f
│ │ ├── mpi_bcastcompare.c
│ │ ├── reduce.c
│ │ ├── reduce.f
│ │ ├── scatter.c
│ │ └── scatter.f
│ ├── compiling-and-running-mpi-programs.md
│ ├── lecture05-MPI-Programming-part-A.pdf
│ ├── lecture05-MPI-Programming-part-B.pdf
│ ├── lecture06-Network-basics-for-MPI-application.pptx
│ └── pi_scalability
│ │ └── scalability.job
└── OpenMP
│ ├── 00--Memory_model.pdf
│ ├── 01--Intro_to_OpenMP.pdf
│ ├── 02--parallel_regions.pdf
│ ├── 03--loops.pdf
│ ├── 04--threads_affinity.pdf
│ ├── examples
│ ├── .#for.c
│ ├── parallel_loops
│ │ ├── 00_array_sum_with_race.c
│ │ ├── 01a_array_sum.c
│ │ ├── 01b_array_sum.c
│ │ ├── 01c_array_sum.c
│ │ ├── 01d_array_sum.c
│ │ ├── 02_falsesharing.c
│ │ ├── 03_falsesharing_fixed.c
│ │ ├── 04_scheduling.c
│ │ ├── 05_first_and_last_private.c
│ │ ├── loop_without_for.c
│ │ ├── pi_openmp.c
│ │ └── pi_openmp.fix.c
│ ├── parallel_regions
│ │ ├── 00_scope_of_variables.c
│ │ ├── 00_stack_and_scope.c
│ │ ├── 01_simple_pr_wrong.c
│ │ ├── 02_simple_pr.c
│ │ ├── 03a_num_of_threads.c
│ │ ├── 03b_num_of_threads.c
│ │ ├── 04_order_of_threads_wrong.c
│ │ ├── 05a_order_of_threads.c
│ │ ├── 05b_order_of_threads.c
│ │ ├── 05c_order_of_threads.c
│ │ ├── 09_clauses__copyin.c
│ │ ├── 09_clauses__copyin__clarify.c
│ │ ├── 09_clauses__copyprivate.c
│ │ ├── 09_clauses__firstprivate.c
│ │ ├── 09_clauses__lastprivate.c
│ │ └── 09_clauses__threadprivate.c
│ └── threads_affinity
│ │ ├── 00_where_I_am.c
│ │ ├── 01_where_I_am_omp.c
│ │ ├── 02_where_I_am_omp.c
│ │ ├── 03_where_I_am_nested.c
│ │ ├── 04_touch_by_one.c
│ │ ├── 05_touch_by_all.c
│ │ └── 06_touch_by_all_threadprivate.c
│ ├── examples_on_stack
│ ├── 00_explore_how_bytes_are_stored.c
│ ├── 01a_understanding_the_stack.c
│ └── 01b_understanding_the_stack.c
│ └── exercises
│ ├── .#lab_exercise.2.c
│ ├── exercises.pdf
│ ├── lab_exercise.2.c
│ ├── lab_exercise.2.v2.c
│ ├── lab_exercise.c
│ ├── my_lab_exercise.2.c
│ ├── my_lab_exercise.2.v2.c
│ ├── prefix_sum.serial.c
│ ├── prefix_sum.serial.h
│ └── write_pgm_image.c
├── README.md
├── intro_to_course.pdf
└── lecture01-intro-toHPC.pdf
/ASSIGNMENTS/exercise1/algs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/algs.png
--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise1/naive_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/naive_model.png
--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise2/exercise2.v1.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.1.pdf
--------------------------------------------------------------------------------
/ASSIGNMENTS/exercise2/exercise2.v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/01--Modern_architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/01--Modern_architecture.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/02--optimization--cache.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/02--optimization--cache.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/03--optimization--branches.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/03--optimization--branches.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/Readme.md:
--------------------------------------------------------------------------------
1 | # Materials on serial code optimization
2 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/sort_2_arrays/compile:
--------------------------------------------------------------------------------
1 | gcc -march=native -I/scratch/Software/include -DUSE_PAPI -o branchpred2 branchpred2.c -lm -L/scratch/Software/lib -lpapi
2 | gcc -march=native -DBESMART -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart branchpred2.c -lm -L/scratch/Software/lib -lpapi
3 | gcc -march=native -DBESMART2 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart2 branchpred2.c -lm -L/scratch/Software/lib -lpapi
4 | gcc -march=native -DBESMART3 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart3 branchpred2.c -lm -L/scratch/Software/lib -lpapi
5 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/branchpred.besmart2.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of the exercises for the Lectures on
3 | * "Foundations of High Performance Computing"
4 | * given at
5 | * Master in HPC and
6 | * Master in Data Science and Scientific Computing
7 | * @ SISSA, ICTP and University of Trieste
8 | *
9 | * contact: luca.tornatore@inaf.it
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 |
25 |
26 |
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 |
33 | #define SIZE_DEFAULT 1000000
34 | #define TOP (2 << 20)
35 | #define PIVOT (TOP >> 2)
36 |
37 |
38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \
39 | (double)ts.tv_nsec * 1e-9)
40 |
41 |
42 |
43 | int main(int argc, char **argv)
44 | {
45 | int SIZE;
46 | int *data;
47 | int cc, ii;
48 |
49 | long long sum = 0;
50 |
51 | struct timespec ts;
52 | double tstart, tstop;
53 |
54 | if(argc > 1)
55 | SIZE = atoi( *(argv+1) );
56 | else
57 | SIZE = SIZE_DEFAULT;
58 |
59 | // Generate data
60 | data = (int*)calloc(SIZE, sizeof(int));
61 | srand((int)(SIZE));
62 |
63 | for (cc = 0; cc < SIZE; cc++)
64 | data[cc] = rand() % TOP;
65 |
66 |
67 | tstart = TCPU_TIME;
68 |
69 | for (cc = 0; cc < 1000; cc++)
70 | {
71 | sum = 0;
72 | long long _sum_[4] = {0};
73 | for (ii = 0; ii < SIZE; ii+=4)
74 | {
75 | _sum_[0] += (data[ii]>PIVOT? data[ii] : 0);
76 | _sum_[1] += (data[ii+1]>PIVOT? data[ii+1] : 0);
77 | _sum_[2] += (data[ii+2]>PIVOT? data[ii+2] : 0);
78 | _sum_[3] += (data[ii+3]>PIVOT? data[ii+3] : 0);
79 | }
80 | sum += (_sum_[0] + _sum_[1]) + (_sum_[2] + _sum_[3]);
81 | }
82 |
83 | tstop = TCPU_TIME;
84 |
85 | #ifdef WOW
86 | tot_tstop = TCPU_TIME;
87 | #endif
88 |
89 | free(data);
90 |
91 | #if !defined(WOW)
92 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
93 |
94 | #else
95 | double tot_time = tot_tstop - tot_tstart;
96 | double loop_time = tstop - tstart;
97 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
98 | sum, tot_time, loop_time, tot_time - loop_time);
99 | #endif
100 |
101 | printf("\n");
102 | return 0;
103 | }
104 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.2:
--------------------------------------------------------------------------------
1 |
2 | 3.4191
3 |
4 |
5 | 3.57167
6 |
7 |
8 | 3.44099
9 |
10 |
11 | 4.17072
12 |
13 |
14 | 4.20686
15 |
16 |
17 | 3.64886
18 |
19 |
20 | 3.39921
21 |
22 |
23 | 4.78118
24 |
25 |
26 | 3.54926
27 |
28 |
29 | 3.52104
30 |
31 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.v:
--------------------------------------------------------------------------------
1 |
2 | sum is 9831544284110, elapsed seconds: 3.45677
3 |
4 |
5 | sum is 9831544284110, elapsed seconds: 3.80376
6 |
7 |
8 | sum is 9831544284110, elapsed seconds: 4.81135
9 |
10 |
11 | sum is 9831544284110, elapsed seconds: 3.60161
12 |
13 |
14 | sum is 9831544284110, elapsed seconds: 3.65025
15 |
16 |
17 | sum is 9831544284110, elapsed seconds: 3.68967
18 |
19 |
20 | sum is 9831544284110, elapsed seconds: 3.63842
21 |
22 |
23 | sum is 9831544284110, elapsed seconds: 3.63771
24 |
25 |
26 | sum is 9831544284110, elapsed seconds: 3.6503
27 |
28 |
29 | sum is 9831544284110, elapsed seconds: 3.64676
30 |
31 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of the exercises for the Lectures on
3 | * "Foundations of High Performance Computing"
4 | * given at
5 | * Master in HPC and
6 | * Master in Data Science and Scientific Computing
7 | * @ SISSA, ICTP and University of Trieste
8 | *
9 | * contact: luca.tornatore@inaf.it
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 |
25 |
26 |
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 |
33 | #define SIZE_DEFAULT 1000000
34 | #define TOP (2 << 20)
35 | #define PIVOT (TOP >> 2)
36 |
37 |
38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \
39 | (double)ts.tv_nsec * 1e-9)
40 |
41 |
42 | #ifdef WOW
43 | int compare(const void *A, const void *B)
44 | {
45 | return *(int*)A - *(int*)B;
46 | }
47 | #endif
48 |
49 | int main(int argc, char **argv)
50 | {
51 | int SIZE;
52 | int *data;
53 | int cc, ii;
54 |
55 | #ifdef WOW
56 | double tot_tstart, tot_tstop;
57 | #endif
58 |
59 | long long sum = 0;
60 |
61 | struct timespec ts;
62 | double tstart, tstop;
63 |
64 | if(argc > 1)
65 | SIZE = atoi( *(argv+1) );
66 | else
67 | SIZE = SIZE_DEFAULT;
68 |
69 | // Generate data
70 | data = (int*)calloc(SIZE, sizeof(int));
71 | srand((int)(SIZE));
72 |
73 | for (cc = 0; cc < SIZE; cc++)
74 | data[cc] = rand() % TOP;
75 |
76 |
77 |
78 | #ifdef WOW
79 | tot_tstart = TCPU_TIME;
80 | // !!! With this, the next loop runs faster
81 | qsort(data, SIZE, sizeof(int), compare);
82 | #endif
83 |
84 |
85 | tstart = TCPU_TIME;
86 |
87 | for (cc = 0; cc < 1000; cc++)
88 | {
89 | sum = 0;
90 |
91 | for (ii = 0; ii < SIZE; ii++)
92 | {
93 | #if !defined( BESMART ) && !defined( BESMART2 )
94 | if (data[ii] > PIVOT)
95 | sum += data[ii];
96 |
97 | #elif defined( BESMART )
98 | unsigned int t = (data[ii] - PIVOT - 1) >> 31; // the additional -1 is for the case data[ii]==PIVOT
99 | sum += ~t & data[ii];
100 |
101 | #elif defined( BESMART2 )
102 | //sum += (data[ii]>PIVOT)*data[ii];
103 | sum += (data[ii]>PIVOT? data[ii] : 0);
104 | #endif
105 | }
106 | }
107 |
108 | tstop = TCPU_TIME;
109 |
110 | #ifdef WOW
111 | tot_tstop = TCPU_TIME;
112 | #endif
113 |
114 | free(data);
115 |
116 | #if !defined(WOW)
117 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
118 |
119 | #else
120 | double tot_time = tot_tstop - tot_tstart;
121 | double loop_time = tstop - tstart;
122 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
123 | sum, tot_time, loop_time, tot_time - loop_time);
124 | #endif
125 |
126 | printf("\n");
127 | return 0;
128 | }
129 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c~:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is part of the exercises for the Lectures on
3 | * "Foundations of High Performance Computing"
4 | * given at
5 | * Master in HPC and
6 | * Master in Data Science and Scientific Computing
7 | * @ SISSA, ICTP and University of Trieste
8 | *
9 | * contact: luca.tornatore@inaf.it
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 |
25 |
26 |
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 |
33 | #define SIZE_DEFAULT 1000000
34 | #define TOP (2 << 20)
35 | #define PIVOT (TOP >> 2)
36 |
37 |
38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \
39 | (double)ts.tv_nsec * 1e-9)
40 |
41 |
42 | #ifdef WOW
43 | int compare(const void *A, const void *B)
44 | {
45 | return *(int*)A - *(int*)B;
46 | }
47 | #endif
48 |
49 | int main(int argc, char **argv)
50 | {
51 | int SIZE;
52 | int *data;
53 | int cc, ii;
54 |
55 | #ifdef WOW
56 | double tot_tstart, tot_tstop;
57 | #endif
58 |
59 | long long sum = 0;
60 |
61 | struct timespec ts;
62 | double tstart, tstop;
63 |
64 | if(argc > 1)
65 | SIZE = atoi( *(argv+1) );
66 | else
67 | SIZE = SIZE_DEFAULT;
68 |
69 | // Generate data
70 | data = (int*)calloc(SIZE, sizeof(int));
71 | srand((int)(SIZE));
72 |
73 | for (cc = 0; cc < SIZE; cc++)
74 | data[cc] = rand() % TOP;
75 |
76 |
77 |
78 | #ifdef WOW
79 | tot_tstart = TCPU_TIME;
80 | // !!! With this, the next loop runs faster
81 | qsort(data, SIZE, sizeof(int), compare);
82 | #endif
83 |
84 |
85 | tstart = TCPU_TIME;
86 |
87 | for (cc = 0; cc < 1000; cc++)
88 | {
89 | sum = 0;
90 |
91 | for (ii = 0; ii < SIZE; ii++)
92 | {
93 | #if !defined( BESMART ) && !defined( BESMART2 )
94 | if (data[ii] > PIVOT)
95 | sum += data[ii];
96 |
97 | #elif defined( BESMART )
98 | unsigned int t = (data[ii] - PIVOT - 1) >> 31; // the additional -1 is for the case data[ii]==PIVOT
99 | sum += ~t & data[ii];
100 |
101 | #elif defined( BESMART2 )
102 | sum += (data[ii]>PIVOT)*data[ii];
103 | #endif
104 | }
105 | }
106 |
107 | tstop = TCPU_TIME;
108 |
109 | #ifdef WOW
110 | tot_tstop = TCPU_TIME;
111 | #endif
112 |
113 | free(data);
114 |
115 | #if !defined(WOW)
116 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart);
117 |
118 | #else
119 | double tot_time = tot_tstop - tot_tstart;
120 | double loop_time = tstop - tstart;
121 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n",
122 | sum, tot_time, loop_time, tot_time - loop_time);
123 | #endif
124 |
125 | printf("\n");
126 | return 0;
127 | }
128 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v0.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 | typedef struct node_t {
40 | double key;
41 | char data[DATASIZE];
42 | struct node_t *next;
43 | } node;
44 |
45 |
46 |
47 |
48 | #define N_default 10000
49 |
50 | int main( int argc, char **argv )
51 | {
52 | struct timespec ts;
53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
54 |
55 | // -------------------------------------
56 | // startup
57 |
58 | int N = N_default;
59 |
60 | if ( argc > 1 )
61 | N = atoi( *(argv+1) );
62 |
63 |
64 | // -------------------------------------
65 | // setup
66 |
67 | double *keys = (double*)calloc( N, sizeof(double));
68 | node *last = NULL;
69 | node *first = NULL;
70 |
71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
72 | srand48( time(NULL) );
73 |
74 | for( int nn = 0; nn < N; nn++ )
75 | {
76 | node *new = (node*)calloc( 1, sizeof(node) );
77 | if ( last != NULL )
78 | last->next = new;
79 | else
80 | first = new;
81 | new ->key = drand48();
82 | keys[nn] = new->key;
83 | new ->next = NULL;
84 | memset( new->data, 0, sizeof(char)*DATASIZE);
85 | last = new;
86 | }
87 |
88 |
89 | printf("now let's search for all of them\n"); fflush(stdout);
90 |
91 | int NSHOTS = N;
92 | double sum = 0;
93 |
94 | double tstart = CPU_TIME;
95 |
96 | for( int ii = 0; ii < NSHOTS; ii++ )
97 | {
98 | double key = keys[(int)(drand48() * N)];
99 | node *target = first;
100 |
101 | // this implementation is less efficient than
102 | // that in v1
103 | for ( int nn = 0; nn < N; nn++ )
104 | if ( target->key == key )
105 | sum += target->key;
106 | else
107 | target = target->next;
108 | }
109 |
110 | double et = CPU_TIME - tstart;
111 |
112 | printf("timing for %d shots: %g\n", NSHOTS, et );
113 |
114 | node *target = first;
115 | while( target->next != NULL )
116 | {
117 | node *tmp = target->next;
118 | free(target);
119 | target = tmp;
120 | }
121 |
122 | return 0;
123 | }
124 |
125 |
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v1.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 | typedef struct node_t {
40 | double key;
41 | char data[DATASIZE];
42 | struct node_t *next;
43 | } node;
44 |
45 |
46 |
47 |
48 | #define N_default 10000
49 |
50 | int main( int argc, char **argv )
51 | {
52 | struct timespec ts;
53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
54 |
55 | // -------------------------------------
56 | // startup
57 |
58 | int N = N_default;
59 |
60 | if ( argc > 1 )
61 | N = atoi( *(argv+1) );
62 |
63 |
64 | // -------------------------------------
65 | // setup
66 |
67 | double *keys = (double*)calloc( N, sizeof(double));
68 | node *last = NULL;
69 | node *first = NULL;
70 |
71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
72 | srand48( time(NULL) );
73 |
74 | for( int nn = 0; nn < N; nn++ )
75 | {
76 | node *new = (node*)calloc( 1, sizeof(node) );
77 | if ( last != NULL )
78 | last->next = new;
79 | else
80 | first = new;
81 | new ->key = drand48();
82 | keys[nn] = new->key;
83 | new ->next = NULL;
84 | memset( new->data, 0, sizeof(char)*DATASIZE);
85 | last = new;
86 | }
87 |
88 |
89 | printf("now let's search for all of them\n"); fflush(stdout);
90 |
91 | int NSHOTS = N;
92 | double sum = 0;
93 |
94 | double tstart = CPU_TIME;
95 |
96 | for( int ii = 0; ii < NSHOTS; ii++ )
97 | {
98 | double key = keys[(int)(drand48() * N)];
99 | node *target = first;
100 |
101 | while ( target->key != key )
102 | target = target->next;
103 | sum += target->key;
104 | }
105 |
106 | double et = CPU_TIME - tstart;
107 |
108 | printf("timing for %d shots: %g\n", NSHOTS, et );
109 |
110 | node *target = first;
111 | while( target->next != NULL )
112 | {
113 | node *tmp = target->next;
114 | free(target);
115 | target = tmp;
116 | }
117 |
118 | return 0;
119 | }
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v0.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 | typedef struct node_t {
40 | double key;
41 | struct node_t *next;
42 | char data[DATASIZE];
43 | } node;
44 |
45 |
46 |
47 |
48 | #define N_default 10000
49 |
50 | int main( int argc, char **argv )
51 | {
52 | struct timespec ts;
53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
54 |
55 | // -------------------------------------
56 | // startup
57 |
58 | int N = N_default;
59 |
60 | if ( argc > 1 )
61 | N = atoi( *(argv+1) );
62 |
63 |
64 | // -------------------------------------
65 | // setup
66 |
67 | double *keys = (double*)calloc( N, sizeof(double));
68 | node *last = NULL;
69 | node *first = NULL;
70 |
71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
72 | srand48( time(NULL) );
73 |
74 | for( int nn = 0; nn < N; nn++ )
75 | {
76 | node *new = (node*)calloc( 1, sizeof(node) );
77 | if ( last != NULL )
78 | last->next = new;
79 | else
80 | first = new;
81 | new ->key = drand48();
82 | keys[nn] = new->key;
83 | new ->next = NULL;
84 | memset( new->data, 0, sizeof(char)*DATASIZE);
85 | last = new;
86 | }
87 |
88 |
89 | printf("now let's search for all of them\n"); fflush(stdout);
90 |
91 | int NSHOTS = N;
92 | double sum = 0;
93 |
94 | double tstart = CPU_TIME;
95 |
96 | for( int ii = 0; ii < NSHOTS; ii++ )
97 | {
98 | double key = keys[(int)(drand48() * N)];
99 | node *target = first;
100 |
101 | // this implementation is less efficient than
102 | // that in v1
103 | for ( int nn = 0; nn < N; nn++ )
104 | if ( target->key == key )
105 | sum += target->key;
106 | else
107 | target = target->next;
108 | }
109 |
110 | double et = CPU_TIME - tstart;
111 |
112 | printf("timing for %d shots: %g\n", NSHOTS, et );
113 |
114 | node *target = first;
115 | while( target->next != NULL )
116 | {
117 | node *tmp = target->next;
118 | free(target);
119 | target = tmp;
120 | }
121 |
122 | return 0;
123 | }
124 |
125 |
126 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v1.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 | typedef struct node_t {
40 | double key;
41 | struct node_t *next;
42 | char data[DATASIZE];
43 | } node;
44 |
45 |
46 |
47 |
48 | #define N_default 10000
49 |
50 | int main( int argc, char **argv )
51 | {
52 | struct timespec ts;
53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
54 |
55 | // -------------------------------------
56 | // startup
57 |
58 | int N = N_default;
59 |
60 | if ( argc > 1 )
61 | N = atoi( *(argv+1) );
62 |
63 |
64 | // -------------------------------------
65 | // setup
66 |
67 | double *keys = (double*)calloc( N, sizeof(double));
68 | node *last = NULL;
69 | node *first = NULL;
70 |
71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
72 | srand48( time(NULL) );
73 |
74 | for( int nn = 0; nn < N; nn++ )
75 | {
76 | node *new = (node*)calloc( 1, sizeof(node) );
77 | if ( last != NULL )
78 | last->next = new;
79 | else
80 | first = new;
81 | new ->key = drand48();
82 | keys[nn] = new->key;
83 | new ->next = NULL;
84 | memset( new->data, 0, sizeof(char)*DATASIZE);
85 | last = new;
86 | }
87 |
88 |
89 | printf("now let's search for all of them\n"); fflush(stdout);
90 |
91 | int NSHOTS = N;
92 | double sum = 0;
93 |
94 | double tstart = CPU_TIME;
95 |
96 | for( int ii = 0; ii < NSHOTS; ii++ )
97 | {
98 | double key = keys[(int)(drand48() * N)];
99 | node *target = first;
100 |
101 | while ( target->key != key )
102 | target = target->next;
103 | sum += target->key;
104 | }
105 |
106 | double et = CPU_TIME - tstart;
107 |
108 | printf("timing for %d shots: %g\n", NSHOTS, et );
109 |
110 | node *target = first;
111 | while( target->next != NULL )
112 | {
113 | node *tmp = target->next;
114 | free(target);
115 | target = tmp;
116 | }
117 |
118 | return 0;
119 | }
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v0.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 |
40 | typedef struct node_t {
41 | double key;
42 | struct node_t *next;
43 | void *data;
44 | } node;
45 |
46 |
47 |
48 |
49 | #define N_default 10000
50 |
51 | int main( int argc, char **argv )
52 | {
53 | struct timespec ts;
54 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
55 |
56 | // -------------------------------------
57 | // startup
58 |
59 | int N = N_default;
60 |
61 | if ( argc > 1 )
62 | N = atoi( *(argv+1) );
63 |
64 |
65 | // -------------------------------------
66 | // setup
67 |
68 | double *keys = (double*)calloc( N, sizeof(double));
69 | char *alldata = (char*)calloc( DATASIZE*N, sizeof(char));
70 | node *last = NULL;
71 | node *first = NULL;
72 |
73 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
74 | srand48( time(NULL) );
75 |
76 | for( int nn = 0; nn < N; nn++ )
77 | {
78 | node *new = (node*)calloc( 1, sizeof(node) );
79 | if ( last != NULL )
80 | last->next = new;
81 | else
82 | first = new;
83 | new ->key = drand48();
84 | keys[nn] = new->key;
85 | new ->next = NULL;
86 | new ->data = alldata + DATASIZE*nn;
87 | memset( new->data, 0, sizeof(char)*DATASIZE);
88 | last = new;
89 | }
90 |
91 |
92 | printf("now let's search for all of them\n"); fflush(stdout);
93 |
94 | int NSHOTS = N;
95 | double sum = 0;
96 |
97 | double tstart = CPU_TIME;
98 |
99 | for( int ii = 0; ii < NSHOTS; ii++ )
100 | {
101 | double key = keys[(int)(drand48() * N)];
102 | node *target = first;
103 |
104 | // this implementation is less efficient than
105 | // that in v1
106 | for ( int nn = 0; nn < N; nn++ )
107 | if ( target->key == key )
108 | sum += target->key;
109 | else
110 | target = target->next;
111 | }
112 |
113 | double et = CPU_TIME - tstart;
114 |
115 | printf("timing for %d shots: %g\n", NSHOTS, et );
116 |
117 | node *target = first;
118 | while( target->next != NULL )
119 | {
120 | node *tmp = target->next;
121 | free(target);
122 | target = tmp;
123 | }
124 |
125 | return 0;
126 | }
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v1.c:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * This file is part of the exercises for the Lectures on
4 | * "Foundations of High Performance Computing"
5 | * given at
6 | * Master in HPC and
7 | * Master in Data Science and Scientific Computing
8 | * @ SISSA, ICTP and University of Trieste
9 | * 2019
10 | *
11 | * This is free software; you can redistribute it and/or modify
12 | * it under the terms of the GNU General Public License as published by
13 | * the Free Software Foundation; either version 3 of the License, or
14 | * (at your option) any later version.
15 | * This code is distributed in the hope that it will be useful,
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | * GNU General Public License for more details.
19 | *
20 | * You should have received a copy of the GNU General Public License
21 | * along with this program. If not, see
22 | */
23 |
24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \
33 | (double)ts.tv_nsec * 1e-9)
34 |
35 | #ifndef DATASIZE
36 | #define DATASIZE 200
37 | #endif
38 |
39 |
40 | typedef struct node_t {
41 | double key;
42 | struct node_t *next;
43 | void *data;
44 | } node;
45 |
46 |
47 |
48 |
49 | #define N_default 10000
50 |
51 | int main( int argc, char **argv )
52 | {
53 | struct timespec ts;
54 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID;
55 |
56 | // -------------------------------------
57 | // startup
58 |
59 | int N = N_default;
60 |
61 | if ( argc > 1 )
62 | N = atoi( *(argv+1) );
63 |
64 |
65 | // -------------------------------------
66 | // setup
67 |
68 | double *keys = (double*)calloc( N, sizeof(double));
69 | char *alldata = (char*)calloc( DATASIZE*N, sizeof(char));
70 | node *last = NULL;
71 | node *first = NULL;
72 |
73 | printf("creating and initializing %d nodes\n", N ); fflush(stdout);
74 | srand48( time(NULL) );
75 |
76 | for( int nn = 0; nn < N; nn++ )
77 | {
78 | node *new = (node*)calloc( 1, sizeof(node) );
79 | if ( last != NULL )
80 | last->next = new;
81 | else
82 | first = new;
83 | new ->key = drand48();
84 | keys[nn] = new->key;
85 | new ->next = NULL;
86 | new ->data = alldata + DATASIZE*nn;
87 | memset( new->data, 0, sizeof(char)*DATASIZE);
88 | last = new;
89 | }
90 |
91 |
92 | printf("now let's search for all of them\n"); fflush(stdout);
93 |
94 | int NSHOTS = N;
95 | double sum = 0;
96 |
97 | double tstart = CPU_TIME;
98 |
99 | for( int ii = 0; ii < NSHOTS; ii++ )
100 | {
101 | double key = keys[(int)(drand48() * N)];
102 | node *target = first;
103 |
104 | while ( target->key != key )
105 | target = target->next;
106 | sum += target->key;
107 | }
108 |
109 | double et = CPU_TIME - tstart;
110 |
111 | printf("timing for %d shots: %g\n", NSHOTS, et );
112 |
113 | node *target = first;
114 | while( target->next != NULL )
115 | {
116 | node *tmp = target->next;
117 | free(target);
118 | target = tmp;
119 | }
120 |
121 | return 0;
122 | }
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/mypapi.h:
--------------------------------------------------------------------------------
1 |
2 |
3 | #if defined(USE_PAPI) // -----------------------------------------------------------
4 | #include
5 |
6 | typedef unsigned long long int uLint;
7 |
8 | #define PAPI_EVENTS_NUM 4
9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L2_DCM };
10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values
13 |
14 | // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) { \
16 | if ( (R) != PAPI_OK ) { \
17 | printf("a problem with PAPI (code %d) arise at line %d\n", \
18 | (R), __LINE__);fflush(stdout); return (R); }}
19 |
20 |
21 | // check that PAPI is OK,
22 | // issue a warning if not with a
23 | // provided message
24 | #define PAPI_WARN( R, S ) { \
25 | if ( (R) != PAPI_OK ) { \
26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \
27 | (R), __LINE__, (S)); fflush(stdout); }}
28 |
29 | // check that PAPI is OK about an event
30 | // issue a warning if not with a
31 | // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \
33 | if ( (R) != PAPI_OK ) { \
34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }}
36 |
37 |
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 | retval = PAPI_query_event(papi_events[i]); \
40 | if ( retval == PAPI_OK ) { \
41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \
42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } }
44 |
45 | #define PAPI_INIT { \
46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \
47 | if (retval != PAPI_VER_CURRENT) \
48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 | PAPI_ADD_EVENTS_to_SET; }
51 |
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 |
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR { \
58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 |
60 | #define PAPI_STOP_CNTR { \
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \
62 | if( retval == PAPI_OK ) { \
63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 |
66 | #define PAPI_GET_CNTR( i ) ( papi_values[(i)] )
67 |
68 | #define PAPI_FLUSH_BUFFER { \
69 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
70 | papi_buffer[ jj] = 0; }
71 |
72 | #define PAPI_FLUSH { \
73 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
74 | papi_values[jj] = papi_buffer[ jj] = 0; }
75 |
76 |
77 | #else // -----------------------------------------------------------
78 |
79 | #define PAPI_EVENTS_NUM 0
80 | #define PAPI_INIT
81 | #define PAPI_START_CNTR
82 | #define PAPI_STOP_CNTR
83 | #define PAPI_FLUSH
84 | #define PAPI_GET_CNTR( i ) 0
85 |
86 | #endif // -----------------------------------------------------------
87 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/Makefile:
--------------------------------------------------------------------------------
1 |
2 | COMPILER=gcc
3 |
4 | ifeq ($(COMPILER),gcc)
5 | CC = gcc
6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm -D__i686__
7 | SUFFIX = .gcc
8 | LIBM = -lm
9 | endif
10 |
11 | ifeq ($(COMPILER),icc)
12 | CC = icc
13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo
14 | SUFFIX = .icc
15 | LIBM =
16 | endif
17 |
18 | ifeq ($(COMPILER),pgcc)
19 | CC = pgcc
20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm
21 | SUFFIX = .pgcc
22 | LIBM = -lm
23 | endif
24 |
25 | mountain: mountain.c fcyc2.c clock.c
26 | $(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c clock.c $(LIBM)
27 |
28 | clean:
29 | rm -f mountain$(SUFFIX) *.o *~
30 |
31 |
32 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/README:
--------------------------------------------------------------------------------
1 | This directory contains code for generating a memory mountain, as
2 | described in Computer Systems: A Programmer's Perspective
3 |
4 | clock.{c,h} - routines for using x86 and Alpha cycle timers
5 | fcyc2.{c,h} - routines that estimate the number of cycles required
6 | by a function f that takes two arguments.
7 | Makefile - memory mountain makefile
8 | mountain.c - program that generates the memory mountain.
9 |
10 | (1) set the compiler at the top of Makefile
11 | (2) invoke make
12 | (3) execute the mountain.$COMPILER
13 | (4) copy the output in a file named mountain.dat
14 | (5) use plotmountain.gp to plot the data using gnuplot
15 | type 'load "plotmountain.gp"' from inside gnuplot
16 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/clock.h:
--------------------------------------------------------------------------------
1 | /* Routines for using cycle counter */
2 |
3 | /* Start the counter */
4 | void start_counter();
5 |
6 | /* Get # cycles since counter started */
7 | double get_counter();
8 |
9 |
10 | /* Measure overhead for counter */
11 | double ovhd();
12 |
13 | /* Determine clock rate of processor */
14 | double mhz(int verbose);
15 |
16 | /* Determine clock rate of processor, having more control over accuracy */
17 | double mhz_full(int verbose, int sleeptime);
18 |
19 | /** Special counters that compensate for timer interrupt overhead */
20 |
21 | void start_comp_counter();
22 |
23 | double get_comp_counter();
24 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/fcyc2.h:
--------------------------------------------------------------------------------
1 | /* Find number of cycles used by function that takes 2 arguments */
2 |
3 | /* Function to be tested takes two integer arguments */
4 | typedef int (*test_funct)(int, int);
5 |
6 | /* Compute time used by function f */
7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache);
8 |
9 | /********* These routines are used to help with the analysis *********/
10 |
11 | /*
12 | Parameters:
13 | k: How many samples must be within epsilon for convergence
14 | epsilon: What is tolerance
15 | maxsamples: How many samples until give up?
16 | */
17 |
18 | /* Full version of fcyc with control over parameters */
19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
20 | int k, double epsilon, int maxsamples, int compensate);
21 |
22 | /* Get current minimum */
23 | double get_min();
24 |
25 | /* What is convergence status for k minimum measurements within epsilon
26 | Returns 0 if not converged, #samples if converged, and -1 if can't
27 | reach convergence
28 | */
29 |
30 | int has_converged(int k, double epsilon, int maxsamples);
31 |
32 | /* What is error of current measurement */
33 | double err(int k);
34 |
35 | /************* Try other clocking methods *****************/
36 |
37 | /* Full version that uses the time of day clock */
38 | double fcyc2_full_tod(test_funct f, int param1, int param2, int clear_cache,
39 | int k, double epsilon, int maxsamples, int compensate);
40 |
41 | double fcyc2_tod(test_funct f, int param1, int param2, int clear_cache);
42 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.c:
--------------------------------------------------------------------------------
1 | /* mountain.c - Generate the memory mountain. */
2 | /* $begin mountainmain */
3 | #include
4 | #include
5 | #include
6 | #include "fcyc2.h" /* measurement routines */
7 | #include "clock.h" /* routines to access the cycle counter */
8 |
9 | #define MINBYTES (1 << 14) /* First working set size */
10 | #define MAXBYTES (1 << 27) /* Last working set size */
11 | #define MAXSTRIDE 15 /* Stride x8 bytes */
12 | #define MAXELEMS MAXBYTES/sizeof(long)
13 |
14 |
15 | long data[MAXELEMS]; /* The global array we'll be traversing */
16 |
17 |
18 | void init_data(long *data, int n);
19 | int test(int elems, int stride);
20 | double run(int size, int stride, double Mhz);
21 |
22 | /* $begin mountainmain */
23 | int main()
24 | {
25 | int size; /* Working set size (in bytes) */
26 | int stride; /* Stride (in array elements) */
27 | double Mhz; /* Clock frequency */
28 |
29 | init_data(data, MAXELEMS); /* Initialize each element in data */
30 | Mhz = mhz(0); /* Estimate the clock frequency */
31 |
32 |
33 | printf("# Clock frequency is approx. %.1f MHz\n", Mhz);
34 | printf("# Memory mountain (MB/sec)\n");
35 |
36 |
37 | printf("%d\t", MAXSTRIDE);
38 | for (stride = 1; stride <= MAXSTRIDE; stride++)
39 | printf("%d\t", stride);
40 |
41 | printf("\n");
42 |
43 | /* begin mountainmain */
44 | for (size = MAXBYTES; size >= MINBYTES; size >>= 1)
45 | {
46 | int log2size_kb = (int)(log2((double)size / 1024.0));
47 | printf("%d\t", log2size_kb);
48 |
49 | for (stride = 1; stride <= MAXSTRIDE; stride++)
50 | printf("%.0f\t", run(size, stride, Mhz));
51 |
52 | printf("\n");
53 | }
54 | exit(0);
55 | }
56 |
57 |
58 | /* init_data - initializes the array */
59 | void init_data(long *data, int n)
60 | {
61 | int i;
62 |
63 | for (i = 0; i < n; i++)
64 | data[i] = i;
65 | }
66 |
67 | /* $begin mountainfuns */
68 | /* test - Iterate over first "elems" elements of array "data" with
69 | * stride of "stride", using 4x4 loop unrolling.
70 | */
71 | int test(int elems, int stride)
72 | {
73 | long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4;
74 | long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
75 | long length = elems;
76 | long limit = length - sx4;
77 |
78 | /* Combine 4 elements at a time */
79 | for (i = 0; i < limit; i += sx4) {
80 | acc0 = acc0 + data[i];
81 | acc1 = acc1 + data[i+stride];
82 | acc2 = acc2 + data[i+sx2];
83 | acc3 = acc3 + data[i+sx3];
84 | }
85 |
86 | /* Finish any remaining elements */
87 | for (; i < length; i++) {
88 | acc0 = acc0 + data[i];
89 | }
90 | return ((acc0 + acc1) + (acc2 + acc3));
91 | }
92 |
93 | /* run - Run test(elems, stride) and return read throughput (MB/s).
94 | * "size" is in bytes, "stride" is in array elements, and Mhz is
95 | * CPU clock frequency in Mhz.
96 | */
97 | double run(int size, int stride, double Mhz)
98 | {
99 | double cycles;
100 | int elems = size / sizeof(double);
101 |
102 | test(elems, stride); /* Warm up the cache */ //line:mem:warmup
103 | cycles = fcyc2(test, elems, stride, 0); /* Call test(elems,stride) */ //line:mem:fcyc
104 | return (size / stride) / (cycles / Mhz); /* Convert cycles to MB/s */ //line:mem:bwcompute
105 | }
106 | /* $end mountainfuns */
107 |
108 |
109 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/plotmountain.gp:
--------------------------------------------------------------------------------
1 | set samples 100
2 | set isosamples 100
3 | set xyplane 0
4 |
5 | set xlabel "STRIDES" font ", 16"
6 | set ylabel "SIZE (KB, log_2)" font ", 16"
7 | set zlabel "MBs/sec" offset -3, 0 font ",16" rotate parallel
8 |
9 | set tics font ", 12"
10 |
11 | set pm3d
12 | splot [:17][4:17] "mountain.dat" u 1:2:3 matrix nonuniform with lines lc 0 notitle
13 |
14 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/Makefile:
--------------------------------------------------------------------------------
1 |
2 | COMPILER=gcc
3 |
4 | ifeq ($(COMPILER),gcc)
5 | CC = gcc
6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm
7 | SUFFIX = .gcc
8 | LIBM = -lm
9 | endif
10 |
11 | ifeq ($(COMPILER),icc)
12 | CC = icc
13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo
14 | SUFFIX = .icc
15 | LIBM =
16 | endif
17 |
18 | ifeq ($(COMPILER),pgcc)
19 | CC = pgcc
20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm
21 | SUFFIX = .pgcc
22 | LIBM = -lm
23 | endif
24 |
25 | mountain: mountain.c fcyc2.c
26 | $(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c $(LIBM)
27 |
28 | clean:
29 | rm -f mountain$(SUFFIX) *.o *~
30 |
31 |
32 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.c:
--------------------------------------------------------------------------------
1 | /* Compute time used by a function f that takes two integer args */
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include "fcyc2.h"
8 |
9 | #define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \
10 | (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;})
11 |
12 | static double *values = NULL;
13 | int samplecount = 0;
14 |
15 | #define KEEP_VALS 1
16 | #define KEEP_SAMPLES 1
17 |
18 | #if KEEP_SAMPLES
19 | double *samples = NULL;
20 | #endif
21 |
22 |
23 | static void init_sampler(int k, int maxsamples)
24 | {
25 | if (values)
26 | free(values);
27 | values = calloc(k, sizeof(double));
28 | #if KEEP_SAMPLES
29 | if (samples)
30 | free(samples);
31 | /* Allocate extra for wraparound analysis */
32 | samples = calloc(maxsamples+k, sizeof(double));
33 | #endif
34 | samplecount = 0;
35 | }
36 |
37 |
38 | /* Add new sample. */
39 | void add_sample(double val, int k)
40 | {
41 | int pos = 0;
42 | if (samplecount < k) {
43 | pos = samplecount;
44 | values[pos] = val;
45 | } else if (val < values[k-1]) {
46 | pos = k-1;
47 | values[pos] = val;
48 | }
49 | #if KEEP_SAMPLES
50 | samples[samplecount] = val;
51 | #endif
52 | samplecount++;
53 | /* Insertion sort */
54 | while (pos > 0 && values[pos-1] > values[pos]) {
55 | double temp = values[pos-1];
56 | values[pos-1] = values[pos];
57 | values[pos] = temp;
58 | pos--;
59 | }
60 | }
61 |
62 | /* Get current minimum */
63 | double get_min()
64 | {
65 | return values[0];
66 | }
67 |
68 | /* What is relative error for kth smallest sample */
69 | double err(int k)
70 | {
71 | if (samplecount < k)
72 | return 1000.0;
73 | return (values[k-1] - values[0])/values[0];
74 | }
75 |
76 | /* Have k minimum measurements converged within epsilon? */
77 | int has_converged(int k_arg, double epsilon_arg, int maxsamples)
78 | {
79 | if ((samplecount >= k_arg) &&
80 | ((1 + epsilon_arg)*values[0] >= values[k_arg-1]))
81 | return samplecount;
82 | if ((samplecount >= maxsamples))
83 | return -1;
84 | return 0;
85 | }
86 |
87 | /* Code to clear cache */
88 | #define ASIZE (1 << 20)
89 | #define STRIDE 8
90 | static int stuff[ASIZE];
91 | static int sink;
92 |
93 | static void clear()
94 | {
95 | int x = sink;
96 | int i;
97 | for (i = 0; i < ASIZE; i += STRIDE)
98 | x += stuff[i];
99 | sink = x;
100 | }
101 |
102 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
103 | int k, double epsilon, int maxsamples, int compensate)
104 | {
105 | double result;
106 | init_sampler(k, maxsamples);
107 | if (compensate) {
108 | do {
109 | if (clear_cache)
110 | clear();
111 | f(param1, param2); /* warm cache */
112 | double tstart = CPU_TIME;
113 | f(param1, param2);
114 | tstart = CPU_TIME - tstart;
115 | add_sample(tstart, k);
116 | } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples);
117 | } else {
118 | do {
119 | if (clear_cache)
120 | clear();
121 | f(param1, param2); /* warm cache */
122 | double tstart = CPU_TIME;
123 | f(param1, param2);
124 | tstart = CPU_TIME-tstart;
125 | add_sample(tstart, k);
126 | } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples);
127 | }
128 | #ifdef DEBUG
129 | {
130 | int i;
131 | printf(" %d smallest values: [", k);
132 | for (i = 0; i < k; i++)
133 | printf("%.0f%s", values[i], i==k-1 ? "]\n" : ", ");
134 | }
135 | #endif
136 | result = values[0];
137 | #if !KEEP_VALS
138 | free(values);
139 | values = NULL;
140 | #endif
141 | return result;
142 | }
143 |
144 |
145 | double fcyc2(test_funct f, int param1, int param2, int clear_cache)
146 | {
147 | return fcyc2_full(f, param1, param2, clear_cache, 3, 0.01, 500, 0);
148 | }
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.h:
--------------------------------------------------------------------------------
1 | /* Find number of cycles used by function that takes 2 arguments */
2 |
3 | /* Function to be tested takes two integer arguments */
4 | typedef int (*test_funct)(int, int);
5 |
6 | /* Compute time used by function f */
7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache);
8 |
9 | /********* These routines are used to help with the analysis *********/
10 |
11 | /*
12 | Parameters:
13 | k: How many samples must be within epsilon for convergence
14 | epsilon: What is tolerance
15 | maxsamples: How many samples until give up?
16 | */
17 |
18 | /* Full version of fcyc with control over parameters */
19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache,
20 | int k, double epsilon, int maxsamples, int compensate);
21 |
22 | /* Get current minimum */
23 | double get_min();
24 |
25 | /* What is convergence status for k minimum measurements within epsilon
26 | Returns 0 if not converged, #samples if converged, and -1 if can't
27 | reach convergence
28 | */
29 |
30 | int has_converged(int k, double epsilon, int maxsamples);
31 |
32 | /* What is error of current measurement */
33 | double err(int k);
34 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.c:
--------------------------------------------------------------------------------
1 | /* mountain.c - Generate the memory mountain. */
2 | /* $begin mountainmain */
3 | #include
4 | #include
5 | #include
6 | #include "fcyc2.h"
7 |
8 |
9 | #define MINBYTES (1 << 14) /* First working set size */
10 | #define MAXBYTES (1 << 27) /* Last working set size */
11 | #define MAXSTRIDE 15 /* Stride x8 bytes */
12 | #define MAXELEMS MAXBYTES/sizeof(long)
13 |
14 |
15 | long data[MAXELEMS]; /* The global array we'll be traversing */
16 |
17 | void init_data(long *data, int n);
18 | int test(int elems, int stride);
19 | double run(int size, int stride);
20 |
21 |
22 | int main()
23 | {
24 | int size; /* Working set size (in bytes) */
25 | int stride; /* Stride (in array elements) */
26 |
27 | init_data(data, MAXELEMS); /* Initialize each element in data */
28 |
29 | printf("# Memory mountain (MB/sec)\n");
30 |
31 |
32 | printf("%d\t", MAXSTRIDE);
33 | for (stride = 1; stride <= MAXSTRIDE; stride++)
34 | printf("%d\t", stride);
35 |
36 | printf("\n");
37 |
38 | /* begin mountainmain */
39 | for (size = MAXBYTES; size >= MINBYTES; size >>= 1)
40 | {
41 | int log2size_kb = (int)(log2((double)size / 1024.0));
42 | printf("%d\t", log2size_kb);
43 |
44 | for (stride = 1; stride <= MAXSTRIDE; stride++)
45 | printf("%.0f\t", run(size, stride));
46 |
47 | printf("\n");
48 | }
49 | exit(0);
50 | }
51 |
52 |
53 | /* init_data - initializes the array */
54 | void init_data(long *data, int n)
55 | {
56 | int i;
57 |
58 | for (i = 0; i < n; i++)
59 | data[i] = i;
60 | }
61 |
62 | /* $begin mountainfuns */
63 | /* test - Iterate over first "elems" elements of array "data" with
64 | * stride of "stride", using 4x4 loop unrolling.
65 | */
66 | int test(int elems, int stride)
67 | {
68 | long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4;
69 | long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
70 | long length = elems;
71 | long limit = length - sx4;
72 |
73 | /* Combine 4 elements at a time */
74 | for (i = 0; i < limit; i += sx4) {
75 | acc0 = acc0 + data[i];
76 | acc1 = acc1 + data[i+stride];
77 | acc2 = acc2 + data[i+sx2];
78 | acc3 = acc3 + data[i+sx3];
79 | }
80 |
81 | /* Finish any remaining elements */
82 | for (; i < length; i++) {
83 | acc0 = acc0 + data[i];
84 | }
85 | return ((acc0 + acc1) + (acc2 + acc3));
86 | }
87 |
88 | /* run - Run test(elems, stride) and return read throughput (MB/s).
89 | * "size" is in bytes, "stride" is in array elements, and Mhz is
90 | * CPU clock frequency in Mhz.
91 | */
92 | double run(int size, int stride)
93 | {
94 | double timing;
95 | int elems = size / sizeof(double);
96 |
97 | test(elems, stride); /* Warm up the cache */ //line:mem:warmup
98 | timing = fcyc2(test, elems, stride, 0); /* Call test(elems,stride) */ //line:mem:fcyc
99 | return (size / stride) / timing; /* Convert cycles to MB/s */ //line:mem:bwcompute
100 | }
101 | /* $end mountainfuns */
102 |
103 |
104 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/compile:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for f in v?.c;
4 | do
5 | version=$( echo $f | cut -d'.' -f1 | cut -d'v' -f2)
6 | echo "compiling "$version" -> v"$version
7 | gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version pipeline.c -lm -lpapi
8 | gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version.O3n pipeline.c -lm -lpapi -O3 -march=native -mavx2
9 | done
10 |
11 | echo "compiling vector"
12 | gcc -std=c11 -DUSE_PAPI -march=native -o vetor vector.c -lm -lpapi
13 | gcc -std=c11 -DUSE_PAPI -O3 -march=native -mavx2 -o vector.O3n vector.c -lm -lpapi
14 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/mypapi.h:
--------------------------------------------------------------------------------
1 |
2 |
3 | #if defined(USE_PAPI) // -----------------------------------------------------------
4 | #include
5 |
6 | #define PAPI_EVENTS_NUM 2
7 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC };
8 | int papi_EventSet = PAPI_NULL; // the handle for the events' set
9 | long long papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values
10 | long long papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values
11 |
12 | // check that PAPI is OK, exit if not
13 | #define PAPI_CHECK( R ) { \
14 | if ( (R) != PAPI_OK ) { \
15 | printf("a problem with PAPI (code %d) arise at line %d\n", \
16 | (R), __LINE__);fflush(stdout); return (R); }}
17 |
18 |
19 | // check that PAPI is OK,
20 | // issue a warning if not with a
21 | // provided message
22 | #define PAPI_WARN( R, S ) { \
23 | if ( (R) != PAPI_OK ) { \
24 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \
25 | (R), __LINE__, (S)); fflush(stdout); }}
26 |
27 | // check that PAPI is OK about an event
28 | // issue a warning if not with a
29 | // provided message
30 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \
31 | if ( (R) != PAPI_OK ) { \
32 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
33 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }}
34 |
35 |
36 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
37 | retval = PAPI_query_event(papi_events[i]); \
38 | if ( retval == PAPI_OK ) { \
39 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \
40 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
41 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } }
42 |
43 | #define PAPI_INIT { \
44 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \
45 | if (retval != PAPI_VER_CURRENT) \
46 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
47 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
48 | PAPI_ADD_EVENTS_to_SET; }
49 |
50 | // to use HIGH-LEVEL API
51 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
52 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
53 |
54 | // to use NORMAL API
55 | #define PAPI_START_CNTR { \
56 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
57 |
58 | #define PAPI_STOP_CNTR { \
59 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \
60 | if( retval == PAPI_OK ) { \
61 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
62 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
63 |
64 |
65 |
66 | #else // -----------------------------------------------------------
67 |
68 | #define PAPI_INIT
69 | #define PAPI_START_CNTR
70 | #define PAPI_STOP_CNTR
71 |
72 | #endif // -----------------------------------------------------------
73 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #./compile_all
4 |
5 | export LC_NUMERIC="en_US.UTF-8" #that is to avoid problems with locales when using printf
6 |
7 | declare -a opts=("" ".O3n")
8 | declare -a opt_names=("no-opt" "opt")
9 |
10 | ntypes=${#types[@]}
11 | nopts=${#opts[@]}
12 | ncompilers=${#compilers[@]}
13 |
14 | # --------------------------------------------------
15 | # get results
16 | timings=()
17 | IPC=()
18 |
19 | execs=(v?)
20 | execs+=(vector)
21 |
22 | for f in ${execs[@]};
23 | do
24 | version=$(echo $f | cut -f2 -d'v')
25 | printf "\trunning version v%s\n" $version
26 |
27 | for (( o=0; o out
31 |
32 | IPC+=($(cat out | grep IPC | cut -d':' -f2 ))
33 | timings+=($(cat out | grep cycles-per-element | cut -d':' -f3 | cut -d']' -f1 ))
34 | done
35 | rm -f out
36 | done
37 |
38 |
39 | # --------------------------------------------------
40 | # write results on the stdout
41 |
42 | # ............................
43 | # headers
44 | echo
45 | printf "%s\t" ""
46 | for (( o=0; o
8 | #include
9 | #include
10 |
11 | double cclock()
12 | /* Returns elepsed seconds past from the last call to timer rest */
13 | {
14 |
15 | struct timeval tmp;
16 | double sec;
17 | gettimeofday( &tmp, (struct timezone *)0 );
18 | sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0;
19 | return sec;
20 | }
21 |
22 | void setup_matrix(double* a, int n, int m, int stride)
23 | {
24 | int i,j;
25 | for (i = 0; i < n; i++)
26 | for (j = 0; j < m; j++)
27 | a[i*m + j ] = i*m + j + stride;
28 |
29 | }
30 |
31 | void clear_matrix(double* a, int n, int m)
32 | {
33 | int i,j;
34 | for (i = 0; i < n; i++)
35 | for (j = 0; j < m; j++)
36 | a[i*m + j ] = 0;
37 |
38 | }
39 |
40 |
41 | void mat_mult(double* a, double* b, double* c, int n, int m, int o)
42 | {
43 |
44 | int i, j, k;
45 | for (i = 0; i < n; i++)
46 | for (j = 0; j < o; j++)
47 | for ( k = 0; k < m; k++)
48 | c[i*o + j] += a[i*m + k] * b[k*o + j];
49 | }
50 |
51 |
52 | void mat_mult_opt(double* a, double* b, double* c, int n, int m, int o)
53 | {
54 |
55 | int i, j, k;
56 | for (i = 0; i < n; i++)
57 | for (k = 0; k < m; k++)
58 | for (j = 0; j < o; j++)
59 | c[i*o + j] += a[i*m + k] * b[k*o + j];
60 |
61 |
62 | }
63 |
64 |
65 | int main(int argc, char** argv)
66 | {
67 |
68 | double *a, *b, *c;
69 | int w, m,n,o;
70 | double begin, end;
71 |
72 | if (argc < 5)
73 | {
74 | printf(" Calculates c(n,o)=a(n,m)*b(m,o) \n");
75 | printf(" Usage: %s case n m o ", argv[0]);
76 | return 1;
77 | }
78 |
79 | w=atoi(argv[1]);
80 | n=atoi(argv[2]);
81 | m=atoi(argv[3]);
82 | o=atoi(argv[4]);
83 |
84 | a = malloc(n * m * sizeof(double));
85 | b = malloc(m * o * sizeof(double));
86 | c = malloc(n * o * sizeof(double));
87 |
88 | setup_matrix(a, n, m, 0);
89 | setup_matrix(b, m, o, m*n);
90 | clear_matrix(c, n, o);
91 |
92 | if( w == 0 )
93 | {
94 | begin = cclock();
95 | mat_mult(a, b, c, n, m, o);
96 | end = cclock();
97 | printf ("NON-optimized elapsed time %9.4f s \n\n", end - begin );
98 | }
99 | else
100 | {
101 | begin = cclock();
102 | mat_mult_opt(a, b, c, n, m, o);
103 | end = cclock();
104 | printf (" Optimized Elapsed time %9.4f s \n\n", end - begin );
105 | }
106 |
107 | // printf("%f\n", c[0]);
108 | free(a);
109 | free(b);
110 | free(c);
111 |
112 | return 0;
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/mypapi.h:
--------------------------------------------------------------------------------
1 |
2 |
3 | #if defined(USE_PAPI) // -----------------------------------------------------------
4 | #include
5 |
6 | typedef unsigned long long int uLint;
7 |
8 | #define PAPI_EVENTS_NUM 3
9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM };
10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values
13 |
14 | // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) { \
16 | if ( (R) != PAPI_OK ) { \
17 | printf("a problem with PAPI (code %d) arise at line %d\n", \
18 | (R), __LINE__);fflush(stdout); return (R); }}
19 |
20 |
21 | // check that PAPI is OK,
22 | // issue a warning if not with a
23 | // provided message
24 | #define PAPI_WARN( R, S ) { \
25 | if ( (R) != PAPI_OK ) { \
26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \
27 | (R), __LINE__, (S)); fflush(stdout); }}
28 |
29 | // check that PAPI is OK about an event
30 | // issue a warning if not with a
31 | // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \
33 | if ( (R) != PAPI_OK ) { \
34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }}
36 |
37 |
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 | retval = PAPI_query_event(papi_events[i]); \
40 | if ( retval == PAPI_OK ) { \
41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \
42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } }
44 |
45 | #define PAPI_INIT { \
46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \
47 | if (retval != PAPI_VER_CURRENT) \
48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 | PAPI_ADD_EVENTS_to_SET; }
51 |
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 |
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR { \
58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 |
60 | #define PAPI_STOP_CNTR { \
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \
62 | if( retval == PAPI_OK ) { \
63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 |
66 | #define PAPI_FLUSH_BUFFER { \
67 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
68 | papi_buffer[ jj] = 0; }
69 |
70 | #define PAPI_FLUSH { \
71 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
72 | papi_values[jj] = papi_buffer[ jj] = 0; }
73 |
74 |
75 | #else // -----------------------------------------------------------
76 |
77 | #define PAPI_INIT
78 | #define PAPI_START_CNTR
79 | #define PAPI_STOP_CNTR
80 |
81 | #endif // -----------------------------------------------------------
82 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/plot.gp:
--------------------------------------------------------------------------------
1 | reset
2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28"
3 | #set terminal qt enhanced size 1200,1000
4 |
5 | set key inside top left font ",22"
6 | set tics font ",22"
7 | set lmargin screen 0.08
8 | set rmargin screen 0.95
9 | set bmargin screen 0.12
10 |
11 | set xlabel "N" font ",22" offset 0,0.5
12 |
13 | unset yrange
14 | unset xrange
15 |
16 | array OPT[2]
17 | OPT[1] = "O0 "
18 | OPT[2] = "O3 "
19 |
20 | array W[2]
21 | W[1] = 3
22 | W[2] = 1.5
23 |
24 | array DT[2]
25 | DT[1] = "-- __"
26 | DT[2] = 1
27 |
28 | array TYPE[3]
29 | TYPE[1] = "naive"
30 | TYPE[2] = "optimized"
31 | TYPE[3] = "tailed"
32 |
33 |
34 | # ---------------------------------------------
35 |
36 | set output "timings.png"
37 | set ylabel "timing (sec)" font ",22" offset 2
38 |
39 |
40 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1],\
41 | "" u 1:(1.5e-8*$1**3) w l lc 0 lw 2 dt '..' notitle,\
42 | "" u 1:(3e-9*$1**3) w l lc 0 lw 2 dt '..' notitle
43 |
44 |
45 | # ---------------------------------------------
46 |
47 | set output "timings_ratio.png"
48 | set ylabel "timings / timings_{naive}" font ",22" offset 2
49 |
50 | ref = 2
51 | clr = 2
52 | plot for[L = 1:2] for [i = 3:4] "timings" u 1:(column(i+(L-1)*3)/column(ref)) w lp ps 2 lw W[L] dt DT[L] lc ((L-1)*3+(i-1)) title OPT[L].TYPE[i-1]
53 |
54 | # ---------------------------------------------
55 |
56 | set output "timings_per_element.png"
57 | set ylabel "timing per element (nsec)" font ",22" offset 2
58 |
59 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)/($1**3)*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
60 |
61 |
62 | # ---------------------------------------------
63 |
64 | set output "CPE.png"
65 | set ylabel "CPE" font ",22" offset 2
66 |
67 | plot for[L = 1:2] for [i = 2:4] "CPEs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
68 |
69 |
70 | # ---------------------------------------------
71 |
72 | set output "L1M.png"
73 | set ylabel "Level 1 misses per element" font ",22" offset 2
74 |
75 | plot for[L = 1:2] for [i = 2:4] "L1Ms" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
76 |
77 |
78 | # ---------------------------------------------
79 |
80 | set output "IPC.png"
81 | set key inside bottom left
82 | set ylabel "IPC" font ",22" offset 2
83 | set yrange [:4]
84 |
85 | plot for[L = 1:2] for [i = 2:4] "IPCs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
86 |
87 |
88 |
89 |
90 | set output
91 | reset
92 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export LC_NUMERIC="en_US.UTF-8"
4 | export LC_LOCALE="en_US.UTF-8"
5 |
6 | exec=matmul
7 |
8 | declare -a outputs=("timings" "IPCs" "CPEs" "L1Ms")
9 | ndata=${#outputs[@]}
10 |
11 | declare -a optimizations=("Non-opt" "Opt")
12 | noptimizations=${#optimizations[@]}
13 |
14 | declare -a versions=("naive " "lpswap" "tailed")
15 | nversions=${#versions[@]}
16 |
17 | # --------------------------------------------
18 | # SAVE THE OLD TABLES, IF PRESENT
19 | # --------------------------------------------
20 |
21 | for (( o=0 ; o < $ndata; o++ ));
22 | do
23 | mv -f ${outputs[$o]} ${outputs[$o]}.back
24 | echo -n "# ">> ${outputs[$o]}
25 |
26 | for (( p=0; p < $noptimizations; p++ )); do
27 | echo -e -n ${optimizations[$p]}"\t\t" >> ${outputs[$o]} ;
28 | done
29 | echo >> ${outputs[$o]}
30 |
31 | echo -n "#N ">> ${outputs[$o]}
32 | for (( p=0; p < $noptimizations; p++ )); do
33 | for (( v=0; v < $nversions; v++ )); do
34 | echo -n ${versions[$v]}" " >> ${outputs[$o]}; done;
35 | done
36 | echo >> ${outputs[$o]}
37 | done
38 |
39 | # --------------------------------------------
40 | # PREPARE OUTPUT FOLDER
41 | # --------------------------------------------
42 |
43 | output_dir=./output_saved
44 | if [ ! -d $output_dir ]; then mkdir $output_dir; fi
45 |
46 | # --------------------------------------------
47 |
48 |
49 | start=100
50 | stop=3000
51 | inc=100
52 |
53 |
54 |
55 |
56 | echo -n "running.. "
57 | for (( N=$start; N<=$stop; N+=$inc ));
58 | do
59 | echo -n "N="$N".. "
60 | for (( V=0; V<$nversions; V++ ));
61 | do
62 | taskset -c 2 ./$exec $V $N $N $N > ${output_dir}/output.${V}.${N}
63 | results+=($(cat ${output_dir}/output.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}'))
64 |
65 | taskset -c 2 ./${exec}.On $V $N $N $N > ${output_dir}/output.O.${V}.${N}
66 | resultsO+=($(cat ${output_dir}/output.O.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}'))
67 |
68 | done
69 |
70 | for (( o=0 ; o < ${#outputs[@]}; o++ ));
71 | do
72 | echo -n $N" " >> ${outputs[$o]}
73 | for (( c=0; c<$nversions; c++ )); do echo -n ${results[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done
74 | for (( c=0; c<$nversions; c++ )); do echo -n ${resultsO[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done
75 | echo >> ${outputs[$o]}
76 | done
77 |
78 | results=()
79 | resultsO=()
80 | done
81 | echo
82 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/Makefile:
--------------------------------------------------------------------------------
1 | CC=gcc
2 | #CFLAGS=-Wall -O1 -msse3
3 | CFLAGS= -O3 -march=native
4 | OBJ=poly.o benchmark.o timing/clock.o statistics/cpe.o statistics/fcyc.o statistics/lsquare.o
5 | LDFLAGS=-lm
6 | # phony targets will always be remade, so a file named "clean"
7 | # won't prevent the clean target from running
8 | .PHONY: all clean run
9 | EXE=driver
10 |
11 | all: $(EXE)
12 |
13 | $(EXE): $(OBJ)
14 | $(CC) $(CFLAGS) -o $(EXE) $(OBJ) $(LDFLAGS)
15 |
16 | run: $(EXE)
17 | ./$(EXE)
18 |
19 | clean:
20 | rm -f $(EXE) $(OBJ)
21 |
22 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/benchmark.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "statistics/cpe.h"
5 | #include "poly.h"
6 |
7 | #define SHORT 0
8 | #if SHORT
9 | #define ASIZE 31
10 | #else
11 | #define ASIZE 973
12 | #endif
13 | #define EPS (1e-8)
14 |
15 | /* Keep track of a number of different programs */
16 | #define MAX_BENCHMARKS 100
17 |
18 | static struct {
19 | poly_t cfunct;
20 | char *description;
21 | double cpe;
22 | } benchmarks[MAX_BENCHMARKS];
23 |
24 | static int benchmark_count = 0;
25 | static int current_benchmark = 0;
26 |
27 | static double* data = NULL;
28 | static double x;
29 | static double result;
30 | static poly_t check_func = NULL;
31 |
32 | static void setup()
33 | {
34 | int i;
35 | if (!data)
36 | data = (double*) malloc(sizeof(double) * ASIZE);
37 | if (!data) {
38 | fprintf(stderr, "Memory allocation error!\n");
39 | exit(EXIT_FAILURE);
40 | }
41 | /* Initialize array */
42 | for (i = 0; i < ASIZE; i++)
43 | data[i] = (drand48() * 2) - 1;
44 | x = (drand48() * 2) - 1;
45 | }
46 |
47 | void run(int cnt) {
48 | result = benchmarks[current_benchmark].cfunct(data, x, cnt);
49 | }
50 |
51 | static void run_test(int bench_index) {
52 | double cpe;
53 | char *description = benchmarks[bench_index].description;
54 | double good_result;
55 | current_benchmark = bench_index;
56 | printf("starting benchmark %d\n", bench_index);
57 | setup();
58 | cpe = find_cpe_full(run, ASIZE, 200000, stdout, RAN_SAMPLE, 0.3, 0);
59 | if (check_func) {
60 | result = benchmarks[bench_index].cfunct(data, x, ASIZE);
61 | good_result = check_func(data, x, ASIZE);
62 | if (result - good_result > EPS) {
63 | printf("Function %s, Should be %f, Got %f\n",
64 | description, good_result, result);
65 | }
66 | }
67 | benchmarks[current_benchmark].cpe = cpe;
68 | /* print results */
69 | printf("%s: ", description);
70 | printf("%.2f cycles/element\n\n", cpe);
71 | }
72 |
73 | void add_function(poly_t f, char *description) {
74 | benchmarks[benchmark_count].cfunct = f;
75 | benchmarks[benchmark_count].description = description;
76 | benchmark_count++;
77 | }
78 |
79 | void set_check_function(poly_t f) {
80 | check_func = f;
81 | }
82 |
83 | int main()
84 | {
85 | int i;
86 | register_functions();
87 | printf("\n");
88 | for (i = 0; i < benchmark_count; i++) {
89 | run_test(i);
90 | }
91 | free(data);
92 | return EXIT_SUCCESS;
93 | }
94 |
95 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.c:
--------------------------------------------------------------------------------
1 | #include "poly.h"
2 |
3 | double poly(double a[], double x, int degree)
4 | {
5 | long int i;
6 | double result = a[0];
7 | double xpwr = x; /* equals x^i at start of loop */
8 | for (i = 1; i <= degree; i++) {
9 | result += a[i] * xpwr;
10 | xpwr = x * xpwr;
11 | }
12 | return result;
13 | }
14 |
15 | double polyh(double a[], double x, int degree)
16 | {
17 | long int i;
18 | double result = a[degree];
19 | for (i = degree-1; i >= 0; i--)
20 | result = a[i] + x*result;
21 | return result;
22 | }
23 |
24 | double mypoly1(double a[], double x, int degree)
25 | {
26 | long int i;
27 | double x2 = x*x;
28 | double res = a[0];
29 | double xpwr = x;
30 |
31 | for ( i = 1; i < degree; i += 2 )
32 | {
33 | res += a[i] * xpwr;
34 | res += a[i+1] * xpwr * x;
35 | xpwr *= x2;
36 | }
37 | return res;
38 | }
39 |
40 | double mypoly2(double a[], double x, int degree)
41 | {
42 | long int i;
43 | double x2 = x*x;
44 | double res = a[0];
45 | double xpwr = x;
46 |
47 | for ( i = 1; i < degree; i += 2 )
48 | {
49 | res += (a[i] + a[i+1]*x)* xpwr;
50 | // res += a[i+1] * xpwr * x;
51 | xpwr *= x2;
52 | }
53 | for ( ; i <= degree; i ++ )
54 | {
55 | res += a[i] * xpwr;
56 | xpwr *= x;
57 | }
58 | return res;
59 | }
60 |
61 | double mypoly3(double a[], double x, int degree)
62 | {
63 | long int i;
64 | double x2 = x*x;
65 | double res1 = a[0];
66 | double res2 = a[2];
67 | double xpwr = x;
68 | double xpwr3 = x2*x;
69 |
70 | for ( i = 1; i < degree-4; i += 4 )
71 | {
72 | res1 += (a[i] + a[i+1]*x)* xpwr;
73 | res2 += (a[i+2] + a[i+3]*x)* xpwr3;
74 | xpwr *= x2;
75 | xpwr3 *= x2;
76 | }
77 | for ( ; i <= degree; i ++ )
78 | {
79 | res1 += a[i] * xpwr;
80 | xpwr *= x;
81 | }
82 |
83 | return res1+res2;
84 | }
85 |
86 |
87 | double mypoly4(double a[], double x, int degree)
88 | {
89 | long int i;
90 | double x2 = x*x;
91 | double res_even = a[0];
92 | double res_odd = 0;
93 | double xpwr_even = x2;
94 | double xpwr_odd = x;
95 |
96 | for ( i = 1; i <= degree; i += 2 )
97 | {
98 | res_odd += a[i] * xpwr_odd;
99 | xpwr_odd *= x2;
100 | res_even += a[i+1] * xpwr_even;
101 | xpwr_even *= x2;
102 | }
103 | //for ( ; i <= degree; i ++ )
104 | // {
105 | // res_odd += a[i] * xpwr_even;
106 | // xpwr_even *= x;
107 | // }
108 | return res_even + res_odd;
109 | }
110 |
111 | void register_functions(void)
112 | {
113 | set_check_function(&poly); /* used as reference implementation */
114 |
115 | add_function(&poly, "Polynomial: Naive implementation");
116 | add_function(&polyh, "Polynomial: Horner's method");
117 | add_function(&mypoly1, "Polynomial: my poly1, unroll x 2");
118 | add_function(&mypoly2, "Polynomial: my poly2, 2 separate loops");
119 | add_function(&mypoly3, "Polynomial: my poly3, unroll x 2 and separate accumulation");
120 |
121 | return;
122 | }
123 |
124 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.h:
--------------------------------------------------------------------------------
1 | #if __INTEL_COMPILER
2 | /* inline function definitions */
3 | #pragma warning ( disable : 1418 )
4 | #endif
5 |
6 | typedef double (*poly_t)(double*, double, int);
7 | /* Add routine to list of programs to measure */
8 | void add_function(poly_t f, char *description);
9 | /* Set routine to check results against */
10 | void set_check_function(poly_t f);
11 | /* called by main to register the set of routines to benchmark */
12 | void register_functions(void);
13 |
14 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/readme.md:
--------------------------------------------------------------------------------
1 | pipelining at work in evaulation of polynomials
2 |
3 | just typing "make" you should get an executable named "driver".
4 | by default, -O3 -march=native is enabled.
5 |
6 | That will evaluate a polynomial using different functions that are defined in poly.c
7 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/cpe.h:
--------------------------------------------------------------------------------
1 | /* Compute CPE for function */
2 |
3 | /* Compute for function that is linear in some parameter cnt */
4 | typedef void (*elem_fun_t)(int);
5 |
6 | /* Different ways of finding samples
7 | UNI_SAMPLE: samples uniformly spaced between bias*maxcnt and maxcnt
8 | RAN_SAMPLE: samples randomly selected between bias*maxcnt and maxcnt
9 | */
10 |
11 | typedef enum {UNI_SAMPLE, RAN_SAMPLE}
12 | sample_t;
13 |
14 | /* Find cpe for function f, which allows cnt up to maxcnt.
15 | Uses default parameters
16 | */
17 | double find_cpe(elem_fun_t f, int maxcnt);
18 |
19 | /* Find cpe for function f, which allows cnt up to maxcnt, using
20 | specified number of sample points.
21 | If data_file, then print data so that can plot points with Excel
22 | smethod determines method for generating samples
23 | */
24 | double find_cpe_full(elem_fun_t f, int maxcnt, int samples, FILE *data_file,
25 | sample_t smethod, double bias, int verbose);
26 |
27 | /* Find number of cycles taken by function.
28 | Do this by running number of trials until best two within TOL (2%) of
29 | each other
30 | */
31 | double measure_function(elem_fun_t f, int cnt);
32 |
33 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/fcyc.h:
--------------------------------------------------------------------------------
1 |
2 | /* Fcyc measures the speed of any "test function." Such a function
3 | is passed a list of integer parameters, which it may interpret
4 | in any way it chooses.
5 | */
6 |
7 | typedef void (*test_funct)(int *);
8 |
9 | /* Compute number of cycles used by function f on given set of parameters */
10 | double fcyc(test_funct f, int* params);
11 |
12 | /***********************************************************/
13 | /* Set the various parameters used by measurement routines */
14 |
15 |
16 | /* When set, will run code to clear cache before each measurement
17 | Default = 0
18 | */
19 | void set_fcyc_clear_cache(int clear);
20 |
21 | /* Set size of cache to use when clearing cache
22 | Default = 1<<19 (512KB)
23 | */
24 | void set_fcyc_cache_size(int bytes);
25 |
26 | /* Set size of cache block
27 | Default = 32
28 | */
29 | void set_fcyc_cache_block(int bytes);
30 |
31 | /* When set, will attempt to compensate for timer interrupt overhead
32 | Default = 0
33 | */
34 | void set_fcyc_compensate(int compensate);
35 |
36 | /* Value of K in K-best
37 | Default = 3
38 | */
39 | void set_fcyc_k(int k);
40 |
41 | /* Maximum number of samples attempting to find K-best within some tolerance.
42 | When exceeded, just return best sample found.
43 | Default = 20
44 | */
45 | void set_fcyc_maxsamples(int maxsamples);
46 |
47 | /* Tolerance required for K-best
48 | Default = 0.01
49 | */
50 | void set_fcyc_epsilon(double epsilon);
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.c:
--------------------------------------------------------------------------------
1 | /* Compute least squares fit of set of data points */
2 | #include
3 | #include
4 | #include "lsquare.h"
5 |
6 | typedef struct {
7 | double sum_x;
8 | double sum_y;
9 | double sum_xx;
10 | double sum_xy;
11 | } ls_stat_t;
12 |
13 | /* Accumulate various sums of the data */
14 | static void ls_stats(double *xval, double *yval, int cnt, ls_stat_t *statp)
15 | {
16 | int i;
17 | statp->sum_x = 0.0;
18 | statp->sum_y = 0.0;
19 | statp->sum_xx = 0.0;
20 | statp->sum_xy = 0.0;
21 | for (i = 0; i < cnt; i++) {
22 | double x = xval[i];
23 | double y = yval[i];
24 | statp->sum_x += x;
25 | statp->sum_y += y;
26 | statp->sum_xx += x * x;
27 | statp->sum_xy += x * y;
28 | }
29 | }
30 |
31 | double ls_slope(double *xval, double *yval, int cnt)
32 | {
33 | double slope;
34 | ls_stat_t stat;
35 | ls_stats(xval, yval, cnt, &stat);
36 | slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
37 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
38 | return slope;
39 | }
40 |
41 | double ls_intercept(double *xval, double *yval, int cnt)
42 | {
43 | double intercept;
44 | ls_stat_t stat;
45 | ls_stats(xval, yval, cnt, &stat);
46 | intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
47 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
48 | return intercept;
49 | }
50 |
51 | static double rel_err(double x, double y, double slope, double intercept)
52 | {
53 | double offset = y - (slope*x+intercept);
54 | if (offset < 0)
55 | offset = -offset;
56 | if (x == 0)
57 | return offset;
58 | /*
59 | printf("x = %.2f, y = %.2f, a = %.2f, b = %.2f\n",
60 | x, y, slope, intercept);
61 | printf("Abs err = %.2f, Rel err = %.2f\n", offset, offset/x);
62 | */
63 | return offset/x;
64 | }
65 |
66 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype)
67 | {
68 | double slope;
69 | double intercept;
70 | ls_stat_t stat;
71 | int i;
72 | double num, denom;
73 | ls_stats(xval, yval, cnt, &stat);
74 | slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/
75 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
76 | intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/
77 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x);
78 | num = denom = 0;
79 | for (i = 0; i < cnt; i++) {
80 | double e = rel_err(xval[i], yval[i], slope, intercept);
81 | switch (etype) {
82 | case LS_AVG:
83 | num += e;
84 | denom++;
85 | break;
86 | case LS_MAX:
87 | if (num < e)
88 | num = e;
89 | denom = 1;
90 | break;
91 | default:
92 | fprintf(stderr, "Invalid error type: %d\n", etype);
93 | exit(1);
94 | break;
95 | }
96 | }
97 | return num/denom;
98 | }
99 |
100 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.h:
--------------------------------------------------------------------------------
1 | /* Compute least squares fit of set of data points */
2 |
3 | /* Fit is of form y = mx + b. m is slope, b is intercept */
4 | double ls_slope(double *xval, double *yval, int cnt);
5 | double ls_intercept(double *xval, double *yval, int cnt);
6 |
7 | typedef enum {LS_AVG, LS_MAX} ls_err_t;
8 |
9 | /* Determine error (either absolute or average) of least squares fit */
10 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype);
11 |
12 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/timing/clock.h:
--------------------------------------------------------------------------------
1 | #if __INTEL_COMPILER
2 | /* inline function definitions */
3 | #pragma warning ( disable : 1418 )
4 | #endif
5 |
6 | /* Routines for using cycle counter */
7 |
8 | /* Start the counter */
9 | void start_counter();
10 | void start_counter_copy();
11 |
12 | /* Get # cycles since counter started */
13 | double get_counter();
14 | double get_counter_copy();
15 |
16 |
17 | /* Measure overhead for counter */
18 | double ovhd();
19 |
20 | /* Determine clock rate of processor */
21 | double mhz(int verbose);
22 |
23 | /* Determine clock rate of processor, having more control over accuracy */
24 | double mhz_full(int verbose, int sleeptime);
25 |
26 | /** Special counters that compensate for timer interrupt overhead */
27 |
28 | void start_comp_counter();
29 |
30 | double get_comp_counter();
31 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/mypapi.h:
--------------------------------------------------------------------------------
1 |
2 |
3 | #if defined(USE_PAPI) // -----------------------------------------------------------
4 | #include
5 |
6 | typedef unsigned long long int uLint;
7 |
8 | #define PAPI_EVENTS_NUM 3
9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM };
10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set
11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values
12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values
13 |
14 | // check that PAPI is OK, exit if not
15 | #define PAPI_CHECK( R ) { \
16 | if ( (R) != PAPI_OK ) { \
17 | printf("a problem with PAPI (code %d) arise at line %d\n", \
18 | (R), __LINE__);fflush(stdout); return (R); }}
19 |
20 |
21 | // check that PAPI is OK,
22 | // issue a warning if not with a
23 | // provided message
24 | #define PAPI_WARN( R, S ) { \
25 | if ( (R) != PAPI_OK ) { \
26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \
27 | (R), __LINE__, (S)); fflush(stdout); }}
28 |
29 | // check that PAPI is OK about an event
30 | // issue a warning if not with a
31 | // provided message
32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \
33 | if ( (R) != PAPI_OK ) { \
34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \
35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }}
36 |
37 |
38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \
39 | retval = PAPI_query_event(papi_events[i]); \
40 | if ( retval == PAPI_OK ) { \
41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \
42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \
43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } }
44 |
45 | #define PAPI_INIT { \
46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \
47 | if (retval != PAPI_VER_CURRENT) \
48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \
49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \
50 | PAPI_ADD_EVENTS_to_SET; }
51 |
52 | // to use HIGH-LEVEL API
53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); }
55 |
56 | // to use NORMAL API
57 | #define PAPI_START_CNTR { \
58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); }
59 |
60 | #define PAPI_STOP_CNTR { \
61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \
62 | if( retval == PAPI_OK ) { \
63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); }
65 |
66 | #define PAPI_FLUSH_BUFFER { \
67 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
68 | papi_buffer[ jj] = 0; }
69 |
70 | #define PAPI_FLUSH { \
71 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \
72 | papi_values[jj] = papi_buffer[ jj] = 0; }
73 |
74 |
75 | #else // -----------------------------------------------------------
76 |
77 | #define PAPI_INIT
78 | #define PAPI_START_CNTR
79 | #define PAPI_STOP_CNTR
80 |
81 | #endif // -----------------------------------------------------------
82 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/plot.gp:
--------------------------------------------------------------------------------
1 | reset
2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28"
3 | #set terminal qt enhanced size 1200,1000
4 |
5 |
6 | set tics font ",22"
7 | set rmargin screen 0.95
8 | set bmargin screen 0.12
9 |
10 | set xlabel "N" font ",22" offset 0,0.5
11 |
12 | unset yrange
13 | unset xrange
14 |
15 | array OPT[2]
16 | OPT[1] = "O0 "
17 | OPT[2] = "O3 "
18 |
19 | array W[2]
20 | W[1] = 3
21 | W[2] = 1.5
22 |
23 | array DT[2]
24 | DT[1] = "-- __"
25 | DT[2] = 1
26 |
27 | NTYPE = 7
28 | array TYPE[NTYPE]
29 | TYPE[1] = "naive"
30 | TYPE[2] = "UR2x1"
31 | TYPE[3] = "UR2x1g"
32 | TYPE[4] = "UR2x2"
33 | TYPE[5] = "UR4x2g"
34 | TYPE[6] = "UR4x4"
35 | TYPE[7] = "vUR4x4"
36 |
37 |
38 | # ---------------------------------------------
39 | set key inside top left font ",22"
40 | set lmargin screen 0.08
41 | # ---------------------------------------------
42 |
43 | set output "timings.png"
44 | set ylabel "timing (sec)" font ",22" offset 2
45 |
46 |
47 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
48 |
49 |
50 | # ---------------------------------------------
51 | set key outside left
52 | set lmargin screen 0.22
53 | # ---------------------------------------------
54 |
55 | set output "timings_per_element.png"
56 | set ylabel "timing per element (nsec)" font ",22" offset 2, -6
57 |
58 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/$1*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
59 |
60 | # ---------------------------------------------
61 |
62 | set output "timings_ratio.png"
63 | set ylabel "timings / timings_{naive}" font ",22" offset 2
64 |
65 | ref = 2
66 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/column(ref)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
67 |
68 | # ---------------------------------------------
69 | set output "CPE.png"
70 | set ylabel "CPE" font ",22" offset 2
71 |
72 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "CPEs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
73 |
74 |
75 | # ---------------------------------------------
76 |
77 | set output "L1M.png"
78 | set ylabel "Level 1 misses per element" font ",22" offset 2,-5
79 |
80 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "L1Ms" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
81 |
82 |
83 | # ---------------------------------------------
84 |
85 | set output "IPC.png"
86 | set ylabel "IPC" font ",22" offset 2
87 |
88 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "IPCs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1]
89 |
90 |
91 |
92 |
93 | set output
94 | reset
95 |
--------------------------------------------------------------------------------
/CODE_OPTIMIZATION/examples_on_pipelines/reduction/reduction.h:
--------------------------------------------------------------------------------
1 |
2 | #if defined(_GNU_SOURCE)
3 | #include
4 | #endif
5 |
6 | // ─────────────────────────────────────────────────────────────────
7 | // define the datatype
8 | //
9 | #if !defined(ITYPE)
10 | #warning "compiling with double type"
11 | #define DTYPE double // type of data
12 | #define DATYPE double // type for accumulator
13 | #else
14 | #warning "compiling with int type"
15 | #define DTYPE unsigned int // type of data
16 | #define DATYPE long long unsigned int // type for accumulator
17 | #endif
18 |
19 |
20 |
21 | typedef unsigned long long int uLint;
22 |
23 | //
24 | // ------------------------------------------------------------------
25 |
26 |
27 | #define CONCAT(x,y) x ## y
28 |
29 | // ─────────────────────────────────────────────────────────────────
30 | // define the timing routines
31 | //
32 |
33 | #define CPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \
34 | (double)ts.tv_sec + \
35 | (double)ts.tv_nsec * 1e-9)
36 |
37 | //
38 | // ------------------------------------------------------------------
39 |
40 |
41 | // ─────────────────────────────────────────────────────────────────
42 | // define the vector generator
43 | //
44 |
45 | #define DEFINE_VECT( T, N, NAME ) typedef T v##NAME __attribute__((vector_size( sizeof(T) * N))); typedef union { v##NAME v; T s[N]; } u##NAME;
46 |
47 |
48 |
49 | // ─────────────────────────────────────────────────────────────────
50 | // define the vector generator
51 | //
52 |
53 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
54 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep")
55 | #elif defined(__INTEL_COMPILER) | defined(__ICC)
56 | #define PRAGMA_VECT_LOOP _Pragma("parallel")
57 | #elif defined(__clang__)
58 | #define PRAGMA_VECT_LOOP _Pragma("ivdep")
59 | #else
60 | #define PRAGMA_VECT_LOOP
61 | #endif
62 |
63 | //
64 | // ------------------------------------------------------------------
65 |
66 | // ─────────────────────────────────────────────────────────────────
67 | //
68 | //
69 |
70 |
71 |
72 | // ─────────────────────────────────────────────────────────────────
73 | // define the debug printing routine
74 | //
75 |
76 | #ifdef DEBUG
77 | #define PRINTF(...) printf(__VA_ARGS__)
78 | #define DEBUG_IO 2
79 | #else
80 | #define PRINTF(...)
81 | #endif
82 |
83 |
84 |
85 | DEFINE_VECT( DTYPE, 4, 4d );
86 | DEFINE_VECT( long int, 4, 4i );
87 |
--------------------------------------------------------------------------------
/HPC_TOOLS_and_STORAGE/Readme.md:
--------------------------------------------------------------------------------
1 | # Materials on HPC libraries, tools, storage
2 |
--------------------------------------------------------------------------------
/Materials/A_note_on_Endiansim.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/A_note_on_Endiansim.pdf
--------------------------------------------------------------------------------
/Materials/Readme.md:
--------------------------------------------------------------------------------
1 | # Sparse materials on various topics
2 |
3 | In this folder we will upload materials of interest
4 |
5 | 1) topics.pdf :: a continuosly updated pdf with various topics discussed in the class
6 | 2) What every Computer Scientist should know about floating point :: a good introduction to the IEEE floating point representation
7 |
8 |
--------------------------------------------------------------------------------
/Materials/What_every_computer_scientist_should_know_about_floating-point.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/What_every_computer_scientist_should_know_about_floating-point.pdf
--------------------------------------------------------------------------------
/Materials/arguments.c:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 |
6 |
7 | int main (int argc, char **argv )
8 | {
9 |
10 | printf("argv is located at address %p and points to %p\n", &argv, argv );
11 |
12 | int i = 0;
13 | while ( i < argc )
14 | {
15 | printf("arguments %d is located at address %p and reads as %s\n", i, argv + i, *(argv+i));
16 | i++;
17 | }
18 |
19 | return 0;
20 | }
21 |
--------------------------------------------------------------------------------
/Materials/topics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/topics.pdf
--------------------------------------------------------------------------------
/PARALLELISM/Readme.md:
--------------------------------------------------------------------------------
1 | # Section one: INTRODUCTION TO HPC and PARALLEL CONCEPTS
2 |
3 | ## Day 1: introduction to HPC
4 | date: Tuesday 28/09/2023
5 |
6 | ### lectures
7 |
8 | - Stefano Cozzini : [introduction to HPC](lecture01-intro-toHPC.pdf)
9 |
10 |
11 | The lecture above introduces HPC concepts and basic definitions.
12 |
13 | There is plenty of materials on the topic on the web.
14 | Here a few links to start with:
15 |
16 | - [FLOPS definition from wikipedia](https://en.wikipedia.org/wiki/FLOPS)
17 | - [ HPC short introduction from European perspective](https://ec.europa.eu/digital-single-market/en/high-performance-computing)
18 | - [ a must read paper: Reinventing High Performance Computing: Challenges and Opportunities](https://arxiv.org/abs/2203.02544)
19 | - [what can we do with an exascale machine](https://www.hpe.com/us/en/insights/articles/whats-with-the-18-zeros-2009.html)
20 | - [the www.top500.org: it deserves a visit to check a few things](https://www.top500.org)
21 |
22 | Application ( not discussed in lecture)
23 | - [Folding@home project: take a look](https://foldingathome.org/?lng=en)
24 | - [AlphaFold web page](https://alphafold.com/)
25 |
26 | ### Materials for Linux beginners:
27 |
28 | - [one simple tutorial to start using ssh](https://www.ssh.com/ssh/command/)
29 | - [linux/unix shell short tutorial for novice users](http://swcarpentry.github.io/shell-novice/)
30 |
--------------------------------------------------------------------------------
/PARALLELISM/codes/memory.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | void callocator(double ** vec, size_t N)
7 | {
8 | *vec=(double *)calloc(N, sizeof(double));
9 | assert(*vec != NULL);
10 | }
11 |
12 | int main(int argc, char **argv)
13 | {
14 | double * v ;
15 | size_t i, j, m;
16 | for (i = 1e3 ; i < 1e8 ; i*=10 ) {
17 | m = sizeof(double) * i ;
18 | callocator(&v, i);
19 | for (j=0; j
16 | #include
17 | #include
18 | // if you don ' t have drand48 uncomment the following two lines 10
19 | // #define drand48 1.0/RANDMAXrand
20 | // #define srand48 srand
21 | #define seed 68111 // seed for number generator
22 |
23 | int main (int argc, char ** argv) {
24 |
25 | if (argc<2)
26 | {
27 | printf(" Usage: %s number \n",argv[0]);
28 | return 1;
29 | }
30 | long long int N = atoll(argv[1]);
31 | long long int M = 0 ;
32 | double pi = 0;
33 | // point coordinates
34 | double x , y;
35 | clock_t start_time, end_time;
36 | double total_time;
37 | start_time = clock();
38 |
39 | srand48 ( seed ) ; // seed the number generator
40 |
41 | long long int i;
42 | for (i = 0 ; i < N ; i++)
43 | {
44 | // take a point P(x,y) inside the unit square
45 | x = drand48();
46 | y = drand48();
47 |
48 | // check if the point P(x,y) is inside the circle
49 | if ((x*x + y*y)<1)
50 | M++;
51 | }
52 | pi = 4.0*M/N ; // calculate area
53 | end_time=clock();
54 | printf ( "\n # of trials = %llu , estimate of pi is %1.9f \n", N, pi ) ;
55 | total_time= ( (double) (end_time - start_time) )/CLOCKS_PER_SEC ;
56 | printf ( "\n # walltime : %10.8f \n", total_time );
57 | return 0;
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/PARALLELISM/lecture01-intro-toHPC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture01-intro-toHPC.pdf
--------------------------------------------------------------------------------
/PARALLELISM/lecture02-HPC-hardware.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture02-HPC-hardware.pdf
--------------------------------------------------------------------------------
/PARALLELISM/lecture03-HPCsoftware-stack.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture03-HPCsoftware-stack.pdf
--------------------------------------------------------------------------------
/PARALLELISM/lecture04-on-parallel-programming.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture04-on-parallel-programming.pdf
--------------------------------------------------------------------------------
/PARALLELISM/slurm/README.md:
--------------------------------------------------------------------------------
1 | This folder contains the following files:
2 |
3 | - slurm01.job
4 | A simple example of a batch script for Slurm jobs
5 |
6 | - slurm02_#.job
7 | Three jobs showing how to run job steps within a Slurm job and the differences between allocating tasks and nodes
8 |
9 | - slurm03_#.job
10 | Three jobs showing the importance of specifying walltime and memory requirements
11 |
12 | - slurm04.job
13 | A simple job showing how to: load modules, compile and run an application within a Slurm job
14 |
15 | - slurm05.job
16 | A simple job showing what happens when we load a module
17 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm01.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Name of the job
4 | #SBATCH --job-name=my_first_job
5 |
6 | # Define the number of nodes you need.
7 | #SBATCH --nodes=1
8 |
9 | # Define the number of tasks you need. Use with distributed parallelism
10 | #SBATCH --ntasks=16
11 |
12 | # Eventually, you can further specify the number of tasks per node
13 | #SBATCH --ntasks-per-node=16
14 |
15 | # Define the number of CPUs allocated to each task. Use with shared memory parallelism
16 | #SBATCH --cpus-per-task=2
17 |
18 | # Define how long the job will run in real time. Format is d-hh:mm:ss
19 | # For a 30 seconds job
20 | #SBATCH --time=0-00:00:30
21 |
22 | ## Define the account name, e.g. for the Laboratory of Data Engineering
23 | ##SBATCH -A lade
24 |
25 | # Define the partition on which the job shall run, e.g. EPYC, THIN, GPU, DGX
26 | #SBATCH -p EPYC
27 |
28 | # Define how much memory you need. Choose one between the following
29 | # --mem will define memory per node
30 | # --mem-per-cpu will define memory per CPU/core
31 | #SBATCH --mem-per-cpu=1500MB
32 | ##SBATCH --mem=5GB # this one is not in effect, due to the double hash
33 |
34 | # Specify the output and error files
35 | #SBATCH --output=%x.%j.out
36 | #SBATCH --error=%x.%j.err
37 |
38 | # Eventually, you can turn on mail notification.
39 | # Among the possibilities we can list: NONE, BEGIN, END, FAIL, ALL
40 | ##SBATCH --mail-type=BEGIN,END
41 | ##SBATCH --mail-user=fifo@lifo.com
42 |
43 | # Pick nodes with feature 'foo'. Different clusters have different features available.
44 | # Most of the time you don't need this
45 | ##SBATCH -C foo
46 |
47 | # Restrict the job to run on the node(s) named
48 | ##SBATCH -w epyc008
49 |
50 | #Start the program
51 |
52 | >&2 echo "DIR is ${SLURM_SUBMIT_DIR}"
53 |
54 | srun /bin/hostname
55 | srun sleep 60
56 |
57 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_A.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=my_second_job_A
4 | #SBATCH --time=0-00:10:00
5 | #SBATCH -p EPYC
6 | #SBATCH -n3 # 3 tasks
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 | echo Starting job $SLURM_JOB_ID
10 | echo SLURM assigned me these nodes
11 | srun -l hostname
12 |
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 sleep 60 & # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 sleep 60 & # start 1 copy of program 2
17 | echo "3)" $(date)
18 | wait # wait for all to finish
19 |
20 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_B.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=my_second_job_B
4 | #SBATCH --time=0-00:10:00
5 | #SBATCH -p EPYC
6 | #SBATCH -n3 # 3 tasks
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 | echo Starting job $SLURM_JOB_ID
10 | echo SLURM assigned me these nodes
11 | srun -l hostname
12 |
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 sleep 60 # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 sleep 60 # start 1 copy of program 2
17 | echo "3)" $(date)
18 |
19 |
20 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm02_C.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=my_second_job_C
3 | #SBATCH --time=0-00:10:00
4 | #SBATCH -p EPYC
5 | #SBATCH -n3 # 3 tasks
6 | #SBATCH --output=%x.%j.out
7 | #SBATCH --error=%x.%j.err
8 | #SBATCH -N3 # 3 NODES
9 |
10 | echo Starting job $SLURM_JOB_ID
11 | echo SLURM assigned me these nodes
12 | srun -l hostname
13 | echo "1)" $(date)
14 | srun -l --exclusive -n2 -N2 sleep 60 & # start 2 copies of program 1
15 | echo "2)" $(date)
16 | srun -l --exclusive -n1 -N1 sleep 60 & # start 1 copy of program 2
17 | echo "3)" $(date)
18 | wait # wait for all to finish
19 |
20 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_A.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=memory_A # Job name
3 | #SBATCH --ntasks=1 # Run a single task
4 | #SBATCH --mem=70M # Job Memory
5 | #SBATCH --time=00:15:00 # Time limit hrs:min:sec
6 | #SBATCH -p THIN
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 |
10 | pwd; hostname; date
11 | cd ../codes
12 |
13 | gcc memory.c -o memory.x
14 | ./memory.x
15 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_B.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=memory_B # Job name
3 | #SBATCH --ntasks=1 # Run a single task
4 | #SBATCH --mem=90M # Job Memory
5 | #SBATCH --time=00:05:00 # Time limit hrs:min:sec
6 | #SBATCH -p THIN
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 |
10 | pwd; hostname; date
11 | cd ../codes
12 |
13 | gcc memory.c -o memory.x
14 | ./memory.x
15 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm03_C.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=memory_C # Job name
3 | #SBATCH --ntasks=1 # Run a single task
4 | #SBATCH --mem=100M # Job Memory
5 | #SBATCH --time=00:00:01 # Time limit hrs:min:sec
6 | #SBATCH -p THIN
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 |
10 | pwd; hostname; date
11 | cd ../codes
12 |
13 | gcc memory.c -o memory.x
14 | ./memory.x
15 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm04.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=compile_and_run_pi
4 | #SBATCH --time=0-00:10:00
5 | #SBATCH -p EPYC
6 | #SBATCH -n1 # 1 tasks
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 | #SBATCH -N1 # 1 NODES
10 | echo Starting job $SLURM_JOB_ID
11 | echo Current dir is ${SLURM_SUBMIT_DIR}
12 |
13 | module purge
14 | module load compiler # For Intel compiler instead of GNU compiler
15 | cd ../codes
16 | echo "Now, I am in $(pwd)"
17 | icx pi.c -O3 -o pi.x
18 | ./pi.x 100000000
19 |
20 |
--------------------------------------------------------------------------------
/PARALLELISM/slurm/slurm05.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=modules # Job name
3 | #SBATCH --ntasks=1 # Run a single task
4 | #SBATCH --time=00:05:00 # Time limit hrs:min:sec
5 | #SBATCH -p EPYC
6 | #SBATCH --output=%x.%j.out
7 | #SBATCH --error=%x.%j.err
8 |
9 | module purge
10 | echo "a) "$LD_LIBRARY_PATH
11 | module load openMPI/4.1.5/gnu
12 | echo "b) "$LD_LIBRARY_PATH
13 | module purge
14 | echo "c) "$LD_LIBRARY_PATH
15 | module load openMPI/4.1.5/icx
16 | echo "d) "$LD_LIBRARY_PATH
17 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/Readme.md:
--------------------------------------------------------------------------------
1 | # This folder collects materials on MPI and OpenMP
2 |
3 | ## MPI section
4 |
5 | A collection of materials/references for the MPI lectures
6 |
7 |
8 | ### lectures (all by S.Cozzini)
9 |
10 | - lecture 5a: [MPI programming partA ](lecture05-MPI-Programming-A.pdf)
11 | - lecture 5b: [MPI programming partB ](lecture05-MPI-Programming-B.pdf)
12 |
13 |
14 | ### Main references for MPI lectures:
15 |
16 | - chapter 9 of reference 4 is a nice and detailed introduction to MPI.
17 | - exercises and tutorials on MPI are present all over the web. Here a couple of examples:
18 | - [Here a very good starting point](https://www.mcs.anl.gov/research/projects/mpi/tutorial/index.html)
19 | - [Another simple tutorial](https://mpitutorial.com/tutorials/)
20 | - [A virtual course where I took a lot of materials, including some exercises](https://cvw.cac.cornell.edu/MPIP2P/)
21 |
22 | ### tutorials (contributed by Niccolo Tosato and Marco Celoria)
23 | - tutorial 1: [compiling and running MPI program on ORFEO (prepared by N.Tosato)](compiling-and-running-mpi-programs.md)
24 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/Brecv.c:
--------------------------------------------------------------------------------
1 | // taken from https://cvw.cac.cornell.edu/MPIP2P/brecv
2 | #include
3 | #include
4 | #include
5 | #include "mpi.h"
6 | #define TAG 100
7 |
8 | void print_time(double tbegin, double tend);
9 | int new_sleep(int);
10 | int SLEEP(clock_t);
11 |
12 | /* -------------------------------------------------------------------
13 | * helper to calculate elapsed time and print results
14 | * -------------------------------------------------------------------
15 | */
16 | void print_time(double tbegin, double tend)
17 | {
18 | int dt;
19 | dt = (int)((tend - tbegin) * 1000000.0);
20 | printf(" Elapsed time for send = %8d uSec\n", dt);
21 | }
22 |
23 | /* -----------------------------------------------------------
24 | * helpers to sleep program
25 | * -----------------------------------------------------------
26 | */
27 | int SLEEP(clock_t wait)
28 | {
29 | clock_t goal;
30 | wait *= 1000;
31 | goal = wait + clock();
32 | while (goal > clock() )
33 | ;
34 | return (0);
35 | }
36 |
37 | int new_sleep(int amount)
38 | {
39 | SLEEP(amount);
40 | return (0);
41 | }
42 |
43 | /* -----------------------------------------------------------
44 | * Main Program
45 | * -----------------------------------------------------------
46 | */
47 | int main(int argc, char **argv)
48 | {
49 | float *message; /* message buffer */
50 | int rank, /* rank of task in communicator */
51 | size, i;
52 | int mlen; /* dimension of the message */
53 | MPI_Status status; /* status of communication */
54 | double tbegin, tend; /* used to measure elapsed time */
55 |
56 | if (argc != 2) {
57 | printf(" Usage: blocksends \n");
58 | return -1;
59 | }
60 |
61 | /* -------------------------------------------------------------------
62 | * do initial housekeeping: allocate memory for messages,
63 | * initialize program with MPI, define message tags
64 | * ------------------------------------------------------------------
65 | */
66 |
67 | mlen = atoi(argv[1]);
68 | message = (float *)malloc(mlen * sizeof(float));
69 |
70 |
71 | MPI_Init(&argc, &argv);
72 | MPI_Comm_size(MPI_COMM_WORLD, &size);
73 | MPI_Comm_rank( MPI_COMM_WORLD, &rank );
74 | if(size != 2) {
75 | printf("This application is meant to be run with 2 processes.\n");
76 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
77 | }
78 | printf(" Process %d initialized\n", rank);
79 | printf(" Message size = %6d floats\n", mlen);
80 | printf(" Total size = %6lu bytes\n", (mlen* sizeof(float)));
81 |
82 | /* -----------------------------------------------------------------
83 | * task 0 will report the elapsed time for a blocking send
84 | * -----------------------------------------------------------------
85 | */
86 | if (rank == 0) {
87 | for (i = 0; i < mlen; i++) message[i] = 100;
88 | printf(" Task %d sending message\n", rank);
89 | MPI_Barrier(MPI_COMM_WORLD);
90 | tbegin = MPI_Wtime();
91 | MPI_Send(message, mlen, MPI_FLOAT, 1, TAG, MPI_COMM_WORLD);
92 | tend = MPI_Wtime();
93 | print_time(tbegin, tend);
94 | }
95 |
96 | /* -----------------------------------------------------------------
97 | * task 1 sleeps for 1 second, and then calls a blocking receive.
98 | * the sleep is intended to simulate time spent in useful computation
99 | * -----------------------------------------------------------------
100 | */
101 | else if (rank == 1) {
102 | for (i = 0; i < mlen; i++) message[i] = -100;
103 | MPI_Barrier(MPI_COMM_WORLD);
104 | new_sleep(1);
105 | MPI_Recv(message, mlen, MPI_FLOAT, 0, TAG, MPI_COMM_WORLD, &status );
106 | printf(" Task %d received message\n", rank);
107 | }
108 | MPI_Finalize();
109 | return 0;
110 | }
111 |
112 |
113 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/clean.sh:
--------------------------------------------------------------------------------
1 | rm *.x
2 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_gnu.sh:
--------------------------------------------------------------------------------
1 | module load openMPI/4.1.5/gnu
2 |
3 | mpicc Brecv.c -g3 -o Brecv.x
4 | mpicc CBlockSends.c -g3 -o CBlockSends.x
5 | mpicc deadlock.c -g3 -o deadlock.x
6 | mpicc linear-array.c -g3 -o linear-array.x
7 | mpicc mpi_env_call.c -g3 -o mpi_env_call.x
8 | mpicc mpi_hello_world.c -g3 -o mpi_hello_world.x
9 | mpicc mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
10 | mpif90 mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x
11 | mpicc mpi_pi.c -O3 -g3 -o mpi_pi.x
12 | mpif90 send_message.F90 -g3 -o send_message_F.x
13 | mpicc send_message.c -g3 -o send_message.x
14 | mpicc sendrecv_message.c -g3 -o sendrecv_message.x
15 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_intel.sh:
--------------------------------------------------------------------------------
1 | module purge
2 | module load openMPI/4.1.5/icx
3 |
4 | mpicc Brecv.c -g3 -o Brecv.x
5 | mpicc CBlockSends.c -g3 -o CBlockSends.x
6 | mpicc deadlock.c -g3 -o deadlock.x
7 | mpicc linear-array.c -g3 -o linear-array.x
8 | mpicc mpi_env_call.c -g3 -o mpi_env_call.x
9 | mpicc mpi_hello_world.c -g3 -o mpi_hello_world.x
10 | mpicc mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
11 | mpif90 mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x
12 | mpicc mpi_pi.c -O3 -g3 -o mpi_pi.x
13 | mpif90 send_message.F90 -g3 -o send_message_F.x
14 | mpicc send_message.c -g3 -o send_message.x
15 | mpicc sendrecv_message.c -g3 -o sendrecv_message.x
16 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/deadlock.c:
--------------------------------------------------------------------------------
1 | // A simple program with a deadloack inside.
2 | // Taken and adapted from somewhere on the net
3 | #include
4 | #include "mpi.h"
5 | #include
6 |
7 | int main(int argc, char *argv[])
8 | {
9 | #define MSGLEN 1024
10 | int ITAG_A = 100, ITAG_B = 200;
11 | int irank, i, isize, idest, isrc, istag, iretag;
12 | float rmsg1[MSGLEN];
13 | float rmsg2[MSGLEN];
14 | MPI_Status recv_status;
15 |
16 | MPI_Init(&argc, &argv);
17 | MPI_Comm_rank(MPI_COMM_WORLD, &irank);
18 | MPI_Comm_size(MPI_COMM_WORLD, &isize);
19 |
20 | if(isize != 2) {
21 | printf("This application is meant to be run with 2 processes.\n");
22 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
23 | }
24 |
25 | printf("I am rank %d of %d \n", irank, isize );
26 | // load an array of float numbers as message
27 | for (i = 1; i <= MSGLEN; i++) {
28 | rmsg1[i] = 100;
29 | rmsg2[i] = -100;
30 | }
31 | if (irank == 0) {
32 | idest = 1;
33 | isrc = 1;
34 | istag = ITAG_A;
35 | iretag = ITAG_B;
36 | }
37 | else if (irank == 1) {
38 | idest = 0;
39 | isrc = 0;
40 | istag = ITAG_B;
41 | iretag = ITAG_A;
42 | }
43 |
44 | printf("Task %d sends the message with tag %d of length %lu \n",
45 | irank, istag, MSGLEN * sizeof(float));
46 |
47 | printf("Task %d receives message with tag %d of length %lu \n",
48 | irank, iretag, MSGLEN * sizeof(float));
49 |
50 | MPI_Barrier(MPI_COMM_WORLD);
51 |
52 | MPI_Send(&rmsg1, MSGLEN, MPI_FLOAT, idest, istag, MPI_COMM_WORLD);
53 | MPI_Recv(&rmsg2, MSGLEN, MPI_FLOAT, isrc, iretag, MPI_COMM_WORLD, &recv_status);
54 | printf("Task %d has received the message\n", irank);
55 | MPI_Finalize();
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/linear-array.c:
--------------------------------------------------------------------------------
1 | // A simple 1-D example:
2 | // each element receive the rank of the previous one, add its rank and send forward.
3 | // taken somewhere on net and adapted.
4 | // Final SUM is the sum of n-1 integers.
5 |
6 | #include
7 | #include "mpi.h"
8 |
9 | int main(int argc,char *argv[])
10 | {
11 | int MyRank, Numprocs;
12 | int value, sum = 0;
13 | int Source, Source_tag;
14 | int Destination, Destination_tag;
15 | int Root = 0;
16 | MPI_Status status;
17 |
18 | MPI_Init(&argc,&argv);
19 | MPI_Comm_size(MPI_COMM_WORLD, &Numprocs);
20 | MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
21 |
22 | if (MyRank == Root) {
23 | Destination = MyRank + 1;
24 | Destination_tag = 0;
25 | MPI_Send(&MyRank, 1, MPI_INT, Destination, Destination_tag, MPI_COMM_WORLD);
26 | }
27 | else {
28 | if (MyRank
5 |
6 | int main(int argc, char *argv[]) {
7 | int numtasks, rank, len, rc;
8 | char hostname[MPI_MAX_PROCESSOR_NAME];
9 |
10 | // initialize MPI
11 | MPI_Init(&argc,&argv);
12 |
13 | // get number of tasks
14 | MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
15 |
16 | // get my rank
17 | MPI_Comm_rank(MPI_COMM_WORLD,&rank);
18 |
19 | // this one is obvious
20 | MPI_Get_processor_name(hostname, &len);
21 | printf ("Number of tasks= %d. My rank= %d. Running on %s\n", numtasks, rank, hostname);
22 |
23 | // done with MPI
24 | MPI_Finalize();
25 | }
26 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.F90:
--------------------------------------------------------------------------------
1 | PROGRAM hello
2 | INCLUDE 'mpif.h'
3 | INTEGER err, rank, size, name_len
4 | CHARACTER(MPI_MAX_PROCESSOR_NAME) processor_name
5 | CALL MPI_INIT(err)
6 | CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank,err)
7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD,size,err)
8 | CALL MPI_GET_PROCESSOR_NAME(processor_name,name_len,err)
9 | print *, 'Hello world from processor ', processor_name, ' rank ', rank, ' out of ', size, ' processors'
10 | CALL MPI_FINALIZE(err)
11 | END
12 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main(int argc, char** argv) {
5 | // Initialize the MPI environment
6 | MPI_Init(NULL, NULL);
7 |
8 | // Get the number of processes
9 | int world_size;
10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size);
11 |
12 | // Get the rank of the process
13 | int world_rank;
14 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
15 |
16 | // Get the name of the processor
17 | char processor_name[MPI_MAX_PROCESSOR_NAME];
18 | int name_len;
19 | MPI_Get_processor_name(processor_name, &name_len);
20 |
21 | fprintf(stdout, "Hello world from processor %s, rank %d out of %d processors\n",
22 | processor_name, world_rank, world_size);
23 | // Finalize the MPI environment.
24 | MPI_Finalize();
25 | }
26 |
27 |
28 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world_sync.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main(int argc, char** argv) {
5 | // Initialize the MPI environment
6 | MPI_Init(NULL, NULL);
7 |
8 | // Get the number of processes
9 | int world_size;
10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size);
11 |
12 | // Get the rank of the process
13 | int world_rank;
14 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
15 |
16 | // Get the name of the processor
17 | char processor_name[MPI_MAX_PROCESSOR_NAME];
18 | int name_len;
19 | MPI_Get_processor_name(processor_name, &name_len);
20 |
21 | // Print off a hello world message
22 | for (int i=0; i
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #define USE MPI
17 | #define SEED 35791246
18 |
19 | int main (int argc , char *argv[])
20 | {
21 | // coordinates
22 | double x, y;
23 |
24 | // number of points inside the circle
25 | long long int M, local_M;
26 | double pi;
27 |
28 | // times
29 | double start_time, comp_time, end_time, wall_time, avg_walltime, max_walltime;
30 | int myid, numprocs, proc;
31 | MPI_Status status;
32 | MPI_Request request;
33 | // master process
34 | int master = 0;
35 | int tag = 123;
36 |
37 | MPI_Init(&argc, &argv);
38 | MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
39 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
40 | fprintf (stdout, "I am %d\n", myid);
41 | if (argc <=1 ) {
42 | fprintf(stderr, "Usage : mpi -np n %s number_of_iterations \n", argv[0]);
43 | MPI_Finalize();
44 | exit(-1);
45 | }
46 |
47 | long long int N = atoll(argv[1])/numprocs;
48 | // take time of processors after initial I/O operation
49 | start_time = MPI_Wtime();
50 |
51 | // initialize random numbers
52 | srand48(SEED * (myid + 1)); // seed the number generator
53 | local_M = 0;
54 | long long int i;
55 | for (i = 0; i < N ; i++) {
56 | // take a point P(x,y) inside the unit square
57 | x = drand48();
58 | y = drand48();
59 | // check if the point P(x,y) is inside the circle
60 | if ( (x*x + y*y) < 1)
61 | local_M++;
62 | }
63 | // take time of processors after initial I/O operation
64 | MPI_Barrier(MPI_COMM_WORLD);
65 | comp_time=MPI_Wtime();
66 |
67 | if (myid == 0) { //if I am the master process gather results from others
68 | M = local_M;
69 | for (proc = 1; proc < numprocs; proc++) {
70 | MPI_Recv(&local_M, 1, MPI_LONG_LONG, proc, tag, MPI_COMM_WORLD, &status);
71 | M += local_M;
72 | }
73 | pi = 4.0 * M / (N * numprocs);
74 | end_time = MPI_Wtime();
75 | }
76 | else { // for all the slave processes send results to the master /
77 | MPI_Ssend(&local_M, 1,MPI_LONG_LONG, master, tag, MPI_COMM_WORLD);
78 | end_time=MPI_Wtime();
79 | }
80 |
81 | wall_time = end_time - start_time;
82 | MPI_Reduce(&wall_time, &avg_walltime, 1, MPI_DOUBLE, MPI_SUM, master, MPI_COMM_WORLD);
83 | avg_walltime = avg_walltime / numprocs;
84 | MPI_Reduce(&wall_time, &max_walltime, 1, MPI_DOUBLE, MPI_MAX, master, MPI_COMM_WORLD);
85 |
86 | fprintf(stdout, "\n# walltime on processor %i : %10.8f\n", myid, wall_time);
87 | fprintf(stdout, "\n# walltime after computation on processor %i : %10.8f\n", myid, comp_time - start_time);
88 | fprintf(stdout, "\n# walltime for communication on processor %i : %10.8f\n", myid, end_time - comp_time);
89 | fflush(stdout);
90 | if (myid ==0) {
91 | printf ( "\n# of trials = %llu , estimate of pi is %1.9f\n", N * numprocs, pi);
92 | fprintf(stdout, "\n[*] Average Walltime: %10.8f\n", avg_walltime);
93 | fprintf(stdout, "\n(*) Max Walltime: %10.8f\n", max_walltime);
94 | fflush(stdout);
95 | }
96 | MPI_Finalize() ; // let MPI finish up /
97 |
98 | }
99 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_pi.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #SBATCH --job-name=pi_epyc
4 | #SBATCH --time=0-00:10:00
5 | #SBATCH -p EPYC
6 | #SBATCH -n128
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 | #SBATCH -N1 # 1 NODES
10 | echo Starting job $SLURM_JOB_ID
11 | echo Current dir is ${SLURM_SUBMIT_DIR}
12 |
13 | module purge
14 | module load compiler
15 | module load intelMPI/2021.7.1
16 | mpiicc -cc=icx mpi_hello_world.c -g3 -o mpi_hello_world.x
17 | mpiicc -cc=icx mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x
18 | mpiifort mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x
19 | mpiicc -cc=icx mpi_pi.c -O3 -g3 -o mpi_pi.x
20 |
21 | mpirun -np 12 ./mpi_pi.x 10000
22 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.F90:
--------------------------------------------------------------------------------
1 |
2 | Program MPI
3 | ! a simple implementation of send/receive message
4 | Implicit None
5 | !
6 | Include 'mpif.h'
7 | !
8 | Integer :: rank
9 | Integer :: buffer
10 | Integer, Dimension( 1:MPI_status_size ) :: status
11 | Integer :: error
12 | !
13 | Call MPI_init( error )
14 | Call MPI_comm_rank( MPI_comm_world, rank, error )
15 | !
16 | If( rank == 0 ) Then
17 | Call MPI_recv( buffer, 1, MPI_integer, 1, 10, &
18 | MPI_comm_world, status, error )
19 | Print*, 'Rank ', rank, ' buffer=', buffer
20 | If( buffer /= 33 ) Print*, 'fail'
21 | End If
22 | !
23 | If( rank == 1 ) Then
24 | buffer = 33
25 | Call MPI_send( buffer, 1, MPI_integer, 0, 10, &
26 | MPI_comm_world, error )
27 | End If
28 | !
29 | Call MPI_finalize( error )
30 | End Program MPI
31 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | int main(int argc, char** argv) {
6 | MPI_Init(&argc, &argv);
7 | int rank, size;
8 | int buffer;
9 | MPI_Status status;
10 | MPI_Comm_size(MPI_COMM_WORLD, &size);
11 | MPI_Comm_rank(MPI_COMM_WORLD, &rank);
12 | if(size != 2) {
13 | printf("This application is meant to be run with 2 processes.\n");
14 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
15 | }
16 | if (rank == 0){
17 | // int MPI_Recv(void* buffer, int count, MPI_Datatype datatype,
18 | // int sender, int tag, MPI_Comm communicator, MPI_Status* status);
19 | MPI_Recv(&buffer, 1, MPI_INT, 1, 9, MPI_COMM_WORLD, &status);
20 | fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer);
21 | if (buffer != 33) fprintf(stderr, "Fail\n");
22 | }
23 | if (rank == 1) {
24 | buffer = 33;
25 | // int MPI_Send(const void* buffer, int count, MPI_Datatype datatype,
26 | // int recipient, int tag, MPI_Comm communicator);
27 | MPI_Send(&buffer, 1, MPI_INT, 0, 9, MPI_COMM_WORLD);
28 | }
29 | MPI_Finalize();
30 | return EXIT_SUCCESS;
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/sendrecv_message.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | int main(int argc, char** argv) {
6 | MPI_Init(&argc, &argv);
7 | int rank, size;
8 | int buffer;
9 | char message[2][16];
10 | MPI_Status status;
11 | MPI_Comm_size(MPI_COMM_WORLD, &size);
12 | MPI_Comm_rank(MPI_COMM_WORLD, &rank);
13 | if(size != 2) {
14 | printf("This application is meant to be run with 2 processes.\n");
15 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
16 | }
17 | if (rank == 0){
18 | strcpy(message[0], "skew");
19 | strcpy(message[1], "squeue");
20 | // int MPI_Sendrecv(const void* buffer_send, int count_send,
21 | // MPI_Datatype datatype_send, int recipient, int tag_send,
22 | // void* buffer_recv, int count_recv,
23 | // MPI_Datatype datatype_recv, int sender, int tag_recv,
24 | // MPI_Comm communicator, MPI_Status* status);
25 | MPI_Sendrecv(message, 32, MPI_CHAR, 1, 10,
26 | &buffer, 1, MPI_INT, 1, 9,
27 | MPI_COMM_WORLD, &status);
28 | fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer);
29 | if (buffer != 33) fprintf(stderr, "Fail\n");
30 | }
31 | if (rank == 1) {
32 | buffer = 33;
33 | MPI_Sendrecv(&buffer, 1, MPI_INT, 0, 9,
34 | message, 32, MPI_CHAR, 0, 10,
35 | MPI_COMM_WORLD, &status);
36 |
37 | fprintf(stdout, "Rank %d: message[0] = %s, message[1] = %s \n",
38 | rank, message[0], message[1]);
39 | }
40 | MPI_Finalize();
41 | return EXIT_SUCCESS;
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | #SBATCH --job-name=allgather-example
4 | #SBATCH -p GPU
5 | #SBATCH --nodes=2
6 | #SBATCH --ntasks-per-node=1
7 | ##SBATCH --gres=gpu:2
8 | #SBATCH --time=0:10:00
9 | #SBATCH -o allgather.%A.out
10 | #SBATCH -e allgather.%A.error
11 | ##SBATCH -A lade
12 | #SBATCH --wait-all-nodes=1
13 | #SBATCH --cpus-per-task=16
14 | #SBATCH --mem=10G
15 | ##SBATCH -w dgx002
16 | CURRENT_DIR=${SLURM_SUBMIT_DIR}
17 | head_node=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
18 | head_node_ip=$( srun --nodes=1 --ntasks=1 -w "$head_node" --exclusive hostname --ip-address)
19 | echo "head_node=" ${head_node} " - head_node_ip=" $head_node_ip
20 | #export LOGLEVEL=INFO
21 | #export NCCL_DEBUG=INFO
22 | export OMP_NUM_THREADS=16
23 | cd ../..
24 | source myenv_v100/bin/activate
25 | cd -
26 | echo $(pwd)
27 | echo ${CUDA_VISIBLE_DEVICES}
28 |
29 | srun -l torchrun \
30 | --nnodes 2 \
31 | --nproc_per_node 2 \
32 | --rdzv_id $RANDOM \
33 | --rdzv_backend c10d \
34 | --rdzv_endpoint $head_node_ip:29500 \
35 | allgather.py
36 |
37 |
38 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.py:
--------------------------------------------------------------------------------
1 | import torch.distributed as dist
2 | import torch.multiprocessing as mp
3 | import torch
4 | import os
5 |
6 | # salloc -N1 -n1 -c64 -A lade -p DGX --gpus-per-node=4 --time=1:59:00
7 | # srun python tmp.py
8 | def ddp_setup():
9 | dist.init_process_group(backend="nccl")
10 | torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
11 |
12 | #def ddp_setup(rank: int, world_size: int):
13 | # """
14 | # Args:
15 | # rank: Unique identifier of each process
16 | # world_size: Total number of processes
17 | # """
18 | # os.environ["MASTER_ADDR"] = "localhost"
19 | # os.environ["MASTER_PORT"] = "12355"
20 | # dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
21 | # torch.cuda.set_device(rank)
22 |
23 | def run():
24 | local_rank = int(os.environ["LOCAL_RANK"])
25 | global_rank = int(os.environ["RANK"])
26 | world_size = dist.get_world_size()
27 | torch.manual_seed(global_rank)
28 | n = torch.randint(high=10, size=(1,), dtype=int).to(local_rank)
29 | a = torch.tensor([global_rank] * n, dtype=int).to(local_rank)
30 | for p in range(world_size):
31 | if global_rank==p:
32 | print(f"A) {global_rank}: {a}", flush=True)
33 | dist.barrier()
34 | nelements_list = [torch.zeros_like(n).to(local_rank) for _ in range(world_size)]
35 | dist.all_gather(tensor = n, tensor_list = nelements_list)
36 | gather_list = [torch.zeros(int(nelements_list[i]), dtype=int).to(local_rank) for i in range(world_size)]
37 | dist.all_gather(tensor = a, tensor_list = gather_list)
38 | res = torch.cat((gather_list))
39 | for p in range(world_size):
40 | if global_rank==p:
41 | print(f"B) {global_rank}: {res}", flush=True)
42 | dist.barrier()
43 |
44 | def main():
45 | ddp_setup()
46 | run()
47 | dist.destroy_process_group()
48 |
49 | if __name__ == "__main__":
50 | # world_size = torch.cuda.device_count()
51 | # mp.spawn(main, args=(world_size,), nprocs=world_size)
52 | main()
53 |
54 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/allgatherv.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #define SEED 35791246
9 |
10 | int main(int argc, char** argv) {
11 |
12 | int myid, nproc;
13 | MPI_Init(NULL, NULL);
14 | MPI_Comm_size(MPI_COMM_WORLD, &nproc);
15 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
16 | srand(SEED*(myid+1)) ; // seed the number generator
17 | int numel = 1 + (rand() % 9);
18 | int totel;
19 |
20 | int counts_recv[nproc];
21 | int displacements[nproc];
22 | MPI_Allgather(&numel, 1, MPI_INT, counts_recv, 1, MPI_INT, MPI_COMM_WORLD);
23 | displacements[0] = 0 ;
24 | for (int i = 1; i < nproc ; i++){
25 | displacements[i] = displacements[i-1] + counts_recv[i-1];
26 | }
27 |
28 | double * a = (double*)malloc(sizeof(double) * numel);
29 | assert(a != NULL);
30 |
31 | for (int i=0; i < numel; i++) {
32 | a[i]=myid;
33 | }
34 |
35 | for (int i = 0; i < nproc; i++) {
36 | if (i == myid) {
37 | fprintf(stdout, "BEFORE\tmyid = %d\n", myid );
38 | for (int n = 0 ; n < numel; n++)
39 | fprintf(stdout, "\ta[%d]=%.1f\n", n, a[n]);
40 | fprintf(stdout, "\n");
41 | fflush(stdout);
42 | }
43 | MPI_Barrier(MPI_COMM_WORLD);
44 | }
45 |
46 | MPI_Allreduce(&numel, &totel, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
47 | double * b = (double*)malloc(sizeof(double) * totel);
48 | assert(b != NULL);
49 |
50 | MPI_Allgatherv(a, numel, MPI_DOUBLE, b, counts_recv, displacements, MPI_DOUBLE, MPI_COMM_WORLD);
51 | for (int i = 0; i < nproc; i++) {
52 | if (i == myid) {
53 | fprintf(stdout, "AFTER\tmyid = %d\n", myid );
54 | for (int n = 0 ; n < totel; n++)
55 | fprintf(stdout, "\tb[%d]=%.1f\n", n, b[n]);
56 | fprintf(stdout, "\n");
57 | fflush(stdout);
58 | }
59 | MPI_Barrier(MPI_COMM_WORLD);
60 | }
61 | free(a);
62 | free(b);
63 | MPI_Finalize();
64 | }
65 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | int main(int argc, char** argv) {
7 |
8 | int num_elements = 2;
9 | int myid, nproc, root;
10 | MPI_Init(NULL, NULL);
11 | MPI_Comm_size(MPI_COMM_WORLD, &nproc);
12 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
13 | double * a = (double*)malloc(sizeof(double) * num_elements);
14 | assert(a != NULL);
15 | for (int i=0; i < num_elements; i++) {
16 | a[i]=0.;
17 | }
18 | root = 0;
19 | if (myid == root) {
20 | for (int i = 0 ; i < num_elements; i++)
21 | a[i] = 2. * (i + 1.);
22 | }
23 | for (int i = 0; i < nproc; i++) {
24 | if (i == myid) {
25 | fprintf(stdout, "%d\tbefore:", myid );
26 | for (int n = 0 ; n < num_elements; n++)
27 | fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]);
28 | fprintf(stdout, "\n");
29 | fflush(stdout);
30 | }
31 | MPI_Barrier(MPI_COMM_WORLD);
32 | }
33 | // int MPI_Bcast(void* buffer, int count, MPI_Datatype datatype, int emitter_rank, MPI_Comm communicator);
34 | MPI_Bcast(a, num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD);
35 | for (int i = 0; i < nproc; i++) {
36 | if (i == myid) {
37 | fprintf(stdout, "%d\tafter:", myid );
38 | for (int n = 0 ; n < num_elements; n++)
39 | fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]);
40 | fprintf(stdout, "\n");
41 | fflush(stdout);
42 | }
43 | MPI_Barrier(MPI_COMM_WORLD);
44 | }
45 | free(a);
46 | MPI_Finalize();
47 | }
48 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.f:
--------------------------------------------------------------------------------
1 | PROGRAM broad_cast
2 | INCLUDE 'mpif.h'
3 | INTEGER ierr, myid, nproc, root
4 | INTEGER status(MPI_STATUS_SIZE)
5 | REAL A(2)
6 | CALL MPI_INIT(ierr)
7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
9 | root = 0
10 | a(1)=0.0
11 | A(2)=0.0
12 | IF( myid .EQ. 0 ) THEN
13 | a(1) = 2.0
14 | a(2) = 4.0
15 | END IF
16 | WRITE(6,*) myid, ' before: a(1)=', a(1), 'a(2)=', a(2)
17 | CALL MPI_BARRIER()
18 | CALL MPI_BCAST(a, 2, MPI_REAL, 0, MPI_COMM_WORLD, ierr)
19 | WRITE(6,*) myid, ' after : a(1)=', a(1), 'a(2)=', a(2)
20 | CALL MPI_FINALIZE(ierr)
21 | END
22 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/clean.sh:
--------------------------------------------------------------------------------
1 | rm *.x
2 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/compile.sh:
--------------------------------------------------------------------------------
1 | module purge
2 | module load openMPI/4.1.5/gnu
3 | mpicc scatter.c -o scatter_c.x
4 | mpicc gather.c -o gather_c.x
5 | mpicc b_cast.c -o b_cast_c.x
6 | mpicc reduce.c -o reduce_c.x
7 | mpicc mpi_bcastcompare.c -o mpi_bcastcompare.x
8 | mpicc allgatherv.c -o allgatherv.x
9 | mpicc all2allv3d.c -o all2allv3d.x
10 |
11 | mpifort scatter.f -o scatter_f.x
12 | mpifort scatter.f -o scatter_f.x
13 | mpifort gather.f -o gather_f.x
14 | mpifort b_cast.f -o b_cast_f.x
15 | mpifort reduce.f -o reduce_f.x
16 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | int main(int argc, char** argv) {
7 |
8 | int myid, nproc, root;
9 | int num_elements = 8;
10 | int nsnd = 2;
11 | double *a;
12 | double *b;
13 | a = (double*)malloc(sizeof(double) * num_elements);
14 | b = (double*)malloc(sizeof(double) * nsnd);
15 | assert(a != NULL);
16 | assert(b != NULL);
17 | MPI_Init(NULL, NULL);
18 | MPI_Comm_size(MPI_COMM_WORLD, &nproc);
19 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
20 | int gat_elements = nsnd * nproc;
21 | root=0;
22 | if(num_elements < gat_elements && myid == root) {
23 | printf("This application is meant to be run with no more than %d MPI processes.\n", num_elements/nsnd);
24 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
25 | }
26 | for (int i = 0; i < nsnd; i++)
27 | b[i] = myid;
28 | // int MPI_Gather(const void* buffer_send, int count_send, MPI_Datatype datatype_send,
29 | // void* buffer_recv, int count_recv, MPI_Datatype datatype_recv,
30 | // int root, MPI_Comm communicator);
31 | MPI_Gather(b, nsnd, MPI_DOUBLE, a, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD);
32 | if (myid==root) {
33 | fprintf(stdout, "myid=%d:\n", myid);
34 | for (int i = 0; i < gat_elements; i++)
35 | fprintf(stdout, "\ta[%d]=%.2f\n", i, a[i]);
36 | fprintf(stdout, "\n");
37 | for (int i = gat_elements; i < num_elements; i++)
38 | fprintf(stdout, "\t\ta[%d]=%.2f\n", i, a[i]);
39 | }
40 | free(a);
41 | free(b);
42 | MPI_Finalize();
43 | }
44 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.f:
--------------------------------------------------------------------------------
1 | PROGRAM gather
2 | INCLUDE 'mpif.h'
3 | INTEGER ierr, myid, nproc, nsnd, I, root
4 | INTEGER status(MPI_STATUS_SIZE)
5 | REAL A(16), B(2)
6 | CALL MPI_INIT(ierr)
7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
9 | root = 0
10 | b(1) = REAL( myid )
11 | b(2) = REAL( myid )
12 | nsnd = 2
13 | CALL MPI_GATHER(b, nsnd, MPI_REAL, a, nsnd,
14 | & MPI_REAL, root, MPI_COMM_WORLD, ierr)
15 | IF( myid .eq. root ) THEN
16 | DO i = 1, (nsnd*nproc)
17 | WRITE(6,*) myid, ': a(i)=', a(i)
18 | END DO
19 | END IF
20 | CALL MPI_FINALIZE(ierr)
21 | END
22 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/mpi_bcastcompare.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | void my_bcast(void* data, int count, MPI_Datatype datatype, int root,
7 | MPI_Comm communicator) {
8 | int world_rank;
9 | MPI_Comm_rank(communicator, &world_rank);
10 | int world_size;
11 | MPI_Comm_size(communicator, &world_size);
12 |
13 | if (world_rank == root) {
14 | // If we are the root process, send our data to everyone
15 | int i;
16 | for (i = 0; i < world_size; i++) {
17 | if (i != world_rank) {
18 | MPI_Send(data, count, datatype, i, 0, communicator);
19 | }
20 | }
21 | } else {
22 | // If we are a receiver process, receive the data from the root
23 | MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE);
24 | }
25 | }
26 |
27 | int main(int argc, char** argv) {
28 |
29 | int num_elements = 1000;
30 | int num_trials = 10;
31 |
32 | MPI_Init(NULL, NULL);
33 |
34 | int world_rank;
35 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
36 |
37 | double total_my_bcast_time = 0.0;
38 | double total_mpi_bcast_time = 0.0;
39 | int i;
40 | int* data = (int*)malloc(sizeof(int) * num_elements);
41 | assert(data != NULL);
42 |
43 | for (i = 0; i < num_trials; i++) {
44 | // Time my_bcast
45 | // Synchronize before starting timing
46 | MPI_Barrier(MPI_COMM_WORLD);
47 | total_my_bcast_time -= MPI_Wtime();
48 | my_bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
49 | // Synchronize again before obtaining final time
50 | MPI_Barrier(MPI_COMM_WORLD);
51 | total_my_bcast_time += MPI_Wtime();
52 |
53 | // Time MPI_Bcast
54 | MPI_Barrier(MPI_COMM_WORLD);
55 | total_mpi_bcast_time -= MPI_Wtime();
56 | MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
57 | MPI_Barrier(MPI_COMM_WORLD);
58 | total_mpi_bcast_time += MPI_Wtime();
59 | }
60 |
61 | // Print off timing information
62 | if (world_rank == 0) {
63 | printf("Data size = %d, Trials = %d\n", num_elements * (int)sizeof(int), num_trials);
64 | printf("Avg my_bcast time = %lf\n", total_my_bcast_time / num_trials);
65 | printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials);
66 | }
67 |
68 | free(data);
69 | MPI_Finalize();
70 | }
71 |
72 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int main(int argc, char** argv) {
5 |
6 | int num_elements = 2;
7 | int myid, nproc, root;
8 | double a[num_elements], b[num_elements];
9 | for (int i = 0; i < num_elements; i++)
10 | a[i] = 2.0 * (1+i);
11 | root=0;
12 | MPI_Init(NULL, NULL);
13 | int world_rank;
14 | MPI_Comm_size(MPI_COMM_WORLD, &nproc);
15 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
16 | //int MPI_Reduce(const void* send_buffer, void* receive_buffer, int count,
17 | // MPI_Datatype datatype, MPI_Op operation, int root, MPI_Comm communicator);
18 | MPI_Reduce(a, b, num_elements, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
19 | if (myid == 0) {
20 | fprintf(stdout,"myid=%d:\n", myid);
21 | for (int i = 0; i < num_elements; i++)
22 | fprintf(stdout,"\tb[%d]=%.2f\n", i, b[i]);
23 | fprintf(stdout,"\n");
24 | }
25 | MPI_Finalize();
26 | }
27 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.f:
--------------------------------------------------------------------------------
1 | PROGRAM reduce
2 | INCLUDE 'mpif.h'
3 | INTEGER ierr, myid, nproc, root
4 | INTEGER status(MPI_STATUS_SIZE)
5 | REAL A(2), res(2)
6 | CALL MPI_INIT(ierr)
7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
9 | root = 0
10 | a(1) = 2.0
11 | a(2) = 4.0
12 | CALL MPI_REDUCE(a, res, 2, MPI_REAL, MPI_SUM, root,
13 | & MPI_COMM_WORLD, ierr)
14 | IF( myid .EQ. 0 ) THEN
15 | WRITE(6,*) myid, ': res(1)=', res(1), 'res(2)=', res(2)
16 | END IF
17 | CALL MPI_FINALIZE(ierr)
18 | END
19 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 |
7 |
8 |
9 | int main(int argc, char** argv) {
10 |
11 | int myid, nproc, root;
12 | int num_elements = 8;
13 | int nsnd = 2;
14 | double a[num_elements];
15 | double *b;
16 | b = (double*)malloc(sizeof(double) * nsnd);
17 | assert(b != NULL);
18 | MPI_Init(NULL, NULL);
19 | MPI_Comm_size(MPI_COMM_WORLD, &nproc);
20 | MPI_Comm_rank(MPI_COMM_WORLD, &myid);
21 | root=0;
22 | if(nproc * nsnd != num_elements && myid == root) {
23 | printf("This application is meant to be run with %d MPI processes.\n", num_elements/nsnd);
24 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
25 | }
26 | if (myid == root) {
27 | for (int i = 0; i < num_elements; i++)
28 | a[i] = i+1;
29 | }
30 | // int MPI_Scatter(const void* buffer_send, int count_send, MPI_Datatype datatype_send,
31 | // void* buffer_recv, int count_recv, MPI_Datatype datatype_recv,
32 | // int root, MPI_Comm communicator);
33 |
34 | MPI_Scatter(a, nsnd, MPI_DOUBLE, b, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD);
35 | fprintf(stdout, "myid=%d:\tb[0]=%.2f,\tb[1]=%.2f\n",myid, b[0], b[1] );
36 | free(b);
37 | MPI_Finalize();
38 | }
39 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.f:
--------------------------------------------------------------------------------
1 | PROGRAM scatter
2 | INCLUDE 'mpif.h'
3 | INTEGER ierr, myid, nproc, nsnd, I, root
4 | INTEGER status(MPI_STATUS_SIZE)
5 | REAL A(16), B(2)
6 | CALL MPI_INIT(ierr)
7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr)
8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr)
9 | root = 0
10 | IF( myid .eq. root ) THEN
11 | DO i = 1, 16
12 | a(i) = REAL(i)
13 | END DO
14 | END IF
15 | nsnd = 2
16 | CALL MPI_SCATTER(a, nsnd, MPI_REAL, b, nsnd,
17 | & MPI_REAL, root, MPI_COMM_WORLD, ierr)
18 | WRITE(6,*) myid, ': b(1)=', b(1), 'b(2)=', b(2)
19 | CALL MPI_FINALIZE(ierr)
20 | END
21 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/MPI/pi_scalability/scalability.job:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=scaling # Job name
3 | #SBATCH --nodes=1
4 | #SBATCH --ntasks=128 # Run a single task
5 | #SBATCH --time=01:20:00 # Time limit hrs:min:sec
6 | #SBATCH -p EPYC
7 | #SBATCH --output=%x.%j.out
8 | #SBATCH --error=%x.%j.err
9 | #SBATCH --exclusive
10 | module purge
11 | module load openMPI/4.1.5/gnu
12 | PI="../basic-mpi-codes/mpi_pi"
13 | mpicc -O3 ${PI}.c -o mpi_pi.x
14 | element="socket"
15 | N=1000000000
16 | echo "tasks, N, avg_walltime" > pi_strong.csv
17 | for i in $(eval echo {0..$SLURM_NTASKS..8});
18 | do
19 | if [ "$i" -eq "0" ]
20 | then
21 | echo -n "1, $N," >> pi_strong.csv
22 | mpirun --map-by ${element} -np 1 ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv
23 | else
24 | echo -n "$i, $N," >> pi_strong.csv
25 | mpirun --map-by ${element} -np $i ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv
26 | fi
27 | done
28 |
29 | echo "tasks, N, avg_walltime" > pi_weak.csv
30 | for i in $(eval echo {0..$SLURM_NTASKS..8});
31 | do
32 | if [ "$i" -eq "0" ]
33 | then
34 | M=$N
35 | echo -n "1, $M," >> pi_weak.csv
36 | mpirun --map-by ${element} -np 1 ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv
37 | else
38 | M=$((${N}*${i}))
39 | echo -n "$i, $M," >> pi_weak.csv
40 | mpirun --map-by ${element} -np $i ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv
41 | fi
42 | done
43 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/.#for.c:
--------------------------------------------------------------------------------
1 | luca@ggg.2121:1698304345
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/loop_without_for.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 |
39 | #define N_default 1000 // how long is the main array
40 |
41 | int main( int argc, char **argv )
42 | {
43 |
44 | int N = N_default;
45 | int nthreads = 1;
46 |
47 | // check whether some arg has been passed on
48 | if ( argc > 1 )
49 | {
50 | N = atoi( *(argv+1) );
51 | if ( argc > 2 )
52 | nthreads = atoi( *(argv+2) );
53 | }
54 |
55 | if( nthreads > 1 )
56 | omp_set_num_threads(nthreads);
57 | #pragma omp parallel
58 | {
59 | int me = omp_get_thread_num();
60 | int nthreads = omp_get_num_threads();
61 |
62 | int chunk = N / nthreads;
63 | int mod = N % nthreads;
64 | int my_first = chunk*me + ((me < mod)?me:mod);
65 | int my_chunk = chunk + (mod > 0)*(me < mod);
66 |
67 | #pragma omp single
68 | printf("nthreads: %d, N: %d --- chunk is %d, reminder is %d\n", nthreads, N, chunk, mod);
69 |
70 | printf("thread %d : from %d to %d\n", me, my_first, my_first+my_chunk);
71 |
72 | /*
73 | * here you could then insert a for loop
74 | * int my_stop = my_first + my_chunk;
75 | * for( int i = myfirst; i < my_stop; i++ )
76 | * ...
77 | */
78 | }
79 |
80 |
81 | return 0;
82 | }
83 |
84 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define DEFAULT 1000000
10 | #define SEED 918273
11 |
12 | int main ( int argc, char **argv)
13 | {
14 |
15 | long long int M=0;
16 | int nthreads;
17 | double pi;
18 |
19 |
20 |
21 | #pragma omp parallel
22 | #pragma omp master
23 | nthreads = omp_get_num_threads();
24 |
25 | long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ;
26 | printf("omp calculation with %d threads\nN=%Ld\n",
27 | nthreads ,N);
28 |
29 | double timing = omp_get_wtime();
30 | #pragma omp parallel
31 | {
32 | int myid = omp_get_thread_num();
33 | double x, y ;
34 | srand48(SEED*(myid+1));
35 |
36 | #pragma omp for reduction(+:M)
37 | for( long long unsigned i = 0; i < N; i++)
38 | {
39 | x = drand48();
40 | y = drand48();
41 | M += ((x*x + y*y) < 1.0);
42 | }
43 | }
44 |
45 | timing = omp_get_wtime() - timing;
46 | printf("Estimation of pi: %1.9f\n Walltime:%g\n",
47 | (4.0*(double)M)/N, timing );
48 |
49 | return 0;
50 | }
51 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.fix.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define DEFAULT 1000000
10 | #define SEED 918273
11 |
12 | int main(int argc,char* argv[])
13 | {
14 |
15 | long long unsigned int M = 0;
16 | int nthreads;
17 |
18 | #pragma omp parallel
19 | #pragma omp master
20 | nthreads = omp_get_num_threads();
21 |
22 | long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ;
23 | printf("omp calculation with %d threads\nN=%Ld\n", nthreads ,N);
24 |
25 | double timing = omp_get_wtime();
26 | #pragma omp parallel
27 | {
28 | int myid = omp_get_thread_num();
29 | int unsigned short myseeds[3] = {SEED+(myid),SEED+(myid*3+1), SEED+(myid*4+2)};
30 |
31 | seed48( myseeds );
32 |
33 | #pragma omp for reduction(+:M)
34 | for( long long unsigned int i = 0; i < N; i++)
35 | {
36 | double x = erand48( myseeds );
37 | double y = erand48( myseeds );
38 |
39 | M += ( (x*x + y*y) < 1.0 );
40 | }
41 | }
42 |
43 | timing = omp_get_wtime() - timing;
44 |
45 | printf("Estimation of pi: %1.9f\n Walltime:%g\n",
46 | (4.0*(double)M)/N, timing );
47 | return 0;
48 | }
49 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/00_scope_of_variables.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #define _GNU_SOURCE
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 |
41 |
42 | int main( int argc, char **argv )
43 | {
44 | int i;
45 |
46 | printf( "\nmain thread (pid: %d, tid: %ld) data:\n"
47 | "&i is @ address : %p\n\n",
48 | (int)getpid(), syscall(SYS_gettid), &i);
49 |
50 | // just try who is the private i for each thread
51 | #pragma omp parallel private(i)
52 | {
53 | int me = omp_get_thread_num();
54 |
55 | printf( "\tthread nr %d ( tid %ld, from pid %d ) :\n"
56 | "\t\tmy i address is %p\n",
57 | me, syscall(SYS_gettid), (int)getpid(), &i );
58 | }
59 |
60 | printf( "\n" );
61 | return 0;
62 | }
63 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/01_simple_pr_wrong.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 | #if defined(__STDC__)
28 | # if (__STDC_VERSION__ >= 199901L)
29 | # define _XOPEN_SOURCE 700
30 | # endif
31 | #endif
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 |
39 | int main( int argc, char **argv )
40 | {
41 |
42 | int nthreads;
43 | int my_thread_id;
44 |
45 | #if defined(_OPENMP)
46 |
47 | #pragma omp parallel // this creates a parallel region
48 | // that is encompassed by the
49 | // opening and closing { }
50 | //
51 | // you can modify the number of
52 | // spawned threads through the
53 | // OMP_THREAD_NUM
54 | // environmental variable
55 |
56 | {
57 |
58 | my_thread_id = omp_get_thread_num(); // note: this assignment is not thread-safe
59 | sleep(0.05);
60 | #pragma omp master
61 | nthreads = omp_get_num_threads();
62 |
63 | // the order in which different threads will
64 | // arrive at this print is undefined;
65 | // if you run this code several times, you will
66 | // obtain different results
67 |
68 | printf( "\tgreetings from thread num %d\n", my_thread_id);
69 | }
70 | #else
71 |
72 | nthreads = 1;
73 | #endif
74 |
75 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
76 |
77 | return 0;
78 | }
79 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/02_simple_pr.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 | #if defined(__STDC__)
28 | # if (__STDC_VERSION__ >= 199901L)
29 | # define _XOPEN_SOURCE 700
30 | # endif
31 | #endif
32 | #include
33 | #include
34 | #include
35 | #include
36 |
37 |
38 | int main( int argc, char **argv )
39 | {
40 |
41 | int nthreads;
42 |
43 | #if defined(_OPENMP)
44 |
45 | #pragma omp parallel // this creates a parallel region
46 | // that is encompassed by the
47 | // opening and closing { }
48 | //
49 | // you can modify the number of
50 | // spawned threads through the
51 | // OMP_THREAD_NUM
52 | // environmental variable
53 |
54 | {
55 |
56 | int my_thread_id = omp_get_thread_num(); // note: this assignment is now
57 | // thread-safe because the lvalue
58 | // is a private variable
59 | #pragma omp master
60 | nthreads = omp_get_num_threads();
61 |
62 | // the order in which different threads will
63 | // arrive at this print is undefined;
64 | // if you run this code several times, you will
65 | // obtain different results
66 |
67 | printf( "\tgreetings from thread num %d\n", my_thread_id);
68 | }
69 | #else
70 |
71 | nthreads = 1;
72 | printf( "\tgreetings from thread num 0\n");
73 | #endif
74 |
75 | printf(" %d thread%s greeted you from the %sparallel region\n",
76 | nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
77 |
78 | return 0;
79 | }
80 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/03a_num_of_threads.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 |
39 | int main( int argc, char **argv )
40 | {
41 |
42 | int nthreads;
43 |
44 | #if defined(_OPENMP)
45 |
46 | int threads_num = 1;
47 |
48 | if ( argc > 1 )
49 | {
50 | // read the argument given
51 | threads_num = atoi(*(argv+1));
52 | omp_set_num_threads( threads_num );
53 | }
54 |
55 | #pragma omp parallel // this creates a parallel region
56 | // that is encompassed by the
57 | // opening and closing { }
58 | //
59 | // you can modify the number of
60 | // spawned threads in different
61 | // ways:
62 | // 1) through the OMP_THREAD_NUM
63 | // environmental variable
64 | // 2) using the omp_set_num_threads()
65 | //
66 | // you can also declare the desired
67 | // number at the creation of the
68 | // parallel region:
69 |
70 | //#pragma omp parallel num_threads( threads_num )
71 |
72 | {
73 |
74 | int my_thread_id = omp_get_thread_num();
75 | #pragma omp master
76 | nthreads = omp_get_num_threads();
77 |
78 | // the order in which different threads will
79 | // arrive at this print is undefined;
80 | // if you run this code several times, you will
81 | // obtain different results
82 |
83 | printf( "\tgreetings from thread num %d\n", my_thread_id );
84 | }
85 |
86 | #else
87 |
88 | nthreads = 1;
89 |
90 | #endif
91 |
92 | printf(" %d thread%s greeted you from the %sparallel region\n",
93 | nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
94 |
95 | return 0;
96 | }
97 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/04_order_of_threads_wrong.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 | #if defined(__STDC__)
28 | # if (__STDC_VERSION__ >= 199901L)
29 | # define _XOPEN_SOURCE 700
30 | # endif
31 | #endif
32 | #include
33 | #include
34 | #include
35 | #include
36 |
37 |
38 | int main( int argc, char **argv )
39 | {
40 |
41 | int nthreads;
42 |
43 | #if defined(_OPENMP)
44 |
45 | int order = 0;
46 |
47 | #pragma omp parallel // this creates a parallel region
48 | // that is encompassed by the
49 | // opening and closing { }
50 | //
51 | // you can modify the number of
52 | // spawned threads through the
53 | // OMP_THREAD_NUM
54 | // environmental variable
55 |
56 | {
57 |
58 | int my_thread_id = omp_get_thread_num();
59 | #pragma omp master
60 | nthreads = omp_get_num_threads();
61 |
62 | // now we impose an ordered output
63 | // although not ina very efficient way
64 |
65 | // the "critical" directive identifies a
66 | // section that must be executed by a
67 | // single thread at a time.
68 | // Here, un unspecified number of threads
69 | // will print the message.
70 | // That is just due to this particular
71 | // case: in fact, ALL the threads will
72 | // execute the if test. However, which are
73 | // those that succeed, print and modify the
74 | // "order" value depends on which have been
75 | // the previous ones, and on the relative delay.
76 | #pragma omp critical
77 | if ( order == my_thread_id )
78 | {
79 | printf( "\tgreetings from thread num %d\n", my_thread_id );
80 | order++;
81 | }
82 | }
83 | #else
84 |
85 | nthreads = 1;
86 | #endif
87 |
88 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
89 |
90 | return 0;
91 | }
92 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05b_order_of_threads.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 |
39 | int main( int argc, char **argv )
40 | {
41 |
42 | int nthreads;
43 |
44 | #if defined(_OPENMP)
45 |
46 | #pragma omp parallel
47 | {
48 |
49 | int my_thread_id = omp_get_thread_num();
50 | #pragma omp master
51 | nthreads = omp_get_num_threads();
52 | #pragma omp barrier // let all the threads to read
53 | // the correct value of nthreads
54 |
55 | #pragma omp for ordered // declare a for within which there
56 | for ( int i = 0; i < nthreads; i++) // are ordered regions
57 | #pragma omp ordered // declare the ordered region
58 | printf( "\tgreetings from thread num %d\n", my_thread_id );
59 |
60 | }
61 | #else
62 |
63 | nthreads = 1;
64 | #endif
65 |
66 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
67 |
68 | return 0;
69 | }
70 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05c_order_of_threads.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 | void do_something( int who_am_I )
39 | {
40 | #pragma omp ordered
41 | printf( "\tgreetings from thread num %d\n", who_am_I );
42 | }
43 |
44 |
45 | int main( int argc, char **argv )
46 | {
47 |
48 | int nthreads;
49 |
50 | #if defined(_OPENMP)
51 |
52 | #pragma omp parallel
53 | {
54 |
55 | int my_thread_id = omp_get_thread_num();
56 | #pragma omp master
57 | nthreads = omp_get_num_threads();
58 | #pragma omp barrier // let all the threads to read
59 | // the correct value of nthreads
60 |
61 | #pragma omp for ordered // declare a for within which there
62 | for ( int i = 0; i < nthreads; i++) // are ordered regions
63 | do_something( my_thread_id );
64 |
65 |
66 | }
67 | #else
68 |
69 | nthreads = 1;
70 | #endif
71 |
72 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" );
73 |
74 | return 0;
75 | }
76 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__copyin__clarify.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 |
39 |
40 | double golden_value = 0;
41 | #pragma omp threadprivate( golden_value )
42 |
43 |
44 | int main( int argc, char **argv )
45 | {
46 | srand48(time(NULL));
47 | int N = 10;
48 |
49 | #pragma omp parallel copyin(golden_value)
50 | // the copying of thread 0's golden_value
51 | // happens here, at the entering of the
52 | // parallel region;
53 | //
54 | {
55 |
56 | #pragma omp master
57 | golden_value = 1.618033988; // we do not expect
58 | // this value to be
59 | // broadcasted
60 |
61 | #pragma omp barrier
62 |
63 | printf("[PR 1] thread %d has a golden value %g\n",
64 | omp_get_thread_num(), golden_value );
65 | }
66 |
67 |
68 | #pragma omp parallel copyin(golden_value)
69 | // here the master's value is copied again;
70 | // since it was modified in the previous
71 | // PR, we do expect that now everybody
72 | // will have the new value
73 | //
74 | printf("[PR 2] thread %d has a golden value %g\n",
75 | omp_get_thread_num(), golden_value );
76 |
77 | return 0;
78 | }
79 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__firstprivate.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 |
38 | #define DEFAULT 10
39 |
40 | int main( int argc, char **argv )
41 | {
42 |
43 | int i = (argc > 1 ? atoi(*(argv+1)) : DEFAULT );
44 | int nthreads;
45 | int *array;
46 |
47 | #pragma omp parallel
48 | #pragma omp master
49 | nthreads = omp_get_num_threads();
50 |
51 | array = (int*)calloc( nthreads, sizeof(int) );
52 |
53 | #pragma omp parallel firstprivate( i, array )
54 | {
55 | int me = omp_get_thread_num();
56 |
57 | // Here we can refer to both i and array.
58 | // Although they are *different* memory region
59 | // than the ones that are hosted in the
60 | // serial region, their value at the entry
61 | // of the parallel region is initialized
62 | // to the value that the corresponding variables
63 | // have in the serial region.
64 |
65 |
66 | array[me] = i + me; // a perfectly valid reference
67 |
68 | array = NULL; // we screw up.. but only in
69 | // this scope because this
70 | // array is _not_ the same
71 | // than that outise the p-region
72 | }
73 |
74 | for( int j = 0; j < nthreads; j++ )
75 | printf("entry %3d is %3d (expected was %3d)\n",
76 | j, array[j], i + j );
77 |
78 | free(array);
79 | return 0;
80 | }
81 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__threadprivate.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 |
39 |
40 | int me, myN;
41 | int *array;
42 |
43 | #pragma omp threadprivate( me, myN, array )
44 |
45 |
46 | #define DEFAULT 100000
47 |
48 | int main( int argc, char **argv )
49 | {
50 | int N = ( argc > 1 ? atoi(*(argv+1)) : DEFAULT);
51 |
52 | #pragma omp parallel
53 | {
54 | me = omp_get_thread_num();
55 |
56 | int nthreads = omp_get_num_threads();
57 |
58 | // note that we did not declare neither
59 | // myN nor array nor me in this scope
60 | //
61 | myN = (N / nthreads) + (me < N%nthreads);
62 | array = (int*)calloc( myN, sizeof(int) );
63 |
64 | printf("+ thread %d has got %d elements; local array "
65 | "(address stored in %p) starts at %p\n",
66 | me, myN, &array, array );
67 |
68 | // write something in the array
69 | //
70 |
71 | int max = ( myN > 3 ? 3 : myN );
72 | for( int j = 0; j < max; j++ )
73 | array[j] = me*1000 + j;
74 | }
75 |
76 |
77 | printf("\nnow we are again in a serial region\n\n");
78 |
79 |
80 | #pragma omp parallel
81 | {
82 | char buffer[200];
83 | sprintf( buffer, "* thread %d :: ", me );
84 |
85 | int max = ( myN > 3 ? 3 : myN );
86 | for( int j = 0; j < max; j++ )
87 | sprintf( &buffer[strlen(buffer)], "[%d] = %4d , ", j, array[j] );
88 |
89 | printf("%s\n", buffer );
90 |
91 | // we must free array from within a parallel region
92 | // is we did this in a serial region, only the memory
93 | // associated to the master thread would be freed
94 | //
95 | free(array);
96 | }
97 |
98 |
99 | return 0;
100 | }
101 |
102 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/examples_on_stack/00_explore_how_bytes_are_stored.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | int main( void )
6 | {
7 |
8 | unsigned int i = 128;
9 | int size = sizeof(i);
10 |
11 | /*
12 | * i is and integer variable, and as suche it requires 4 bytes
13 | * let's explore how this 4 bytes are placed in memory
14 | */
15 |
16 | for ( int j = 0; j < size; j++ )
17 | //
18 | // we loop over the bytes that make up the variable i
19 | // note: to be general, we asked size to be the value
20 | // returned by sizeof()
21 | //
22 | {
23 | // let's print the value of the entire bitfield
24 | // when we interpret it as an integer
25 | printf("i is: %d\n", i );
26 |
27 | // now we access each byte of i
28 | //
29 | char *byte = (char*)&i;
30 | for( int k = 0; k < size; k++ )
31 | printf("\t%p : %d\n", byte+k, *(byte+k) );
32 |
33 | // convince yourself that the previous for loop could have been
34 | // written as follows:
35 | // ( un-comment the next 2 lines to test it
36 |
37 | /* for( int k = 0; k < size; k++ ) */
38 | /* printf("\t%p : %d\n", (char*)&i+k, *(((char*)&i)+k)); */
39 |
40 | // why is it so ?
41 | // -- &i is the address of i; more precisely
42 | // it is the address of the begin of i, i.e.
43 | // the address of the furst of the bytes that
44 | // form i.
45 | // -- (char*)&i means that we interpret the
46 | // address &i as an address to a char
47 | // -- *(char*)&i reads as "the value of the byte
48 | // at the address &i"
49 | // -- (char*)&i+k is k-byte after the byte at
50 | // address &i
51 |
52 |
53 | printf("\n");
54 |
55 | // now we multiply i by 256.
56 | // the operators << and >> read as "shift the argument's bit on the left [or right]
57 | // by the specified amount of bits "
58 | // In this case the amount of bits is 8, i.e. is is the same than multiplying by 256
59 | //
60 | i <<= 8;
61 |
62 | // we are doing this because we want that only a single bit is set per each byte
63 | // among the i's bytes.
64 | // we started from a value of 1, i.e. only the first bit of the first byte of i was
65 | // set; multiplying by 256 (i.e. bit-shifting by 8 positions) we move that bit
66 | // to he next byte.
67 | }
68 |
69 | return 0;
70 | }
71 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/.#lab_exercise.2.c:
--------------------------------------------------------------------------------
1 | luca@ggg.26667:1698393520
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/lab_exercise.c:
--------------------------------------------------------------------------------
1 |
2 | /* ────────────────────────────────────────────────────────────────────────── *
3 | │ │
4 | │ This file is part of the exercises for the Lectures on │
5 | │ "Foundations of High Performance Computing" │
6 | │ given at │
7 | │ Master in HPC and │
8 | │ Master in Data Science and Scientific Computing │
9 | │ @ SISSA, ICTP and University of Trieste │
10 | │ │
11 | │ contact: luca.tornatore@inaf.it │
12 | │ │
13 | │ This is free software; you can redistribute it and/or modify │
14 | │ it under the terms of the GNU General Public License as published by │
15 | │ the Free Software Foundation; either version 3 of the License, or │
16 | │ (at your option) any later version. │
17 | │ This code is distributed in the hope that it will be useful, │
18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │
19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │
20 | │ GNU General Public License for more details. │
21 | │ │
22 | │ You should have received a copy of the GNU General Public License │
23 | │ along with this program. If not, see │
24 | │ │
25 | * ────────────────────────────────────────────────────────────────────────── */
26 |
27 |
28 | #if defined(__STDC__)
29 | # if (__STDC_VERSION__ >= 199901L)
30 | # define _XOPEN_SOURCE 700
31 | # endif
32 | #endif
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 |
40 |
41 | #define N_DFLT 1000
42 |
43 |
44 | int main ( int argc, char **argv )
45 | {
46 |
47 | int N = ( (argc > 1) ? atoi(*(argv+1)) : N_DFLT);
48 | int Nth = ( (argc > 2) ? atoi(*(argv+2)) : 0);
49 |
50 | unsigned int *array = (int*)malloc( sizeof(int) * N );
51 |
52 | if ( Nth > 0 )
53 | omp_set_num_threads = Nth;
54 |
55 | #pragma omp parallel
56 | {
57 | int myid = omp_get_thread_num();
58 | int nthreads = omp_get_num_threads();
59 |
60 | for ( unsigned int i = 0; i < N; i++ )
61 | array[i] = i*i;
62 |
63 | }
64 |
65 | //
66 | // check the results
67 | // can you parallelize this as well ?
68 | //
69 |
70 | unsigned int faults = 0;
71 | for ( unsigned int i = 0; i < N; i++ )
72 | faults += ( array[i] != i*i );
73 |
74 | if ( faults > 0 )
75 | printf("wow, you've been able to get %u faults\n",
76 | faults );
77 |
78 | return 0;
79 | }
80 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.c:
--------------------------------------------------------------------------------
1 |
2 |
3 | #if defined(__STDC__)
4 | # if (__STDC_VERSION__ >= 199901L)
5 | # define _XOPEN_SOURCE 700
6 | # endif
7 | #endif
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include "prefix_sum.serial.h"
14 |
15 |
16 |
17 | inline double scan( const uint N, DTYPE * restrict array )
18 | {
19 |
20 | DTYPE avg = array[0];
21 |
22 | for ( uint ii = 1; ii < N; ii++ )
23 | {
24 | avg += array[ii];
25 | array[ii] = avg;
26 | }
27 |
28 | return avg;
29 | }
30 |
31 |
32 | inline DTYPE scan_efficient( const uint N, DTYPE * restrict array )
33 | {
34 |
35 | uint N_4 = (N/4)*4;
36 |
37 | {
38 | DTYPE temp = array[2];
39 | array[1] += array[0];
40 | array[3] += temp;
41 | array[2] += array[1];
42 | array[3] += array[1];
43 | }
44 |
45 | PRAGMA_VECT_LOOP
46 | for ( uint ii = 4; ii < N_4; ii+=4 )
47 | {
48 | DTYPE register temp = array[ii+2];
49 | array[ii] += array[ii-1];
50 | array[ii+1] += array[ii];
51 | array[ii+3] += temp;
52 | array[ii+2] += array[ii+1];
53 | array[ii+3] += array[ii+1];
54 | }
55 |
56 | for ( uint ii = N_4; ii < N; ii++ )
57 | array[ii] += array[ii-1];
58 |
59 | return array[N-1];
60 | }
61 |
62 |
63 | #define N_default 1000
64 | #define _scan 0
65 | #define _scan_e 1
66 |
67 | int main ( int argc, char **argv )
68 | {
69 |
70 | struct timespec ts;
71 | int Nth_level1 = 1;
72 | int Nth_level2 = 0;
73 |
74 | // -------------------------------------------------------------
75 | // variables' initialization to default values
76 | //
77 |
78 | uint N = N_default;
79 | int scan_type = _scan;
80 |
81 |
82 | if ( argc > 1 )
83 | {
84 | scan_type = atoi( *(argv+1) );
85 | if ( argc > 2 )
86 | N = (unsigned)atoi( *(argv+2) );
87 | }
88 |
89 | printf( "scan type: %d\n", scan_type );
90 |
91 |
92 | // -------------------------------------------------------------
93 | // data init.
94 |
95 | double timing_start;
96 | double timing_scan;
97 | double timing_prepare;
98 | double total_weight;
99 |
100 | uint N_alloc = ((N/4)+1)*4;
101 | // DTYPE *array = (DTYPE*)aligned_alloc( 32, N_alloc * sizeof(DTYPE) );
102 | DTYPE *array = (DTYPE*)malloc( N_alloc * sizeof(DTYPE) );
103 |
104 | timing_start = CPU_TIME;
105 |
106 | // initialize with pseudo-random numbers
107 |
108 | /* srand48(time(0)); */
109 | /* for ( int ii = 0; ii < N; ii++ ) */
110 | /* topnodes[ii] = base + drand48()*range; */
111 |
112 | // initialize with the first N integer
113 | // (that makes the results easy to check)
114 | // //
115 |
116 | for ( uint ii = 0; ii < N; ii++ )
117 | array[ii] = (double)ii;
118 |
119 | timing_prepare = CPU_TIME - timing_start;
120 |
121 | // ................................................
122 | // SCAN
123 | // ................................................
124 |
125 | if ( scan_type == _scan )
126 | total_weight = scan( N, array );
127 |
128 | else if (scan_type == _scan_e)
129 | total_weight = scan_efficient( N, array );
130 |
131 | /* else if (scan_type == _scan_b) */
132 | /* total_weight = scan_b( N, array ); */
133 |
134 | timing_scan = CPU_TIME - timing_start;
135 |
136 | printf("timing for scan is %g, timing for prepare is %g [total weight: %g]\n",
137 | timing_scan, timing_prepare, total_weight);
138 | return 0;
139 | }
140 |
--------------------------------------------------------------------------------
/PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.h:
--------------------------------------------------------------------------------
1 |
2 | // ─────────────────────────────────────────────────────────────────
3 | // define the datatype
4 | //
5 | #if !defined(DTYPE)
6 | #define DTYPE double
7 | #endif
8 |
9 | typedef unsigned int uint;
10 |
11 |
12 | // ─────────────────────────────────────────────────────────────────
13 | // define the timing routines
14 | //
15 |
16 | #define CPU_TIME ({struct timespec ts; \
17 | clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \
18 | (double)ts.tv_sec + \
19 | (double)ts.tv_nsec * 1e-9;})
20 |
21 |
22 | // ─────────────────────────────────────────────────────────────────
23 | // define the vector generator
24 | //
25 |
26 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER)
27 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep")
28 | #elif defined(__INTEL_COMPILER) | defined(__ICC)
29 | #define PRAGMA_VECT_LOOP _Pragma("parallel")
30 | #elif defined(__clang__)
31 | #define PRAGMA_VECT_LOOP _Pragma("ivdep")
32 | #else
33 | #define PRAGMA_VECT_LOOP
34 | #endif
35 |
36 |
37 | // ─────────────────────────────────────────────────────────────────
38 | // prototypes
39 | //
40 |
41 | double scan ( const uint, DTYPE * restrict );
42 | double scan_efficient ( const uint, DTYPE * restrict );
43 |
44 |
--------------------------------------------------------------------------------
/intro_to_course.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/intro_to_course.pdf
--------------------------------------------------------------------------------
/lecture01-intro-toHPC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/lecture01-intro-toHPC.pdf
--------------------------------------------------------------------------------