├── ASSIGNMENTS ├── exercise1 │ ├── algs.png │ ├── exercise1.md │ └── naive_model.png └── exercise2 │ ├── exercise2.md │ ├── exercise2.v1.1.pdf │ ├── exercise2.v1.pdf │ ├── quicksort.c │ └── read_write_pgm_image.c ├── CODE_OPTIMIZATION ├── 00--optimization--preliminaries_and_compiler_usage.pdf ├── 01--Modern_architecture.pdf ├── 02--optimization--cache.pdf ├── 03--optimization--branches.pdf ├── 05--optimization--loops-and-prefetching.pdf ├── Readme.md ├── examples_on_branching │ ├── if_forest_inside_loop │ │ └── loop.c │ ├── sort_2_arrays │ │ ├── branchpred2.c │ │ ├── compile │ │ └── mypapi.h │ └── unpredictable_datastream │ │ ├── amonra.gen10 │ │ ├── branchpred.besmart.s │ │ ├── branchpred.besmart2.c │ │ ├── branchpred.besmart2.s │ │ ├── branchpred.s │ │ ├── branchpred.stat │ │ ├── out.2 │ │ └── out.v │ │ ├── branchpred │ │ ├── branchpred.c │ │ ├── branchpred.c~ │ │ ├── branchpred.smart │ │ └── branchpred.smart2 ├── examples_on_cache │ ├── hot_and_cold_fields │ │ ├── hotcold_a.v0.c │ │ ├── hotcold_a.v1.c │ │ ├── hotcold_b.v0.c │ │ ├── hotcold_b.v1.c │ │ ├── hotcold_c.v0.c │ │ └── hotcold_c.v1.c │ ├── matrix_transpose │ │ ├── transpose │ │ │ ├── matrix_transpose.c │ │ │ ├── matrix_transpose_swapped.c │ │ │ ├── matrix_transpose_swapped_unroll.c │ │ │ └── matrix_transpose_unroll.c │ │ └── transpose_by_blocks │ │ │ ├── matrix_transpose_blocks.v0.c │ │ │ ├── matrix_transpose_blocks.v1.c │ │ │ ├── matrix_transpose_blocks.v2.c │ │ │ ├── matrix_transpose_blocks.v3.c │ │ │ └── mypapi.h │ └── memory_mountain │ │ ├── Makefile │ │ ├── README │ │ ├── clock.c │ │ ├── clock.h │ │ ├── fcyc2.c │ │ ├── fcyc2.h │ │ ├── mountain.c │ │ ├── mountain.gcc │ │ ├── plotmountain.gp │ │ └── v2 │ │ ├── Makefile │ │ ├── fcyc2.c │ │ ├── fcyc2.h │ │ ├── mountain.c │ │ └── mountain.gcc └── examples_on_pipelines │ ├── combine_2_arrays │ ├── compile │ ├── mypapi.h │ ├── pipeline.c │ ├── run │ ├── v0.c │ ├── v1.c │ ├── v1b.c │ ├── v2.c │ ├── v3.c │ ├── v3b.c │ ├── v4.c │ └── vector.c │ ├── matrix_multiplication │ ├── matmul.c │ ├── matmul_simple.c │ ├── mypapi.h │ ├── plot.gp │ └── run │ ├── polynomial_evaluation │ ├── Makefile │ ├── benchmark.c │ ├── poly.c │ ├── poly.h │ ├── readme.md │ ├── statistics │ │ ├── cpe.c │ │ ├── cpe.h │ │ ├── fcyc.c │ │ ├── fcyc.h │ │ ├── lsquare.c │ │ └── lsquare.h │ └── timing │ │ ├── clock.c │ │ └── clock.h │ └── reduction │ ├── mypapi.h │ ├── plot.gp │ ├── reduction.c │ └── reduction.h ├── HPC_TOOLS_and_STORAGE └── Readme.md ├── Materials ├── A_note_on_Endiansim.pdf ├── Readme.md ├── What_every_computer_scientist_should_know_about_floating-point.pdf ├── arguments.c └── topics.pdf ├── PARALLELISM ├── Readme.md ├── codes │ ├── memory.c │ └── pi.c ├── lecture01-intro-toHPC.pdf ├── lecture02-HPC-hardware.pdf ├── lecture03-HPCsoftware-stack.pdf ├── lecture04-on-parallel-programming.pdf └── slurm │ ├── README.md │ ├── slurm01.job │ ├── slurm02_A.job │ ├── slurm02_B.job │ ├── slurm02_C.job │ ├── slurm03_A.job │ ├── slurm03_B.job │ ├── slurm03_C.job │ ├── slurm04.job │ └── slurm05.job ├── PARALLEL_PROGRAMMING ├── MPI │ ├── Readme.md │ ├── basic-mpi-codes │ │ ├── Brecv.c │ │ ├── CBlockSends.c │ │ ├── clean.sh │ │ ├── compile_openMPI_gnu.sh │ │ ├── compile_openMPI_intel.sh │ │ ├── deadlock.c │ │ ├── linear-array.c │ │ ├── mpi_env_call.c │ │ ├── mpi_hello_world.F90 │ │ ├── mpi_hello_world.c │ │ ├── mpi_hello_world_sync.c │ │ ├── mpi_pi.c │ │ ├── mpi_pi.job │ │ ├── send_message.F90 │ │ ├── send_message.c │ │ └── sendrecv_message.c │ ├── collective-mpi │ │ ├── all2allv3d.c │ │ ├── allgather.job │ │ ├── allgather.py │ │ ├── allgatherv.c │ │ ├── b_cast.c │ │ ├── b_cast.f │ │ ├── clean.sh │ │ ├── compile.sh │ │ ├── gather.c │ │ ├── gather.f │ │ ├── mpi_bcastcompare.c │ │ ├── reduce.c │ │ ├── reduce.f │ │ ├── scatter.c │ │ └── scatter.f │ ├── compiling-and-running-mpi-programs.md │ ├── lecture05-MPI-Programming-part-A.pdf │ ├── lecture05-MPI-Programming-part-B.pdf │ ├── lecture06-Network-basics-for-MPI-application.pptx │ └── pi_scalability │ │ └── scalability.job └── OpenMP │ ├── 00--Memory_model.pdf │ ├── 01--Intro_to_OpenMP.pdf │ ├── 02--parallel_regions.pdf │ ├── 03--loops.pdf │ ├── 04--threads_affinity.pdf │ ├── examples │ ├── .#for.c │ ├── parallel_loops │ │ ├── 00_array_sum_with_race.c │ │ ├── 01a_array_sum.c │ │ ├── 01b_array_sum.c │ │ ├── 01c_array_sum.c │ │ ├── 01d_array_sum.c │ │ ├── 02_falsesharing.c │ │ ├── 03_falsesharing_fixed.c │ │ ├── 04_scheduling.c │ │ ├── 05_first_and_last_private.c │ │ ├── loop_without_for.c │ │ ├── pi_openmp.c │ │ └── pi_openmp.fix.c │ ├── parallel_regions │ │ ├── 00_scope_of_variables.c │ │ ├── 00_stack_and_scope.c │ │ ├── 01_simple_pr_wrong.c │ │ ├── 02_simple_pr.c │ │ ├── 03a_num_of_threads.c │ │ ├── 03b_num_of_threads.c │ │ ├── 04_order_of_threads_wrong.c │ │ ├── 05a_order_of_threads.c │ │ ├── 05b_order_of_threads.c │ │ ├── 05c_order_of_threads.c │ │ ├── 09_clauses__copyin.c │ │ ├── 09_clauses__copyin__clarify.c │ │ ├── 09_clauses__copyprivate.c │ │ ├── 09_clauses__firstprivate.c │ │ ├── 09_clauses__lastprivate.c │ │ └── 09_clauses__threadprivate.c │ └── threads_affinity │ │ ├── 00_where_I_am.c │ │ ├── 01_where_I_am_omp.c │ │ ├── 02_where_I_am_omp.c │ │ ├── 03_where_I_am_nested.c │ │ ├── 04_touch_by_one.c │ │ ├── 05_touch_by_all.c │ │ └── 06_touch_by_all_threadprivate.c │ ├── examples_on_stack │ ├── 00_explore_how_bytes_are_stored.c │ ├── 01a_understanding_the_stack.c │ └── 01b_understanding_the_stack.c │ └── exercises │ ├── .#lab_exercise.2.c │ ├── exercises.pdf │ ├── lab_exercise.2.c │ ├── lab_exercise.2.v2.c │ ├── lab_exercise.c │ ├── my_lab_exercise.2.c │ ├── my_lab_exercise.2.v2.c │ ├── prefix_sum.serial.c │ ├── prefix_sum.serial.h │ └── write_pgm_image.c ├── README.md ├── intro_to_course.pdf └── lecture01-intro-toHPC.pdf /ASSIGNMENTS/exercise1/algs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/algs.png -------------------------------------------------------------------------------- /ASSIGNMENTS/exercise1/naive_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise1/naive_model.png -------------------------------------------------------------------------------- /ASSIGNMENTS/exercise2/exercise2.v1.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.1.pdf -------------------------------------------------------------------------------- /ASSIGNMENTS/exercise2/exercise2.v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/ASSIGNMENTS/exercise2/exercise2.v1.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/00--optimization--preliminaries_and_compiler_usage.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/01--Modern_architecture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/01--Modern_architecture.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/02--optimization--cache.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/02--optimization--cache.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/03--optimization--branches.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/03--optimization--branches.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/05--optimization--loops-and-prefetching.pdf -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/Readme.md: -------------------------------------------------------------------------------- 1 | # Materials on serial code optimization 2 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/sort_2_arrays/compile: -------------------------------------------------------------------------------- 1 | gcc -march=native -I/scratch/Software/include -DUSE_PAPI -o branchpred2 branchpred2.c -lm -L/scratch/Software/lib -lpapi 2 | gcc -march=native -DBESMART -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart branchpred2.c -lm -L/scratch/Software/lib -lpapi 3 | gcc -march=native -DBESMART2 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart2 branchpred2.c -lm -L/scratch/Software/lib -lpapi 4 | gcc -march=native -DBESMART3 -I/scratch/Software/include -DUSE_PAPI -o branchpred2_smart3 branchpred2.c -lm -L/scratch/Software/lib -lpapi 5 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/branchpred.besmart2.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the exercises for the Lectures on 3 | * "Foundations of High Performance Computing" 4 | * given at 5 | * Master in HPC and 6 | * Master in Data Science and Scientific Computing 7 | * @ SISSA, ICTP and University of Trieste 8 | * 9 | * contact: luca.tornatore@inaf.it 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | 25 | 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | #define SIZE_DEFAULT 1000000 34 | #define TOP (2 << 20) 35 | #define PIVOT (TOP >> 2) 36 | 37 | 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \ 39 | (double)ts.tv_nsec * 1e-9) 40 | 41 | 42 | 43 | int main(int argc, char **argv) 44 | { 45 | int SIZE; 46 | int *data; 47 | int cc, ii; 48 | 49 | long long sum = 0; 50 | 51 | struct timespec ts; 52 | double tstart, tstop; 53 | 54 | if(argc > 1) 55 | SIZE = atoi( *(argv+1) ); 56 | else 57 | SIZE = SIZE_DEFAULT; 58 | 59 | // Generate data 60 | data = (int*)calloc(SIZE, sizeof(int)); 61 | srand((int)(SIZE)); 62 | 63 | for (cc = 0; cc < SIZE; cc++) 64 | data[cc] = rand() % TOP; 65 | 66 | 67 | tstart = TCPU_TIME; 68 | 69 | for (cc = 0; cc < 1000; cc++) 70 | { 71 | sum = 0; 72 | long long _sum_[4] = {0}; 73 | for (ii = 0; ii < SIZE; ii+=4) 74 | { 75 | _sum_[0] += (data[ii]>PIVOT? data[ii] : 0); 76 | _sum_[1] += (data[ii+1]>PIVOT? data[ii+1] : 0); 77 | _sum_[2] += (data[ii+2]>PIVOT? data[ii+2] : 0); 78 | _sum_[3] += (data[ii+3]>PIVOT? data[ii+3] : 0); 79 | } 80 | sum += (_sum_[0] + _sum_[1]) + (_sum_[2] + _sum_[3]); 81 | } 82 | 83 | tstop = TCPU_TIME; 84 | 85 | #ifdef WOW 86 | tot_tstop = TCPU_TIME; 87 | #endif 88 | 89 | free(data); 90 | 91 | #if !defined(WOW) 92 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart); 93 | 94 | #else 95 | double tot_time = tot_tstop - tot_tstart; 96 | double loop_time = tstop - tstart; 97 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n", 98 | sum, tot_time, loop_time, tot_time - loop_time); 99 | #endif 100 | 101 | printf("\n"); 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.2: -------------------------------------------------------------------------------- 1 | 2 | 3.4191 3 | 4 | 5 | 3.57167 6 | 7 | 8 | 3.44099 9 | 10 | 11 | 4.17072 12 | 13 | 14 | 4.20686 15 | 16 | 17 | 3.64886 18 | 19 | 20 | 3.39921 21 | 22 | 23 | 4.78118 24 | 25 | 26 | 3.54926 27 | 28 | 29 | 3.52104 30 | 31 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/amonra.gen10/out.v: -------------------------------------------------------------------------------- 1 | 2 | sum is 9831544284110, elapsed seconds: 3.45677 3 | 4 | 5 | sum is 9831544284110, elapsed seconds: 3.80376 6 | 7 | 8 | sum is 9831544284110, elapsed seconds: 4.81135 9 | 10 | 11 | sum is 9831544284110, elapsed seconds: 3.60161 12 | 13 | 14 | sum is 9831544284110, elapsed seconds: 3.65025 15 | 16 | 17 | sum is 9831544284110, elapsed seconds: 3.68967 18 | 19 | 20 | sum is 9831544284110, elapsed seconds: 3.63842 21 | 22 | 23 | sum is 9831544284110, elapsed seconds: 3.63771 24 | 25 | 26 | sum is 9831544284110, elapsed seconds: 3.6503 27 | 28 | 29 | sum is 9831544284110, elapsed seconds: 3.64676 30 | 31 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the exercises for the Lectures on 3 | * "Foundations of High Performance Computing" 4 | * given at 5 | * Master in HPC and 6 | * Master in Data Science and Scientific Computing 7 | * @ SISSA, ICTP and University of Trieste 8 | * 9 | * contact: luca.tornatore@inaf.it 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | 25 | 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | #define SIZE_DEFAULT 1000000 34 | #define TOP (2 << 20) 35 | #define PIVOT (TOP >> 2) 36 | 37 | 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \ 39 | (double)ts.tv_nsec * 1e-9) 40 | 41 | 42 | #ifdef WOW 43 | int compare(const void *A, const void *B) 44 | { 45 | return *(int*)A - *(int*)B; 46 | } 47 | #endif 48 | 49 | int main(int argc, char **argv) 50 | { 51 | int SIZE; 52 | int *data; 53 | int cc, ii; 54 | 55 | #ifdef WOW 56 | double tot_tstart, tot_tstop; 57 | #endif 58 | 59 | long long sum = 0; 60 | 61 | struct timespec ts; 62 | double tstart, tstop; 63 | 64 | if(argc > 1) 65 | SIZE = atoi( *(argv+1) ); 66 | else 67 | SIZE = SIZE_DEFAULT; 68 | 69 | // Generate data 70 | data = (int*)calloc(SIZE, sizeof(int)); 71 | srand((int)(SIZE)); 72 | 73 | for (cc = 0; cc < SIZE; cc++) 74 | data[cc] = rand() % TOP; 75 | 76 | 77 | 78 | #ifdef WOW 79 | tot_tstart = TCPU_TIME; 80 | // !!! With this, the next loop runs faster 81 | qsort(data, SIZE, sizeof(int), compare); 82 | #endif 83 | 84 | 85 | tstart = TCPU_TIME; 86 | 87 | for (cc = 0; cc < 1000; cc++) 88 | { 89 | sum = 0; 90 | 91 | for (ii = 0; ii < SIZE; ii++) 92 | { 93 | #if !defined( BESMART ) && !defined( BESMART2 ) 94 | if (data[ii] > PIVOT) 95 | sum += data[ii]; 96 | 97 | #elif defined( BESMART ) 98 | unsigned int t = (data[ii] - PIVOT - 1) >> 31; // the additional -1 is for the case data[ii]==PIVOT 99 | sum += ~t & data[ii]; 100 | 101 | #elif defined( BESMART2 ) 102 | //sum += (data[ii]>PIVOT)*data[ii]; 103 | sum += (data[ii]>PIVOT? data[ii] : 0); 104 | #endif 105 | } 106 | } 107 | 108 | tstop = TCPU_TIME; 109 | 110 | #ifdef WOW 111 | tot_tstop = TCPU_TIME; 112 | #endif 113 | 114 | free(data); 115 | 116 | #if !defined(WOW) 117 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart); 118 | 119 | #else 120 | double tot_time = tot_tstop - tot_tstart; 121 | double loop_time = tstop - tstart; 122 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n", 123 | sum, tot_time, loop_time, tot_time - loop_time); 124 | #endif 125 | 126 | printf("\n"); 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.c~: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the exercises for the Lectures on 3 | * "Foundations of High Performance Computing" 4 | * given at 5 | * Master in HPC and 6 | * Master in Data Science and Scientific Computing 7 | * @ SISSA, ICTP and University of Trieste 8 | * 9 | * contact: luca.tornatore@inaf.it 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | 25 | 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | 33 | #define SIZE_DEFAULT 1000000 34 | #define TOP (2 << 20) 35 | #define PIVOT (TOP >> 2) 36 | 37 | 38 | #define TCPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), (double)ts.tv_sec + \ 39 | (double)ts.tv_nsec * 1e-9) 40 | 41 | 42 | #ifdef WOW 43 | int compare(const void *A, const void *B) 44 | { 45 | return *(int*)A - *(int*)B; 46 | } 47 | #endif 48 | 49 | int main(int argc, char **argv) 50 | { 51 | int SIZE; 52 | int *data; 53 | int cc, ii; 54 | 55 | #ifdef WOW 56 | double tot_tstart, tot_tstop; 57 | #endif 58 | 59 | long long sum = 0; 60 | 61 | struct timespec ts; 62 | double tstart, tstop; 63 | 64 | if(argc > 1) 65 | SIZE = atoi( *(argv+1) ); 66 | else 67 | SIZE = SIZE_DEFAULT; 68 | 69 | // Generate data 70 | data = (int*)calloc(SIZE, sizeof(int)); 71 | srand((int)(SIZE)); 72 | 73 | for (cc = 0; cc < SIZE; cc++) 74 | data[cc] = rand() % TOP; 75 | 76 | 77 | 78 | #ifdef WOW 79 | tot_tstart = TCPU_TIME; 80 | // !!! With this, the next loop runs faster 81 | qsort(data, SIZE, sizeof(int), compare); 82 | #endif 83 | 84 | 85 | tstart = TCPU_TIME; 86 | 87 | for (cc = 0; cc < 1000; cc++) 88 | { 89 | sum = 0; 90 | 91 | for (ii = 0; ii < SIZE; ii++) 92 | { 93 | #if !defined( BESMART ) && !defined( BESMART2 ) 94 | if (data[ii] > PIVOT) 95 | sum += data[ii]; 96 | 97 | #elif defined( BESMART ) 98 | unsigned int t = (data[ii] - PIVOT - 1) >> 31; // the additional -1 is for the case data[ii]==PIVOT 99 | sum += ~t & data[ii]; 100 | 101 | #elif defined( BESMART2 ) 102 | sum += (data[ii]>PIVOT)*data[ii]; 103 | #endif 104 | } 105 | } 106 | 107 | tstop = TCPU_TIME; 108 | 109 | #ifdef WOW 110 | tot_tstop = TCPU_TIME; 111 | #endif 112 | 113 | free(data); 114 | 115 | #if !defined(WOW) 116 | printf("\nsum is %llu, elapsed seconds: %g\n", sum, tstop - tstart); 117 | 118 | #else 119 | double tot_time = tot_tstop - tot_tstart; 120 | double loop_time = tstop - tstart; 121 | printf("\nsum is %llu, elapsed seconds: %g, %g in loop and %g in qsort\n", 122 | sum, tot_time, loop_time, tot_time - loop_time); 123 | #endif 124 | 125 | printf("\n"); 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_branching/unpredictable_datastream/branchpred.smart2 -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v0.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | typedef struct node_t { 40 | double key; 41 | char data[DATASIZE]; 42 | struct node_t *next; 43 | } node; 44 | 45 | 46 | 47 | 48 | #define N_default 10000 49 | 50 | int main( int argc, char **argv ) 51 | { 52 | struct timespec ts; 53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 54 | 55 | // ------------------------------------- 56 | // startup 57 | 58 | int N = N_default; 59 | 60 | if ( argc > 1 ) 61 | N = atoi( *(argv+1) ); 62 | 63 | 64 | // ------------------------------------- 65 | // setup 66 | 67 | double *keys = (double*)calloc( N, sizeof(double)); 68 | node *last = NULL; 69 | node *first = NULL; 70 | 71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 72 | srand48( time(NULL) ); 73 | 74 | for( int nn = 0; nn < N; nn++ ) 75 | { 76 | node *new = (node*)calloc( 1, sizeof(node) ); 77 | if ( last != NULL ) 78 | last->next = new; 79 | else 80 | first = new; 81 | new ->key = drand48(); 82 | keys[nn] = new->key; 83 | new ->next = NULL; 84 | memset( new->data, 0, sizeof(char)*DATASIZE); 85 | last = new; 86 | } 87 | 88 | 89 | printf("now let's search for all of them\n"); fflush(stdout); 90 | 91 | int NSHOTS = N; 92 | double sum = 0; 93 | 94 | double tstart = CPU_TIME; 95 | 96 | for( int ii = 0; ii < NSHOTS; ii++ ) 97 | { 98 | double key = keys[(int)(drand48() * N)]; 99 | node *target = first; 100 | 101 | // this implementation is less efficient than 102 | // that in v1 103 | for ( int nn = 0; nn < N; nn++ ) 104 | if ( target->key == key ) 105 | sum += target->key; 106 | else 107 | target = target->next; 108 | } 109 | 110 | double et = CPU_TIME - tstart; 111 | 112 | printf("timing for %d shots: %g\n", NSHOTS, et ); 113 | 114 | node *target = first; 115 | while( target->next != NULL ) 116 | { 117 | node *tmp = target->next; 118 | free(target); 119 | target = tmp; 120 | } 121 | 122 | return 0; 123 | } 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_a.v1.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | typedef struct node_t { 40 | double key; 41 | char data[DATASIZE]; 42 | struct node_t *next; 43 | } node; 44 | 45 | 46 | 47 | 48 | #define N_default 10000 49 | 50 | int main( int argc, char **argv ) 51 | { 52 | struct timespec ts; 53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 54 | 55 | // ------------------------------------- 56 | // startup 57 | 58 | int N = N_default; 59 | 60 | if ( argc > 1 ) 61 | N = atoi( *(argv+1) ); 62 | 63 | 64 | // ------------------------------------- 65 | // setup 66 | 67 | double *keys = (double*)calloc( N, sizeof(double)); 68 | node *last = NULL; 69 | node *first = NULL; 70 | 71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 72 | srand48( time(NULL) ); 73 | 74 | for( int nn = 0; nn < N; nn++ ) 75 | { 76 | node *new = (node*)calloc( 1, sizeof(node) ); 77 | if ( last != NULL ) 78 | last->next = new; 79 | else 80 | first = new; 81 | new ->key = drand48(); 82 | keys[nn] = new->key; 83 | new ->next = NULL; 84 | memset( new->data, 0, sizeof(char)*DATASIZE); 85 | last = new; 86 | } 87 | 88 | 89 | printf("now let's search for all of them\n"); fflush(stdout); 90 | 91 | int NSHOTS = N; 92 | double sum = 0; 93 | 94 | double tstart = CPU_TIME; 95 | 96 | for( int ii = 0; ii < NSHOTS; ii++ ) 97 | { 98 | double key = keys[(int)(drand48() * N)]; 99 | node *target = first; 100 | 101 | while ( target->key != key ) 102 | target = target->next; 103 | sum += target->key; 104 | } 105 | 106 | double et = CPU_TIME - tstart; 107 | 108 | printf("timing for %d shots: %g\n", NSHOTS, et ); 109 | 110 | node *target = first; 111 | while( target->next != NULL ) 112 | { 113 | node *tmp = target->next; 114 | free(target); 115 | target = tmp; 116 | } 117 | 118 | return 0; 119 | } 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v0.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | typedef struct node_t { 40 | double key; 41 | struct node_t *next; 42 | char data[DATASIZE]; 43 | } node; 44 | 45 | 46 | 47 | 48 | #define N_default 10000 49 | 50 | int main( int argc, char **argv ) 51 | { 52 | struct timespec ts; 53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 54 | 55 | // ------------------------------------- 56 | // startup 57 | 58 | int N = N_default; 59 | 60 | if ( argc > 1 ) 61 | N = atoi( *(argv+1) ); 62 | 63 | 64 | // ------------------------------------- 65 | // setup 66 | 67 | double *keys = (double*)calloc( N, sizeof(double)); 68 | node *last = NULL; 69 | node *first = NULL; 70 | 71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 72 | srand48( time(NULL) ); 73 | 74 | for( int nn = 0; nn < N; nn++ ) 75 | { 76 | node *new = (node*)calloc( 1, sizeof(node) ); 77 | if ( last != NULL ) 78 | last->next = new; 79 | else 80 | first = new; 81 | new ->key = drand48(); 82 | keys[nn] = new->key; 83 | new ->next = NULL; 84 | memset( new->data, 0, sizeof(char)*DATASIZE); 85 | last = new; 86 | } 87 | 88 | 89 | printf("now let's search for all of them\n"); fflush(stdout); 90 | 91 | int NSHOTS = N; 92 | double sum = 0; 93 | 94 | double tstart = CPU_TIME; 95 | 96 | for( int ii = 0; ii < NSHOTS; ii++ ) 97 | { 98 | double key = keys[(int)(drand48() * N)]; 99 | node *target = first; 100 | 101 | // this implementation is less efficient than 102 | // that in v1 103 | for ( int nn = 0; nn < N; nn++ ) 104 | if ( target->key == key ) 105 | sum += target->key; 106 | else 107 | target = target->next; 108 | } 109 | 110 | double et = CPU_TIME - tstart; 111 | 112 | printf("timing for %d shots: %g\n", NSHOTS, et ); 113 | 114 | node *target = first; 115 | while( target->next != NULL ) 116 | { 117 | node *tmp = target->next; 118 | free(target); 119 | target = tmp; 120 | } 121 | 122 | return 0; 123 | } 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_b.v1.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | typedef struct node_t { 40 | double key; 41 | struct node_t *next; 42 | char data[DATASIZE]; 43 | } node; 44 | 45 | 46 | 47 | 48 | #define N_default 10000 49 | 50 | int main( int argc, char **argv ) 51 | { 52 | struct timespec ts; 53 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 54 | 55 | // ------------------------------------- 56 | // startup 57 | 58 | int N = N_default; 59 | 60 | if ( argc > 1 ) 61 | N = atoi( *(argv+1) ); 62 | 63 | 64 | // ------------------------------------- 65 | // setup 66 | 67 | double *keys = (double*)calloc( N, sizeof(double)); 68 | node *last = NULL; 69 | node *first = NULL; 70 | 71 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 72 | srand48( time(NULL) ); 73 | 74 | for( int nn = 0; nn < N; nn++ ) 75 | { 76 | node *new = (node*)calloc( 1, sizeof(node) ); 77 | if ( last != NULL ) 78 | last->next = new; 79 | else 80 | first = new; 81 | new ->key = drand48(); 82 | keys[nn] = new->key; 83 | new ->next = NULL; 84 | memset( new->data, 0, sizeof(char)*DATASIZE); 85 | last = new; 86 | } 87 | 88 | 89 | printf("now let's search for all of them\n"); fflush(stdout); 90 | 91 | int NSHOTS = N; 92 | double sum = 0; 93 | 94 | double tstart = CPU_TIME; 95 | 96 | for( int ii = 0; ii < NSHOTS; ii++ ) 97 | { 98 | double key = keys[(int)(drand48() * N)]; 99 | node *target = first; 100 | 101 | while ( target->key != key ) 102 | target = target->next; 103 | sum += target->key; 104 | } 105 | 106 | double et = CPU_TIME - tstart; 107 | 108 | printf("timing for %d shots: %g\n", NSHOTS, et ); 109 | 110 | node *target = first; 111 | while( target->next != NULL ) 112 | { 113 | node *tmp = target->next; 114 | free(target); 115 | target = tmp; 116 | } 117 | 118 | return 0; 119 | } 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v0.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | 40 | typedef struct node_t { 41 | double key; 42 | struct node_t *next; 43 | void *data; 44 | } node; 45 | 46 | 47 | 48 | 49 | #define N_default 10000 50 | 51 | int main( int argc, char **argv ) 52 | { 53 | struct timespec ts; 54 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 55 | 56 | // ------------------------------------- 57 | // startup 58 | 59 | int N = N_default; 60 | 61 | if ( argc > 1 ) 62 | N = atoi( *(argv+1) ); 63 | 64 | 65 | // ------------------------------------- 66 | // setup 67 | 68 | double *keys = (double*)calloc( N, sizeof(double)); 69 | char *alldata = (char*)calloc( DATASIZE*N, sizeof(char)); 70 | node *last = NULL; 71 | node *first = NULL; 72 | 73 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 74 | srand48( time(NULL) ); 75 | 76 | for( int nn = 0; nn < N; nn++ ) 77 | { 78 | node *new = (node*)calloc( 1, sizeof(node) ); 79 | if ( last != NULL ) 80 | last->next = new; 81 | else 82 | first = new; 83 | new ->key = drand48(); 84 | keys[nn] = new->key; 85 | new ->next = NULL; 86 | new ->data = alldata + DATASIZE*nn; 87 | memset( new->data, 0, sizeof(char)*DATASIZE); 88 | last = new; 89 | } 90 | 91 | 92 | printf("now let's search for all of them\n"); fflush(stdout); 93 | 94 | int NSHOTS = N; 95 | double sum = 0; 96 | 97 | double tstart = CPU_TIME; 98 | 99 | for( int ii = 0; ii < NSHOTS; ii++ ) 100 | { 101 | double key = keys[(int)(drand48() * N)]; 102 | node *target = first; 103 | 104 | // this implementation is less efficient than 105 | // that in v1 106 | for ( int nn = 0; nn < N; nn++ ) 107 | if ( target->key == key ) 108 | sum += target->key; 109 | else 110 | target = target->next; 111 | } 112 | 113 | double et = CPU_TIME - tstart; 114 | 115 | printf("timing for %d shots: %g\n", NSHOTS, et ); 116 | 117 | node *target = first; 118 | while( target->next != NULL ) 119 | { 120 | node *tmp = target->next; 121 | free(target); 122 | target = tmp; 123 | } 124 | 125 | return 0; 126 | } 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/hot_and_cold_fields/hotcold_c.v1.c: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * This file is part of the exercises for the Lectures on 4 | * "Foundations of High Performance Computing" 5 | * given at 6 | * Master in HPC and 7 | * Master in Data Science and Scientific Computing 8 | * @ SISSA, ICTP and University of Trieste 9 | * 2019 10 | * 11 | * This is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License as published by 13 | * the Free Software Foundation; either version 3 of the License, or 14 | * (at your option) any later version. 15 | * This code is distributed in the hope that it will be useful, 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | * GNU General Public License for more details. 19 | * 20 | * You should have received a copy of the GNU General Public License 21 | * along with this program. If not, see 22 | */ 23 | 24 | #define _XOPEN_SOURCE 700 // ensures we're using c11 standard 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | #define CPU_TIME (clock_gettime( id, &ts ), (double)ts.tv_sec + \ 33 | (double)ts.tv_nsec * 1e-9) 34 | 35 | #ifndef DATASIZE 36 | #define DATASIZE 200 37 | #endif 38 | 39 | 40 | typedef struct node_t { 41 | double key; 42 | struct node_t *next; 43 | void *data; 44 | } node; 45 | 46 | 47 | 48 | 49 | #define N_default 10000 50 | 51 | int main( int argc, char **argv ) 52 | { 53 | struct timespec ts; 54 | clockid_t id = CLOCK_PROCESS_CPUTIME_ID; 55 | 56 | // ------------------------------------- 57 | // startup 58 | 59 | int N = N_default; 60 | 61 | if ( argc > 1 ) 62 | N = atoi( *(argv+1) ); 63 | 64 | 65 | // ------------------------------------- 66 | // setup 67 | 68 | double *keys = (double*)calloc( N, sizeof(double)); 69 | char *alldata = (char*)calloc( DATASIZE*N, sizeof(char)); 70 | node *last = NULL; 71 | node *first = NULL; 72 | 73 | printf("creating and initializing %d nodes\n", N ); fflush(stdout); 74 | srand48( time(NULL) ); 75 | 76 | for( int nn = 0; nn < N; nn++ ) 77 | { 78 | node *new = (node*)calloc( 1, sizeof(node) ); 79 | if ( last != NULL ) 80 | last->next = new; 81 | else 82 | first = new; 83 | new ->key = drand48(); 84 | keys[nn] = new->key; 85 | new ->next = NULL; 86 | new ->data = alldata + DATASIZE*nn; 87 | memset( new->data, 0, sizeof(char)*DATASIZE); 88 | last = new; 89 | } 90 | 91 | 92 | printf("now let's search for all of them\n"); fflush(stdout); 93 | 94 | int NSHOTS = N; 95 | double sum = 0; 96 | 97 | double tstart = CPU_TIME; 98 | 99 | for( int ii = 0; ii < NSHOTS; ii++ ) 100 | { 101 | double key = keys[(int)(drand48() * N)]; 102 | node *target = first; 103 | 104 | while ( target->key != key ) 105 | target = target->next; 106 | sum += target->key; 107 | } 108 | 109 | double et = CPU_TIME - tstart; 110 | 111 | printf("timing for %d shots: %g\n", NSHOTS, et ); 112 | 113 | node *target = first; 114 | while( target->next != NULL ) 115 | { 116 | node *tmp = target->next; 117 | free(target); 118 | target = tmp; 119 | } 120 | 121 | return 0; 122 | } 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/matrix_transpose_blocks.v3.c -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/matrix_transpose/transpose_by_blocks/mypapi.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #if defined(USE_PAPI) // ----------------------------------------------------------- 4 | #include 5 | 6 | typedef unsigned long long int uLint; 7 | 8 | #define PAPI_EVENTS_NUM 4 9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM, PAPI_L2_DCM }; 10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set 11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values 12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values 13 | 14 | // check that PAPI is OK, exit if not 15 | #define PAPI_CHECK( R ) { \ 16 | if ( (R) != PAPI_OK ) { \ 17 | printf("a problem with PAPI (code %d) arise at line %d\n", \ 18 | (R), __LINE__);fflush(stdout); return (R); }} 19 | 20 | 21 | // check that PAPI is OK, 22 | // issue a warning if not with a 23 | // provided message 24 | #define PAPI_WARN( R, S ) { \ 25 | if ( (R) != PAPI_OK ) { \ 26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \ 27 | (R), __LINE__, (S)); fflush(stdout); }} 28 | 29 | // check that PAPI is OK about an event 30 | // issue a warning if not with a 31 | // provided message 32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \ 33 | if ( (R) != PAPI_OK ) { \ 34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \ 35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }} 36 | 37 | 38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \ 39 | retval = PAPI_query_event(papi_events[i]); \ 40 | if ( retval == PAPI_OK ) { \ 41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \ 42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \ 43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } } 44 | 45 | #define PAPI_INIT { \ 46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \ 47 | if (retval != PAPI_VER_CURRENT) \ 48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \ 49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \ 50 | PAPI_ADD_EVENTS_to_SET; } 51 | 52 | // to use HIGH-LEVEL API 53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 55 | 56 | // to use NORMAL API 57 | #define PAPI_START_CNTR { \ 58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); } 59 | 60 | #define PAPI_STOP_CNTR { \ 61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \ 62 | if( retval == PAPI_OK ) { \ 63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); } 65 | 66 | #define PAPI_GET_CNTR( i ) ( papi_values[(i)] ) 67 | 68 | #define PAPI_FLUSH_BUFFER { \ 69 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 70 | papi_buffer[ jj] = 0; } 71 | 72 | #define PAPI_FLUSH { \ 73 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 74 | papi_values[jj] = papi_buffer[ jj] = 0; } 75 | 76 | 77 | #else // ----------------------------------------------------------- 78 | 79 | #define PAPI_EVENTS_NUM 0 80 | #define PAPI_INIT 81 | #define PAPI_START_CNTR 82 | #define PAPI_STOP_CNTR 83 | #define PAPI_FLUSH 84 | #define PAPI_GET_CNTR( i ) 0 85 | 86 | #endif // ----------------------------------------------------------- 87 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/Makefile: -------------------------------------------------------------------------------- 1 | 2 | COMPILER=gcc 3 | 4 | ifeq ($(COMPILER),gcc) 5 | CC = gcc 6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm -D__i686__ 7 | SUFFIX = .gcc 8 | LIBM = -lm 9 | endif 10 | 11 | ifeq ($(COMPILER),icc) 12 | CC = icc 13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo 14 | SUFFIX = .icc 15 | LIBM = 16 | endif 17 | 18 | ifeq ($(COMPILER),pgcc) 19 | CC = pgcc 20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm 21 | SUFFIX = .pgcc 22 | LIBM = -lm 23 | endif 24 | 25 | mountain: mountain.c fcyc2.c clock.c 26 | $(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c clock.c $(LIBM) 27 | 28 | clean: 29 | rm -f mountain$(SUFFIX) *.o *~ 30 | 31 | 32 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/README: -------------------------------------------------------------------------------- 1 | This directory contains code for generating a memory mountain, as 2 | described in Computer Systems: A Programmer's Perspective 3 | 4 | clock.{c,h} - routines for using x86 and Alpha cycle timers 5 | fcyc2.{c,h} - routines that estimate the number of cycles required 6 | by a function f that takes two arguments. 7 | Makefile - memory mountain makefile 8 | mountain.c - program that generates the memory mountain. 9 | 10 | (1) set the compiler at the top of Makefile 11 | (2) invoke make 12 | (3) execute the mountain.$COMPILER 13 | (4) copy the output in a file named mountain.dat 14 | (5) use plotmountain.gp to plot the data using gnuplot 15 | type 'load "plotmountain.gp"' from inside gnuplot 16 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/clock.h: -------------------------------------------------------------------------------- 1 | /* Routines for using cycle counter */ 2 | 3 | /* Start the counter */ 4 | void start_counter(); 5 | 6 | /* Get # cycles since counter started */ 7 | double get_counter(); 8 | 9 | 10 | /* Measure overhead for counter */ 11 | double ovhd(); 12 | 13 | /* Determine clock rate of processor */ 14 | double mhz(int verbose); 15 | 16 | /* Determine clock rate of processor, having more control over accuracy */ 17 | double mhz_full(int verbose, int sleeptime); 18 | 19 | /** Special counters that compensate for timer interrupt overhead */ 20 | 21 | void start_comp_counter(); 22 | 23 | double get_comp_counter(); 24 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/fcyc2.h: -------------------------------------------------------------------------------- 1 | /* Find number of cycles used by function that takes 2 arguments */ 2 | 3 | /* Function to be tested takes two integer arguments */ 4 | typedef int (*test_funct)(int, int); 5 | 6 | /* Compute time used by function f */ 7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache); 8 | 9 | /********* These routines are used to help with the analysis *********/ 10 | 11 | /* 12 | Parameters: 13 | k: How many samples must be within epsilon for convergence 14 | epsilon: What is tolerance 15 | maxsamples: How many samples until give up? 16 | */ 17 | 18 | /* Full version of fcyc with control over parameters */ 19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache, 20 | int k, double epsilon, int maxsamples, int compensate); 21 | 22 | /* Get current minimum */ 23 | double get_min(); 24 | 25 | /* What is convergence status for k minimum measurements within epsilon 26 | Returns 0 if not converged, #samples if converged, and -1 if can't 27 | reach convergence 28 | */ 29 | 30 | int has_converged(int k, double epsilon, int maxsamples); 31 | 32 | /* What is error of current measurement */ 33 | double err(int k); 34 | 35 | /************* Try other clocking methods *****************/ 36 | 37 | /* Full version that uses the time of day clock */ 38 | double fcyc2_full_tod(test_funct f, int param1, int param2, int clear_cache, 39 | int k, double epsilon, int maxsamples, int compensate); 40 | 41 | double fcyc2_tod(test_funct f, int param1, int param2, int clear_cache); 42 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.c: -------------------------------------------------------------------------------- 1 | /* mountain.c - Generate the memory mountain. */ 2 | /* $begin mountainmain */ 3 | #include 4 | #include 5 | #include 6 | #include "fcyc2.h" /* measurement routines */ 7 | #include "clock.h" /* routines to access the cycle counter */ 8 | 9 | #define MINBYTES (1 << 14) /* First working set size */ 10 | #define MAXBYTES (1 << 27) /* Last working set size */ 11 | #define MAXSTRIDE 15 /* Stride x8 bytes */ 12 | #define MAXELEMS MAXBYTES/sizeof(long) 13 | 14 | 15 | long data[MAXELEMS]; /* The global array we'll be traversing */ 16 | 17 | 18 | void init_data(long *data, int n); 19 | int test(int elems, int stride); 20 | double run(int size, int stride, double Mhz); 21 | 22 | /* $begin mountainmain */ 23 | int main() 24 | { 25 | int size; /* Working set size (in bytes) */ 26 | int stride; /* Stride (in array elements) */ 27 | double Mhz; /* Clock frequency */ 28 | 29 | init_data(data, MAXELEMS); /* Initialize each element in data */ 30 | Mhz = mhz(0); /* Estimate the clock frequency */ 31 | 32 | 33 | printf("# Clock frequency is approx. %.1f MHz\n", Mhz); 34 | printf("# Memory mountain (MB/sec)\n"); 35 | 36 | 37 | printf("%d\t", MAXSTRIDE); 38 | for (stride = 1; stride <= MAXSTRIDE; stride++) 39 | printf("%d\t", stride); 40 | 41 | printf("\n"); 42 | 43 | /* begin mountainmain */ 44 | for (size = MAXBYTES; size >= MINBYTES; size >>= 1) 45 | { 46 | int log2size_kb = (int)(log2((double)size / 1024.0)); 47 | printf("%d\t", log2size_kb); 48 | 49 | for (stride = 1; stride <= MAXSTRIDE; stride++) 50 | printf("%.0f\t", run(size, stride, Mhz)); 51 | 52 | printf("\n"); 53 | } 54 | exit(0); 55 | } 56 | 57 | 58 | /* init_data - initializes the array */ 59 | void init_data(long *data, int n) 60 | { 61 | int i; 62 | 63 | for (i = 0; i < n; i++) 64 | data[i] = i; 65 | } 66 | 67 | /* $begin mountainfuns */ 68 | /* test - Iterate over first "elems" elements of array "data" with 69 | * stride of "stride", using 4x4 loop unrolling. 70 | */ 71 | int test(int elems, int stride) 72 | { 73 | long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4; 74 | long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; 75 | long length = elems; 76 | long limit = length - sx4; 77 | 78 | /* Combine 4 elements at a time */ 79 | for (i = 0; i < limit; i += sx4) { 80 | acc0 = acc0 + data[i]; 81 | acc1 = acc1 + data[i+stride]; 82 | acc2 = acc2 + data[i+sx2]; 83 | acc3 = acc3 + data[i+sx3]; 84 | } 85 | 86 | /* Finish any remaining elements */ 87 | for (; i < length; i++) { 88 | acc0 = acc0 + data[i]; 89 | } 90 | return ((acc0 + acc1) + (acc2 + acc3)); 91 | } 92 | 93 | /* run - Run test(elems, stride) and return read throughput (MB/s). 94 | * "size" is in bytes, "stride" is in array elements, and Mhz is 95 | * CPU clock frequency in Mhz. 96 | */ 97 | double run(int size, int stride, double Mhz) 98 | { 99 | double cycles; 100 | int elems = size / sizeof(double); 101 | 102 | test(elems, stride); /* Warm up the cache */ //line:mem:warmup 103 | cycles = fcyc2(test, elems, stride, 0); /* Call test(elems,stride) */ //line:mem:fcyc 104 | return (size / stride) / (cycles / Mhz); /* Convert cycles to MB/s */ //line:mem:bwcompute 105 | } 106 | /* $end mountainfuns */ 107 | 108 | 109 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/mountain.gcc -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/plotmountain.gp: -------------------------------------------------------------------------------- 1 | set samples 100 2 | set isosamples 100 3 | set xyplane 0 4 | 5 | set xlabel "STRIDES" font ", 16" 6 | set ylabel "SIZE (KB, log_2)" font ", 16" 7 | set zlabel "MBs/sec" offset -3, 0 font ",16" rotate parallel 8 | 9 | set tics font ", 12" 10 | 11 | set pm3d 12 | splot [:17][4:17] "mountain.dat" u 1:2:3 matrix nonuniform with lines lc 0 notitle 13 | 14 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/Makefile: -------------------------------------------------------------------------------- 1 | 2 | COMPILER=gcc 3 | 4 | ifeq ($(COMPILER),gcc) 5 | CC = gcc 6 | CFLAGS = -Wall -O3 -march=native -ftree-vectorize -lm 7 | SUFFIX = .gcc 8 | LIBM = -lm 9 | endif 10 | 11 | ifeq ($(COMPILER),icc) 12 | CC = icc 13 | CFLAGS = -Wall -O3 -fast -axSSE4.2 -xHost -ipo 14 | SUFFIX = .icc 15 | LIBM = 16 | endif 17 | 18 | ifeq ($(COMPILER),pgcc) 19 | CC = pgcc 20 | CFLAGS = -Wall -O4 -fast -Munroll -Mvect=simd,fuse,tile -Mipa -lm 21 | SUFFIX = .pgcc 22 | LIBM = -lm 23 | endif 24 | 25 | mountain: mountain.c fcyc2.c 26 | $(CC) $(CFLAGS) -o mountain$(SUFFIX) mountain.c fcyc2.c $(LIBM) 27 | 28 | clean: 29 | rm -f mountain$(SUFFIX) *.o *~ 30 | 31 | 32 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.c: -------------------------------------------------------------------------------- 1 | /* Compute time used by a function f that takes two integer args */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "fcyc2.h" 8 | 9 | #define CPU_TIME ({struct timespec ts; clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \ 10 | (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;}) 11 | 12 | static double *values = NULL; 13 | int samplecount = 0; 14 | 15 | #define KEEP_VALS 1 16 | #define KEEP_SAMPLES 1 17 | 18 | #if KEEP_SAMPLES 19 | double *samples = NULL; 20 | #endif 21 | 22 | 23 | static void init_sampler(int k, int maxsamples) 24 | { 25 | if (values) 26 | free(values); 27 | values = calloc(k, sizeof(double)); 28 | #if KEEP_SAMPLES 29 | if (samples) 30 | free(samples); 31 | /* Allocate extra for wraparound analysis */ 32 | samples = calloc(maxsamples+k, sizeof(double)); 33 | #endif 34 | samplecount = 0; 35 | } 36 | 37 | 38 | /* Add new sample. */ 39 | void add_sample(double val, int k) 40 | { 41 | int pos = 0; 42 | if (samplecount < k) { 43 | pos = samplecount; 44 | values[pos] = val; 45 | } else if (val < values[k-1]) { 46 | pos = k-1; 47 | values[pos] = val; 48 | } 49 | #if KEEP_SAMPLES 50 | samples[samplecount] = val; 51 | #endif 52 | samplecount++; 53 | /* Insertion sort */ 54 | while (pos > 0 && values[pos-1] > values[pos]) { 55 | double temp = values[pos-1]; 56 | values[pos-1] = values[pos]; 57 | values[pos] = temp; 58 | pos--; 59 | } 60 | } 61 | 62 | /* Get current minimum */ 63 | double get_min() 64 | { 65 | return values[0]; 66 | } 67 | 68 | /* What is relative error for kth smallest sample */ 69 | double err(int k) 70 | { 71 | if (samplecount < k) 72 | return 1000.0; 73 | return (values[k-1] - values[0])/values[0]; 74 | } 75 | 76 | /* Have k minimum measurements converged within epsilon? */ 77 | int has_converged(int k_arg, double epsilon_arg, int maxsamples) 78 | { 79 | if ((samplecount >= k_arg) && 80 | ((1 + epsilon_arg)*values[0] >= values[k_arg-1])) 81 | return samplecount; 82 | if ((samplecount >= maxsamples)) 83 | return -1; 84 | return 0; 85 | } 86 | 87 | /* Code to clear cache */ 88 | #define ASIZE (1 << 20) 89 | #define STRIDE 8 90 | static int stuff[ASIZE]; 91 | static int sink; 92 | 93 | static void clear() 94 | { 95 | int x = sink; 96 | int i; 97 | for (i = 0; i < ASIZE; i += STRIDE) 98 | x += stuff[i]; 99 | sink = x; 100 | } 101 | 102 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache, 103 | int k, double epsilon, int maxsamples, int compensate) 104 | { 105 | double result; 106 | init_sampler(k, maxsamples); 107 | if (compensate) { 108 | do { 109 | if (clear_cache) 110 | clear(); 111 | f(param1, param2); /* warm cache */ 112 | double tstart = CPU_TIME; 113 | f(param1, param2); 114 | tstart = CPU_TIME - tstart; 115 | add_sample(tstart, k); 116 | } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples); 117 | } else { 118 | do { 119 | if (clear_cache) 120 | clear(); 121 | f(param1, param2); /* warm cache */ 122 | double tstart = CPU_TIME; 123 | f(param1, param2); 124 | tstart = CPU_TIME-tstart; 125 | add_sample(tstart, k); 126 | } while (!has_converged(k, epsilon, maxsamples) && samplecount < maxsamples); 127 | } 128 | #ifdef DEBUG 129 | { 130 | int i; 131 | printf(" %d smallest values: [", k); 132 | for (i = 0; i < k; i++) 133 | printf("%.0f%s", values[i], i==k-1 ? "]\n" : ", "); 134 | } 135 | #endif 136 | result = values[0]; 137 | #if !KEEP_VALS 138 | free(values); 139 | values = NULL; 140 | #endif 141 | return result; 142 | } 143 | 144 | 145 | double fcyc2(test_funct f, int param1, int param2, int clear_cache) 146 | { 147 | return fcyc2_full(f, param1, param2, clear_cache, 3, 0.01, 500, 0); 148 | } 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/fcyc2.h: -------------------------------------------------------------------------------- 1 | /* Find number of cycles used by function that takes 2 arguments */ 2 | 3 | /* Function to be tested takes two integer arguments */ 4 | typedef int (*test_funct)(int, int); 5 | 6 | /* Compute time used by function f */ 7 | double fcyc2(test_funct f, int param1, int param2, int clear_cache); 8 | 9 | /********* These routines are used to help with the analysis *********/ 10 | 11 | /* 12 | Parameters: 13 | k: How many samples must be within epsilon for convergence 14 | epsilon: What is tolerance 15 | maxsamples: How many samples until give up? 16 | */ 17 | 18 | /* Full version of fcyc with control over parameters */ 19 | double fcyc2_full(test_funct f, int param1, int param2, int clear_cache, 20 | int k, double epsilon, int maxsamples, int compensate); 21 | 22 | /* Get current minimum */ 23 | double get_min(); 24 | 25 | /* What is convergence status for k minimum measurements within epsilon 26 | Returns 0 if not converged, #samples if converged, and -1 if can't 27 | reach convergence 28 | */ 29 | 30 | int has_converged(int k, double epsilon, int maxsamples); 31 | 32 | /* What is error of current measurement */ 33 | double err(int k); 34 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.c: -------------------------------------------------------------------------------- 1 | /* mountain.c - Generate the memory mountain. */ 2 | /* $begin mountainmain */ 3 | #include 4 | #include 5 | #include 6 | #include "fcyc2.h" 7 | 8 | 9 | #define MINBYTES (1 << 14) /* First working set size */ 10 | #define MAXBYTES (1 << 27) /* Last working set size */ 11 | #define MAXSTRIDE 15 /* Stride x8 bytes */ 12 | #define MAXELEMS MAXBYTES/sizeof(long) 13 | 14 | 15 | long data[MAXELEMS]; /* The global array we'll be traversing */ 16 | 17 | void init_data(long *data, int n); 18 | int test(int elems, int stride); 19 | double run(int size, int stride); 20 | 21 | 22 | int main() 23 | { 24 | int size; /* Working set size (in bytes) */ 25 | int stride; /* Stride (in array elements) */ 26 | 27 | init_data(data, MAXELEMS); /* Initialize each element in data */ 28 | 29 | printf("# Memory mountain (MB/sec)\n"); 30 | 31 | 32 | printf("%d\t", MAXSTRIDE); 33 | for (stride = 1; stride <= MAXSTRIDE; stride++) 34 | printf("%d\t", stride); 35 | 36 | printf("\n"); 37 | 38 | /* begin mountainmain */ 39 | for (size = MAXBYTES; size >= MINBYTES; size >>= 1) 40 | { 41 | int log2size_kb = (int)(log2((double)size / 1024.0)); 42 | printf("%d\t", log2size_kb); 43 | 44 | for (stride = 1; stride <= MAXSTRIDE; stride++) 45 | printf("%.0f\t", run(size, stride)); 46 | 47 | printf("\n"); 48 | } 49 | exit(0); 50 | } 51 | 52 | 53 | /* init_data - initializes the array */ 54 | void init_data(long *data, int n) 55 | { 56 | int i; 57 | 58 | for (i = 0; i < n; i++) 59 | data[i] = i; 60 | } 61 | 62 | /* $begin mountainfuns */ 63 | /* test - Iterate over first "elems" elements of array "data" with 64 | * stride of "stride", using 4x4 loop unrolling. 65 | */ 66 | int test(int elems, int stride) 67 | { 68 | long i, sx2 = stride*2, sx3 = stride*3, sx4 = stride*4; 69 | long acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; 70 | long length = elems; 71 | long limit = length - sx4; 72 | 73 | /* Combine 4 elements at a time */ 74 | for (i = 0; i < limit; i += sx4) { 75 | acc0 = acc0 + data[i]; 76 | acc1 = acc1 + data[i+stride]; 77 | acc2 = acc2 + data[i+sx2]; 78 | acc3 = acc3 + data[i+sx3]; 79 | } 80 | 81 | /* Finish any remaining elements */ 82 | for (; i < length; i++) { 83 | acc0 = acc0 + data[i]; 84 | } 85 | return ((acc0 + acc1) + (acc2 + acc3)); 86 | } 87 | 88 | /* run - Run test(elems, stride) and return read throughput (MB/s). 89 | * "size" is in bytes, "stride" is in array elements, and Mhz is 90 | * CPU clock frequency in Mhz. 91 | */ 92 | double run(int size, int stride) 93 | { 94 | double timing; 95 | int elems = size / sizeof(double); 96 | 97 | test(elems, stride); /* Warm up the cache */ //line:mem:warmup 98 | timing = fcyc2(test, elems, stride, 0); /* Call test(elems,stride) */ //line:mem:fcyc 99 | return (size / stride) / timing; /* Convert cycles to MB/s */ //line:mem:bwcompute 100 | } 101 | /* $end mountainfuns */ 102 | 103 | 104 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/CODE_OPTIMIZATION/examples_on_cache/memory_mountain/v2/mountain.gcc -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/compile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for f in v?.c; 4 | do 5 | version=$( echo $f | cut -d'.' -f1 | cut -d'v' -f2) 6 | echo "compiling "$version" -> v"$version 7 | gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version pipeline.c -lm -lpapi 8 | gcc -std=c11 -DUSE_PAPI -DPIPELINE=$version -o v$version.O3n pipeline.c -lm -lpapi -O3 -march=native -mavx2 9 | done 10 | 11 | echo "compiling vector" 12 | gcc -std=c11 -DUSE_PAPI -march=native -o vetor vector.c -lm -lpapi 13 | gcc -std=c11 -DUSE_PAPI -O3 -march=native -mavx2 -o vector.O3n vector.c -lm -lpapi 14 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/mypapi.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #if defined(USE_PAPI) // ----------------------------------------------------------- 4 | #include 5 | 6 | #define PAPI_EVENTS_NUM 2 7 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC }; 8 | int papi_EventSet = PAPI_NULL; // the handle for the events' set 9 | long long papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values 10 | long long papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values 11 | 12 | // check that PAPI is OK, exit if not 13 | #define PAPI_CHECK( R ) { \ 14 | if ( (R) != PAPI_OK ) { \ 15 | printf("a problem with PAPI (code %d) arise at line %d\n", \ 16 | (R), __LINE__);fflush(stdout); return (R); }} 17 | 18 | 19 | // check that PAPI is OK, 20 | // issue a warning if not with a 21 | // provided message 22 | #define PAPI_WARN( R, S ) { \ 23 | if ( (R) != PAPI_OK ) { \ 24 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \ 25 | (R), __LINE__, (S)); fflush(stdout); }} 26 | 27 | // check that PAPI is OK about an event 28 | // issue a warning if not with a 29 | // provided message 30 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \ 31 | if ( (R) != PAPI_OK ) { \ 32 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \ 33 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }} 34 | 35 | 36 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \ 37 | retval = PAPI_query_event(papi_events[i]); \ 38 | if ( retval == PAPI_OK ) { \ 39 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \ 40 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \ 41 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } } 42 | 43 | #define PAPI_INIT { \ 44 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \ 45 | if (retval != PAPI_VER_CURRENT) \ 46 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \ 47 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \ 48 | PAPI_ADD_EVENTS_to_SET; } 49 | 50 | // to use HIGH-LEVEL API 51 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 52 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 53 | 54 | // to use NORMAL API 55 | #define PAPI_START_CNTR { \ 56 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); } 57 | 58 | #define PAPI_STOP_CNTR { \ 59 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \ 60 | if( retval == PAPI_OK ) { \ 61 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 62 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); } 63 | 64 | 65 | 66 | #else // ----------------------------------------------------------- 67 | 68 | #define PAPI_INIT 69 | #define PAPI_START_CNTR 70 | #define PAPI_STOP_CNTR 71 | 72 | #endif // ----------------------------------------------------------- 73 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/combine_2_arrays/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #./compile_all 4 | 5 | export LC_NUMERIC="en_US.UTF-8" #that is to avoid problems with locales when using printf 6 | 7 | declare -a opts=("" ".O3n") 8 | declare -a opt_names=("no-opt" "opt") 9 | 10 | ntypes=${#types[@]} 11 | nopts=${#opts[@]} 12 | ncompilers=${#compilers[@]} 13 | 14 | # -------------------------------------------------- 15 | # get results 16 | timings=() 17 | IPC=() 18 | 19 | execs=(v?) 20 | execs+=(vector) 21 | 22 | for f in ${execs[@]}; 23 | do 24 | version=$(echo $f | cut -f2 -d'v') 25 | printf "\trunning version v%s\n" $version 26 | 27 | for (( o=0; o out 31 | 32 | IPC+=($(cat out | grep IPC | cut -d':' -f2 )) 33 | timings+=($(cat out | grep cycles-per-element | cut -d':' -f3 | cut -d']' -f1 )) 34 | done 35 | rm -f out 36 | done 37 | 38 | 39 | # -------------------------------------------------- 40 | # write results on the stdout 41 | 42 | # ............................ 43 | # headers 44 | echo 45 | printf "%s\t" "" 46 | for (( o=0; o 8 | #include 9 | #include 10 | 11 | double cclock() 12 | /* Returns elepsed seconds past from the last call to timer rest */ 13 | { 14 | 15 | struct timeval tmp; 16 | double sec; 17 | gettimeofday( &tmp, (struct timezone *)0 ); 18 | sec = tmp.tv_sec + ((double)tmp.tv_usec)/1000000.0; 19 | return sec; 20 | } 21 | 22 | void setup_matrix(double* a, int n, int m, int stride) 23 | { 24 | int i,j; 25 | for (i = 0; i < n; i++) 26 | for (j = 0; j < m; j++) 27 | a[i*m + j ] = i*m + j + stride; 28 | 29 | } 30 | 31 | void clear_matrix(double* a, int n, int m) 32 | { 33 | int i,j; 34 | for (i = 0; i < n; i++) 35 | for (j = 0; j < m; j++) 36 | a[i*m + j ] = 0; 37 | 38 | } 39 | 40 | 41 | void mat_mult(double* a, double* b, double* c, int n, int m, int o) 42 | { 43 | 44 | int i, j, k; 45 | for (i = 0; i < n; i++) 46 | for (j = 0; j < o; j++) 47 | for ( k = 0; k < m; k++) 48 | c[i*o + j] += a[i*m + k] * b[k*o + j]; 49 | } 50 | 51 | 52 | void mat_mult_opt(double* a, double* b, double* c, int n, int m, int o) 53 | { 54 | 55 | int i, j, k; 56 | for (i = 0; i < n; i++) 57 | for (k = 0; k < m; k++) 58 | for (j = 0; j < o; j++) 59 | c[i*o + j] += a[i*m + k] * b[k*o + j]; 60 | 61 | 62 | } 63 | 64 | 65 | int main(int argc, char** argv) 66 | { 67 | 68 | double *a, *b, *c; 69 | int w, m,n,o; 70 | double begin, end; 71 | 72 | if (argc < 5) 73 | { 74 | printf(" Calculates c(n,o)=a(n,m)*b(m,o) \n"); 75 | printf(" Usage: %s case n m o ", argv[0]); 76 | return 1; 77 | } 78 | 79 | w=atoi(argv[1]); 80 | n=atoi(argv[2]); 81 | m=atoi(argv[3]); 82 | o=atoi(argv[4]); 83 | 84 | a = malloc(n * m * sizeof(double)); 85 | b = malloc(m * o * sizeof(double)); 86 | c = malloc(n * o * sizeof(double)); 87 | 88 | setup_matrix(a, n, m, 0); 89 | setup_matrix(b, m, o, m*n); 90 | clear_matrix(c, n, o); 91 | 92 | if( w == 0 ) 93 | { 94 | begin = cclock(); 95 | mat_mult(a, b, c, n, m, o); 96 | end = cclock(); 97 | printf ("NON-optimized elapsed time %9.4f s \n\n", end - begin ); 98 | } 99 | else 100 | { 101 | begin = cclock(); 102 | mat_mult_opt(a, b, c, n, m, o); 103 | end = cclock(); 104 | printf (" Optimized Elapsed time %9.4f s \n\n", end - begin ); 105 | } 106 | 107 | // printf("%f\n", c[0]); 108 | free(a); 109 | free(b); 110 | free(c); 111 | 112 | return 0; 113 | 114 | } 115 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/mypapi.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #if defined(USE_PAPI) // ----------------------------------------------------------- 4 | #include 5 | 6 | typedef unsigned long long int uLint; 7 | 8 | #define PAPI_EVENTS_NUM 3 9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM }; 10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set 11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values 12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values 13 | 14 | // check that PAPI is OK, exit if not 15 | #define PAPI_CHECK( R ) { \ 16 | if ( (R) != PAPI_OK ) { \ 17 | printf("a problem with PAPI (code %d) arise at line %d\n", \ 18 | (R), __LINE__);fflush(stdout); return (R); }} 19 | 20 | 21 | // check that PAPI is OK, 22 | // issue a warning if not with a 23 | // provided message 24 | #define PAPI_WARN( R, S ) { \ 25 | if ( (R) != PAPI_OK ) { \ 26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \ 27 | (R), __LINE__, (S)); fflush(stdout); }} 28 | 29 | // check that PAPI is OK about an event 30 | // issue a warning if not with a 31 | // provided message 32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \ 33 | if ( (R) != PAPI_OK ) { \ 34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \ 35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }} 36 | 37 | 38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \ 39 | retval = PAPI_query_event(papi_events[i]); \ 40 | if ( retval == PAPI_OK ) { \ 41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \ 42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \ 43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } } 44 | 45 | #define PAPI_INIT { \ 46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \ 47 | if (retval != PAPI_VER_CURRENT) \ 48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \ 49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \ 50 | PAPI_ADD_EVENTS_to_SET; } 51 | 52 | // to use HIGH-LEVEL API 53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 55 | 56 | // to use NORMAL API 57 | #define PAPI_START_CNTR { \ 58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); } 59 | 60 | #define PAPI_STOP_CNTR { \ 61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \ 62 | if( retval == PAPI_OK ) { \ 63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); } 65 | 66 | #define PAPI_FLUSH_BUFFER { \ 67 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 68 | papi_buffer[ jj] = 0; } 69 | 70 | #define PAPI_FLUSH { \ 71 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 72 | papi_values[jj] = papi_buffer[ jj] = 0; } 73 | 74 | 75 | #else // ----------------------------------------------------------- 76 | 77 | #define PAPI_INIT 78 | #define PAPI_START_CNTR 79 | #define PAPI_STOP_CNTR 80 | 81 | #endif // ----------------------------------------------------------- 82 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/plot.gp: -------------------------------------------------------------------------------- 1 | reset 2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28" 3 | #set terminal qt enhanced size 1200,1000 4 | 5 | set key inside top left font ",22" 6 | set tics font ",22" 7 | set lmargin screen 0.08 8 | set rmargin screen 0.95 9 | set bmargin screen 0.12 10 | 11 | set xlabel "N" font ",22" offset 0,0.5 12 | 13 | unset yrange 14 | unset xrange 15 | 16 | array OPT[2] 17 | OPT[1] = "O0 " 18 | OPT[2] = "O3 " 19 | 20 | array W[2] 21 | W[1] = 3 22 | W[2] = 1.5 23 | 24 | array DT[2] 25 | DT[1] = "-- __" 26 | DT[2] = 1 27 | 28 | array TYPE[3] 29 | TYPE[1] = "naive" 30 | TYPE[2] = "optimized" 31 | TYPE[3] = "tailed" 32 | 33 | 34 | # --------------------------------------------- 35 | 36 | set output "timings.png" 37 | set ylabel "timing (sec)" font ",22" offset 2 38 | 39 | 40 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1],\ 41 | "" u 1:(1.5e-8*$1**3) w l lc 0 lw 2 dt '..' notitle,\ 42 | "" u 1:(3e-9*$1**3) w l lc 0 lw 2 dt '..' notitle 43 | 44 | 45 | # --------------------------------------------- 46 | 47 | set output "timings_ratio.png" 48 | set ylabel "timings / timings_{naive}" font ",22" offset 2 49 | 50 | ref = 2 51 | clr = 2 52 | plot for[L = 1:2] for [i = 3:4] "timings" u 1:(column(i+(L-1)*3)/column(ref)) w lp ps 2 lw W[L] dt DT[L] lc ((L-1)*3+(i-1)) title OPT[L].TYPE[i-1] 53 | 54 | # --------------------------------------------- 55 | 56 | set output "timings_per_element.png" 57 | set ylabel "timing per element (nsec)" font ",22" offset 2 58 | 59 | plot for[L = 1:2] for [i = 2:4] "timings" u 1:(column(i+(L-1)*3)/($1**3)*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 60 | 61 | 62 | # --------------------------------------------- 63 | 64 | set output "CPE.png" 65 | set ylabel "CPE" font ",22" offset 2 66 | 67 | plot for[L = 1:2] for [i = 2:4] "CPEs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 68 | 69 | 70 | # --------------------------------------------- 71 | 72 | set output "L1M.png" 73 | set ylabel "Level 1 misses per element" font ",22" offset 2 74 | 75 | plot for[L = 1:2] for [i = 2:4] "L1Ms" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 76 | 77 | 78 | # --------------------------------------------- 79 | 80 | set output "IPC.png" 81 | set key inside bottom left 82 | set ylabel "IPC" font ",22" offset 2 83 | set yrange [:4] 84 | 85 | plot for[L = 1:2] for [i = 2:4] "IPCs" u 1:(column(i+(L-1)*3)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 86 | 87 | 88 | 89 | 90 | set output 91 | reset 92 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/matrix_multiplication/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LC_NUMERIC="en_US.UTF-8" 4 | export LC_LOCALE="en_US.UTF-8" 5 | 6 | exec=matmul 7 | 8 | declare -a outputs=("timings" "IPCs" "CPEs" "L1Ms") 9 | ndata=${#outputs[@]} 10 | 11 | declare -a optimizations=("Non-opt" "Opt") 12 | noptimizations=${#optimizations[@]} 13 | 14 | declare -a versions=("naive " "lpswap" "tailed") 15 | nversions=${#versions[@]} 16 | 17 | # -------------------------------------------- 18 | # SAVE THE OLD TABLES, IF PRESENT 19 | # -------------------------------------------- 20 | 21 | for (( o=0 ; o < $ndata; o++ )); 22 | do 23 | mv -f ${outputs[$o]} ${outputs[$o]}.back 24 | echo -n "# ">> ${outputs[$o]} 25 | 26 | for (( p=0; p < $noptimizations; p++ )); do 27 | echo -e -n ${optimizations[$p]}"\t\t" >> ${outputs[$o]} ; 28 | done 29 | echo >> ${outputs[$o]} 30 | 31 | echo -n "#N ">> ${outputs[$o]} 32 | for (( p=0; p < $noptimizations; p++ )); do 33 | for (( v=0; v < $nversions; v++ )); do 34 | echo -n ${versions[$v]}" " >> ${outputs[$o]}; done; 35 | done 36 | echo >> ${outputs[$o]} 37 | done 38 | 39 | # -------------------------------------------- 40 | # PREPARE OUTPUT FOLDER 41 | # -------------------------------------------- 42 | 43 | output_dir=./output_saved 44 | if [ ! -d $output_dir ]; then mkdir $output_dir; fi 45 | 46 | # -------------------------------------------- 47 | 48 | 49 | start=100 50 | stop=3000 51 | inc=100 52 | 53 | 54 | 55 | 56 | echo -n "running.. " 57 | for (( N=$start; N<=$stop; N+=$inc )); 58 | do 59 | echo -n "N="$N".. " 60 | for (( V=0; V<$nversions; V++ )); 61 | do 62 | taskset -c 2 ./$exec $V $N $N $N > ${output_dir}/output.${V}.${N} 63 | results+=($(cat ${output_dir}/output.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}')) 64 | 65 | taskset -c 2 ./${exec}.On $V $N $N $N > ${output_dir}/output.O.${V}.${N} 66 | resultsO+=($(cat ${output_dir}/output.O.${V}.${N} | gawk '{ if($1=="elapsed") time=$3; else if($1=="IPC:") IPC=$2; else if($1=="cycles-per-element:") CPE=$2; else if($1=="L1miss-per-element:") L1M=$2} END {print time, IPC, CPE,L1M}')) 67 | 68 | done 69 | 70 | for (( o=0 ; o < ${#outputs[@]}; o++ )); 71 | do 72 | echo -n $N" " >> ${outputs[$o]} 73 | for (( c=0; c<$nversions; c++ )); do echo -n ${results[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done 74 | for (( c=0; c<$nversions; c++ )); do echo -n ${resultsO[$(($c*$ndata+$o))]}" " >> ${outputs[$o]}; done 75 | echo >> ${outputs[$o]} 76 | done 77 | 78 | results=() 79 | resultsO=() 80 | done 81 | echo 82 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | #CFLAGS=-Wall -O1 -msse3 3 | CFLAGS= -O3 -march=native 4 | OBJ=poly.o benchmark.o timing/clock.o statistics/cpe.o statistics/fcyc.o statistics/lsquare.o 5 | LDFLAGS=-lm 6 | # phony targets will always be remade, so a file named "clean" 7 | # won't prevent the clean target from running 8 | .PHONY: all clean run 9 | EXE=driver 10 | 11 | all: $(EXE) 12 | 13 | $(EXE): $(OBJ) 14 | $(CC) $(CFLAGS) -o $(EXE) $(OBJ) $(LDFLAGS) 15 | 16 | run: $(EXE) 17 | ./$(EXE) 18 | 19 | clean: 20 | rm -f $(EXE) $(OBJ) 21 | 22 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/benchmark.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "statistics/cpe.h" 5 | #include "poly.h" 6 | 7 | #define SHORT 0 8 | #if SHORT 9 | #define ASIZE 31 10 | #else 11 | #define ASIZE 973 12 | #endif 13 | #define EPS (1e-8) 14 | 15 | /* Keep track of a number of different programs */ 16 | #define MAX_BENCHMARKS 100 17 | 18 | static struct { 19 | poly_t cfunct; 20 | char *description; 21 | double cpe; 22 | } benchmarks[MAX_BENCHMARKS]; 23 | 24 | static int benchmark_count = 0; 25 | static int current_benchmark = 0; 26 | 27 | static double* data = NULL; 28 | static double x; 29 | static double result; 30 | static poly_t check_func = NULL; 31 | 32 | static void setup() 33 | { 34 | int i; 35 | if (!data) 36 | data = (double*) malloc(sizeof(double) * ASIZE); 37 | if (!data) { 38 | fprintf(stderr, "Memory allocation error!\n"); 39 | exit(EXIT_FAILURE); 40 | } 41 | /* Initialize array */ 42 | for (i = 0; i < ASIZE; i++) 43 | data[i] = (drand48() * 2) - 1; 44 | x = (drand48() * 2) - 1; 45 | } 46 | 47 | void run(int cnt) { 48 | result = benchmarks[current_benchmark].cfunct(data, x, cnt); 49 | } 50 | 51 | static void run_test(int bench_index) { 52 | double cpe; 53 | char *description = benchmarks[bench_index].description; 54 | double good_result; 55 | current_benchmark = bench_index; 56 | printf("starting benchmark %d\n", bench_index); 57 | setup(); 58 | cpe = find_cpe_full(run, ASIZE, 200000, stdout, RAN_SAMPLE, 0.3, 0); 59 | if (check_func) { 60 | result = benchmarks[bench_index].cfunct(data, x, ASIZE); 61 | good_result = check_func(data, x, ASIZE); 62 | if (result - good_result > EPS) { 63 | printf("Function %s, Should be %f, Got %f\n", 64 | description, good_result, result); 65 | } 66 | } 67 | benchmarks[current_benchmark].cpe = cpe; 68 | /* print results */ 69 | printf("%s: ", description); 70 | printf("%.2f cycles/element\n\n", cpe); 71 | } 72 | 73 | void add_function(poly_t f, char *description) { 74 | benchmarks[benchmark_count].cfunct = f; 75 | benchmarks[benchmark_count].description = description; 76 | benchmark_count++; 77 | } 78 | 79 | void set_check_function(poly_t f) { 80 | check_func = f; 81 | } 82 | 83 | int main() 84 | { 85 | int i; 86 | register_functions(); 87 | printf("\n"); 88 | for (i = 0; i < benchmark_count; i++) { 89 | run_test(i); 90 | } 91 | free(data); 92 | return EXIT_SUCCESS; 93 | } 94 | 95 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.c: -------------------------------------------------------------------------------- 1 | #include "poly.h" 2 | 3 | double poly(double a[], double x, int degree) 4 | { 5 | long int i; 6 | double result = a[0]; 7 | double xpwr = x; /* equals x^i at start of loop */ 8 | for (i = 1; i <= degree; i++) { 9 | result += a[i] * xpwr; 10 | xpwr = x * xpwr; 11 | } 12 | return result; 13 | } 14 | 15 | double polyh(double a[], double x, int degree) 16 | { 17 | long int i; 18 | double result = a[degree]; 19 | for (i = degree-1; i >= 0; i--) 20 | result = a[i] + x*result; 21 | return result; 22 | } 23 | 24 | double mypoly1(double a[], double x, int degree) 25 | { 26 | long int i; 27 | double x2 = x*x; 28 | double res = a[0]; 29 | double xpwr = x; 30 | 31 | for ( i = 1; i < degree; i += 2 ) 32 | { 33 | res += a[i] * xpwr; 34 | res += a[i+1] * xpwr * x; 35 | xpwr *= x2; 36 | } 37 | return res; 38 | } 39 | 40 | double mypoly2(double a[], double x, int degree) 41 | { 42 | long int i; 43 | double x2 = x*x; 44 | double res = a[0]; 45 | double xpwr = x; 46 | 47 | for ( i = 1; i < degree; i += 2 ) 48 | { 49 | res += (a[i] + a[i+1]*x)* xpwr; 50 | // res += a[i+1] * xpwr * x; 51 | xpwr *= x2; 52 | } 53 | for ( ; i <= degree; i ++ ) 54 | { 55 | res += a[i] * xpwr; 56 | xpwr *= x; 57 | } 58 | return res; 59 | } 60 | 61 | double mypoly3(double a[], double x, int degree) 62 | { 63 | long int i; 64 | double x2 = x*x; 65 | double res1 = a[0]; 66 | double res2 = a[2]; 67 | double xpwr = x; 68 | double xpwr3 = x2*x; 69 | 70 | for ( i = 1; i < degree-4; i += 4 ) 71 | { 72 | res1 += (a[i] + a[i+1]*x)* xpwr; 73 | res2 += (a[i+2] + a[i+3]*x)* xpwr3; 74 | xpwr *= x2; 75 | xpwr3 *= x2; 76 | } 77 | for ( ; i <= degree; i ++ ) 78 | { 79 | res1 += a[i] * xpwr; 80 | xpwr *= x; 81 | } 82 | 83 | return res1+res2; 84 | } 85 | 86 | 87 | double mypoly4(double a[], double x, int degree) 88 | { 89 | long int i; 90 | double x2 = x*x; 91 | double res_even = a[0]; 92 | double res_odd = 0; 93 | double xpwr_even = x2; 94 | double xpwr_odd = x; 95 | 96 | for ( i = 1; i <= degree; i += 2 ) 97 | { 98 | res_odd += a[i] * xpwr_odd; 99 | xpwr_odd *= x2; 100 | res_even += a[i+1] * xpwr_even; 101 | xpwr_even *= x2; 102 | } 103 | //for ( ; i <= degree; i ++ ) 104 | // { 105 | // res_odd += a[i] * xpwr_even; 106 | // xpwr_even *= x; 107 | // } 108 | return res_even + res_odd; 109 | } 110 | 111 | void register_functions(void) 112 | { 113 | set_check_function(&poly); /* used as reference implementation */ 114 | 115 | add_function(&poly, "Polynomial: Naive implementation"); 116 | add_function(&polyh, "Polynomial: Horner's method"); 117 | add_function(&mypoly1, "Polynomial: my poly1, unroll x 2"); 118 | add_function(&mypoly2, "Polynomial: my poly2, 2 separate loops"); 119 | add_function(&mypoly3, "Polynomial: my poly3, unroll x 2 and separate accumulation"); 120 | 121 | return; 122 | } 123 | 124 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/poly.h: -------------------------------------------------------------------------------- 1 | #if __INTEL_COMPILER 2 | /* inline function definitions */ 3 | #pragma warning ( disable : 1418 ) 4 | #endif 5 | 6 | typedef double (*poly_t)(double*, double, int); 7 | /* Add routine to list of programs to measure */ 8 | void add_function(poly_t f, char *description); 9 | /* Set routine to check results against */ 10 | void set_check_function(poly_t f); 11 | /* called by main to register the set of routines to benchmark */ 12 | void register_functions(void); 13 | 14 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/readme.md: -------------------------------------------------------------------------------- 1 | pipelining at work in evaulation of polynomials 2 | 3 | just typing "make" you should get an executable named "driver". 4 | by default, -O3 -march=native is enabled. 5 | 6 | That will evaluate a polynomial using different functions that are defined in poly.c 7 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/cpe.h: -------------------------------------------------------------------------------- 1 | /* Compute CPE for function */ 2 | 3 | /* Compute for function that is linear in some parameter cnt */ 4 | typedef void (*elem_fun_t)(int); 5 | 6 | /* Different ways of finding samples 7 | UNI_SAMPLE: samples uniformly spaced between bias*maxcnt and maxcnt 8 | RAN_SAMPLE: samples randomly selected between bias*maxcnt and maxcnt 9 | */ 10 | 11 | typedef enum {UNI_SAMPLE, RAN_SAMPLE} 12 | sample_t; 13 | 14 | /* Find cpe for function f, which allows cnt up to maxcnt. 15 | Uses default parameters 16 | */ 17 | double find_cpe(elem_fun_t f, int maxcnt); 18 | 19 | /* Find cpe for function f, which allows cnt up to maxcnt, using 20 | specified number of sample points. 21 | If data_file, then print data so that can plot points with Excel 22 | smethod determines method for generating samples 23 | */ 24 | double find_cpe_full(elem_fun_t f, int maxcnt, int samples, FILE *data_file, 25 | sample_t smethod, double bias, int verbose); 26 | 27 | /* Find number of cycles taken by function. 28 | Do this by running number of trials until best two within TOL (2%) of 29 | each other 30 | */ 31 | double measure_function(elem_fun_t f, int cnt); 32 | 33 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/fcyc.h: -------------------------------------------------------------------------------- 1 | 2 | /* Fcyc measures the speed of any "test function." Such a function 3 | is passed a list of integer parameters, which it may interpret 4 | in any way it chooses. 5 | */ 6 | 7 | typedef void (*test_funct)(int *); 8 | 9 | /* Compute number of cycles used by function f on given set of parameters */ 10 | double fcyc(test_funct f, int* params); 11 | 12 | /***********************************************************/ 13 | /* Set the various parameters used by measurement routines */ 14 | 15 | 16 | /* When set, will run code to clear cache before each measurement 17 | Default = 0 18 | */ 19 | void set_fcyc_clear_cache(int clear); 20 | 21 | /* Set size of cache to use when clearing cache 22 | Default = 1<<19 (512KB) 23 | */ 24 | void set_fcyc_cache_size(int bytes); 25 | 26 | /* Set size of cache block 27 | Default = 32 28 | */ 29 | void set_fcyc_cache_block(int bytes); 30 | 31 | /* When set, will attempt to compensate for timer interrupt overhead 32 | Default = 0 33 | */ 34 | void set_fcyc_compensate(int compensate); 35 | 36 | /* Value of K in K-best 37 | Default = 3 38 | */ 39 | void set_fcyc_k(int k); 40 | 41 | /* Maximum number of samples attempting to find K-best within some tolerance. 42 | When exceeded, just return best sample found. 43 | Default = 20 44 | */ 45 | void set_fcyc_maxsamples(int maxsamples); 46 | 47 | /* Tolerance required for K-best 48 | Default = 0.01 49 | */ 50 | void set_fcyc_epsilon(double epsilon); 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.c: -------------------------------------------------------------------------------- 1 | /* Compute least squares fit of set of data points */ 2 | #include 3 | #include 4 | #include "lsquare.h" 5 | 6 | typedef struct { 7 | double sum_x; 8 | double sum_y; 9 | double sum_xx; 10 | double sum_xy; 11 | } ls_stat_t; 12 | 13 | /* Accumulate various sums of the data */ 14 | static void ls_stats(double *xval, double *yval, int cnt, ls_stat_t *statp) 15 | { 16 | int i; 17 | statp->sum_x = 0.0; 18 | statp->sum_y = 0.0; 19 | statp->sum_xx = 0.0; 20 | statp->sum_xy = 0.0; 21 | for (i = 0; i < cnt; i++) { 22 | double x = xval[i]; 23 | double y = yval[i]; 24 | statp->sum_x += x; 25 | statp->sum_y += y; 26 | statp->sum_xx += x * x; 27 | statp->sum_xy += x * y; 28 | } 29 | } 30 | 31 | double ls_slope(double *xval, double *yval, int cnt) 32 | { 33 | double slope; 34 | ls_stat_t stat; 35 | ls_stats(xval, yval, cnt, &stat); 36 | slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/ 37 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x); 38 | return slope; 39 | } 40 | 41 | double ls_intercept(double *xval, double *yval, int cnt) 42 | { 43 | double intercept; 44 | ls_stat_t stat; 45 | ls_stats(xval, yval, cnt, &stat); 46 | intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/ 47 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x); 48 | return intercept; 49 | } 50 | 51 | static double rel_err(double x, double y, double slope, double intercept) 52 | { 53 | double offset = y - (slope*x+intercept); 54 | if (offset < 0) 55 | offset = -offset; 56 | if (x == 0) 57 | return offset; 58 | /* 59 | printf("x = %.2f, y = %.2f, a = %.2f, b = %.2f\n", 60 | x, y, slope, intercept); 61 | printf("Abs err = %.2f, Rel err = %.2f\n", offset, offset/x); 62 | */ 63 | return offset/x; 64 | } 65 | 66 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype) 67 | { 68 | double slope; 69 | double intercept; 70 | ls_stat_t stat; 71 | int i; 72 | double num, denom; 73 | ls_stats(xval, yval, cnt, &stat); 74 | slope = (cnt * stat.sum_xy - stat.sum_x * stat.sum_y)/ 75 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x); 76 | intercept = (stat.sum_xx * stat.sum_y - stat.sum_xy * stat.sum_x)/ 77 | (cnt * stat.sum_xx - stat.sum_x*stat.sum_x); 78 | num = denom = 0; 79 | for (i = 0; i < cnt; i++) { 80 | double e = rel_err(xval[i], yval[i], slope, intercept); 81 | switch (etype) { 82 | case LS_AVG: 83 | num += e; 84 | denom++; 85 | break; 86 | case LS_MAX: 87 | if (num < e) 88 | num = e; 89 | denom = 1; 90 | break; 91 | default: 92 | fprintf(stderr, "Invalid error type: %d\n", etype); 93 | exit(1); 94 | break; 95 | } 96 | } 97 | return num/denom; 98 | } 99 | 100 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/statistics/lsquare.h: -------------------------------------------------------------------------------- 1 | /* Compute least squares fit of set of data points */ 2 | 3 | /* Fit is of form y = mx + b. m is slope, b is intercept */ 4 | double ls_slope(double *xval, double *yval, int cnt); 5 | double ls_intercept(double *xval, double *yval, int cnt); 6 | 7 | typedef enum {LS_AVG, LS_MAX} ls_err_t; 8 | 9 | /* Determine error (either absolute or average) of least squares fit */ 10 | double ls_error(double *xval, double *yval, int cnt, ls_err_t etype); 11 | 12 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/polynomial_evaluation/timing/clock.h: -------------------------------------------------------------------------------- 1 | #if __INTEL_COMPILER 2 | /* inline function definitions */ 3 | #pragma warning ( disable : 1418 ) 4 | #endif 5 | 6 | /* Routines for using cycle counter */ 7 | 8 | /* Start the counter */ 9 | void start_counter(); 10 | void start_counter_copy(); 11 | 12 | /* Get # cycles since counter started */ 13 | double get_counter(); 14 | double get_counter_copy(); 15 | 16 | 17 | /* Measure overhead for counter */ 18 | double ovhd(); 19 | 20 | /* Determine clock rate of processor */ 21 | double mhz(int verbose); 22 | 23 | /* Determine clock rate of processor, having more control over accuracy */ 24 | double mhz_full(int verbose, int sleeptime); 25 | 26 | /** Special counters that compensate for timer interrupt overhead */ 27 | 28 | void start_comp_counter(); 29 | 30 | double get_comp_counter(); 31 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/reduction/mypapi.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #if defined(USE_PAPI) // ----------------------------------------------------------- 4 | #include 5 | 6 | typedef unsigned long long int uLint; 7 | 8 | #define PAPI_EVENTS_NUM 3 9 | int papi_events[PAPI_EVENTS_NUM] = {PAPI_TOT_INS, PAPI_TOT_CYC, PAPI_L1_DCM }; 10 | int papi_EventSet = PAPI_NULL; // the handle for the events' set 11 | uLint papi_buffer[PAPI_EVENTS_NUM] = {0}; // storage for the counters' values 12 | uLint papi_values[PAPI_EVENTS_NUM] = {0}; // accumulate the counters' values 13 | 14 | // check that PAPI is OK, exit if not 15 | #define PAPI_CHECK( R ) { \ 16 | if ( (R) != PAPI_OK ) { \ 17 | printf("a problem with PAPI (code %d) arise at line %d\n", \ 18 | (R), __LINE__);fflush(stdout); return (R); }} 19 | 20 | 21 | // check that PAPI is OK, 22 | // issue a warning if not with a 23 | // provided message 24 | #define PAPI_WARN( R, S ) { \ 25 | if ( (R) != PAPI_OK ) { \ 26 | printf("a problem with PAPI (code %d) arise at line %d: %s\n", \ 27 | (R), __LINE__, (S)); fflush(stdout); }} 28 | 29 | // check that PAPI is OK about an event 30 | // issue a warning if not with a 31 | // provided message 32 | #define PAPI_WARN_EVENT( R, E, S1, n ) { \ 33 | if ( (R) != PAPI_OK ) { \ 34 | printf("a problem with PAPI (code %d) : event %d arise at line %d: %s (%d)\n", \ 35 | (R), (E), __LINE__, (S1), (n)); fflush(stdout); }} 36 | 37 | 38 | #define PAPI_ADD_EVENTS_to_SET { for ( int i = 0; i < PAPI_EVENTS_NUM; i++) { \ 39 | retval = PAPI_query_event(papi_events[i]); \ 40 | if ( retval == PAPI_OK ) { \ 41 | retval = PAPI_add_event(papi_EventSet, papi_events[i]); \ 42 | PAPI_WARN_EVENT(retval, papi_events[i], "adding event", i);} else { \ 43 | PAPI_WARN_EVENT(retval, papi_events[i],"querying event", i)} } } 44 | 45 | #define PAPI_INIT { \ 46 | int retval = PAPI_library_init(PAPI_VER_CURRENT); \ 47 | if (retval != PAPI_VER_CURRENT) \ 48 | printf("wrong PAPI initialization: version %d instead of %d has been found\n", retval, PAPI_VER_CURRENT); \ 49 | retval = PAPI_create_eventset(&papi_EventSet); PAPI_WARN(retval,"creating event set"); \ 50 | PAPI_ADD_EVENTS_to_SET; } 51 | 52 | // to use HIGH-LEVEL API 53 | //#define PAPI_START_CNTR { int res = PAPI_start_counters(papi_events, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 54 | //#define PAPI_STOP_CNTR { int res = PAPI_stop_counters(papi_values, PAPI_EVENTS_NUM); PAPI_CHECK_RES(res); } 55 | 56 | // to use NORMAL API 57 | #define PAPI_START_CNTR { \ 58 | int retval = PAPI_start(papi_EventSet); PAPI_WARN(retval, "starting counters"); } 59 | 60 | #define PAPI_STOP_CNTR { \ 61 | int retval = PAPI_stop(papi_EventSet, papi_buffer); \ 62 | if( retval == PAPI_OK ) { \ 63 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 64 | papi_values[jj] += papi_buffer[jj]; } else PAPI_WARN(retval, "reading counters"); } 65 | 66 | #define PAPI_FLUSH_BUFFER { \ 67 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 68 | papi_buffer[ jj] = 0; } 69 | 70 | #define PAPI_FLUSH { \ 71 | for( int jj = 0; jj < PAPI_EVENTS_NUM; jj++) \ 72 | papi_values[jj] = papi_buffer[ jj] = 0; } 73 | 74 | 75 | #else // ----------------------------------------------------------- 76 | 77 | #define PAPI_INIT 78 | #define PAPI_START_CNTR 79 | #define PAPI_STOP_CNTR 80 | 81 | #endif // ----------------------------------------------------------- 82 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/reduction/plot.gp: -------------------------------------------------------------------------------- 1 | reset 2 | set terminal pngcairo size 1600,1000 dashlength 2 truecolor font "Garamond, 28" 3 | #set terminal qt enhanced size 1200,1000 4 | 5 | 6 | set tics font ",22" 7 | set rmargin screen 0.95 8 | set bmargin screen 0.12 9 | 10 | set xlabel "N" font ",22" offset 0,0.5 11 | 12 | unset yrange 13 | unset xrange 14 | 15 | array OPT[2] 16 | OPT[1] = "O0 " 17 | OPT[2] = "O3 " 18 | 19 | array W[2] 20 | W[1] = 3 21 | W[2] = 1.5 22 | 23 | array DT[2] 24 | DT[1] = "-- __" 25 | DT[2] = 1 26 | 27 | NTYPE = 7 28 | array TYPE[NTYPE] 29 | TYPE[1] = "naive" 30 | TYPE[2] = "UR2x1" 31 | TYPE[3] = "UR2x1g" 32 | TYPE[4] = "UR2x2" 33 | TYPE[5] = "UR4x2g" 34 | TYPE[6] = "UR4x4" 35 | TYPE[7] = "vUR4x4" 36 | 37 | 38 | # --------------------------------------------- 39 | set key inside top left font ",22" 40 | set lmargin screen 0.08 41 | # --------------------------------------------- 42 | 43 | set output "timings.png" 44 | set ylabel "timing (sec)" font ",22" offset 2 45 | 46 | 47 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 48 | 49 | 50 | # --------------------------------------------- 51 | set key outside left 52 | set lmargin screen 0.22 53 | # --------------------------------------------- 54 | 55 | set output "timings_per_element.png" 56 | set ylabel "timing per element (nsec)" font ",22" offset 2, -6 57 | 58 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/$1*1e9) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 59 | 60 | # --------------------------------------------- 61 | 62 | set output "timings_ratio.png" 63 | set ylabel "timings / timings_{naive}" font ",22" offset 2 64 | 65 | ref = 2 66 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "timings" u 1:(column(i+(L-1)*NTYPE)/column(ref)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 67 | 68 | # --------------------------------------------- 69 | set output "CPE.png" 70 | set ylabel "CPE" font ",22" offset 2 71 | 72 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "CPEs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 73 | 74 | 75 | # --------------------------------------------- 76 | 77 | set output "L1M.png" 78 | set ylabel "Level 1 misses per element" font ",22" offset 2,-5 79 | 80 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "L1Ms" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 81 | 82 | 83 | # --------------------------------------------- 84 | 85 | set output "IPC.png" 86 | set ylabel "IPC" font ",22" offset 2 87 | 88 | plot for[L = 1:2] for [i = 2:(NTYPE+1)] "IPCs" u 1:(column(i+(L-1)*NTYPE)) w lp ps 2 lw W[L] dt DT[L] title OPT[L].TYPE[i-1] 89 | 90 | 91 | 92 | 93 | set output 94 | reset 95 | -------------------------------------------------------------------------------- /CODE_OPTIMIZATION/examples_on_pipelines/reduction/reduction.h: -------------------------------------------------------------------------------- 1 | 2 | #if defined(_GNU_SOURCE) 3 | #include 4 | #endif 5 | 6 | // ───────────────────────────────────────────────────────────────── 7 | // define the datatype 8 | // 9 | #if !defined(ITYPE) 10 | #warning "compiling with double type" 11 | #define DTYPE double // type of data 12 | #define DATYPE double // type for accumulator 13 | #else 14 | #warning "compiling with int type" 15 | #define DTYPE unsigned int // type of data 16 | #define DATYPE long long unsigned int // type for accumulator 17 | #endif 18 | 19 | 20 | 21 | typedef unsigned long long int uLint; 22 | 23 | // 24 | // ------------------------------------------------------------------ 25 | 26 | 27 | #define CONCAT(x,y) x ## y 28 | 29 | // ───────────────────────────────────────────────────────────────── 30 | // define the timing routines 31 | // 32 | 33 | #define CPU_TIME (clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \ 34 | (double)ts.tv_sec + \ 35 | (double)ts.tv_nsec * 1e-9) 36 | 37 | // 38 | // ------------------------------------------------------------------ 39 | 40 | 41 | // ───────────────────────────────────────────────────────────────── 42 | // define the vector generator 43 | // 44 | 45 | #define DEFINE_VECT( T, N, NAME ) typedef T v##NAME __attribute__((vector_size( sizeof(T) * N))); typedef union { v##NAME v; T s[N]; } u##NAME; 46 | 47 | 48 | 49 | // ───────────────────────────────────────────────────────────────── 50 | // define the vector generator 51 | // 52 | 53 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER) 54 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep") 55 | #elif defined(__INTEL_COMPILER) | defined(__ICC) 56 | #define PRAGMA_VECT_LOOP _Pragma("parallel") 57 | #elif defined(__clang__) 58 | #define PRAGMA_VECT_LOOP _Pragma("ivdep") 59 | #else 60 | #define PRAGMA_VECT_LOOP 61 | #endif 62 | 63 | // 64 | // ------------------------------------------------------------------ 65 | 66 | // ───────────────────────────────────────────────────────────────── 67 | // 68 | // 69 | 70 | 71 | 72 | // ───────────────────────────────────────────────────────────────── 73 | // define the debug printing routine 74 | // 75 | 76 | #ifdef DEBUG 77 | #define PRINTF(...) printf(__VA_ARGS__) 78 | #define DEBUG_IO 2 79 | #else 80 | #define PRINTF(...) 81 | #endif 82 | 83 | 84 | 85 | DEFINE_VECT( DTYPE, 4, 4d ); 86 | DEFINE_VECT( long int, 4, 4i ); 87 | -------------------------------------------------------------------------------- /HPC_TOOLS_and_STORAGE/Readme.md: -------------------------------------------------------------------------------- 1 | # Materials on HPC libraries, tools, storage 2 | -------------------------------------------------------------------------------- /Materials/A_note_on_Endiansim.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/A_note_on_Endiansim.pdf -------------------------------------------------------------------------------- /Materials/Readme.md: -------------------------------------------------------------------------------- 1 | # Sparse materials on various topics 2 | 3 | In this folder we will upload materials of interest 4 | 5 | 1) topics.pdf :: a continuosly updated pdf with various topics discussed in the class 6 | 2) What every Computer Scientist should know about floating point :: a good introduction to the IEEE floating point representation 7 | 8 | -------------------------------------------------------------------------------- /Materials/What_every_computer_scientist_should_know_about_floating-point.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/What_every_computer_scientist_should_know_about_floating-point.pdf -------------------------------------------------------------------------------- /Materials/arguments.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | int main (int argc, char **argv ) 8 | { 9 | 10 | printf("argv is located at address %p and points to %p\n", &argv, argv ); 11 | 12 | int i = 0; 13 | while ( i < argc ) 14 | { 15 | printf("arguments %d is located at address %p and reads as %s\n", i, argv + i, *(argv+i)); 16 | i++; 17 | } 18 | 19 | return 0; 20 | } 21 | -------------------------------------------------------------------------------- /Materials/topics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/Materials/topics.pdf -------------------------------------------------------------------------------- /PARALLELISM/Readme.md: -------------------------------------------------------------------------------- 1 | # Section one: INTRODUCTION TO HPC and PARALLEL CONCEPTS 2 | 3 | ## Day 1: introduction to HPC 4 | date: Tuesday 28/09/2023 5 | 6 | ### lectures 7 | 8 | - Stefano Cozzini : [introduction to HPC](lecture01-intro-toHPC.pdf) 9 | 10 | 11 | The lecture above introduces HPC concepts and basic definitions. 12 | 13 | There is plenty of materials on the topic on the web. 14 | Here a few links to start with: 15 | 16 | - [FLOPS definition from wikipedia](https://en.wikipedia.org/wiki/FLOPS) 17 | - [ HPC short introduction from European perspective](https://ec.europa.eu/digital-single-market/en/high-performance-computing) 18 | - [ a must read paper: Reinventing High Performance Computing: Challenges and Opportunities](https://arxiv.org/abs/2203.02544) 19 | - [what can we do with an exascale machine](https://www.hpe.com/us/en/insights/articles/whats-with-the-18-zeros-2009.html) 20 | - [the www.top500.org: it deserves a visit to check a few things](https://www.top500.org) 21 | 22 | Application ( not discussed in lecture) 23 | - [Folding@home project: take a look](https://foldingathome.org/?lng=en) 24 | - [AlphaFold web page](https://alphafold.com/) 25 | 26 | ### Materials for Linux beginners: 27 | 28 | - [one simple tutorial to start using ssh](https://www.ssh.com/ssh/command/) 29 | - [linux/unix shell short tutorial for novice users](http://swcarpentry.github.io/shell-novice/) 30 | -------------------------------------------------------------------------------- /PARALLELISM/codes/memory.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void callocator(double ** vec, size_t N) 7 | { 8 | *vec=(double *)calloc(N, sizeof(double)); 9 | assert(*vec != NULL); 10 | } 11 | 12 | int main(int argc, char **argv) 13 | { 14 | double * v ; 15 | size_t i, j, m; 16 | for (i = 1e3 ; i < 1e8 ; i*=10 ) { 17 | m = sizeof(double) * i ; 18 | callocator(&v, i); 19 | for (j=0; j 16 | #include 17 | #include 18 | // if you don ' t have drand48 uncomment the following two lines 10 19 | // #define drand48 1.0/RANDMAXrand 20 | // #define srand48 srand 21 | #define seed 68111 // seed for number generator 22 | 23 | int main (int argc, char ** argv) { 24 | 25 | if (argc<2) 26 | { 27 | printf(" Usage: %s number \n",argv[0]); 28 | return 1; 29 | } 30 | long long int N = atoll(argv[1]); 31 | long long int M = 0 ; 32 | double pi = 0; 33 | // point coordinates 34 | double x , y; 35 | clock_t start_time, end_time; 36 | double total_time; 37 | start_time = clock(); 38 | 39 | srand48 ( seed ) ; // seed the number generator 40 | 41 | long long int i; 42 | for (i = 0 ; i < N ; i++) 43 | { 44 | // take a point P(x,y) inside the unit square 45 | x = drand48(); 46 | y = drand48(); 47 | 48 | // check if the point P(x,y) is inside the circle 49 | if ((x*x + y*y)<1) 50 | M++; 51 | } 52 | pi = 4.0*M/N ; // calculate area 53 | end_time=clock(); 54 | printf ( "\n # of trials = %llu , estimate of pi is %1.9f \n", N, pi ) ; 55 | total_time= ( (double) (end_time - start_time) )/CLOCKS_PER_SEC ; 56 | printf ( "\n # walltime : %10.8f \n", total_time ); 57 | return 0; 58 | } 59 | 60 | -------------------------------------------------------------------------------- /PARALLELISM/lecture01-intro-toHPC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture01-intro-toHPC.pdf -------------------------------------------------------------------------------- /PARALLELISM/lecture02-HPC-hardware.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture02-HPC-hardware.pdf -------------------------------------------------------------------------------- /PARALLELISM/lecture03-HPCsoftware-stack.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture03-HPCsoftware-stack.pdf -------------------------------------------------------------------------------- /PARALLELISM/lecture04-on-parallel-programming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLELISM/lecture04-on-parallel-programming.pdf -------------------------------------------------------------------------------- /PARALLELISM/slurm/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the following files: 2 | 3 | - slurm01.job 4 | A simple example of a batch script for Slurm jobs 5 | 6 | - slurm02_#.job 7 | Three jobs showing how to run job steps within a Slurm job and the differences between allocating tasks and nodes 8 | 9 | - slurm03_#.job 10 | Three jobs showing the importance of specifying walltime and memory requirements 11 | 12 | - slurm04.job 13 | A simple job showing how to: load modules, compile and run an application within a Slurm job 14 | 15 | - slurm05.job 16 | A simple job showing what happens when we load a module 17 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm01.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Name of the job 4 | #SBATCH --job-name=my_first_job 5 | 6 | # Define the number of nodes you need. 7 | #SBATCH --nodes=1 8 | 9 | # Define the number of tasks you need. Use with distributed parallelism 10 | #SBATCH --ntasks=16 11 | 12 | # Eventually, you can further specify the number of tasks per node 13 | #SBATCH --ntasks-per-node=16 14 | 15 | # Define the number of CPUs allocated to each task. Use with shared memory parallelism 16 | #SBATCH --cpus-per-task=2 17 | 18 | # Define how long the job will run in real time. Format is d-hh:mm:ss 19 | # For a 30 seconds job 20 | #SBATCH --time=0-00:00:30 21 | 22 | ## Define the account name, e.g. for the Laboratory of Data Engineering 23 | ##SBATCH -A lade 24 | 25 | # Define the partition on which the job shall run, e.g. EPYC, THIN, GPU, DGX 26 | #SBATCH -p EPYC 27 | 28 | # Define how much memory you need. Choose one between the following 29 | # --mem will define memory per node 30 | # --mem-per-cpu will define memory per CPU/core 31 | #SBATCH --mem-per-cpu=1500MB 32 | ##SBATCH --mem=5GB # this one is not in effect, due to the double hash 33 | 34 | # Specify the output and error files 35 | #SBATCH --output=%x.%j.out 36 | #SBATCH --error=%x.%j.err 37 | 38 | # Eventually, you can turn on mail notification. 39 | # Among the possibilities we can list: NONE, BEGIN, END, FAIL, ALL 40 | ##SBATCH --mail-type=BEGIN,END 41 | ##SBATCH --mail-user=fifo@lifo.com 42 | 43 | # Pick nodes with feature 'foo'. Different clusters have different features available. 44 | # Most of the time you don't need this 45 | ##SBATCH -C foo 46 | 47 | # Restrict the job to run on the node(s) named 48 | ##SBATCH -w epyc008 49 | 50 | #Start the program 51 | 52 | >&2 echo "DIR is ${SLURM_SUBMIT_DIR}" 53 | 54 | srun /bin/hostname 55 | srun sleep 60 56 | 57 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm02_A.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=my_second_job_A 4 | #SBATCH --time=0-00:10:00 5 | #SBATCH -p EPYC 6 | #SBATCH -n3 # 3 tasks 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | echo Starting job $SLURM_JOB_ID 10 | echo SLURM assigned me these nodes 11 | srun -l hostname 12 | 13 | echo "1)" $(date) 14 | srun -l --exclusive -n2 sleep 60 & # start 2 copies of program 1 15 | echo "2)" $(date) 16 | srun -l --exclusive -n1 sleep 60 & # start 1 copy of program 2 17 | echo "3)" $(date) 18 | wait # wait for all to finish 19 | 20 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm02_B.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=my_second_job_B 4 | #SBATCH --time=0-00:10:00 5 | #SBATCH -p EPYC 6 | #SBATCH -n3 # 3 tasks 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | echo Starting job $SLURM_JOB_ID 10 | echo SLURM assigned me these nodes 11 | srun -l hostname 12 | 13 | echo "1)" $(date) 14 | srun -l --exclusive -n2 sleep 60 # start 2 copies of program 1 15 | echo "2)" $(date) 16 | srun -l --exclusive -n1 sleep 60 # start 1 copy of program 2 17 | echo "3)" $(date) 18 | 19 | 20 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm02_C.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=my_second_job_C 3 | #SBATCH --time=0-00:10:00 4 | #SBATCH -p EPYC 5 | #SBATCH -n3 # 3 tasks 6 | #SBATCH --output=%x.%j.out 7 | #SBATCH --error=%x.%j.err 8 | #SBATCH -N3 # 3 NODES 9 | 10 | echo Starting job $SLURM_JOB_ID 11 | echo SLURM assigned me these nodes 12 | srun -l hostname 13 | echo "1)" $(date) 14 | srun -l --exclusive -n2 -N2 sleep 60 & # start 2 copies of program 1 15 | echo "2)" $(date) 16 | srun -l --exclusive -n1 -N1 sleep 60 & # start 1 copy of program 2 17 | echo "3)" $(date) 18 | wait # wait for all to finish 19 | 20 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm03_A.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=memory_A # Job name 3 | #SBATCH --ntasks=1 # Run a single task 4 | #SBATCH --mem=70M # Job Memory 5 | #SBATCH --time=00:15:00 # Time limit hrs:min:sec 6 | #SBATCH -p THIN 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | 10 | pwd; hostname; date 11 | cd ../codes 12 | 13 | gcc memory.c -o memory.x 14 | ./memory.x 15 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm03_B.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=memory_B # Job name 3 | #SBATCH --ntasks=1 # Run a single task 4 | #SBATCH --mem=90M # Job Memory 5 | #SBATCH --time=00:05:00 # Time limit hrs:min:sec 6 | #SBATCH -p THIN 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | 10 | pwd; hostname; date 11 | cd ../codes 12 | 13 | gcc memory.c -o memory.x 14 | ./memory.x 15 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm03_C.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=memory_C # Job name 3 | #SBATCH --ntasks=1 # Run a single task 4 | #SBATCH --mem=100M # Job Memory 5 | #SBATCH --time=00:00:01 # Time limit hrs:min:sec 6 | #SBATCH -p THIN 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | 10 | pwd; hostname; date 11 | cd ../codes 12 | 13 | gcc memory.c -o memory.x 14 | ./memory.x 15 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm04.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=compile_and_run_pi 4 | #SBATCH --time=0-00:10:00 5 | #SBATCH -p EPYC 6 | #SBATCH -n1 # 1 tasks 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | #SBATCH -N1 # 1 NODES 10 | echo Starting job $SLURM_JOB_ID 11 | echo Current dir is ${SLURM_SUBMIT_DIR} 12 | 13 | module purge 14 | module load compiler # For Intel compiler instead of GNU compiler 15 | cd ../codes 16 | echo "Now, I am in $(pwd)" 17 | icx pi.c -O3 -o pi.x 18 | ./pi.x 100000000 19 | 20 | -------------------------------------------------------------------------------- /PARALLELISM/slurm/slurm05.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=modules # Job name 3 | #SBATCH --ntasks=1 # Run a single task 4 | #SBATCH --time=00:05:00 # Time limit hrs:min:sec 5 | #SBATCH -p EPYC 6 | #SBATCH --output=%x.%j.out 7 | #SBATCH --error=%x.%j.err 8 | 9 | module purge 10 | echo "a) "$LD_LIBRARY_PATH 11 | module load openMPI/4.1.5/gnu 12 | echo "b) "$LD_LIBRARY_PATH 13 | module purge 14 | echo "c) "$LD_LIBRARY_PATH 15 | module load openMPI/4.1.5/icx 16 | echo "d) "$LD_LIBRARY_PATH 17 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/Readme.md: -------------------------------------------------------------------------------- 1 | # This folder collects materials on MPI and OpenMP 2 | 3 | ## MPI section 4 | 5 | A collection of materials/references for the MPI lectures 6 | 7 | 8 | ### lectures (all by S.Cozzini) 9 | 10 | - lecture 5a: [MPI programming partA ](lecture05-MPI-Programming-A.pdf) 11 | - lecture 5b: [MPI programming partB ](lecture05-MPI-Programming-B.pdf) 12 | 13 | 14 | ### Main references for MPI lectures: 15 | 16 | - chapter 9 of reference 4 is a nice and detailed introduction to MPI. 17 | - exercises and tutorials on MPI are present all over the web. Here a couple of examples: 18 | - [Here a very good starting point](https://www.mcs.anl.gov/research/projects/mpi/tutorial/index.html) 19 | - [Another simple tutorial](https://mpitutorial.com/tutorials/) 20 | - [A virtual course where I took a lot of materials, including some exercises](https://cvw.cac.cornell.edu/MPIP2P/) 21 | 22 | ### tutorials (contributed by Niccolo Tosato and Marco Celoria) 23 | - tutorial 1: [compiling and running MPI program on ORFEO (prepared by N.Tosato)](compiling-and-running-mpi-programs.md) 24 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/Brecv.c: -------------------------------------------------------------------------------- 1 | // taken from https://cvw.cac.cornell.edu/MPIP2P/brecv 2 | #include 3 | #include 4 | #include 5 | #include "mpi.h" 6 | #define TAG 100 7 | 8 | void print_time(double tbegin, double tend); 9 | int new_sleep(int); 10 | int SLEEP(clock_t); 11 | 12 | /* ------------------------------------------------------------------- 13 | * helper to calculate elapsed time and print results 14 | * ------------------------------------------------------------------- 15 | */ 16 | void print_time(double tbegin, double tend) 17 | { 18 | int dt; 19 | dt = (int)((tend - tbegin) * 1000000.0); 20 | printf(" Elapsed time for send = %8d uSec\n", dt); 21 | } 22 | 23 | /* ----------------------------------------------------------- 24 | * helpers to sleep program 25 | * ----------------------------------------------------------- 26 | */ 27 | int SLEEP(clock_t wait) 28 | { 29 | clock_t goal; 30 | wait *= 1000; 31 | goal = wait + clock(); 32 | while (goal > clock() ) 33 | ; 34 | return (0); 35 | } 36 | 37 | int new_sleep(int amount) 38 | { 39 | SLEEP(amount); 40 | return (0); 41 | } 42 | 43 | /* ----------------------------------------------------------- 44 | * Main Program 45 | * ----------------------------------------------------------- 46 | */ 47 | int main(int argc, char **argv) 48 | { 49 | float *message; /* message buffer */ 50 | int rank, /* rank of task in communicator */ 51 | size, i; 52 | int mlen; /* dimension of the message */ 53 | MPI_Status status; /* status of communication */ 54 | double tbegin, tend; /* used to measure elapsed time */ 55 | 56 | if (argc != 2) { 57 | printf(" Usage: blocksends \n"); 58 | return -1; 59 | } 60 | 61 | /* ------------------------------------------------------------------- 62 | * do initial housekeeping: allocate memory for messages, 63 | * initialize program with MPI, define message tags 64 | * ------------------------------------------------------------------ 65 | */ 66 | 67 | mlen = atoi(argv[1]); 68 | message = (float *)malloc(mlen * sizeof(float)); 69 | 70 | 71 | MPI_Init(&argc, &argv); 72 | MPI_Comm_size(MPI_COMM_WORLD, &size); 73 | MPI_Comm_rank( MPI_COMM_WORLD, &rank ); 74 | if(size != 2) { 75 | printf("This application is meant to be run with 2 processes.\n"); 76 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 77 | } 78 | printf(" Process %d initialized\n", rank); 79 | printf(" Message size = %6d floats\n", mlen); 80 | printf(" Total size = %6lu bytes\n", (mlen* sizeof(float))); 81 | 82 | /* ----------------------------------------------------------------- 83 | * task 0 will report the elapsed time for a blocking send 84 | * ----------------------------------------------------------------- 85 | */ 86 | if (rank == 0) { 87 | for (i = 0; i < mlen; i++) message[i] = 100; 88 | printf(" Task %d sending message\n", rank); 89 | MPI_Barrier(MPI_COMM_WORLD); 90 | tbegin = MPI_Wtime(); 91 | MPI_Send(message, mlen, MPI_FLOAT, 1, TAG, MPI_COMM_WORLD); 92 | tend = MPI_Wtime(); 93 | print_time(tbegin, tend); 94 | } 95 | 96 | /* ----------------------------------------------------------------- 97 | * task 1 sleeps for 1 second, and then calls a blocking receive. 98 | * the sleep is intended to simulate time spent in useful computation 99 | * ----------------------------------------------------------------- 100 | */ 101 | else if (rank == 1) { 102 | for (i = 0; i < mlen; i++) message[i] = -100; 103 | MPI_Barrier(MPI_COMM_WORLD); 104 | new_sleep(1); 105 | MPI_Recv(message, mlen, MPI_FLOAT, 0, TAG, MPI_COMM_WORLD, &status ); 106 | printf(" Task %d received message\n", rank); 107 | } 108 | MPI_Finalize(); 109 | return 0; 110 | } 111 | 112 | 113 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/clean.sh: -------------------------------------------------------------------------------- 1 | rm *.x 2 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_gnu.sh: -------------------------------------------------------------------------------- 1 | module load openMPI/4.1.5/gnu 2 | 3 | mpicc Brecv.c -g3 -o Brecv.x 4 | mpicc CBlockSends.c -g3 -o CBlockSends.x 5 | mpicc deadlock.c -g3 -o deadlock.x 6 | mpicc linear-array.c -g3 -o linear-array.x 7 | mpicc mpi_env_call.c -g3 -o mpi_env_call.x 8 | mpicc mpi_hello_world.c -g3 -o mpi_hello_world.x 9 | mpicc mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x 10 | mpif90 mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x 11 | mpicc mpi_pi.c -O3 -g3 -o mpi_pi.x 12 | mpif90 send_message.F90 -g3 -o send_message_F.x 13 | mpicc send_message.c -g3 -o send_message.x 14 | mpicc sendrecv_message.c -g3 -o sendrecv_message.x 15 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/compile_openMPI_intel.sh: -------------------------------------------------------------------------------- 1 | module purge 2 | module load openMPI/4.1.5/icx 3 | 4 | mpicc Brecv.c -g3 -o Brecv.x 5 | mpicc CBlockSends.c -g3 -o CBlockSends.x 6 | mpicc deadlock.c -g3 -o deadlock.x 7 | mpicc linear-array.c -g3 -o linear-array.x 8 | mpicc mpi_env_call.c -g3 -o mpi_env_call.x 9 | mpicc mpi_hello_world.c -g3 -o mpi_hello_world.x 10 | mpicc mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x 11 | mpif90 mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x 12 | mpicc mpi_pi.c -O3 -g3 -o mpi_pi.x 13 | mpif90 send_message.F90 -g3 -o send_message_F.x 14 | mpicc send_message.c -g3 -o send_message.x 15 | mpicc sendrecv_message.c -g3 -o sendrecv_message.x 16 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/deadlock.c: -------------------------------------------------------------------------------- 1 | // A simple program with a deadloack inside. 2 | // Taken and adapted from somewhere on the net 3 | #include 4 | #include "mpi.h" 5 | #include 6 | 7 | int main(int argc, char *argv[]) 8 | { 9 | #define MSGLEN 1024 10 | int ITAG_A = 100, ITAG_B = 200; 11 | int irank, i, isize, idest, isrc, istag, iretag; 12 | float rmsg1[MSGLEN]; 13 | float rmsg2[MSGLEN]; 14 | MPI_Status recv_status; 15 | 16 | MPI_Init(&argc, &argv); 17 | MPI_Comm_rank(MPI_COMM_WORLD, &irank); 18 | MPI_Comm_size(MPI_COMM_WORLD, &isize); 19 | 20 | if(isize != 2) { 21 | printf("This application is meant to be run with 2 processes.\n"); 22 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 23 | } 24 | 25 | printf("I am rank %d of %d \n", irank, isize ); 26 | // load an array of float numbers as message 27 | for (i = 1; i <= MSGLEN; i++) { 28 | rmsg1[i] = 100; 29 | rmsg2[i] = -100; 30 | } 31 | if (irank == 0) { 32 | idest = 1; 33 | isrc = 1; 34 | istag = ITAG_A; 35 | iretag = ITAG_B; 36 | } 37 | else if (irank == 1) { 38 | idest = 0; 39 | isrc = 0; 40 | istag = ITAG_B; 41 | iretag = ITAG_A; 42 | } 43 | 44 | printf("Task %d sends the message with tag %d of length %lu \n", 45 | irank, istag, MSGLEN * sizeof(float)); 46 | 47 | printf("Task %d receives message with tag %d of length %lu \n", 48 | irank, iretag, MSGLEN * sizeof(float)); 49 | 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | 52 | MPI_Send(&rmsg1, MSGLEN, MPI_FLOAT, idest, istag, MPI_COMM_WORLD); 53 | MPI_Recv(&rmsg2, MSGLEN, MPI_FLOAT, isrc, iretag, MPI_COMM_WORLD, &recv_status); 54 | printf("Task %d has received the message\n", irank); 55 | MPI_Finalize(); 56 | 57 | } 58 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/linear-array.c: -------------------------------------------------------------------------------- 1 | // A simple 1-D example: 2 | // each element receive the rank of the previous one, add its rank and send forward. 3 | // taken somewhere on net and adapted. 4 | // Final SUM is the sum of n-1 integers. 5 | 6 | #include 7 | #include "mpi.h" 8 | 9 | int main(int argc,char *argv[]) 10 | { 11 | int MyRank, Numprocs; 12 | int value, sum = 0; 13 | int Source, Source_tag; 14 | int Destination, Destination_tag; 15 | int Root = 0; 16 | MPI_Status status; 17 | 18 | MPI_Init(&argc,&argv); 19 | MPI_Comm_size(MPI_COMM_WORLD, &Numprocs); 20 | MPI_Comm_rank(MPI_COMM_WORLD, &MyRank); 21 | 22 | if (MyRank == Root) { 23 | Destination = MyRank + 1; 24 | Destination_tag = 0; 25 | MPI_Send(&MyRank, 1, MPI_INT, Destination, Destination_tag, MPI_COMM_WORLD); 26 | } 27 | else { 28 | if (MyRank 5 | 6 | int main(int argc, char *argv[]) { 7 | int numtasks, rank, len, rc; 8 | char hostname[MPI_MAX_PROCESSOR_NAME]; 9 | 10 | // initialize MPI 11 | MPI_Init(&argc,&argv); 12 | 13 | // get number of tasks 14 | MPI_Comm_size(MPI_COMM_WORLD,&numtasks); 15 | 16 | // get my rank 17 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 18 | 19 | // this one is obvious 20 | MPI_Get_processor_name(hostname, &len); 21 | printf ("Number of tasks= %d. My rank= %d. Running on %s\n", numtasks, rank, hostname); 22 | 23 | // done with MPI 24 | MPI_Finalize(); 25 | } 26 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.F90: -------------------------------------------------------------------------------- 1 | PROGRAM hello 2 | INCLUDE 'mpif.h' 3 | INTEGER err, rank, size, name_len 4 | CHARACTER(MPI_MAX_PROCESSOR_NAME) processor_name 5 | CALL MPI_INIT(err) 6 | CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank,err) 7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD,size,err) 8 | CALL MPI_GET_PROCESSOR_NAME(processor_name,name_len,err) 9 | print *, 'Hello world from processor ', processor_name, ' rank ', rank, ' out of ', size, ' processors' 10 | CALL MPI_FINALIZE(err) 11 | END 12 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | // Initialize the MPI environment 6 | MPI_Init(NULL, NULL); 7 | 8 | // Get the number of processes 9 | int world_size; 10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 11 | 12 | // Get the rank of the process 13 | int world_rank; 14 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 15 | 16 | // Get the name of the processor 17 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 18 | int name_len; 19 | MPI_Get_processor_name(processor_name, &name_len); 20 | 21 | fprintf(stdout, "Hello world from processor %s, rank %d out of %d processors\n", 22 | processor_name, world_rank, world_size); 23 | // Finalize the MPI environment. 24 | MPI_Finalize(); 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_hello_world_sync.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | // Initialize the MPI environment 6 | MPI_Init(NULL, NULL); 7 | 8 | // Get the number of processes 9 | int world_size; 10 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 11 | 12 | // Get the rank of the process 13 | int world_rank; 14 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 15 | 16 | // Get the name of the processor 17 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 18 | int name_len; 19 | MPI_Get_processor_name(processor_name, &name_len); 20 | 21 | // Print off a hello world message 22 | for (int i=0; i 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #define USE MPI 17 | #define SEED 35791246 18 | 19 | int main (int argc , char *argv[]) 20 | { 21 | // coordinates 22 | double x, y; 23 | 24 | // number of points inside the circle 25 | long long int M, local_M; 26 | double pi; 27 | 28 | // times 29 | double start_time, comp_time, end_time, wall_time, avg_walltime, max_walltime; 30 | int myid, numprocs, proc; 31 | MPI_Status status; 32 | MPI_Request request; 33 | // master process 34 | int master = 0; 35 | int tag = 123; 36 | 37 | MPI_Init(&argc, &argv); 38 | MPI_Comm_size(MPI_COMM_WORLD, &numprocs); 39 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 40 | fprintf (stdout, "I am %d\n", myid); 41 | if (argc <=1 ) { 42 | fprintf(stderr, "Usage : mpi -np n %s number_of_iterations \n", argv[0]); 43 | MPI_Finalize(); 44 | exit(-1); 45 | } 46 | 47 | long long int N = atoll(argv[1])/numprocs; 48 | // take time of processors after initial I/O operation 49 | start_time = MPI_Wtime(); 50 | 51 | // initialize random numbers 52 | srand48(SEED * (myid + 1)); // seed the number generator 53 | local_M = 0; 54 | long long int i; 55 | for (i = 0; i < N ; i++) { 56 | // take a point P(x,y) inside the unit square 57 | x = drand48(); 58 | y = drand48(); 59 | // check if the point P(x,y) is inside the circle 60 | if ( (x*x + y*y) < 1) 61 | local_M++; 62 | } 63 | // take time of processors after initial I/O operation 64 | MPI_Barrier(MPI_COMM_WORLD); 65 | comp_time=MPI_Wtime(); 66 | 67 | if (myid == 0) { //if I am the master process gather results from others 68 | M = local_M; 69 | for (proc = 1; proc < numprocs; proc++) { 70 | MPI_Recv(&local_M, 1, MPI_LONG_LONG, proc, tag, MPI_COMM_WORLD, &status); 71 | M += local_M; 72 | } 73 | pi = 4.0 * M / (N * numprocs); 74 | end_time = MPI_Wtime(); 75 | } 76 | else { // for all the slave processes send results to the master / 77 | MPI_Ssend(&local_M, 1,MPI_LONG_LONG, master, tag, MPI_COMM_WORLD); 78 | end_time=MPI_Wtime(); 79 | } 80 | 81 | wall_time = end_time - start_time; 82 | MPI_Reduce(&wall_time, &avg_walltime, 1, MPI_DOUBLE, MPI_SUM, master, MPI_COMM_WORLD); 83 | avg_walltime = avg_walltime / numprocs; 84 | MPI_Reduce(&wall_time, &max_walltime, 1, MPI_DOUBLE, MPI_MAX, master, MPI_COMM_WORLD); 85 | 86 | fprintf(stdout, "\n# walltime on processor %i : %10.8f\n", myid, wall_time); 87 | fprintf(stdout, "\n# walltime after computation on processor %i : %10.8f\n", myid, comp_time - start_time); 88 | fprintf(stdout, "\n# walltime for communication on processor %i : %10.8f\n", myid, end_time - comp_time); 89 | fflush(stdout); 90 | if (myid ==0) { 91 | printf ( "\n# of trials = %llu , estimate of pi is %1.9f\n", N * numprocs, pi); 92 | fprintf(stdout, "\n[*] Average Walltime: %10.8f\n", avg_walltime); 93 | fprintf(stdout, "\n(*) Max Walltime: %10.8f\n", max_walltime); 94 | fflush(stdout); 95 | } 96 | MPI_Finalize() ; // let MPI finish up / 97 | 98 | } 99 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/mpi_pi.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=pi_epyc 4 | #SBATCH --time=0-00:10:00 5 | #SBATCH -p EPYC 6 | #SBATCH -n128 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | #SBATCH -N1 # 1 NODES 10 | echo Starting job $SLURM_JOB_ID 11 | echo Current dir is ${SLURM_SUBMIT_DIR} 12 | 13 | module purge 14 | module load compiler 15 | module load intelMPI/2021.7.1 16 | mpiicc -cc=icx mpi_hello_world.c -g3 -o mpi_hello_world.x 17 | mpiicc -cc=icx mpi_hello_world_sync.c -g3 -o mpi_hello_world_sync.x 18 | mpiifort mpi_hello_world.F90 -g3 -o mpi_hello_world_F.x 19 | mpiicc -cc=icx mpi_pi.c -O3 -g3 -o mpi_pi.x 20 | 21 | mpirun -np 12 ./mpi_pi.x 10000 22 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.F90: -------------------------------------------------------------------------------- 1 | 2 | Program MPI 3 | ! a simple implementation of send/receive message 4 | Implicit None 5 | ! 6 | Include 'mpif.h' 7 | ! 8 | Integer :: rank 9 | Integer :: buffer 10 | Integer, Dimension( 1:MPI_status_size ) :: status 11 | Integer :: error 12 | ! 13 | Call MPI_init( error ) 14 | Call MPI_comm_rank( MPI_comm_world, rank, error ) 15 | ! 16 | If( rank == 0 ) Then 17 | Call MPI_recv( buffer, 1, MPI_integer, 1, 10, & 18 | MPI_comm_world, status, error ) 19 | Print*, 'Rank ', rank, ' buffer=', buffer 20 | If( buffer /= 33 ) Print*, 'fail' 21 | End If 22 | ! 23 | If( rank == 1 ) Then 24 | buffer = 33 25 | Call MPI_send( buffer, 1, MPI_integer, 0, 10, & 26 | MPI_comm_world, error ) 27 | End If 28 | ! 29 | Call MPI_finalize( error ) 30 | End Program MPI 31 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/send_message.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | int main(int argc, char** argv) { 6 | MPI_Init(&argc, &argv); 7 | int rank, size; 8 | int buffer; 9 | MPI_Status status; 10 | MPI_Comm_size(MPI_COMM_WORLD, &size); 11 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 12 | if(size != 2) { 13 | printf("This application is meant to be run with 2 processes.\n"); 14 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 15 | } 16 | if (rank == 0){ 17 | // int MPI_Recv(void* buffer, int count, MPI_Datatype datatype, 18 | // int sender, int tag, MPI_Comm communicator, MPI_Status* status); 19 | MPI_Recv(&buffer, 1, MPI_INT, 1, 9, MPI_COMM_WORLD, &status); 20 | fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer); 21 | if (buffer != 33) fprintf(stderr, "Fail\n"); 22 | } 23 | if (rank == 1) { 24 | buffer = 33; 25 | // int MPI_Send(const void* buffer, int count, MPI_Datatype datatype, 26 | // int recipient, int tag, MPI_Comm communicator); 27 | MPI_Send(&buffer, 1, MPI_INT, 0, 9, MPI_COMM_WORLD); 28 | } 29 | MPI_Finalize(); 30 | return EXIT_SUCCESS; 31 | } 32 | 33 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/basic-mpi-codes/sendrecv_message.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | int main(int argc, char** argv) { 6 | MPI_Init(&argc, &argv); 7 | int rank, size; 8 | int buffer; 9 | char message[2][16]; 10 | MPI_Status status; 11 | MPI_Comm_size(MPI_COMM_WORLD, &size); 12 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 13 | if(size != 2) { 14 | printf("This application is meant to be run with 2 processes.\n"); 15 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 16 | } 17 | if (rank == 0){ 18 | strcpy(message[0], "skew"); 19 | strcpy(message[1], "squeue"); 20 | // int MPI_Sendrecv(const void* buffer_send, int count_send, 21 | // MPI_Datatype datatype_send, int recipient, int tag_send, 22 | // void* buffer_recv, int count_recv, 23 | // MPI_Datatype datatype_recv, int sender, int tag_recv, 24 | // MPI_Comm communicator, MPI_Status* status); 25 | MPI_Sendrecv(message, 32, MPI_CHAR, 1, 10, 26 | &buffer, 1, MPI_INT, 1, 9, 27 | MPI_COMM_WORLD, &status); 28 | fprintf(stdout, "Rank %d: buffer = %d \n", rank, buffer); 29 | if (buffer != 33) fprintf(stderr, "Fail\n"); 30 | } 31 | if (rank == 1) { 32 | buffer = 33; 33 | MPI_Sendrecv(&buffer, 1, MPI_INT, 0, 9, 34 | message, 32, MPI_CHAR, 0, 10, 35 | MPI_COMM_WORLD, &status); 36 | 37 | fprintf(stdout, "Rank %d: message[0] = %s, message[1] = %s \n", 38 | rank, message[0], message[1]); 39 | } 40 | MPI_Finalize(); 41 | return EXIT_SUCCESS; 42 | } 43 | 44 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #SBATCH --job-name=allgather-example 4 | #SBATCH -p GPU 5 | #SBATCH --nodes=2 6 | #SBATCH --ntasks-per-node=1 7 | ##SBATCH --gres=gpu:2 8 | #SBATCH --time=0:10:00 9 | #SBATCH -o allgather.%A.out 10 | #SBATCH -e allgather.%A.error 11 | ##SBATCH -A lade 12 | #SBATCH --wait-all-nodes=1 13 | #SBATCH --cpus-per-task=16 14 | #SBATCH --mem=10G 15 | ##SBATCH -w dgx002 16 | CURRENT_DIR=${SLURM_SUBMIT_DIR} 17 | head_node=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 18 | head_node_ip=$( srun --nodes=1 --ntasks=1 -w "$head_node" --exclusive hostname --ip-address) 19 | echo "head_node=" ${head_node} " - head_node_ip=" $head_node_ip 20 | #export LOGLEVEL=INFO 21 | #export NCCL_DEBUG=INFO 22 | export OMP_NUM_THREADS=16 23 | cd ../.. 24 | source myenv_v100/bin/activate 25 | cd - 26 | echo $(pwd) 27 | echo ${CUDA_VISIBLE_DEVICES} 28 | 29 | srun -l torchrun \ 30 | --nnodes 2 \ 31 | --nproc_per_node 2 \ 32 | --rdzv_id $RANDOM \ 33 | --rdzv_backend c10d \ 34 | --rdzv_endpoint $head_node_ip:29500 \ 35 | allgather.py 36 | 37 | 38 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/allgather.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | import torch.multiprocessing as mp 3 | import torch 4 | import os 5 | 6 | # salloc -N1 -n1 -c64 -A lade -p DGX --gpus-per-node=4 --time=1:59:00 7 | # srun python tmp.py 8 | def ddp_setup(): 9 | dist.init_process_group(backend="nccl") 10 | torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) 11 | 12 | #def ddp_setup(rank: int, world_size: int): 13 | # """ 14 | # Args: 15 | # rank: Unique identifier of each process 16 | # world_size: Total number of processes 17 | # """ 18 | # os.environ["MASTER_ADDR"] = "localhost" 19 | # os.environ["MASTER_PORT"] = "12355" 20 | # dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) 21 | # torch.cuda.set_device(rank) 22 | 23 | def run(): 24 | local_rank = int(os.environ["LOCAL_RANK"]) 25 | global_rank = int(os.environ["RANK"]) 26 | world_size = dist.get_world_size() 27 | torch.manual_seed(global_rank) 28 | n = torch.randint(high=10, size=(1,), dtype=int).to(local_rank) 29 | a = torch.tensor([global_rank] * n, dtype=int).to(local_rank) 30 | for p in range(world_size): 31 | if global_rank==p: 32 | print(f"A) {global_rank}: {a}", flush=True) 33 | dist.barrier() 34 | nelements_list = [torch.zeros_like(n).to(local_rank) for _ in range(world_size)] 35 | dist.all_gather(tensor = n, tensor_list = nelements_list) 36 | gather_list = [torch.zeros(int(nelements_list[i]), dtype=int).to(local_rank) for i in range(world_size)] 37 | dist.all_gather(tensor = a, tensor_list = gather_list) 38 | res = torch.cat((gather_list)) 39 | for p in range(world_size): 40 | if global_rank==p: 41 | print(f"B) {global_rank}: {res}", flush=True) 42 | dist.barrier() 43 | 44 | def main(): 45 | ddp_setup() 46 | run() 47 | dist.destroy_process_group() 48 | 49 | if __name__ == "__main__": 50 | # world_size = torch.cuda.device_count() 51 | # mp.spawn(main, args=(world_size,), nprocs=world_size) 52 | main() 53 | 54 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/allgatherv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #define SEED 35791246 9 | 10 | int main(int argc, char** argv) { 11 | 12 | int myid, nproc; 13 | MPI_Init(NULL, NULL); 14 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 15 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 16 | srand(SEED*(myid+1)) ; // seed the number generator 17 | int numel = 1 + (rand() % 9); 18 | int totel; 19 | 20 | int counts_recv[nproc]; 21 | int displacements[nproc]; 22 | MPI_Allgather(&numel, 1, MPI_INT, counts_recv, 1, MPI_INT, MPI_COMM_WORLD); 23 | displacements[0] = 0 ; 24 | for (int i = 1; i < nproc ; i++){ 25 | displacements[i] = displacements[i-1] + counts_recv[i-1]; 26 | } 27 | 28 | double * a = (double*)malloc(sizeof(double) * numel); 29 | assert(a != NULL); 30 | 31 | for (int i=0; i < numel; i++) { 32 | a[i]=myid; 33 | } 34 | 35 | for (int i = 0; i < nproc; i++) { 36 | if (i == myid) { 37 | fprintf(stdout, "BEFORE\tmyid = %d\n", myid ); 38 | for (int n = 0 ; n < numel; n++) 39 | fprintf(stdout, "\ta[%d]=%.1f\n", n, a[n]); 40 | fprintf(stdout, "\n"); 41 | fflush(stdout); 42 | } 43 | MPI_Barrier(MPI_COMM_WORLD); 44 | } 45 | 46 | MPI_Allreduce(&numel, &totel, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 47 | double * b = (double*)malloc(sizeof(double) * totel); 48 | assert(b != NULL); 49 | 50 | MPI_Allgatherv(a, numel, MPI_DOUBLE, b, counts_recv, displacements, MPI_DOUBLE, MPI_COMM_WORLD); 51 | for (int i = 0; i < nproc; i++) { 52 | if (i == myid) { 53 | fprintf(stdout, "AFTER\tmyid = %d\n", myid ); 54 | for (int n = 0 ; n < totel; n++) 55 | fprintf(stdout, "\tb[%d]=%.1f\n", n, b[n]); 56 | fprintf(stdout, "\n"); 57 | fflush(stdout); 58 | } 59 | MPI_Barrier(MPI_COMM_WORLD); 60 | } 61 | free(a); 62 | free(b); 63 | MPI_Finalize(); 64 | } 65 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main(int argc, char** argv) { 7 | 8 | int num_elements = 2; 9 | int myid, nproc, root; 10 | MPI_Init(NULL, NULL); 11 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 12 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 13 | double * a = (double*)malloc(sizeof(double) * num_elements); 14 | assert(a != NULL); 15 | for (int i=0; i < num_elements; i++) { 16 | a[i]=0.; 17 | } 18 | root = 0; 19 | if (myid == root) { 20 | for (int i = 0 ; i < num_elements; i++) 21 | a[i] = 2. * (i + 1.); 22 | } 23 | for (int i = 0; i < nproc; i++) { 24 | if (i == myid) { 25 | fprintf(stdout, "%d\tbefore:", myid ); 26 | for (int n = 0 ; n < num_elements; n++) 27 | fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]); 28 | fprintf(stdout, "\n"); 29 | fflush(stdout); 30 | } 31 | MPI_Barrier(MPI_COMM_WORLD); 32 | } 33 | // int MPI_Bcast(void* buffer, int count, MPI_Datatype datatype, int emitter_rank, MPI_Comm communicator); 34 | MPI_Bcast(a, num_elements, MPI_DOUBLE, 0, MPI_COMM_WORLD); 35 | for (int i = 0; i < nproc; i++) { 36 | if (i == myid) { 37 | fprintf(stdout, "%d\tafter:", myid ); 38 | for (int n = 0 ; n < num_elements; n++) 39 | fprintf(stdout, "\ta[%d]=%.2f ", n, a[n]); 40 | fprintf(stdout, "\n"); 41 | fflush(stdout); 42 | } 43 | MPI_Barrier(MPI_COMM_WORLD); 44 | } 45 | free(a); 46 | MPI_Finalize(); 47 | } 48 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/b_cast.f: -------------------------------------------------------------------------------- 1 | PROGRAM broad_cast 2 | INCLUDE 'mpif.h' 3 | INTEGER ierr, myid, nproc, root 4 | INTEGER status(MPI_STATUS_SIZE) 5 | REAL A(2) 6 | CALL MPI_INIT(ierr) 7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr) 8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) 9 | root = 0 10 | a(1)=0.0 11 | A(2)=0.0 12 | IF( myid .EQ. 0 ) THEN 13 | a(1) = 2.0 14 | a(2) = 4.0 15 | END IF 16 | WRITE(6,*) myid, ' before: a(1)=', a(1), 'a(2)=', a(2) 17 | CALL MPI_BARRIER() 18 | CALL MPI_BCAST(a, 2, MPI_REAL, 0, MPI_COMM_WORLD, ierr) 19 | WRITE(6,*) myid, ' after : a(1)=', a(1), 'a(2)=', a(2) 20 | CALL MPI_FINALIZE(ierr) 21 | END 22 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/clean.sh: -------------------------------------------------------------------------------- 1 | rm *.x 2 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/compile.sh: -------------------------------------------------------------------------------- 1 | module purge 2 | module load openMPI/4.1.5/gnu 3 | mpicc scatter.c -o scatter_c.x 4 | mpicc gather.c -o gather_c.x 5 | mpicc b_cast.c -o b_cast_c.x 6 | mpicc reduce.c -o reduce_c.x 7 | mpicc mpi_bcastcompare.c -o mpi_bcastcompare.x 8 | mpicc allgatherv.c -o allgatherv.x 9 | mpicc all2allv3d.c -o all2allv3d.x 10 | 11 | mpifort scatter.f -o scatter_f.x 12 | mpifort scatter.f -o scatter_f.x 13 | mpifort gather.f -o gather_f.x 14 | mpifort b_cast.f -o b_cast_f.x 15 | mpifort reduce.f -o reduce_f.x 16 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int main(int argc, char** argv) { 7 | 8 | int myid, nproc, root; 9 | int num_elements = 8; 10 | int nsnd = 2; 11 | double *a; 12 | double *b; 13 | a = (double*)malloc(sizeof(double) * num_elements); 14 | b = (double*)malloc(sizeof(double) * nsnd); 15 | assert(a != NULL); 16 | assert(b != NULL); 17 | MPI_Init(NULL, NULL); 18 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 19 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 20 | int gat_elements = nsnd * nproc; 21 | root=0; 22 | if(num_elements < gat_elements && myid == root) { 23 | printf("This application is meant to be run with no more than %d MPI processes.\n", num_elements/nsnd); 24 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 25 | } 26 | for (int i = 0; i < nsnd; i++) 27 | b[i] = myid; 28 | // int MPI_Gather(const void* buffer_send, int count_send, MPI_Datatype datatype_send, 29 | // void* buffer_recv, int count_recv, MPI_Datatype datatype_recv, 30 | // int root, MPI_Comm communicator); 31 | MPI_Gather(b, nsnd, MPI_DOUBLE, a, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD); 32 | if (myid==root) { 33 | fprintf(stdout, "myid=%d:\n", myid); 34 | for (int i = 0; i < gat_elements; i++) 35 | fprintf(stdout, "\ta[%d]=%.2f\n", i, a[i]); 36 | fprintf(stdout, "\n"); 37 | for (int i = gat_elements; i < num_elements; i++) 38 | fprintf(stdout, "\t\ta[%d]=%.2f\n", i, a[i]); 39 | } 40 | free(a); 41 | free(b); 42 | MPI_Finalize(); 43 | } 44 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/gather.f: -------------------------------------------------------------------------------- 1 | PROGRAM gather 2 | INCLUDE 'mpif.h' 3 | INTEGER ierr, myid, nproc, nsnd, I, root 4 | INTEGER status(MPI_STATUS_SIZE) 5 | REAL A(16), B(2) 6 | CALL MPI_INIT(ierr) 7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr) 8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) 9 | root = 0 10 | b(1) = REAL( myid ) 11 | b(2) = REAL( myid ) 12 | nsnd = 2 13 | CALL MPI_GATHER(b, nsnd, MPI_REAL, a, nsnd, 14 | & MPI_REAL, root, MPI_COMM_WORLD, ierr) 15 | IF( myid .eq. root ) THEN 16 | DO i = 1, (nsnd*nproc) 17 | WRITE(6,*) myid, ': a(i)=', a(i) 18 | END DO 19 | END IF 20 | CALL MPI_FINALIZE(ierr) 21 | END 22 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/mpi_bcastcompare.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void my_bcast(void* data, int count, MPI_Datatype datatype, int root, 7 | MPI_Comm communicator) { 8 | int world_rank; 9 | MPI_Comm_rank(communicator, &world_rank); 10 | int world_size; 11 | MPI_Comm_size(communicator, &world_size); 12 | 13 | if (world_rank == root) { 14 | // If we are the root process, send our data to everyone 15 | int i; 16 | for (i = 0; i < world_size; i++) { 17 | if (i != world_rank) { 18 | MPI_Send(data, count, datatype, i, 0, communicator); 19 | } 20 | } 21 | } else { 22 | // If we are a receiver process, receive the data from the root 23 | MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE); 24 | } 25 | } 26 | 27 | int main(int argc, char** argv) { 28 | 29 | int num_elements = 1000; 30 | int num_trials = 10; 31 | 32 | MPI_Init(NULL, NULL); 33 | 34 | int world_rank; 35 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 36 | 37 | double total_my_bcast_time = 0.0; 38 | double total_mpi_bcast_time = 0.0; 39 | int i; 40 | int* data = (int*)malloc(sizeof(int) * num_elements); 41 | assert(data != NULL); 42 | 43 | for (i = 0; i < num_trials; i++) { 44 | // Time my_bcast 45 | // Synchronize before starting timing 46 | MPI_Barrier(MPI_COMM_WORLD); 47 | total_my_bcast_time -= MPI_Wtime(); 48 | my_bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD); 49 | // Synchronize again before obtaining final time 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | total_my_bcast_time += MPI_Wtime(); 52 | 53 | // Time MPI_Bcast 54 | MPI_Barrier(MPI_COMM_WORLD); 55 | total_mpi_bcast_time -= MPI_Wtime(); 56 | MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD); 57 | MPI_Barrier(MPI_COMM_WORLD); 58 | total_mpi_bcast_time += MPI_Wtime(); 59 | } 60 | 61 | // Print off timing information 62 | if (world_rank == 0) { 63 | printf("Data size = %d, Trials = %d\n", num_elements * (int)sizeof(int), num_trials); 64 | printf("Avg my_bcast time = %lf\n", total_my_bcast_time / num_trials); 65 | printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials); 66 | } 67 | 68 | free(data); 69 | MPI_Finalize(); 70 | } 71 | 72 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char** argv) { 5 | 6 | int num_elements = 2; 7 | int myid, nproc, root; 8 | double a[num_elements], b[num_elements]; 9 | for (int i = 0; i < num_elements; i++) 10 | a[i] = 2.0 * (1+i); 11 | root=0; 12 | MPI_Init(NULL, NULL); 13 | int world_rank; 14 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 15 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 16 | //int MPI_Reduce(const void* send_buffer, void* receive_buffer, int count, 17 | // MPI_Datatype datatype, MPI_Op operation, int root, MPI_Comm communicator); 18 | MPI_Reduce(a, b, num_elements, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); 19 | if (myid == 0) { 20 | fprintf(stdout,"myid=%d:\n", myid); 21 | for (int i = 0; i < num_elements; i++) 22 | fprintf(stdout,"\tb[%d]=%.2f\n", i, b[i]); 23 | fprintf(stdout,"\n"); 24 | } 25 | MPI_Finalize(); 26 | } 27 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/reduce.f: -------------------------------------------------------------------------------- 1 | PROGRAM reduce 2 | INCLUDE 'mpif.h' 3 | INTEGER ierr, myid, nproc, root 4 | INTEGER status(MPI_STATUS_SIZE) 5 | REAL A(2), res(2) 6 | CALL MPI_INIT(ierr) 7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr) 8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) 9 | root = 0 10 | a(1) = 2.0 11 | a(2) = 4.0 12 | CALL MPI_REDUCE(a, res, 2, MPI_REAL, MPI_SUM, root, 13 | & MPI_COMM_WORLD, ierr) 14 | IF( myid .EQ. 0 ) THEN 15 | WRITE(6,*) myid, ': res(1)=', res(1), 'res(2)=', res(2) 16 | END IF 17 | CALL MPI_FINALIZE(ierr) 18 | END 19 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | 8 | 9 | int main(int argc, char** argv) { 10 | 11 | int myid, nproc, root; 12 | int num_elements = 8; 13 | int nsnd = 2; 14 | double a[num_elements]; 15 | double *b; 16 | b = (double*)malloc(sizeof(double) * nsnd); 17 | assert(b != NULL); 18 | MPI_Init(NULL, NULL); 19 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 20 | MPI_Comm_rank(MPI_COMM_WORLD, &myid); 21 | root=0; 22 | if(nproc * nsnd != num_elements && myid == root) { 23 | printf("This application is meant to be run with %d MPI processes.\n", num_elements/nsnd); 24 | MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); 25 | } 26 | if (myid == root) { 27 | for (int i = 0; i < num_elements; i++) 28 | a[i] = i+1; 29 | } 30 | // int MPI_Scatter(const void* buffer_send, int count_send, MPI_Datatype datatype_send, 31 | // void* buffer_recv, int count_recv, MPI_Datatype datatype_recv, 32 | // int root, MPI_Comm communicator); 33 | 34 | MPI_Scatter(a, nsnd, MPI_DOUBLE, b, nsnd, MPI_DOUBLE, root, MPI_COMM_WORLD); 35 | fprintf(stdout, "myid=%d:\tb[0]=%.2f,\tb[1]=%.2f\n",myid, b[0], b[1] ); 36 | free(b); 37 | MPI_Finalize(); 38 | } 39 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/collective-mpi/scatter.f: -------------------------------------------------------------------------------- 1 | PROGRAM scatter 2 | INCLUDE 'mpif.h' 3 | INTEGER ierr, myid, nproc, nsnd, I, root 4 | INTEGER status(MPI_STATUS_SIZE) 5 | REAL A(16), B(2) 6 | CALL MPI_INIT(ierr) 7 | CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierr) 8 | CALL MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) 9 | root = 0 10 | IF( myid .eq. root ) THEN 11 | DO i = 1, 16 12 | a(i) = REAL(i) 13 | END DO 14 | END IF 15 | nsnd = 2 16 | CALL MPI_SCATTER(a, nsnd, MPI_REAL, b, nsnd, 17 | & MPI_REAL, root, MPI_COMM_WORLD, ierr) 18 | WRITE(6,*) myid, ': b(1)=', b(1), 'b(2)=', b(2) 19 | CALL MPI_FINALIZE(ierr) 20 | END 21 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-A.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture05-MPI-Programming-part-B.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/MPI/lecture06-Network-basics-for-MPI-application.pptx -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/MPI/pi_scalability/scalability.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=scaling # Job name 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks=128 # Run a single task 5 | #SBATCH --time=01:20:00 # Time limit hrs:min:sec 6 | #SBATCH -p EPYC 7 | #SBATCH --output=%x.%j.out 8 | #SBATCH --error=%x.%j.err 9 | #SBATCH --exclusive 10 | module purge 11 | module load openMPI/4.1.5/gnu 12 | PI="../basic-mpi-codes/mpi_pi" 13 | mpicc -O3 ${PI}.c -o mpi_pi.x 14 | element="socket" 15 | N=1000000000 16 | echo "tasks, N, avg_walltime" > pi_strong.csv 17 | for i in $(eval echo {0..$SLURM_NTASKS..8}); 18 | do 19 | if [ "$i" -eq "0" ] 20 | then 21 | echo -n "1, $N," >> pi_strong.csv 22 | mpirun --map-by ${element} -np 1 ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv 23 | else 24 | echo -n "$i, $N," >> pi_strong.csv 25 | mpirun --map-by ${element} -np $i ./mpi_pi.x $N | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_strong.csv 26 | fi 27 | done 28 | 29 | echo "tasks, N, avg_walltime" > pi_weak.csv 30 | for i in $(eval echo {0..$SLURM_NTASKS..8}); 31 | do 32 | if [ "$i" -eq "0" ] 33 | then 34 | M=$N 35 | echo -n "1, $M," >> pi_weak.csv 36 | mpirun --map-by ${element} -np 1 ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv 37 | else 38 | M=$((${N}*${i})) 39 | echo -n "$i, $M," >> pi_weak.csv 40 | mpirun --map-by ${element} -np $i ./mpi_pi.x $M | grep "\[*\]" | awk 'BEGIN {FS=":"}; {print $2}' >> pi_weak.csv 41 | fi 42 | done 43 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/00--Memory_model.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/01--Intro_to_OpenMP.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/02--parallel_regions.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/03--loops.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/04--threads_affinity.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/.#for.c: -------------------------------------------------------------------------------- 1 | luca@ggg.2121:1698304345 -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/loop_without_for.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | #define N_default 1000 // how long is the main array 40 | 41 | int main( int argc, char **argv ) 42 | { 43 | 44 | int N = N_default; 45 | int nthreads = 1; 46 | 47 | // check whether some arg has been passed on 48 | if ( argc > 1 ) 49 | { 50 | N = atoi( *(argv+1) ); 51 | if ( argc > 2 ) 52 | nthreads = atoi( *(argv+2) ); 53 | } 54 | 55 | if( nthreads > 1 ) 56 | omp_set_num_threads(nthreads); 57 | #pragma omp parallel 58 | { 59 | int me = omp_get_thread_num(); 60 | int nthreads = omp_get_num_threads(); 61 | 62 | int chunk = N / nthreads; 63 | int mod = N % nthreads; 64 | int my_first = chunk*me + ((me < mod)?me:mod); 65 | int my_chunk = chunk + (mod > 0)*(me < mod); 66 | 67 | #pragma omp single 68 | printf("nthreads: %d, N: %d --- chunk is %d, reminder is %d\n", nthreads, N, chunk, mod); 69 | 70 | printf("thread %d : from %d to %d\n", me, my_first, my_first+my_chunk); 71 | 72 | /* 73 | * here you could then insert a for loop 74 | * int my_stop = my_first + my_chunk; 75 | * for( int i = myfirst; i < my_stop; i++ ) 76 | * ... 77 | */ 78 | } 79 | 80 | 81 | return 0; 82 | } 83 | 84 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DEFAULT 1000000 10 | #define SEED 918273 11 | 12 | int main ( int argc, char **argv) 13 | { 14 | 15 | long long int M=0; 16 | int nthreads; 17 | double pi; 18 | 19 | 20 | 21 | #pragma omp parallel 22 | #pragma omp master 23 | nthreads = omp_get_num_threads(); 24 | 25 | long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ; 26 | printf("omp calculation with %d threads\nN=%Ld\n", 27 | nthreads ,N); 28 | 29 | double timing = omp_get_wtime(); 30 | #pragma omp parallel 31 | { 32 | int myid = omp_get_thread_num(); 33 | double x, y ; 34 | srand48(SEED*(myid+1)); 35 | 36 | #pragma omp for reduction(+:M) 37 | for( long long unsigned i = 0; i < N; i++) 38 | { 39 | x = drand48(); 40 | y = drand48(); 41 | M += ((x*x + y*y) < 1.0); 42 | } 43 | } 44 | 45 | timing = omp_get_wtime() - timing; 46 | printf("Estimation of pi: %1.9f\n Walltime:%g\n", 47 | (4.0*(double)M)/N, timing ); 48 | 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_loops/pi_openmp.fix.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DEFAULT 1000000 10 | #define SEED 918273 11 | 12 | int main(int argc,char* argv[]) 13 | { 14 | 15 | long long unsigned int M = 0; 16 | int nthreads; 17 | 18 | #pragma omp parallel 19 | #pragma omp master 20 | nthreads = omp_get_num_threads(); 21 | 22 | long long int N = (argc > 1 ? atoll(argv[1]) : DEFAULT ) ; 23 | printf("omp calculation with %d threads\nN=%Ld\n", nthreads ,N); 24 | 25 | double timing = omp_get_wtime(); 26 | #pragma omp parallel 27 | { 28 | int myid = omp_get_thread_num(); 29 | int unsigned short myseeds[3] = {SEED+(myid),SEED+(myid*3+1), SEED+(myid*4+2)}; 30 | 31 | seed48( myseeds ); 32 | 33 | #pragma omp for reduction(+:M) 34 | for( long long unsigned int i = 0; i < N; i++) 35 | { 36 | double x = erand48( myseeds ); 37 | double y = erand48( myseeds ); 38 | 39 | M += ( (x*x + y*y) < 1.0 ); 40 | } 41 | } 42 | 43 | timing = omp_get_wtime() - timing; 44 | 45 | printf("Estimation of pi: %1.9f\n Walltime:%g\n", 46 | (4.0*(double)M)/N, timing ); 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/00_scope_of_variables.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #define _GNU_SOURCE 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | 42 | int main( int argc, char **argv ) 43 | { 44 | int i; 45 | 46 | printf( "\nmain thread (pid: %d, tid: %ld) data:\n" 47 | "&i is @ address : %p\n\n", 48 | (int)getpid(), syscall(SYS_gettid), &i); 49 | 50 | // just try who is the private i for each thread 51 | #pragma omp parallel private(i) 52 | { 53 | int me = omp_get_thread_num(); 54 | 55 | printf( "\tthread nr %d ( tid %ld, from pid %d ) :\n" 56 | "\t\tmy i address is %p\n", 57 | me, syscall(SYS_gettid), (int)getpid(), &i ); 58 | } 59 | 60 | printf( "\n" ); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/01_simple_pr_wrong.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | #if defined(__STDC__) 28 | # if (__STDC_VERSION__ >= 199901L) 29 | # define _XOPEN_SOURCE 700 30 | # endif 31 | #endif 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | int main( int argc, char **argv ) 40 | { 41 | 42 | int nthreads; 43 | int my_thread_id; 44 | 45 | #if defined(_OPENMP) 46 | 47 | #pragma omp parallel // this creates a parallel region 48 | // that is encompassed by the 49 | // opening and closing { } 50 | // 51 | // you can modify the number of 52 | // spawned threads through the 53 | // OMP_THREAD_NUM 54 | // environmental variable 55 | 56 | { 57 | 58 | my_thread_id = omp_get_thread_num(); // note: this assignment is not thread-safe 59 | sleep(0.05); 60 | #pragma omp master 61 | nthreads = omp_get_num_threads(); 62 | 63 | // the order in which different threads will 64 | // arrive at this print is undefined; 65 | // if you run this code several times, you will 66 | // obtain different results 67 | 68 | printf( "\tgreetings from thread num %d\n", my_thread_id); 69 | } 70 | #else 71 | 72 | nthreads = 1; 73 | #endif 74 | 75 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/02_simple_pr.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | #if defined(__STDC__) 28 | # if (__STDC_VERSION__ >= 199901L) 29 | # define _XOPEN_SOURCE 700 30 | # endif 31 | #endif 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | int main( int argc, char **argv ) 39 | { 40 | 41 | int nthreads; 42 | 43 | #if defined(_OPENMP) 44 | 45 | #pragma omp parallel // this creates a parallel region 46 | // that is encompassed by the 47 | // opening and closing { } 48 | // 49 | // you can modify the number of 50 | // spawned threads through the 51 | // OMP_THREAD_NUM 52 | // environmental variable 53 | 54 | { 55 | 56 | int my_thread_id = omp_get_thread_num(); // note: this assignment is now 57 | // thread-safe because the lvalue 58 | // is a private variable 59 | #pragma omp master 60 | nthreads = omp_get_num_threads(); 61 | 62 | // the order in which different threads will 63 | // arrive at this print is undefined; 64 | // if you run this code several times, you will 65 | // obtain different results 66 | 67 | printf( "\tgreetings from thread num %d\n", my_thread_id); 68 | } 69 | #else 70 | 71 | nthreads = 1; 72 | printf( "\tgreetings from thread num 0\n"); 73 | #endif 74 | 75 | printf(" %d thread%s greeted you from the %sparallel region\n", 76 | nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/03a_num_of_threads.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | int main( int argc, char **argv ) 40 | { 41 | 42 | int nthreads; 43 | 44 | #if defined(_OPENMP) 45 | 46 | int threads_num = 1; 47 | 48 | if ( argc > 1 ) 49 | { 50 | // read the argument given 51 | threads_num = atoi(*(argv+1)); 52 | omp_set_num_threads( threads_num ); 53 | } 54 | 55 | #pragma omp parallel // this creates a parallel region 56 | // that is encompassed by the 57 | // opening and closing { } 58 | // 59 | // you can modify the number of 60 | // spawned threads in different 61 | // ways: 62 | // 1) through the OMP_THREAD_NUM 63 | // environmental variable 64 | // 2) using the omp_set_num_threads() 65 | // 66 | // you can also declare the desired 67 | // number at the creation of the 68 | // parallel region: 69 | 70 | //#pragma omp parallel num_threads( threads_num ) 71 | 72 | { 73 | 74 | int my_thread_id = omp_get_thread_num(); 75 | #pragma omp master 76 | nthreads = omp_get_num_threads(); 77 | 78 | // the order in which different threads will 79 | // arrive at this print is undefined; 80 | // if you run this code several times, you will 81 | // obtain different results 82 | 83 | printf( "\tgreetings from thread num %d\n", my_thread_id ); 84 | } 85 | 86 | #else 87 | 88 | nthreads = 1; 89 | 90 | #endif 91 | 92 | printf(" %d thread%s greeted you from the %sparallel region\n", 93 | nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/04_order_of_threads_wrong.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | #if defined(__STDC__) 28 | # if (__STDC_VERSION__ >= 199901L) 29 | # define _XOPEN_SOURCE 700 30 | # endif 31 | #endif 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | int main( int argc, char **argv ) 39 | { 40 | 41 | int nthreads; 42 | 43 | #if defined(_OPENMP) 44 | 45 | int order = 0; 46 | 47 | #pragma omp parallel // this creates a parallel region 48 | // that is encompassed by the 49 | // opening and closing { } 50 | // 51 | // you can modify the number of 52 | // spawned threads through the 53 | // OMP_THREAD_NUM 54 | // environmental variable 55 | 56 | { 57 | 58 | int my_thread_id = omp_get_thread_num(); 59 | #pragma omp master 60 | nthreads = omp_get_num_threads(); 61 | 62 | // now we impose an ordered output 63 | // although not ina very efficient way 64 | 65 | // the "critical" directive identifies a 66 | // section that must be executed by a 67 | // single thread at a time. 68 | // Here, un unspecified number of threads 69 | // will print the message. 70 | // That is just due to this particular 71 | // case: in fact, ALL the threads will 72 | // execute the if test. However, which are 73 | // those that succeed, print and modify the 74 | // "order" value depends on which have been 75 | // the previous ones, and on the relative delay. 76 | #pragma omp critical 77 | if ( order == my_thread_id ) 78 | { 79 | printf( "\tgreetings from thread num %d\n", my_thread_id ); 80 | order++; 81 | } 82 | } 83 | #else 84 | 85 | nthreads = 1; 86 | #endif 87 | 88 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 89 | 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05b_order_of_threads.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | int main( int argc, char **argv ) 40 | { 41 | 42 | int nthreads; 43 | 44 | #if defined(_OPENMP) 45 | 46 | #pragma omp parallel 47 | { 48 | 49 | int my_thread_id = omp_get_thread_num(); 50 | #pragma omp master 51 | nthreads = omp_get_num_threads(); 52 | #pragma omp barrier // let all the threads to read 53 | // the correct value of nthreads 54 | 55 | #pragma omp for ordered // declare a for within which there 56 | for ( int i = 0; i < nthreads; i++) // are ordered regions 57 | #pragma omp ordered // declare the ordered region 58 | printf( "\tgreetings from thread num %d\n", my_thread_id ); 59 | 60 | } 61 | #else 62 | 63 | nthreads = 1; 64 | #endif 65 | 66 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/05c_order_of_threads.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | void do_something( int who_am_I ) 39 | { 40 | #pragma omp ordered 41 | printf( "\tgreetings from thread num %d\n", who_am_I ); 42 | } 43 | 44 | 45 | int main( int argc, char **argv ) 46 | { 47 | 48 | int nthreads; 49 | 50 | #if defined(_OPENMP) 51 | 52 | #pragma omp parallel 53 | { 54 | 55 | int my_thread_id = omp_get_thread_num(); 56 | #pragma omp master 57 | nthreads = omp_get_num_threads(); 58 | #pragma omp barrier // let all the threads to read 59 | // the correct value of nthreads 60 | 61 | #pragma omp for ordered // declare a for within which there 62 | for ( int i = 0; i < nthreads; i++) // are ordered regions 63 | do_something( my_thread_id ); 64 | 65 | 66 | } 67 | #else 68 | 69 | nthreads = 1; 70 | #endif 71 | 72 | printf(" %d thread%s greeted you from the %sparallel region\n", nthreads, (nthreads==1)?" has":"s have", (nthreads==1)?"(non)":"" ); 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__copyin__clarify.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | 40 | double golden_value = 0; 41 | #pragma omp threadprivate( golden_value ) 42 | 43 | 44 | int main( int argc, char **argv ) 45 | { 46 | srand48(time(NULL)); 47 | int N = 10; 48 | 49 | #pragma omp parallel copyin(golden_value) 50 | // the copying of thread 0's golden_value 51 | // happens here, at the entering of the 52 | // parallel region; 53 | // 54 | { 55 | 56 | #pragma omp master 57 | golden_value = 1.618033988; // we do not expect 58 | // this value to be 59 | // broadcasted 60 | 61 | #pragma omp barrier 62 | 63 | printf("[PR 1] thread %d has a golden value %g\n", 64 | omp_get_thread_num(), golden_value ); 65 | } 66 | 67 | 68 | #pragma omp parallel copyin(golden_value) 69 | // here the master's value is copied again; 70 | // since it was modified in the previous 71 | // PR, we do expect that now everybody 72 | // will have the new value 73 | // 74 | printf("[PR 2] thread %d has a golden value %g\n", 75 | omp_get_thread_num(), golden_value ); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__firstprivate.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #define DEFAULT 10 39 | 40 | int main( int argc, char **argv ) 41 | { 42 | 43 | int i = (argc > 1 ? atoi(*(argv+1)) : DEFAULT ); 44 | int nthreads; 45 | int *array; 46 | 47 | #pragma omp parallel 48 | #pragma omp master 49 | nthreads = omp_get_num_threads(); 50 | 51 | array = (int*)calloc( nthreads, sizeof(int) ); 52 | 53 | #pragma omp parallel firstprivate( i, array ) 54 | { 55 | int me = omp_get_thread_num(); 56 | 57 | // Here we can refer to both i and array. 58 | // Although they are *different* memory region 59 | // than the ones that are hosted in the 60 | // serial region, their value at the entry 61 | // of the parallel region is initialized 62 | // to the value that the corresponding variables 63 | // have in the serial region. 64 | 65 | 66 | array[me] = i + me; // a perfectly valid reference 67 | 68 | array = NULL; // we screw up.. but only in 69 | // this scope because this 70 | // array is _not_ the same 71 | // than that outise the p-region 72 | } 73 | 74 | for( int j = 0; j < nthreads; j++ ) 75 | printf("entry %3d is %3d (expected was %3d)\n", 76 | j, array[j], i + j ); 77 | 78 | free(array); 79 | return 0; 80 | } 81 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples/parallel_regions/09_clauses__threadprivate.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | 40 | int me, myN; 41 | int *array; 42 | 43 | #pragma omp threadprivate( me, myN, array ) 44 | 45 | 46 | #define DEFAULT 100000 47 | 48 | int main( int argc, char **argv ) 49 | { 50 | int N = ( argc > 1 ? atoi(*(argv+1)) : DEFAULT); 51 | 52 | #pragma omp parallel 53 | { 54 | me = omp_get_thread_num(); 55 | 56 | int nthreads = omp_get_num_threads(); 57 | 58 | // note that we did not declare neither 59 | // myN nor array nor me in this scope 60 | // 61 | myN = (N / nthreads) + (me < N%nthreads); 62 | array = (int*)calloc( myN, sizeof(int) ); 63 | 64 | printf("+ thread %d has got %d elements; local array " 65 | "(address stored in %p) starts at %p\n", 66 | me, myN, &array, array ); 67 | 68 | // write something in the array 69 | // 70 | 71 | int max = ( myN > 3 ? 3 : myN ); 72 | for( int j = 0; j < max; j++ ) 73 | array[j] = me*1000 + j; 74 | } 75 | 76 | 77 | printf("\nnow we are again in a serial region\n\n"); 78 | 79 | 80 | #pragma omp parallel 81 | { 82 | char buffer[200]; 83 | sprintf( buffer, "* thread %d :: ", me ); 84 | 85 | int max = ( myN > 3 ? 3 : myN ); 86 | for( int j = 0; j < max; j++ ) 87 | sprintf( &buffer[strlen(buffer)], "[%d] = %4d , ", j, array[j] ); 88 | 89 | printf("%s\n", buffer ); 90 | 91 | // we must free array from within a parallel region 92 | // is we did this in a serial region, only the memory 93 | // associated to the master thread would be freed 94 | // 95 | free(array); 96 | } 97 | 98 | 99 | return 0; 100 | } 101 | 102 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/examples_on_stack/00_explore_how_bytes_are_stored.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main( void ) 6 | { 7 | 8 | unsigned int i = 128; 9 | int size = sizeof(i); 10 | 11 | /* 12 | * i is and integer variable, and as suche it requires 4 bytes 13 | * let's explore how this 4 bytes are placed in memory 14 | */ 15 | 16 | for ( int j = 0; j < size; j++ ) 17 | // 18 | // we loop over the bytes that make up the variable i 19 | // note: to be general, we asked size to be the value 20 | // returned by sizeof() 21 | // 22 | { 23 | // let's print the value of the entire bitfield 24 | // when we interpret it as an integer 25 | printf("i is: %d\n", i ); 26 | 27 | // now we access each byte of i 28 | // 29 | char *byte = (char*)&i; 30 | for( int k = 0; k < size; k++ ) 31 | printf("\t%p : %d\n", byte+k, *(byte+k) ); 32 | 33 | // convince yourself that the previous for loop could have been 34 | // written as follows: 35 | // ( un-comment the next 2 lines to test it 36 | 37 | /* for( int k = 0; k < size; k++ ) */ 38 | /* printf("\t%p : %d\n", (char*)&i+k, *(((char*)&i)+k)); */ 39 | 40 | // why is it so ? 41 | // -- &i is the address of i; more precisely 42 | // it is the address of the begin of i, i.e. 43 | // the address of the furst of the bytes that 44 | // form i. 45 | // -- (char*)&i means that we interpret the 46 | // address &i as an address to a char 47 | // -- *(char*)&i reads as "the value of the byte 48 | // at the address &i" 49 | // -- (char*)&i+k is k-byte after the byte at 50 | // address &i 51 | 52 | 53 | printf("\n"); 54 | 55 | // now we multiply i by 256. 56 | // the operators << and >> read as "shift the argument's bit on the left [or right] 57 | // by the specified amount of bits " 58 | // In this case the amount of bits is 8, i.e. is is the same than multiplying by 256 59 | // 60 | i <<= 8; 61 | 62 | // we are doing this because we want that only a single bit is set per each byte 63 | // among the i's bytes. 64 | // we started from a value of 1, i.e. only the first bit of the first byte of i was 65 | // set; multiplying by 256 (i.e. bit-shifting by 8 positions) we move that bit 66 | // to he next byte. 67 | } 68 | 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/exercises/.#lab_exercise.2.c: -------------------------------------------------------------------------------- 1 | luca@ggg.26667:1698393520 -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/PARALLEL_PROGRAMMING/OpenMP/exercises/exercises.pdf -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/exercises/lab_exercise.c: -------------------------------------------------------------------------------- 1 | 2 | /* ────────────────────────────────────────────────────────────────────────── * 3 | │ │ 4 | │ This file is part of the exercises for the Lectures on │ 5 | │ "Foundations of High Performance Computing" │ 6 | │ given at │ 7 | │ Master in HPC and │ 8 | │ Master in Data Science and Scientific Computing │ 9 | │ @ SISSA, ICTP and University of Trieste │ 10 | │ │ 11 | │ contact: luca.tornatore@inaf.it │ 12 | │ │ 13 | │ This is free software; you can redistribute it and/or modify │ 14 | │ it under the terms of the GNU General Public License as published by │ 15 | │ the Free Software Foundation; either version 3 of the License, or │ 16 | │ (at your option) any later version. │ 17 | │ This code is distributed in the hope that it will be useful, │ 18 | │ but WITHOUT ANY WARRANTY; without even the implied warranty of │ 19 | │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the │ 20 | │ GNU General Public License for more details. │ 21 | │ │ 22 | │ You should have received a copy of the GNU General Public License │ 23 | │ along with this program. If not, see │ 24 | │ │ 25 | * ────────────────────────────────────────────────────────────────────────── */ 26 | 27 | 28 | #if defined(__STDC__) 29 | # if (__STDC_VERSION__ >= 199901L) 30 | # define _XOPEN_SOURCE 700 31 | # endif 32 | #endif 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | 41 | #define N_DFLT 1000 42 | 43 | 44 | int main ( int argc, char **argv ) 45 | { 46 | 47 | int N = ( (argc > 1) ? atoi(*(argv+1)) : N_DFLT); 48 | int Nth = ( (argc > 2) ? atoi(*(argv+2)) : 0); 49 | 50 | unsigned int *array = (int*)malloc( sizeof(int) * N ); 51 | 52 | if ( Nth > 0 ) 53 | omp_set_num_threads = Nth; 54 | 55 | #pragma omp parallel 56 | { 57 | int myid = omp_get_thread_num(); 58 | int nthreads = omp_get_num_threads(); 59 | 60 | for ( unsigned int i = 0; i < N; i++ ) 61 | array[i] = i*i; 62 | 63 | } 64 | 65 | // 66 | // check the results 67 | // can you parallelize this as well ? 68 | // 69 | 70 | unsigned int faults = 0; 71 | for ( unsigned int i = 0; i < N; i++ ) 72 | faults += ( array[i] != i*i ); 73 | 74 | if ( faults > 0 ) 75 | printf("wow, you've been able to get %u faults\n", 76 | faults ); 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #if defined(__STDC__) 4 | # if (__STDC_VERSION__ >= 199901L) 5 | # define _XOPEN_SOURCE 700 6 | # endif 7 | #endif 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "prefix_sum.serial.h" 14 | 15 | 16 | 17 | inline double scan( const uint N, DTYPE * restrict array ) 18 | { 19 | 20 | DTYPE avg = array[0]; 21 | 22 | for ( uint ii = 1; ii < N; ii++ ) 23 | { 24 | avg += array[ii]; 25 | array[ii] = avg; 26 | } 27 | 28 | return avg; 29 | } 30 | 31 | 32 | inline DTYPE scan_efficient( const uint N, DTYPE * restrict array ) 33 | { 34 | 35 | uint N_4 = (N/4)*4; 36 | 37 | { 38 | DTYPE temp = array[2]; 39 | array[1] += array[0]; 40 | array[3] += temp; 41 | array[2] += array[1]; 42 | array[3] += array[1]; 43 | } 44 | 45 | PRAGMA_VECT_LOOP 46 | for ( uint ii = 4; ii < N_4; ii+=4 ) 47 | { 48 | DTYPE register temp = array[ii+2]; 49 | array[ii] += array[ii-1]; 50 | array[ii+1] += array[ii]; 51 | array[ii+3] += temp; 52 | array[ii+2] += array[ii+1]; 53 | array[ii+3] += array[ii+1]; 54 | } 55 | 56 | for ( uint ii = N_4; ii < N; ii++ ) 57 | array[ii] += array[ii-1]; 58 | 59 | return array[N-1]; 60 | } 61 | 62 | 63 | #define N_default 1000 64 | #define _scan 0 65 | #define _scan_e 1 66 | 67 | int main ( int argc, char **argv ) 68 | { 69 | 70 | struct timespec ts; 71 | int Nth_level1 = 1; 72 | int Nth_level2 = 0; 73 | 74 | // ------------------------------------------------------------- 75 | // variables' initialization to default values 76 | // 77 | 78 | uint N = N_default; 79 | int scan_type = _scan; 80 | 81 | 82 | if ( argc > 1 ) 83 | { 84 | scan_type = atoi( *(argv+1) ); 85 | if ( argc > 2 ) 86 | N = (unsigned)atoi( *(argv+2) ); 87 | } 88 | 89 | printf( "scan type: %d\n", scan_type ); 90 | 91 | 92 | // ------------------------------------------------------------- 93 | // data init. 94 | 95 | double timing_start; 96 | double timing_scan; 97 | double timing_prepare; 98 | double total_weight; 99 | 100 | uint N_alloc = ((N/4)+1)*4; 101 | // DTYPE *array = (DTYPE*)aligned_alloc( 32, N_alloc * sizeof(DTYPE) ); 102 | DTYPE *array = (DTYPE*)malloc( N_alloc * sizeof(DTYPE) ); 103 | 104 | timing_start = CPU_TIME; 105 | 106 | // initialize with pseudo-random numbers 107 | 108 | /* srand48(time(0)); */ 109 | /* for ( int ii = 0; ii < N; ii++ ) */ 110 | /* topnodes[ii] = base + drand48()*range; */ 111 | 112 | // initialize with the first N integer 113 | // (that makes the results easy to check) 114 | // // 115 | 116 | for ( uint ii = 0; ii < N; ii++ ) 117 | array[ii] = (double)ii; 118 | 119 | timing_prepare = CPU_TIME - timing_start; 120 | 121 | // ................................................ 122 | // SCAN 123 | // ................................................ 124 | 125 | if ( scan_type == _scan ) 126 | total_weight = scan( N, array ); 127 | 128 | else if (scan_type == _scan_e) 129 | total_weight = scan_efficient( N, array ); 130 | 131 | /* else if (scan_type == _scan_b) */ 132 | /* total_weight = scan_b( N, array ); */ 133 | 134 | timing_scan = CPU_TIME - timing_start; 135 | 136 | printf("timing for scan is %g, timing for prepare is %g [total weight: %g]\n", 137 | timing_scan, timing_prepare, total_weight); 138 | return 0; 139 | } 140 | -------------------------------------------------------------------------------- /PARALLEL_PROGRAMMING/OpenMP/exercises/prefix_sum.serial.h: -------------------------------------------------------------------------------- 1 | 2 | // ───────────────────────────────────────────────────────────────── 3 | // define the datatype 4 | // 5 | #if !defined(DTYPE) 6 | #define DTYPE double 7 | #endif 8 | 9 | typedef unsigned int uint; 10 | 11 | 12 | // ───────────────────────────────────────────────────────────────── 13 | // define the timing routines 14 | // 15 | 16 | #define CPU_TIME ({struct timespec ts; \ 17 | clock_gettime( CLOCK_PROCESS_CPUTIME_ID, &ts ), \ 18 | (double)ts.tv_sec + \ 19 | (double)ts.tv_nsec * 1e-9;}) 20 | 21 | 22 | // ───────────────────────────────────────────────────────────────── 23 | // define the vector generator 24 | // 25 | 26 | #if defined(__GNUC__) && !defined(__ICC) && !defined(__INTEL_COMPILER) 27 | #define PRAGMA_VECT_LOOP _Pragma("GCC ivdep") 28 | #elif defined(__INTEL_COMPILER) | defined(__ICC) 29 | #define PRAGMA_VECT_LOOP _Pragma("parallel") 30 | #elif defined(__clang__) 31 | #define PRAGMA_VECT_LOOP _Pragma("ivdep") 32 | #else 33 | #define PRAGMA_VECT_LOOP 34 | #endif 35 | 36 | 37 | // ───────────────────────────────────────────────────────────────── 38 | // prototypes 39 | // 40 | 41 | double scan ( const uint, DTYPE * restrict ); 42 | double scan_efficient ( const uint, DTYPE * restrict ); 43 | 44 | -------------------------------------------------------------------------------- /intro_to_course.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/intro_to_course.pdf -------------------------------------------------------------------------------- /lecture01-intro-toHPC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foundations-of-HPC/High-Performance-Computing-2023/eeadb268737d330e9f7199ac07fd6477d39d007e/lecture01-intro-toHPC.pdf --------------------------------------------------------------------------------