├── .gitignore ├── README.md ├── Linaro-Forge ├── correctness │ ├── core-files │ │ ├── gdb-dump-corefile.cmd │ │ ├── core.makefile │ │ ├── div-by-zero.c │ │ └── common.makefile │ ├── debug │ │ ├── Makefile │ │ ├── deadlock.c │ │ ├── memory_debugging.c │ │ ├── simple.c │ │ └── split.c │ └── gpu-nvidia-mmult │ │ ├── LICENSE │ │ ├── common │ │ ├── helper_functions.h │ │ ├── exception.h │ │ └── helper_string.h │ │ ├── README.md │ │ ├── Makefile │ │ └── matrixMul.cu ├── performance │ ├── mmult_py.makefile │ ├── mmultlib.c │ ├── mmultlib.f90 │ ├── mmult.py │ └── common.makefile ├── README.md └── scripts │ └── submit-job.sh ├── .gitmodules ├── Sanitzers ├── AddressSanitizer │ ├── use-after-free.c │ ├── example_UseAfterFree.cc │ ├── illegalmemoryaccess.cpp │ └── README.md ├── LeakSanitizer │ ├── memory-leak.c │ └── README.md ├── MemorySanitizer │ ├── umr.cc │ ├── umr2.cc │ └── README.md ├── ThreadSanitizer │ ├── buggyreduction_omp.c │ ├── tiny_race.c │ └── README.md ├── Sanitizers4hpc │ ├── GPU │ │ ├── main.cc │ │ ├── README.md │ │ └── memcheck_demo.cu │ └── CPU │ │ ├── buggyreduction_mpiomp.c │ │ └── README.md └── README.md ├── Valgrind ├── memcheck │ ├── uninitialized.c │ ├── doublefree.c │ ├── manuel1.c │ ├── invalidparams.c │ ├── memoryleak.c │ ├── memoryleak_mpi.c │ ├── overlap.c │ ├── leak-cases.c │ ├── memalign.c │ ├── leak.h │ └── memcheck.h ├── massif │ └── example.c └── dhat │ ├── ad-hoc.c │ ├── dhat.out.1688970 │ ├── dhat.out.2245130 │ └── basic.c ├── TotalView ├── README.md ├── programs │ ├── TVcmd1 │ ├── TVcmd2 │ ├── TVcmd4 │ ├── TVcmd3 │ ├── TVcmd5 │ ├── demoMpi_v2.TVD.v4breakpoints │ ├── Makefile │ └── combined.TVD.v4breakpoints └── src │ ├── array.h │ ├── simple.c │ ├── array.c │ ├── ReplayEngine_demo.cxx │ ├── myClassA.hxx │ ├── myClassB.hxx │ ├── myClassA.cxx │ ├── simple_threaded.c │ ├── myClassB.cxx │ ├── TVscript_demo.c │ ├── demoMpi_v2.C │ └── main.cxx ├── gdb4hpc └── README.md ├── CUDA └── CUDA-GDB │ └── README.md └── fortran_memory ├── free_twice.f90 ├── heap_overflow_underflow.f90 ├── segfault.f90 ├── memory_leaks.f90 └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.[oa] 2 | *.mod 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # debugging 2 | 3 | Debugging example codes 4 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/core-files/gdb-dump-corefile.cmd: -------------------------------------------------------------------------------- 1 | handle SIGFPE stop 2 | set confirm off 3 | run 4 | gcore div-by-zero.core 5 | quit 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "CUDA/compute-sanitizer-samples"] 2 | path = CUDA/compute-sanitizer-samples 3 | url = https://github.com/NVIDIA/compute-sanitizer-samples.git 4 | -------------------------------------------------------------------------------- /Sanitzers/AddressSanitizer/use-after-free.c: -------------------------------------------------------------------------------- 1 | #include 2 | int main() { 3 | char *x = (char*)malloc(10 * sizeof(char*)); 4 | free(x); 5 | return x[5]; 6 | } 7 | -------------------------------------------------------------------------------- /Sanitzers/LeakSanitizer/memory-leak.c: -------------------------------------------------------------------------------- 1 | #include 2 | void *p; 3 | int main() { 4 | p = malloc(7); 5 | p = 0; // The memory is leaked here. 6 | return 0; 7 | } 8 | -------------------------------------------------------------------------------- /Sanitzers/AddressSanitizer/example_UseAfterFree.cc: -------------------------------------------------------------------------------- 1 | int main(int argc, char **argv) { 2 | int *array = new int[100]; 3 | delete [] array; 4 | return array[argc]; // BOOM 5 | } 6 | -------------------------------------------------------------------------------- /Valgrind/memcheck/uninitialized.c: -------------------------------------------------------------------------------- 1 | /* Taken from Valgrind memcheck manual */ 2 | 3 | #include 4 | 5 | int main() 6 | { 7 | int x; 8 | printf ("x = %d\n", x); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /Sanitzers/MemorySanitizer/umr.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char** argv) { 4 | int* a = new int[10]; 5 | a[5] = 0; 6 | if (a[argc]) 7 | printf("xx\n"); 8 | return 0; 9 | } 10 | -------------------------------------------------------------------------------- /Sanitzers/MemorySanitizer/umr2.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char** argv) { 4 | int* a = new int[10]; 5 | a[5] = 0; 6 | volatile int b = a[argc]; 7 | if (b) 8 | printf("xx\n"); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /Valgrind/memcheck/doublefree.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | int main ( void ) 6 | { 7 | int i; 8 | void* p = malloc(177); 9 | for (i = 0; i < 2; i++) 10 | free(p); 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /Valgrind/memcheck/manuel1.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main () 4 | { 5 | int x; 6 | 7 | if (x==0xCAFEBABE) 8 | { 9 | printf ("x = %d\n", 99); 10 | } 11 | else 12 | { 13 | printf ("x = %d\n", 88); 14 | } 15 | 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /TotalView/README.md: -------------------------------------------------------------------------------- 1 | # TotalView training materials 2 | 3 | See 4 | 5 | - `/global/cfs/cdirs/training/2024/TotalView_May2024` 6 | 7 | ## Build 8 | 9 | The source codes are in the `src` directory. 10 | 11 | ``` 12 | $ cd programs 13 | $ make clean 14 | $ make 15 | ``` 16 | -------------------------------------------------------------------------------- /TotalView/programs/TVcmd1: -------------------------------------------------------------------------------- 1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#138=print err_detail" ./TVscript_demo' 2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#138=print err_detail" ./TVscript_demo 3 | -------------------------------------------------------------------------------- /Sanitzers/ThreadSanitizer/buggyreduction_omp.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main (int argc, char **argv) { 4 | int sum = 0; 5 | #pragma omp parallel for shared(sum) 6 | for (int i=0; i<1000; i++) 7 | sum += i; 8 | 9 | printf("sum = %d\n", sum); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /TotalView/programs/TVcmd2: -------------------------------------------------------------------------------- 1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#129=print {mypi*numprocs}" ./TVscript_demo' 2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#129=print {mypi*numprocs}" ./TVscript_demo 3 | -------------------------------------------------------------------------------- /TotalView/programs/TVcmd4: -------------------------------------------------------------------------------- 1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 3 -event_action "error=>display_backtrace -show_arguments -show_locals" TVscript_demo' 2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 3 -event_action "error=>display_backtrace -show_arguments -show_locals" TVscript_demo 3 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/debug/Makefile: -------------------------------------------------------------------------------- 1 | # -*- Mode: Makefile; -*- 2 | # 3 | # See COPYRIGHT in top-level directory. 4 | # 5 | 6 | CC=cc 7 | CFLAGS= -O0 -g -Wall -Wno-stringop-overflow 8 | BINS=simple deadlock split memory_debugging 9 | 10 | all: $(BINS) 11 | 12 | clean: 13 | rm -f $(BINS) 14 | -------------------------------------------------------------------------------- /TotalView/programs/TVcmd3: -------------------------------------------------------------------------------- 1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#88=display_backtrace 1 -show_locals" ./TVscript_demo' 2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#88=display_backtrace 1 -show_locals" ./TVscript_demo 3 | -------------------------------------------------------------------------------- /TotalView/programs/TVcmd5: -------------------------------------------------------------------------------- 1 | echo 'rm -f *log ; tvscript -memory_debugging -mpi "Open MPI" -tasks 4 -event_action "termination_notification=list_allocations" ./TVscript_demo' 2 | rm -f *log ; tvscript -memory_debugging -mpi "Open MPI" -tasks 4 -event_action "termination_notification=list_allocations" ./TVscript_demo 3 | -------------------------------------------------------------------------------- /Sanitzers/AddressSanitizer/illegalmemoryaccess.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char **argv) { 4 | int *array = new int[100]; 5 | 6 | for (int i = 0; i < 110; ++i) // Access more than allocated memory. 7 | array[i] = i+1; 8 | 9 | delete [] array; 10 | 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /Sanitzers/ThreadSanitizer/tiny_race.c: -------------------------------------------------------------------------------- 1 | #include 2 | int Global; 3 | void *Thread1(void *x) { 4 | Global = 42; 5 | return x; 6 | } 7 | int main() { 8 | pthread_t t; 9 | pthread_create(&t, NULL, Thread1, NULL); 10 | Global = 43; 11 | pthread_join(t, NULL); 12 | return Global; 13 | } 14 | -------------------------------------------------------------------------------- /Valgrind/memcheck/invalidparams.c: -------------------------------------------------------------------------------- 1 | /* Taken from Valgrind memcheck manual */ 2 | 3 | #include 4 | #include 5 | 6 | int main( void ) 7 | { 8 | char* arr = malloc(10); 9 | int* arr2 = malloc(sizeof(int)); 10 | 11 | write( 1 /* stdout */, arr, 10 ); 12 | 13 | exit(arr2[0]); 14 | } 15 | -------------------------------------------------------------------------------- /Valgrind/memcheck/memoryleak.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void f(void) 4 | { 5 | int* x = malloc(10 * sizeof(int)); 6 | x[10] = 0; // problem 1: heap block overrun 7 | } // problem 2: memory leak -- x not freed 8 | 9 | int main(void) 10 | { 11 | f(); 12 | return 0; 13 | } 14 | -------------------------------------------------------------------------------- /gdb4hpc/README.md: -------------------------------------------------------------------------------- 1 | # gdb4hpc 2 | 3 | - `HPE_Oct2024` 4 | Materials taken from 5 | `/global/cfs/cdirs/training/2024/HPE_Oct2024/gdb4hpc_lab`, 6 | prepared by HPE for the hands-on session for gdb4hpc in the HPE 7 | Perlmutter Training on User Environment and Profiling/Debugging, 8 | October 14-15, 2024 9 | -------------------------------------------------------------------------------- /CUDA/CUDA-GDB/README.md: -------------------------------------------------------------------------------- 1 | CUDA-GDB example codes in the CUDA-GDB User Manual: 2 | 3 | - `bitreverse.cu` 4 | ``` 5 | $ nvcc -g -G -o bitreverse bitreverse.cu 6 | 7 | $ cuda-gdb ./bitreverse 8 | ``` 9 | - `autostep.cu` 10 | ``` 11 | $ nvcc -g -G -o autostep.cu -o autostep 12 | 13 | $ cuda-gdb ./autostep 14 | ``` 15 | -------------------------------------------------------------------------------- /Sanitzers/Sanitizers4hpc/GPU/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void launch_memcheck_demo(); 4 | 5 | int main (int argc, char **argv) 6 | { 7 | int rank, nprocs; 8 | MPI_Init (&argc, &argv); 9 | MPI_Comm_rank (MPI_COMM_WORLD, &rank); 10 | MPI_Comm_size (MPI_COMM_WORLD, &nprocs); 11 | 12 | launch_memcheck_demo(); 13 | MPI_Finalize(); 14 | return 0; 15 | } 16 | -------------------------------------------------------------------------------- /Sanitzers/Sanitizers4hpc/CPU/buggyreduction_mpiomp.c: -------------------------------------------------------------------------------- 1 | #include "mpi.h" 2 | #include 3 | 4 | int main (int argc, char **argv) { 5 | int rank; 6 | 7 | MPI_Init(&argc, &argv); 8 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 9 | 10 | int sum = 0; 11 | #pragma omp parallel for shared(sum) 12 | for (int i=0; i<1000; i++) 13 | sum += i; 14 | 15 | printf("%d: sum = %d\n", rank, sum); 16 | 17 | MPI_Finalize(); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /Valgrind/massif/example.c: -------------------------------------------------------------------------------- 1 | /* From Massif Valgrind manual */ 2 | 3 | #include 4 | 5 | void g(void) 6 | { 7 | malloc(4000); 8 | } 9 | 10 | void f(voild) 11 | { 12 | malloc(2000); 13 | g(); 14 | } 15 | 16 | int main(void) 17 | { 18 | 19 | int i; 20 | int* a[10]; 21 | 22 | for (i = 0; i < 10; i++) { 23 | a[i] = malloc(1000); 24 | } 25 | 26 | f(); 27 | 28 | g(); 29 | 30 | for (i = 0; i < 10; i++) { 31 | free(a[i]); 32 | } 33 | 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /Valgrind/memcheck/memoryleak_mpi.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void f(void) 5 | { 6 | int* x = malloc(25000 * sizeof(int)); 7 | x[25000] = 0; // problem 1: heap block overrun 8 | } // problem 2: memory leak -- x not freed 9 | 10 | int main(int argc, char **argv) 11 | { 12 | int nproc, me; 13 | MPI_Init(&argc, &argv); 14 | MPI_Comm_size(MPI_COMM_WORLD, &nproc); 15 | MPI_Comm_rank(MPI_COMM_WORLD, &me); 16 | f(); 17 | MPI_Finalize(); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /fortran_memory/free_twice.f90: -------------------------------------------------------------------------------- 1 | program free_twice 2 | !... A buggy code prepared for a debugger tutorial by NERSC 3 | use mpi 4 | integer, parameter :: n = 1024 5 | real, allocatable :: a(:), b(:) 6 | integer i, ierr 7 | call mpi_init(ierr) 8 | allocate (a(n), b(n)) 9 | call random_number(a) 10 | b = cos(a) 11 | deallocate (a) 12 | print *, sum(b) 13 | deallocate (a,b) ! Oops..., deallocating 'a' again 14 | call mpi_finalize(ierr) 15 | end 16 | -------------------------------------------------------------------------------- /TotalView/src/array.h: -------------------------------------------------------------------------------- 1 | #ifndef ARRAY_H 2 | #define ARRAY_H 3 | #include 4 | 5 | // You can't look up the values of preprocessed macros in the debugger 6 | // So, if you want to look at the value of JMAX and IMAX compile w/ 7 | // -DUSE_GLOBALS 8 | #ifdef USE_GLOBALS 9 | int JMAX = 1000 10 | int IMAX = 1000 11 | #else 12 | //kah #define JMAX 1000 13 | #define JMAX 100 14 | #define IMAX 100 15 | #endif 16 | 17 | /*** some global vars **/ 18 | extern double b[]; 19 | 20 | void array(); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/core-files/core.makefile: -------------------------------------------------------------------------------- 1 | include common.makefile 2 | 3 | CFLAGS = -g 4 | GDB_PATH = gdb 5 | 6 | all: div-by-zero 7 | # Create two different core files so that loading multiple ones 8 | # can be tested. 9 | $(GDB_PATH) -x gdb-dump-corefile.cmd --args ./div-by-zero 1 10 | mv div-by-zero.core div-by-zero-1.core 11 | $(GDB_PATH) -x gdb-dump-corefile.cmd --args ./div-by-zero 2 12 | mv div-by-zero.core div-by-zero-2.core 13 | 14 | div-by-zero : div-by-zero.c 15 | $(CC) $(CFLAGS) $< -o $@ 16 | 17 | clean: 18 | $(RM) div-by-zero div-by-zero-1.core div-by-zero-2.core 19 | -------------------------------------------------------------------------------- /Sanitzers/Sanitizers4hpc/CPU/README.md: -------------------------------------------------------------------------------- 1 | # Sanitizers4hpc with CPU codes 2 | 3 | Example code: 4 | 5 | - `buggyreduction_mpiomp.c`: Santizers4hpc with ThreadSanitizer 6 | example code 7 | 8 | ## `buggyreduction_mpiomp.c` 9 | 10 | This is a simple MPI code based on `buggyreduction_omp.c`. 11 | 12 | ``` 13 | $ salloc -C cpu -n 2 -c 2 -q shared -t 20 14 | ... 15 | 16 | $ cc -fsanitize=thread -g -O1 -fopenmp buggyreduction_mpiomp.c -o buggyreduction_mpiomp 17 | 18 | $ export OMP_NUM_THREADS=2 19 | 20 | $ module load sanitizers4hpc 21 | 22 | $ sanitizers4hpc -l "-n 2 -c 2" -- ./buggyreduction_mpiomp 23 | ``` 24 | -------------------------------------------------------------------------------- /Linaro-Forge/performance/mmult_py.makefile: -------------------------------------------------------------------------------- 1 | include common.makefile 2 | 3 | # The MPI compiler commands (typically mpicc and mpif90) are autodetected 4 | # by common.makefile. You can override by uncommenting the following: 5 | #MPICC= 6 | #MPIF90= 7 | 8 | CFLAGS = -O2 9 | 10 | targets = libmmult_c.so libmmult_f 11 | 12 | .PHONY: all 13 | all: $(targets) 14 | 15 | libmmult_c.so: mmultlib.c 16 | $(MPICC) -std=c99 -fPIC -shared $(CFLAGS) $^ -o $@ 17 | 18 | .PHONY: libmmult_f 19 | libmmult_f: mmultlib.f90 20 | f2py --opt="$(CFLAGS)" -c $^ -m $@ 21 | 22 | .PHONY: clean 23 | clean: 24 | $(RM) libmmult_c.so libmmult_f*.so res*.mat 25 | 26 | -------------------------------------------------------------------------------- /Valgrind/dhat/ad-hoc.c: -------------------------------------------------------------------------------- 1 | /* #include "dhat/dhat.h" */ 2 | #include "dhat.h" 3 | #include 4 | void g(void) { 5 | DHAT_AD_HOC_EVENT(30); 6 | } 7 | 8 | void f(void) { 9 | g(); 10 | DHAT_AD_HOC_EVENT(20); 11 | g(); 12 | } 13 | 14 | int main(void) { 15 | f(); 16 | DHAT_AD_HOC_EVENT(10); 17 | f(); 18 | 19 | // At one point malloc was broken with --mode=ad-hoc(!), and Valgrind was 20 | // printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc 21 | // was called. So check that it's basically working... 22 | char* p = malloc(100); 23 | p = realloc(p, 200); 24 | free(p); 25 | 26 | return 0; 27 | } 28 | 29 | -------------------------------------------------------------------------------- /Linaro-Forge/README.md: -------------------------------------------------------------------------------- 1 | # Linaro Forge training materials 2 | 3 | See 4 | 5 | - `/global/cfs/cdirs/training/2025/linaro-forge-training` 6 | - `/global/cfs/cdirs/training/2024/Forge_Mar2024` 7 | 8 | # Build 9 | 10 | ## Debugging 11 | 12 | ``` 13 | $ cd correctness 14 | 15 | $ cd core-files 16 | $ make -f core.makefile clean 17 | $ make -f core.makefile 18 | 19 | $ cd .. 20 | 21 | $ cd debug 22 | $ make clean 23 | $ make 24 | 25 | $ cd .. 26 | 27 | $ cd gpu-nvidia-mmult 28 | $ make clean 29 | $ make 30 | 31 | $ cd ../.. 32 | ``` 33 | 34 | ## Profiling 35 | 36 | ``` 37 | $ cd performance 38 | $ ml python 39 | $ make -f mmult_py.makefile clean 40 | $ make -f mmult_py.makefile 41 | ``` 42 | -------------------------------------------------------------------------------- /TotalView/src/simple.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "array.h" 3 | 4 | int main(int argc, char **argv) 5 | { 6 | /************* command line args ***/ 7 | { 8 | char command_line_string[80]; 9 | if (argc > 1) 10 | { 11 | strcpy(command_line_string, argv[1]); 12 | printf("arg_2=%s\n", command_line_string); 13 | } 14 | } 15 | 16 | array(); 17 | 18 | array(); 19 | 20 | { 21 | 22 | char input[80]; 23 | scanf( "%s", input ); 24 | printf( "You entered: %s\n", input ); 25 | scanf( "%s", input ); 26 | printf( "Now you entered: %s\n", input ); 27 | 28 | } 29 | 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /fortran_memory/heap_overflow_underflow.f90: -------------------------------------------------------------------------------- 1 | program heap_overflow_underflow 2 | !... A buggy code prepared for a debugger tutorial by NERSC 3 | use mpi 4 | integer, parameter :: n = 1024 5 | integer, parameter :: ouf = 8 6 | real, allocatable :: a(:), b(:) 7 | integer i, ierr 8 | call mpi_init(ierr) 9 | allocate (a(n), b(n)) 10 | call random_number(a) 11 | b = cos(a) 12 | b(1) = cos(a(1-ouf)) ! read underflow 13 | b(1-ouf) = cos(a(1)) ! write underflow 14 | b(n) = cos(a(n+ouf)) ! read overflow 15 | b(n+ouf) = cos(a(n)) ! write overflow 16 | print *, sum(b) 17 | deallocate (a, b) 18 | call mpi_finalize(ierr) 19 | end 20 | -------------------------------------------------------------------------------- /Linaro-Forge/scripts/submit-job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | #SBATCH -J linaro-forge-hands-on 4 | #SBATCH -o stdout.%J.out 5 | #SBATCH -e stderr.%J.err 6 | #SBATCH -A ntrain7 7 | #SBATCH -C cpu 8 | #SBATCH --time=00:30:00 9 | #SBATCH --nodes=1 10 | #SBATCH -c 32 11 | #SBATCH --reservation=forge_cpu 12 | 13 | export SLURM_CPU_BIND="cores" 14 | 15 | SIMPLEPATH=$FORGE_TRAINING/correctness/debug/simple 16 | MMULTPATH=$FORGE_TRAINING/performance/mmult.py 17 | 18 | module load forge 19 | 20 | # Debug a simple MPI program 21 | ddt --offline -o offline-debugging.html --break-at=simple.c:32 --break-at=simple.c:41 srun -n 4 $SIMPLEPATH 22 | 23 | # Profile matrix multiplication example 24 | #cd `dirname $MMULTPATH` 25 | #map --profile srun -n 8 python3 $MMULTPATH -s 3072 26 | -------------------------------------------------------------------------------- /TotalView/src/array.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "array.h" 4 | 5 | 6 | double b[IMAX]; 7 | 8 | void dowork() 9 | { 10 | /**** some array operations ***/ 11 | { 12 | int i, j, jmod; 13 | double xi, xj, dx, scale = 100.0; 14 | 15 | for (j = 0; j < JMAX; j++) 16 | { 17 | jmod = (100*j) %JMAX; 18 | xj = (double)jmod/(double)JMAX; 19 | for (i = 0; i < IMAX; i++) 20 | { 21 | xi = (double)i/(double)IMAX; 22 | dx = xi-xj; 23 | b[i] = 2.0/(1.0+exp(scale*dx*dx)); 24 | } 25 | printf("counter %d\n", j); 26 | } 27 | } 28 | return; 29 | } 30 | 31 | 32 | void array() 33 | { 34 | dowork(); 35 | return; 36 | } 37 | -------------------------------------------------------------------------------- /fortran_memory/segfault.f90: -------------------------------------------------------------------------------- 1 | program segfault 2 | !... A buggy code prepared for a debugger tutorial by NERSC 3 | use mpi 4 | integer, parameter :: n = 10 5 | real, pointer :: a(:) => null() 6 | real, pointer :: b(:) => null() 7 | real, pointer :: c(:) => null() 8 | integer me, i, ierr 9 | call mpi_init(ierr) 10 | call mpi_comm_rank(mpi_comm_world,me,ierr) 11 | ! allocate (a(n), b(n), c(n)) ! Oops, forgot to allocate... 12 | call sub(a,b,c,n) 13 | print *, sum(c) 14 | deallocate (a, b, c) 15 | call mpi_finalize(ierr) 16 | end 17 | 18 | subroutine sub(a,b,c,n) 19 | integer n 20 | real a(n), b(n), c(n) 21 | call random_number(a) 22 | call random_number(b) 23 | do i=1,n 24 | c(i) = cos(a(i)) * sin(b(i)) 25 | end do 26 | end 27 | -------------------------------------------------------------------------------- /Valgrind/dhat/dhat.out.1688970: -------------------------------------------------------------------------------- 1 | {"dhatFileVersion":2 2 | ,"mode":"heap","verb":"Allocated" 3 | ,"bklt":true,"bkacc":true 4 | ,"tu":"instrs","Mtu":"Minstr" 5 | ,"tuth":500 6 | ,"cmd":"./basic" 7 | ,"pid":1688970 8 | ,"te":341947 9 | ,"tg":336617 10 | ,"pps": 11 | [{"tb":4000,"tbk":2 12 | ,"tl":7111 13 | ,"mb":3000,"mbk":1 14 | ,"gb":3000,"gbk":1 15 | ,"eb":3000,"ebk":1 16 | ,"rb":1008,"wb":1516 17 | ,"fs":[1,2] 18 | } 19 | ,{"tb":3000,"tbk":2 20 | ,"tl":1831 21 | ,"mb":2000,"mbk":1 22 | ,"gb":2000,"gbk":1 23 | ,"eb":0,"ebk":0 24 | ,"rb":2000,"wb":2000 25 | ,"fs":[3,4] 26 | } 27 | ] 28 | ,"ftbl": 29 | ["[root]" 30 | ,"0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_dhat-amd64-linux.so)" 31 | ,"0x40051F: main (basic.c:11)" 32 | ,"0x4E0A571: calloc (in /usr/lib/valgrind/vgpreload_dhat-amd64-linux.so)" 33 | ,"0x400540: main (basic.c:15)" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/core-files/div-by-zero.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | void print_fraction(int numerator, int denominator) 9 | { 10 | printf("%d\n", numerator / denominator); kill (getpid(), SIGFPE); 11 | } 12 | 13 | 14 | int main(int argc, char* argv[]) 15 | { 16 | /* Take a value out of the arguments so that loading multiple */ 17 | /* core files with different values works in ddt. */ 18 | int numerator = 1; 19 | if(argc == 2) { 20 | numerator = atoi(argv[1]); 21 | } 22 | 23 | /* ensure a core file is generated */ 24 | struct rlimit limit; 25 | limit.rlim_cur = limit.rlim_max = RLIM_INFINITY; 26 | setrlimit(RLIMIT_CORE, &limit); 27 | /* divide by zero */ 28 | print_fraction(numerator, 0); 29 | 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /Valgrind/dhat/dhat.out.2245130: -------------------------------------------------------------------------------- 1 | {"dhatFileVersion":2 2 | ,"mode":"ad-hoc","verb":"Occurred" 3 | ,"bklt":false,"bkacc":false 4 | ,"bu":"unit","bsu":"units","bksu":"events" 5 | ,"tu":"instrs","Mtu":"Minstr" 6 | ,"cmd":"./ad-hoc" 7 | ,"pid":2245130 8 | ,"te":340933 9 | ,"pps": 10 | [{"tb":30,"tbk":1 11 | ,"fs":[1,2,3] 12 | } 13 | ,{"tb":20,"tbk":1 14 | ,"fs":[4,3] 15 | } 16 | ,{"tb":30,"tbk":1 17 | ,"fs":[5,6,3] 18 | } 19 | ,{"tb":10,"tbk":1 20 | ,"fs":[7] 21 | } 22 | ,{"tb":30,"tbk":1 23 | ,"fs":[1,2,8] 24 | } 25 | ,{"tb":20,"tbk":1 26 | ,"fs":[4,8] 27 | } 28 | ,{"tb":30,"tbk":1 29 | ,"fs":[5,6,8] 30 | } 31 | ] 32 | ,"ftbl": 33 | ["[root]" 34 | ,"0x4006F6: g (ad-hoc.c:5)" 35 | ,"0x4006F6: f (ad-hoc.c:9)" 36 | ,"0x4004C8: main (ad-hoc.c:15)" 37 | ,"0x400750: f (ad-hoc.c:10)" 38 | ,"0x4007A9: g (ad-hoc.c:5)" 39 | ,"0x4007A9: f (ad-hoc.c:11)" 40 | ,"0x400519: main (ad-hoc.c:22)" 41 | ,"0x40052C: main (ad-hoc.c:17)" 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /Valgrind/dhat/basic.c: -------------------------------------------------------------------------------- 1 | // Some basic allocations and accesses. 2 | 3 | #include 4 | #include 5 | #include 6 | /* #include "dhat/dhat.h" */ 7 | #include "dhat.h" 8 | 9 | int main(void) 10 | { 11 | int64_t* m = malloc(1000); 12 | m[0] = 1; // write 8 bytes 13 | m[10] = m[1]; // read and write 8 bytes 14 | 15 | char* c = calloc(1, 2000); 16 | for (int i = 0; i < 1000; i++) { 17 | c[i + 1000] = c[i]; // read and write 1000 bytes 18 | } 19 | 20 | char* r = realloc(m, 3000); // read and write 1000 bytes (memcpy) 21 | for (int i = 0; i < 500; i++) { 22 | r[i + 2000] = 99; // write 500 bytes 23 | } 24 | 25 | c = realloc(c, 1000); // read and write 1000 bytes (memcpy) 26 | 27 | free(c); 28 | // totals: 3008 read, 3516 write 29 | 30 | // Should be ignored because we're not in ad hoc mode. 31 | DHAT_AD_HOC_EVENT(100); 32 | 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /TotalView/src/ReplayEngine_demo.cxx: -------------------------------------------------------------------------------- 1 | // 2 | // ReplayEngine Demo 3 | // 4 | // 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | int funcA(int); 17 | int funcB(int); 18 | int badstuff(); 19 | #define MAXDEPTH 20 20 | int arraylength=MAXDEPTH; 21 | 22 | 23 | int main() 24 | { 25 | funcA(0); 26 | badstuff(); 27 | funcA(0); 28 | exit(1); 29 | } 30 | 31 | 32 | 33 | int funcA(int a){ 34 | int b; 35 | b=a+2; 36 | b=funcB(b); 37 | return b; 38 | } 39 | 40 | 41 | int funcB(int b){ 42 | int c; 43 | int i; 44 | int v[MAXDEPTH]; 45 | int *p; 46 | 47 | c=b+2; 48 | p=&c; 49 | 50 | if( c0; i--){ 54 | v[i]=*p; 55 | } 56 | 57 | return c; 58 | } 59 | 60 | int badstuff(){ 61 | arraylength=5*MAXDEPTH; 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /TotalView/src/myClassA.hxx: -------------------------------------------------------------------------------- 1 | /*********************************************************************** 2 | * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED 3 | * No part of this material may be reproduced, stored in a retrieval 4 | * system, transmitted or used in any form or by any means, electronic, 5 | * mechanical, photocopying, recording, or otherwise, without the prior 6 | * written permission of, or express license from Etnus, LLC. 7 | *********************************************************************** 8 | * This file contains PROPRIETARY INFORMATION of Etnus, LLC. 9 | *********************************************************************** 10 | * Copyright 1999 by Etnus, Inc. 11 | * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc. 12 | * Copyright 1989-1996 by BBN Inc. 13 | ***********************************************************************/ 14 | #ifndef __MY_CLASS_A__ 15 | #define __MY_CLASS_A__ 16 | 17 | class myClassA { 18 | 19 | public: 20 | myClassA(); 21 | ~myClassA(); 22 | 23 | private: 24 | float *float_p; 25 | int *int_p; 26 | int size; 27 | }; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/debug/deadlock.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 | /* 3 | * See COPYRIGHT in top-level directory. 4 | */ 5 | 6 | #include 7 | #include 8 | #include "mpi.h" 9 | 10 | int main(int argc, char **argv) 11 | { 12 | int rank, size; 13 | int i, data; 14 | 15 | MPI_Init(&argc, &argv); 16 | 17 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 18 | MPI_Comm_size(MPI_COMM_WORLD, &size); 19 | 20 | if (rank == 0) { 21 | int *sendbuf; 22 | MPI_Request *sendreqs; 23 | 24 | /* setup send operations */ 25 | sendreqs = malloc(sizeof(MPI_Request) * size); 26 | sendbuf = malloc(sizeof(int) * size); 27 | 28 | for (i = 0; i < size; i++) { 29 | sendbuf[i] = i * 10; 30 | MPI_Ssend(&sendbuf[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); 31 | //FIX: MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD, &sendreqs[i]); 32 | } 33 | 34 | MPI_Recv(&data, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 35 | 36 | MPI_Waitall(size, sendreqs, MPI_STATUSES_IGNORE); 37 | } 38 | 39 | MPI_Finalize(); 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/debug/memory_debugging.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mpi.h" 4 | 5 | int *chunk_a, *chunk_b; 6 | 7 | void func_a(int size) { 8 | sleep(size); 9 | chunk_a = malloc(4000 * size); 10 | free(chunk_a); 11 | } 12 | 13 | void func_b(int size) { 14 | chunk_b = malloc(2000 * size); 15 | func_a(size); 16 | free(chunk_b); 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | int rank, size, i; 21 | int *a[10], *dynamicArray; 22 | 23 | MPI_Init(&argc, &argv); 24 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 25 | MPI_Comm_size(MPI_COMM_WORLD, &size); 26 | 27 | if (rank == 0) { 28 | for (i = 0; i < size; i++) { 29 | dynamicArray = malloc(sizeof(int)*100000); 30 | } 31 | } 32 | 33 | for (i = 0; i < 10; i++) { 34 | a[i] = malloc(1000); 35 | } 36 | 37 | func_a(size); 38 | func_b(size/2); 39 | 40 | // Address not mapped error 41 | for (i = 0; i < 11; i++) { 42 | free(a[i]); 43 | } 44 | 45 | free(a[1]); // Free previously freed pointer 46 | free(dynamicArray); // Should only be freed on proc 0 47 | 48 | MPI_Finalize(); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/debug/simple.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 | /* 3 | * See COPYRIGHT in top-level directory. 4 | */ 5 | 6 | #include 7 | #include 8 | 9 | int main(int argc, char **argv) 10 | { 11 | int rank, size, target, source; 12 | int sendbuf, recvbuf; 13 | MPI_Request reqs[2]; 14 | 15 | MPI_Init(&argc, &argv); 16 | 17 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 18 | MPI_Comm_size(MPI_COMM_WORLD, &size); 19 | 20 | /* get communication partners */ 21 | target = (rank + 1) % size; 22 | source = (rank - 1); 23 | if (source < 0) 24 | source += size; 25 | 26 | sendbuf = 42; 27 | recvbuf = 0; 28 | 29 | MPI_Irecv(&recvbuf, 3, MPI_INT, source, 0, MPI_COMM_WORLD, &reqs[0]); 30 | 31 | /* stop here to view posted recvs */ 32 | printf("recvs posted, recvbuf = %d\n", recvbuf); 33 | 34 | MPI_Isend(&sendbuf, 3, MPI_INT, target, 0, MPI_COMM_WORLD, &reqs[1]); 35 | 36 | /* stop here to view send ops */ 37 | printf("sends issued\n"); 38 | 39 | MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE); 40 | 41 | printf("communication complete, recvbuf = %d\n", recvbuf); 42 | 43 | MPI_Finalize(); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /TotalView/src/myClassB.hxx: -------------------------------------------------------------------------------- 1 | /*********************************************************************** 2 | * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED 3 | * No part of this material may be reproduced, stored in a retrieval 4 | * system, transmitted or used in any form or by any means, electronic, 5 | * mechanical, photocopying, recording, or otherwise, without the prior 6 | * written permission of, or express license from Etnus, LLC. 7 | *********************************************************************** 8 | * This file contains PROPRIETARY INFORMATION of Etnus, LLC. 9 | *********************************************************************** 10 | * Copyright 1999 by Etnus, Inc. 11 | * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc. 12 | * Copyright 1989-1996 by BBN Inc. 13 | ***********************************************************************/ 14 | 15 | #ifndef __MY_CLASS_B__ 16 | #define __MY_CLASS_B__ 17 | 18 | #include 19 | 20 | class myClassB { 21 | 22 | public: 23 | myClassB(); 24 | ~myClassB(); 25 | void init(void); 26 | void destroy(void); 27 | 28 | private: 29 | 30 | std::vector *vector_char_p; 31 | int **b_pp; 32 | int size; 33 | }; 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /TotalView/src/myClassA.cxx: -------------------------------------------------------------------------------- 1 | /*********************************************************************** 2 | * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED 3 | * No part of this material may be reproduced, stored in a retrieval 4 | * system, transmitted or used in any form or by any means, electronic, 5 | * mechanical, photocopying, recording, or otherwise, without the prior 6 | * written permission of, or express license from Etnus, LLC. 7 | *********************************************************************** 8 | * This file contains PROPRIETARY INFORMATION of Etnus, LLC. 9 | *********************************************************************** 10 | * Copyright 1999 by Etnus, Inc. 11 | * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc. 12 | * Copyright 1989-1996 by BBN Inc. 13 | ***********************************************************************/ 14 | #include 15 | #include "myClassA.hxx" 16 | 17 | myClassA::myClassA() : size (128) { 18 | 19 | float_p = new float[size]; 20 | 21 | for(int i=0; i 20 | #include 21 | 22 | 23 | void mmult(int sz, int nslices, double *A, double *B, double *C) 24 | { 25 | for(int i=0; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #ifdef ADD_MPI 8 | #include 9 | #endif //ADD_MPI 10 | 11 | void random_vector(std::vector& vec) 12 | { 13 | size_t count = (size_t)rand() % 1000; 14 | if(count < 100) count = 150; 15 | 16 | for(size_t i=0; i vec; 27 | 28 | random_vector(vec); 29 | 30 | std::cout << "A thread has finished" << std::endl; 31 | } 32 | 33 | 34 | 35 | int main(int argc, char** argv) 36 | { 37 | #ifdef ADD_MPI 38 | int rank, nnodes, nthreads; 39 | MPI_Init (&argc, &argv); 40 | MPI_Comm_rank (MPI_COMM_WORLD, &rank); 41 | MPI_Comm_size (MPI_COMM_WORLD, &nnodes); 42 | #endif //ADD_MPI 43 | 44 | time_t tm = time(NULL); 45 | srand(tm); 46 | int numThreads = 25; 47 | if(argc >= 2) 48 | { 49 | numThreads = atoi(argv[1]); 50 | if(numThreads < 0) 51 | numThreads = 25; 52 | } 53 | 54 | std::vector threads; 55 | threads.reserve(numThreads); 56 | 57 | std::cout << "Main is going to create " << numThreads << " threads." << std::endl; 58 | 59 | for(int i=0; i 14 | #include 15 | 16 | myClassB::myClassB() : size(256) { 17 | 18 | vector_char_p = new std::vector(); 19 | 20 | for(int i=0; ipush_back((char *) strdup("This is from calling strdup in myClassB.")); 22 | } 23 | 24 | init(); 25 | } 26 | 27 | 28 | myClassB::~myClassB() { 29 | 30 | std::vector::iterator iter; 31 | 32 | for(int i=0; ibegin(); iter != vector_char_p->end(); iter++) { 37 | free(*iter); 38 | } 39 | 40 | delete vector_char_p; 41 | 42 | } 43 | 44 | void myClassB::init(void) { 45 | 46 | b_pp = (int **) malloc (size * sizeof(int *)); 47 | 48 | for(int i=0; i (libgomp.so.1+0x1dd4d) 31 | 32 | Previous write of size 4 at 0x7ffdf6e678bc by main thread: 33 | #0 main._omp_fn.0 /pscratch/sd/e/elvis/sanitizers/buggyreduction_omp.c:7 (a.out+0x4008aa) 34 | #1 GOMP_parallel (libgomp.so.1+0x14e95) 35 | 36 | Location is stack of main thread. 37 | 38 | Location is global '' at 0x000000000000 ([stack]+0x1e8bc) 39 | 40 | Thread T1 (tid=2240266, running) created by main thread at: 41 | #0 pthread_create (libtsan.so.2+0x61be6) 42 | #1 (libgomp.so.1+0x1e38f) 43 | 44 | SUMMARY: ThreadSanitizer: data race /pscratch/sd/e/elvis/sanitizers/buggyreduction_omp.c:6 in main._omp_fn.0 45 | ================== 46 | sum = 335625 47 | ThreadSanitizer: reported 1 warnings 48 | ``` 49 | 50 | You may have to run a few times to see the error (because of a race 51 | condition!). 52 | -------------------------------------------------------------------------------- /Sanitzers/Sanitizers4hpc/GPU/README.md: -------------------------------------------------------------------------------- 1 | # Sanitizers4hpc with GPU codes 2 | 3 | Example code: 4 | 5 | - `main.cc` and `memcheck_demo.cu`: Santizers4hpc with Compute Sanitzer's Memcheck example code 6 | 7 | ## `main.cc` and `memcheck_demo.cu` 8 | 9 | This is a simple MPI adaptation with a Nvidia's Compute Sanitizer 10 | example code, 11 | [`memcheck_demo.cu`](https://github.com/NVIDIA/compute-sanitizer-samples/blob/master/Memcheck/memcheck_demo.cu). 12 | The code is for using Compute Sanitizer's Memcheck tool. 13 | 14 | ``` 15 | $ salloc -A -C gpu -N 1 --gpus-per-node=4 -q debug -t 10 ... 16 | ... 17 | 18 | $ cc -fsanitize=thread -g -O1 -fopenmp buggyreduction_mpiomp.c -o buggyreduction_mpiomp 19 | $ CC -c -g main.cc 20 | $ nvcc -Xcompiler -rdynamic -lineinfo -c memcheck_demo.cu 21 | $ CC -o memcheck_demo main.o memcheck_demo.o 22 | 23 | $ module load sanitizers4hpc 24 | 25 | $ sanitizers4hpc -l "-n 4 -c 32 --cpu-bind=cores --gpus-per-task=1 --gpu-bind=none" -m ${CUDA_HOME}/compute-sanitizer/compute-sanitizer -f -- ./memcheck_demo 26 | RANKS: <2,3> 27 | ... 28 | Saved host backtrace up to driver entry point at error 29 | #0 0x2eae6f in /usr/local/cuda-12.2/compat/libcuda.so.1 30 | #1 0xd8f0 in /home/jenkins/src/gtlt/cuda/gtlt_cuda_query.c:325:gtlt_cuda_pointer_type /opt/cray/pe/lib64/libmpi_gtl_cuda.so.0 31 | ... 32 | RANKS: <0-1> 33 | ... 34 | Saved host backtrace up to driver entry point at error 35 | #0 0x2eae6f in /usr/local/cuda-12.2/compat/libcuda.so.1 36 | #1 0xd8f0 in /home/jenkins/src/gtlt/cuda/gtlt_cuda_query.c:325:gtlt_cuda_pointer_type /opt/cray/pe/lib64/libmpi_gtl_cuda.so.0 37 | ... 38 | ``` 39 | 40 | The `-f` flag is needed to bypass the `santizers4hpc`'s requirement 41 | that the executable is instrumented for a LLVM Sanitizer. This 42 | example code is basically a CUDA code that is not instrumented for 43 | a LLVM Sanitizer. 44 | 45 | Aggregation of output will improve in CPE/24.07. 46 | -------------------------------------------------------------------------------- /Sanitzers/LeakSanitizer/README.md: -------------------------------------------------------------------------------- 1 | # LeakSanitizer (LSAN) 2 | 3 | Example code: 4 | 5 | - `memory-leak.c` 6 | 7 | ## LeakSanitizer as a stand-alone sanitizer 8 | 9 | `memory-leak.c` has a memory leak of 7 bytes as the memory block 10 | pointed to by the pointer `p` is not freed before setting it to 11 | `NULL` (0). 12 | 13 | Here we try the `clang` compiler in the `PrgEnv-aocc` environment 14 | to demonstrate that the tool works in the environment but you are 15 | free to use a different compiler that supports LSAN. To build and 16 | run: 17 | 18 | ``` 19 | $ clang -fsanitize=leak -g -O0 -o memory-leak memory-leak.c 20 | 21 | $ ./memory-leak 22 | ================================================================= 23 | ==2335900==ERROR: LeakSanitizer: detected memory leaks 24 | 25 | Direct leak of 7 byte(s) in 1 object(s) allocated from: 26 | #0 0x55966653a842 in malloc /.../nersc/nersc-user-env/prgenv/llvm_src_17.0.6/compiler-rt/lib/lsan/lsan_interceptors.cpp:75:3 27 | #1 0x559666565898 in main /pscratch/sd/e/elvis/addresssanitizer/memory-leak.c:4:7 28 | #2 0x7efe8f83e24c in __libc_start_main (/lib64/libc.so.6+0x3524c) (BuildId: ddc393ac74ed8f90d4fdfff796432fbafd281e1b) 29 | 30 | SUMMARY: LeakSanitizer: 7 byte(s) leaked in 1 allocation(s) 31 | ``` 32 | 33 | ## LeakSanitizer run under AddressSanitizer: 34 | 35 | LeakSanitizer can be combined with AddressSanitizer to get both 36 | memory error and leak detection, too. Build with `-fsanitize=address` 37 | but run the executable with the environment variable `ASAN_OPTIONS` 38 | set to `detect_leaks=1`: 39 | 40 | ``` 41 | $ clang -fsanitize=address -g -o memory-leak memory-leak.c 42 | 43 | $ ASAN_OPTIONS=detect_leaks=1 ./memory-leak 44 | ================================================================= 45 | ==2339511==ERROR: LeakSanitizer: detected memory leaks 46 | 47 | Direct leak of 7 byte(s) in 1 object(s) allocated from: 48 | #0 0x56040740afde in malloc /.../nersc/nersc-user-env/prgenv/llvm_src_17.0.6/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3 49 | #1 0x560407447a68 in main /pscratch/sd/e/elvis/addresssanitizer/memory-leak.c:4:7 50 | #2 0x7fdab443e24c in __libc_start_main (/lib64/libc.so.6+0x3524c) (BuildId: ddc393ac74ed8f90d4fdfff796432fbafd281e1b) 51 | 52 | SUMMARY: AddressSanitizer: 7 byte(s) leaked in 1 allocation(s) 53 | ``` 54 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/common/helper_functions.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | // These are helper functions for the SDK samples (string parsing, 29 | // timers, image helpers, etc) 30 | #ifndef COMMON_HELPER_FUNCTIONS_H_ 31 | #define COMMON_HELPER_FUNCTIONS_H_ 32 | 33 | #ifdef WIN32 34 | #pragma warning(disable : 4996) 35 | #endif 36 | 37 | // includes, project 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | // includes, timer, string parsing, image helpers 51 | #include // helper functions for image compare, dump, data comparisons 52 | #include // helper functions for string parsing 53 | #include // helper functions for timers 54 | 55 | #ifndef EXIT_WAIVED 56 | #define EXIT_WAIVED 2 57 | #endif 58 | 59 | #endif // COMMON_HELPER_FUNCTIONS_H_ 60 | -------------------------------------------------------------------------------- /Sanitzers/Sanitizers4hpc/GPU/memcheck_demo.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #include 29 | 30 | __device__ int x; 31 | 32 | __global__ void unaligned_kernel(void) 33 | { 34 | *(int*) ((char*)&x + 1) = 42; 35 | } 36 | 37 | __device__ void out_of_bounds_function(void) 38 | { 39 | *(int*) 0x87654320 = 42; 40 | } 41 | 42 | __global__ void out_of_bounds_kernel(void) 43 | { 44 | out_of_bounds_function(); 45 | } 46 | 47 | static void run_unaligned(void) 48 | { 49 | std::cout << "Running unaligned_kernel: "; 50 | unaligned_kernel<<<1,1>>>(); 51 | std::cout << cudaGetErrorString(cudaDeviceSynchronize()) << std::endl; 52 | } 53 | 54 | static void run_out_of_bounds(void) 55 | { 56 | std::cout << "Running out_of_bounds_kernel: "; 57 | out_of_bounds_kernel<<<1,1>>>(); 58 | std::cout << cudaGetErrorString(cudaDeviceSynchronize()) << std::endl; 59 | } 60 | 61 | void launch_memcheck_demo() { 62 | int *devMem = nullptr; 63 | 64 | std::cout << "Mallocing memory" << std::endl; 65 | cudaMalloc((void**)&devMem, 1024); 66 | 67 | run_unaligned(); 68 | run_out_of_bounds(); 69 | 70 | // Omitted to demo leakcheck 71 | // cudaFree(devMem); 72 | } 73 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/debug/split.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 | /* 3 | * See COPYRIGHT in top-level directory. 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main(int argc, char **argv) 12 | { 13 | int rank, size; 14 | int color, split_rank, split_size; 15 | int *sendbuf, *recvbuf; 16 | MPI_Comm split_comm; 17 | 18 | MPI_Init(&argc, &argv); 19 | 20 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 21 | MPI_Comm_size(MPI_COMM_WORLD, &size); 22 | 23 | /* color evens and odds */ 24 | color = (rank % 2 == 0); 25 | MPI_Comm_split(MPI_COMM_WORLD, color, 0, &split_comm); 26 | if (color == 1) 27 | MPI_Comm_set_name(split_comm, "Even Comm"); 28 | else 29 | MPI_Comm_set_name(split_comm, "Odd Comm"); 30 | 31 | MPI_Comm_rank(split_comm, &split_rank); 32 | MPI_Comm_size(split_comm, &split_size); 33 | 34 | /* setup some comm buffers */ 35 | sendbuf = malloc(sizeof(int) * split_size); 36 | recvbuf = malloc(sizeof(int) * split_size); 37 | 38 | /* odd comm */ 39 | if (color == 0) { 40 | int i, curr, num_ops = split_size * 2 + 1; 41 | MPI_Request *reqs = malloc(sizeof(MPI_Request) * num_ops); 42 | 43 | for (i = 0, curr = 0; i < split_size; i++) { 44 | MPI_Irecv(&recvbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]); 45 | } 46 | 47 | for (i = 0; i < split_size; i++) { 48 | sendbuf[i] = rand(); 49 | sleep(1); 50 | MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]); 51 | } 52 | 53 | if (split_rank == 0) { 54 | MPI_Isend(NULL, 0, MPI_INT, 0, 0, MPI_COMM_WORLD, &reqs[curr++]); 55 | MPI_Waitall(num_ops, reqs, MPI_STATUSES_IGNORE); 56 | } else { 57 | MPI_Waitall(num_ops - 1, reqs, MPI_STATUSES_IGNORE); 58 | } 59 | free(reqs); 60 | } 61 | 62 | /* even comm */ 63 | if (color == 1) { 64 | int i, curr, num_ops = split_size * 2 + 1; 65 | MPI_Request *reqs = malloc(sizeof(MPI_Request) * num_ops); 66 | 67 | for (i = 0, curr = 0; i < split_size; i++) { 68 | MPI_Irecv(&recvbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]); 69 | } 70 | 71 | for (i = 0; i < split_size; i++) { 72 | sendbuf[i] = rand(); 73 | sleep(2); 74 | MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]); 75 | } 76 | 77 | if (split_rank == 0) { 78 | MPI_Irecv(NULL, 0, MPI_INT, 1, 0, MPI_COMM_WORLD, &reqs[curr++]); 79 | MPI_Waitall(num_ops, reqs, MPI_STATUSES_IGNORE); 80 | } else { 81 | MPI_Waitall(num_ops - 1, reqs, MPI_STATUSES_IGNORE); 82 | } 83 | free(reqs); 84 | } 85 | 86 | free(sendbuf); 87 | free(recvbuf); 88 | 89 | MPI_Finalize(); 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /Valgrind/memcheck/overlap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | char b[50]; 5 | 6 | void reset_b(void) 7 | { 8 | int i; 9 | 10 | for (i = 0; i < 50; i++) 11 | b[i] = '_'; 12 | b[49] = '\0'; 13 | } 14 | 15 | void reset_b2(void) 16 | { 17 | reset_b(); 18 | strcpy(b, "ABCDEFG"); 19 | } 20 | 21 | int main(void) 22 | { 23 | char x[100]; 24 | char a[] = "abcdefghijklmnopqrstuvwxyz"; 25 | int i; 26 | 27 | /* testing memcpy/strcpy overlap */ 28 | 29 | for (i = 0; i < 50; i++) { 30 | x[i] = i+1; // don't put any zeroes in there 31 | } 32 | for (i = 50; i < 100; i++) { 33 | // because of the errors, the strcpy's will overrun, so put some 34 | // zeroes in the second half to stop them eventually 35 | x[i] = 0; 36 | 37 | } 38 | 39 | memcpy(x+20, x, 20); // ok 40 | memcpy(x+20, x, 21); // overlap 41 | memcpy(x, x+20, 20); // ok 42 | memcpy(x, x+20, 21); // overlap 43 | 44 | strncpy(x+20, x, 20); // ok 45 | strncpy(x+20, x, 21); // overlap 46 | strncpy(x, x+20, 20); // ok 47 | strncpy(x, x+20, 21); // overlap 48 | 49 | x[39] = '\0'; 50 | strcpy(x, x+20); // ok 51 | 52 | x[39] = 39; 53 | x[40] = '\0'; 54 | strcpy(x, x+20); // overlap 55 | 56 | x[19] = '\0'; 57 | strcpy(x+20, x); // ok 58 | 59 | /* 60 | x[19] = 19; 61 | x[20] = '\0'; 62 | strcpy(x+20, x); // overlap, but runs forever (or until it seg faults) 63 | */ 64 | 65 | /* testing strcpy, strncpy() */ 66 | 67 | reset_b(); 68 | printf("`%s'\n", b); 69 | 70 | strcpy(b, a); 71 | printf("`%s'\n", b); 72 | 73 | reset_b(); 74 | strncpy(b, a, 25); 75 | printf("`%s'\n", b); 76 | 77 | reset_b(); 78 | strncpy(b, a, 26); 79 | printf("`%s'\n", b); 80 | 81 | reset_b(); 82 | strncpy(b, a, 27); 83 | printf("`%s'\n", b); 84 | 85 | printf("\n"); 86 | 87 | /* testing strncat() */ 88 | 89 | reset_b2(); 90 | printf("`%s'\n", b); 91 | 92 | reset_b2(); 93 | strcat(b, a); 94 | printf("`%s'\n", b); 95 | 96 | reset_b2(); 97 | strncat(b, a, 25); 98 | printf("`%s'\n", b); 99 | 100 | reset_b2(); 101 | strncat(b, a, 26); 102 | printf("`%s'\n", b); 103 | 104 | reset_b2(); 105 | strncat(b, a, 27); 106 | printf("`%s'\n", b); 107 | 108 | /* Nb: can't actually get strcat warning -- if any overlap occurs, it will 109 | always run forever, I think... */ 110 | 111 | for ( i = 0; i < 2; i++) 112 | strncat(a+20, a, 21); // run twice to check 2nd error isn't shown 113 | strncat(a, a+20, 21); 114 | 115 | /* This is ok, but once gave a warning when strncpy() was wrong, 116 | and used 'n' for the length, even when the src was shorter than 'n' */ 117 | { 118 | char dest[64]; 119 | char src [16]; 120 | strcpy( src, "short" ); 121 | strncpy( dest, src, 20 ); 122 | } 123 | 124 | return 0; 125 | } 126 | -------------------------------------------------------------------------------- /Sanitzers/AddressSanitizer/README.md: -------------------------------------------------------------------------------- 1 | # AddressSanitizer (ASAN) 2 | 3 | Example codes: 4 | 5 | - `illegalmemoryaccess.cpp` 6 | - `use-after-free.c`: from [https://github.com/google/sanitizers/wiki/AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer) 7 | - `example_UseAfterFree.cc`: from [https://clang.llvm.org/docs/AddressSanitizer.html](https://clang.llvm.org/docs/AddressSanitizer.html) 8 | 9 | ## `illegalmemoryaccess.cpp` 10 | 11 | This code attempts to write outside of the allocated block ("heap 12 | over"). In this example, the GNU compiler is used, but any compiler 13 | that supports ASAN can be used. Build with the `-fsanitize=address` 14 | flag: 15 | 16 | ``` 17 | $ g++ -O0 -g -fsanitize=address -o illegalmemoryaccess illegalmemoryaccess.cpp 18 | ``` 19 | 20 | Run: 21 | 22 | ``` 23 | $ ./illegalmemoryaccess 24 | ================================================================= 25 | ==2267569==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x604000000038 at pc 0x0000004009df bp 0x7ffe9e373680 sp 0x7ffe9e373678 26 | WRITE of size 4 at 0x604000000038 thread T0 27 | #0 0x4009de in main /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:7 28 | #1 0x7fbf17c3c24c in __libc_start_main (/lib64/libc.so.6+0x3524c) 29 | #2 0x4008b9 in _start ../sysdeps/x86_64/start.S:120 30 | 31 | 0x604000000038 is located 0 bytes to the right of 40-byte region [0x604000000010,0x604000000038) 32 | allocated by thread T0 here: 33 | #0 0x7fbf188bba88 in operator new[](unsigned long) (/usr/lib64/libasan.so.8+0xbba88) 34 | #1 0x40097e in main /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:4 35 | #2 0x7fbf17c3c24c in __libc_start_main (/lib64/libc.so.6+0x3524c) 36 | 37 | SUMMARY: AddressSanitizer: heap-buffer-overflow /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:7 in main 38 | Shadow bytes around the buggy address: 39 | 0x0c087fff7fb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 40 | 0x0c087fff7fc0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 41 | 0x0c087fff7fd0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 42 | 0x0c087fff7fe0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 43 | 0x0c087fff7ff0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 44 | =>0x0c087fff8000: fa fa 00 00 00 00 00[fa]fa fa fa fa fa fa fa fa 45 | 0x0c087fff8010: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 46 | 0x0c087fff8020: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 47 | 0x0c087fff8030: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 48 | 0x0c087fff8040: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 49 | 0x0c087fff8050: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 50 | Shadow byte legend (one shadow byte represents 8 application bytes): 51 | Addressable: 00 52 | Partially addressable: 01 02 03 04 05 06 07 53 | Heap left redzone: fa 54 | Freed heap region: fd 55 | ... 56 | Right alloca redzone: cb 57 | ==2267569==ABORTING 58 | ``` 59 | 60 | - Heap-buffer-overflow for attempting to write 4 bytes outside 61 | of the allocated memory block at line 7 of `illegalmemoryaccess.cpp` 62 | - Memory block in question: 40 byte region 63 | `[0x604000000010,0x604000000038)`, allocated at line 4 64 | - Shadow bytes 65 | - Mapped to `[0xc087fff8002,0xc087fff8007)` via `Shadow = (Mem >> 3) + 0x7fff8000` 66 | - `00 00 00 00 00`: 5*8 bytes = 40 bytes 67 | - `[fa]`: attempted to write to the heap left redzone (`fa`) 68 | 69 | -------------------------------------------------------------------------------- /Sanitzers/README.md: -------------------------------------------------------------------------------- 1 | # Sanitizers 2 | 3 | See NERSC docs page, [Sanitizers and Sanitizers4hpc](https://docs.nersc.gov/tools/debug/sanitizers/). 4 | 5 | ## Introduction 6 | 7 | LLVM Sanitizers are a group of debugging tools for detecting various 8 | kinds of bugs in C and C++ codes. There are multiple tools, including 9 | AddressSanitizer, LeakSanitizer, ThreadSanitizer, MemorySanitizer, 10 | each with a specific debugging capability. 11 | 12 | A sanitizer consists of a compiler instrumentation module and a 13 | runtime library. To use a sanitizer, you first build an executable 14 | instrumented for the sanitizer, by specifying a compile flag. When 15 | the instrumented executable is run, the runtime intercepts relevant 16 | operations and inspects them. When it detects a problem, it generates 17 | a warning message. 18 | 19 | Because of the instrumentation and the way how the debugging work 20 | is played out, memory usage can become several times bigger and the 21 | instrumented code can run several times slower. Therefore, it is 22 | important to rebuild your code without instrumentation after debugging 23 | is complete. 24 | 25 | ### Supported Compilers 26 | 27 | These tools can be used with more than just LLVM compilers: they 28 | are compatible with all compilers provided on Perlmutter, except 29 | the Nvidia compiler. 30 | 31 | You don't need to change the way you compile your MPI code in order 32 | to use these tools (i.e., you can still use the Cray compiler 33 | wrappers `cc`/`CC`/`ftn` as normal). For a non-MPI code, the 34 | following C/C++ base compilers can be used, too. 35 | 36 | | GNU | Cray | Intel | AOCC | LLVM | 37 | |:---:|:----:|:-----:|:----:|:----:| 38 | | `gcc`/`g++` | `craycc`/`craycxx` | `icx`/`icpx` | `clang`/`clang++` | `clang`/`clang++` | 39 | 40 | Note that Intel's `icc` and `icpc` do not work for the sanitizer 41 | tools as they are not Clang-based. 42 | 43 | ### Sanitizer Flags 44 | 45 | These compilers accept many LLVM sanitizer compile flags. Use the 46 | ones for your needs. For example, you don't have to instrument the 47 | entire code. Instead, you can exclude certain functions or source 48 | files from instrumentation with the `-fsanitize-blacklist=` or 49 | `-fsanitize-ignorelist=` option. 50 | 51 | Runtime behavior of a tool can be controlled by setting the santizer 52 | environment variable to certain runtime flags. The variable is 53 | `ASAN_OPTIONS` for AddressSantizer, `LSAN_OPTIONS` for LeakSanitizer, 54 | `TSAN_OPTIONS` for ThreadSantizer, `MSAN_OPTIONS` for MemorySanitizer, 55 | etc. 56 | 57 | You can find compile and runtime flags at the following web pages: 58 | 59 | - [AddressSanitizer 60 | Flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags) 61 | - [ThreadSanitizer 62 | Flags](https://github.com/google/sanitizers/wiki/ThreadSanitizerFlags) 63 | - [Sanitizer Common 64 | Flags](https://github.com/google/sanitizers/wiki/SanitizerCommonFlags) 65 | 66 | ## Sanitizers4hpc 67 | 68 | HPE's `Sanitizers4hpc` is an aggregation tool to collect and analyze 69 | LLVM Sanitizer output from a distributed-memory parallel (e.g., 70 | MPI) code at scale. It makes sanitizer's result easier to understand, 71 | by presenting output by group of MPI tasks sharing the same pattern. 72 | 73 | Currently it supports 74 | 75 | - AddressSanitizer 76 | - LeakSanitizer 77 | - ThreadSanitizer 78 | 79 | with the Cray and the GNU compilers. It also supports Nvidia Compute 80 | Sanitizer's Memcheck tool for CUDA codes (an example below). 81 | 82 | To run an app with the tool, load the `sanitizers4hpc` module and 83 | then launch as follows: 84 | 85 | ``` 86 | sanitizers4hpc -- ./a.out 87 | ``` 88 | 89 | -------------------------------------------------------------------------------- /Valgrind/memcheck/leak-cases.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "leak.h" 4 | /* #include "../memcheck.h" */ 5 | #include "memcheck.h" 6 | 7 | // Pointer chain AAA Category/output BBB Category/output 8 | // ------------- ------------------- ------------ 9 | // p1 ---> AAA DR / R 10 | // p2 ---> AAA ---> BBB DR / R IR / R 11 | // p3 AAA DL / L 12 | // p4 AAA ---> BBB DL / I IL / L 13 | // p5 -?-> AAA (y)DR, (n)DL / P 14 | // p6 ---> AAA -?-> BBB DR / R (y)IR, (n)DL / P 15 | // p7 -?-> AAA ---> BBB (y)DR, (n)DL / P (y)IR, (n)IL / P 16 | // p8 -?-> AAA -?-> BBB (y)DR, (n)DL / P (y,y)IR, (n,y)IL, (_,n)DL / P 17 | // p9 AAA -?-> BBB DL / L (y)IL, (n)DL / I 18 | // 19 | // Pointer chain legend: 20 | // - pN: a root set pointer 21 | // - AAA, BBB: heap blocks 22 | // - --->: a start-pointer 23 | // - -?->: an interior-pointer 24 | // 25 | // Category legend: 26 | // - DR: Directly reachable 27 | // - IR: Indirectly reachable 28 | // - DL: Directly lost 29 | // - IL: Indirectly lost 30 | // - (y)XY: it's XY if the interior-pointer is a real pointer 31 | // - (n)XY: it's XY if the interior-pointer is not a real pointer 32 | // - (_)XY: it's XY in either case 33 | // 34 | // How we handle the 9 cases: 35 | // - "directly lost": case 3 36 | // - "indirectly lost": cases 4, 9 37 | // - "possibly lost": cases 5..8 38 | // - "still reachable": cases 1, 2 39 | 40 | 41 | typedef 42 | struct _Node { 43 | struct _Node* next; 44 | // Padding ensures the structu is the same size on 32-bit and 64-bit 45 | // machines. 46 | char padding[8 - sizeof(struct _Node*)]; 47 | } Node; 48 | 49 | Node* mk(Node* next) 50 | { 51 | // We allocate two nodes, so we can do p+1 and still point within the 52 | // block. 53 | Node* x = malloc(2 * sizeof(Node)); 54 | x->next = next; 55 | return x; 56 | } 57 | 58 | // These are definite roots. 59 | Node* p1; 60 | Node* p2; 61 | Node* p3; 62 | Node* p4; 63 | Node* p5; 64 | Node* p6; 65 | Node* p7; 66 | Node* p8; 67 | Node* p9; 68 | 69 | void f(void) 70 | { 71 | p1 = mk(NULL); // Case 1: 16/1 still reachable 72 | 73 | p2 = mk(mk(NULL)); // Case 2: 16/1 still reachable 74 | // 16/1 still reachable 75 | (void)mk(NULL); // Case 3: 16/1 definitely lost 76 | 77 | (void)mk(mk(NULL)); // Case 4: 16/1 indirectly lost (counted again below!) 78 | // 32(16d,16i)/1 definitely lost (double count!) 79 | p5 = mk(NULL); // Case 5: 16/1 possibly lost (ok) 80 | p5++; 81 | 82 | p6 = mk(mk(NULL)); // Case 6: 16/1 still reachable 83 | (p6->next)++; // 16/1 possibly lost 84 | 85 | p7 = mk(mk(NULL)); // Case 7: 16/1 possibly lost 86 | p7++; // 16/1 possibly lost 87 | 88 | p8 = mk(mk(NULL)); // Case 8: 16/1 possibly lost 89 | (p8->next)++; // 16/1 possibly lost 90 | p8++; 91 | 92 | p9 = mk(mk(NULL)); // Case 9: 16/1 indirectly lost (counted again below!) 93 | (p9->next)++; // 32(16d,16i)/1 definitely lost (double count!) 94 | p9 = NULL; 95 | } 96 | 97 | int main(void) 98 | { 99 | DECLARE_LEAK_COUNTERS; 100 | 101 | GET_INITIAL_LEAK_COUNTS; 102 | 103 | // Originally, this program did all the work in main(), but on some 104 | // platforms (x86/Darwin and AMD64/Linux with --enable-only32bit) stray 105 | // pointers to supposedly-lost heap blocks were being left on the stack, 106 | // thus making them reachable. Doing the allocations in f() and the leak 107 | // counting in main() avoids the problem. 108 | f(); 109 | 110 | CLEAR_CALLER_SAVED_REGS; 111 | GET_FINAL_LEAK_COUNTS; 112 | 113 | PRINT_LEAK_COUNTS(stderr); 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /TotalView/programs/Makefile: -------------------------------------------------------------------------------- 1 | SRC=../src 2 | CXX=g++ 3 | OMP_CC=gcc 4 | OMP_F77=gfortran 5 | OMP_OPT=-fopenmp 6 | CC=gcc 7 | F90=gfortran 8 | MPICC=cc 9 | MPICXX=CC 10 | CFLAGS= -g 11 | CCFLAGS= -g 12 | F90FLAGS= -g 13 | MPIFLAGS=-DUSEMPI -DMPICH_IGNORE_CXX_SEEK 14 | 15 | 16 | 17 | 18 | PROGRAMS= simple combined demoMpi_v2 filterapp TVscript_demo ReplayEngine_demo simple_threaded 19 | 20 | 21 | all: ${PROGRAMS} 22 | 23 | simple: ${SRC}/simple.c ${SRC}/array.c ${SRC}/array.h 24 | $(CC) $(CFLAGS) $(LDFLAGS) $(SRC)/simple.c $(SRC)/array.c -o $@ -lm 25 | 26 | simple_threaded: ${SRC}/simple_threaded.c 27 | $(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/simple_threaded.c -o $@ -lpthread 28 | 29 | combined: ${SRC}/combined.cxx 30 | $(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/combined.cxx -lpthread -o $@ 31 | 32 | demoMpi: $(SRC)/demoMpi.C 33 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/demoMpi.C -o $@ 34 | 35 | filterapp-mpi: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx 36 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx 37 | 38 | memory-mpi: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx 39 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx 40 | 41 | memory-comp: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx 42 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx 43 | 44 | memory-redzone: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx 45 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx 46 | 47 | demoMpi_v2: $(SRC)/demoMpi_v2.C 48 | $(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/demoMpi_v2.C -o $@ 49 | 50 | MPI_Replay_Engine_demo: $(SRC)/MPI_Replay_Engine_demo.C $(SRC)/merge.h 51 | $(MPICXX) $(CCFLAGS) -I$(SRC) $(MPIFLAGS) $(LDFLAGS) $(SRC)/MPI_Replay_Engine_demo.C -o $@ 52 | 53 | filterapp: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx 54 | $(CXX) $(CCFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx 55 | 56 | f90_demo: $(SRC)/f90_demo.f 57 | 58 | f90_demo: $(SRC)/f90_demo.f 59 | $(F90) $(F90FLAGS) $(LDFLAGS) -o $@ $(SRC)/f90_demo.f 60 | 61 | springs : $(SRC)/omp-springs.c 62 | ${CC} ${CFLAGS} ${LD_FLAGS} $(SRC)/springs.c -o springs -lm 63 | 64 | tx_omp_c_llnl3 : $(SRC)/tx_omp_c_llnl3.c 65 | ${OMP_CC} ${CFLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/tx_omp_c_llnl3.c -o tx_omp_c_llnl3 -lm 66 | 67 | omp-springs : $(SRC)/omp-springs.c 68 | ${OMP_CC} ${CFLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/omp-springs.c -o omp-springs -lm 69 | 70 | omp-springs-fort : $(SRC)/omp-springs.f 71 | ${OMP_F77} ${F90FLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/omp-springs.f -o omp-springs-fort -lm 72 | 73 | txdining: $(SRC)/txdining.cxx $(SRC)/txdining.hxx 74 | $(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/txdining.cxx -o $@ -lpthread -lrt 75 | 76 | ReplayEngine_demo: $(SRC)/ReplayEngine_demo.cxx 77 | $(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/ReplayEngine_demo.cxx -o $@ 78 | 79 | RedZone_demo: $(SRC)/RedZone_demo.cxx 80 | $(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/RedZone_demo.cxx -o $@ 81 | 82 | TVscript_demo: $(SRC)/TVscript_demo.c 83 | $(MPICC) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/TVscript_demo.c -o $@ -lm 84 | 85 | sudoku: ${SRC}/sudoku.c 86 | $(CC) $(CFLAGS) $(LDFLAGS) $(SRC)/sudoku.c -o $@ -lm 87 | 88 | mem_example: ${SRC}/mem_example.cpp 89 | $(CXX) $(CFLAGS) $(LDFLAGS) $(SRC)/mem_example.cpp -o $@ -lm 90 | 91 | 92 | cppview_demo: ${SRC}/cppview_demo.cxx ${SRC}/tv_data_display.c ${SRC}/tv_data_display.h 93 | $(CXX) $(CCFLAGS) $(LDFLAGS) ${SRC}/cppview_demo.cxx ${SRC}/tv_data_display.c -I${SRC} -o $@ 94 | 95 | threads: ${SRC}/threads.cxx 96 | $(CXX) $(CCFLAGS) $(LDFLAGS) -lpthread $(SRC)/threads.cxx -o $@ 97 | 98 | 99 | 100 | clean: 101 | rm -f $(PROGRAMS) *.o 102 | 103 | cuda-clean: 104 | rm -f $(CUDA_PROGRAMS) *.o 105 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/README.md: -------------------------------------------------------------------------------- 1 | # matrixMul - Matrix Multiplication (CUDA Runtime API Version) 2 | 3 | ## Description 4 | 5 | This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication. 6 | 7 | ## Key Concepts 8 | 9 | CUDA Runtime API, Linear Algebra 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l, aarch64 22 | 23 | ## CUDA APIs involved 24 | 25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) 26 | cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHost, cudaProfilerStart, cudaEventSynchronize, cudaEventRecord, cudaFreeHost, cudaStreamSynchronize, cudaEventDestroy, cudaEventElapsedTime, cudaMemcpyAsync, cudaEventCreate 27 | 28 | ## Prerequisites 29 | 30 | Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 31 | 32 | ## Build and Run 33 | 34 | ### Windows 35 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 36 | ``` 37 | *_vs.sln - for Visual Studio 38 | ``` 39 | Each individual sample has its own set of solution files in its directory: 40 | 41 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 42 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 43 | 44 | ### Linux 45 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 46 | ``` 47 | $ cd 48 | $ make 49 | ``` 50 | The samples makefiles can take advantage of certain options: 51 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. 52 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
53 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
54 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 55 | * **dbg=1** - build with debug symbols 56 | ``` 57 | $ make dbg=1 58 | ``` 59 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 60 | ``` 61 | $ make SMS="50 60" 62 | ``` 63 | 64 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 65 | ``` 66 | $ make HOST_COMPILER=g++ 67 | ``` 68 | 69 | ## References (for more details) 70 | 71 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | # 28 | ################################################################################ 29 | # 30 | # Makefile project only supported on Mac OS X and Linux Platforms) 31 | # 32 | ################################################################################ 33 | 34 | # Location of the CUDA Toolkit 35 | CUDA_PATH ?= /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda 36 | 37 | # architecture 38 | HOST_ARCH := $(shell uname -m) 39 | TARGET_ARCH ?= $(HOST_ARCH) 40 | TARGET_SIZE := 64 41 | 42 | # operating system 43 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 44 | TARGET_OS ?= $(HOST_OS) 45 | 46 | HOST_COMPILER ?= g++ 47 | NVCC := `which nvcc` -ccbin $(HOST_COMPILER) 48 | 49 | # internal flags 50 | NVCCFLAGS := -m${TARGET_SIZE} 51 | CCFLAGS := 52 | LDFLAGS := 53 | 54 | ifdef TARGET_OVERRIDE # cuda toolkit targets override 55 | NVCCFLAGS += -target-dir $(TARGET_OVERRIDE) 56 | endif 57 | 58 | NVCCFLAGS += -g -G 59 | BUILD_TYPE := debug 60 | 61 | ALL_CCFLAGS := 62 | ALL_CCFLAGS += $(NVCCFLAGS) 63 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 64 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 65 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 66 | 67 | SAMPLE_ENABLED := 1 68 | 69 | ALL_LDFLAGS := 70 | ALL_LDFLAGS += $(ALL_CCFLAGS) 71 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 72 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 73 | 74 | # Common includes and paths for CUDA 75 | INCLUDES := -Icommon 76 | LIBRARIES := 77 | 78 | ################################################################################ 79 | 80 | # Gencode arguments 81 | SMS = 50 52 60 61 70 75 80 82 | 83 | ifeq ($(GENCODE_FLAGS),) 84 | # Generate SASS code for each SM architecture listed in $(SMS) 85 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 86 | 87 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 88 | HIGHEST_SM := $(lastword $(sort $(SMS))) 89 | ifneq ($(HIGHEST_SM),) 90 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 91 | endif 92 | endif 93 | 94 | ALL_CCFLAGS += --threads 0 --std=c++11 95 | 96 | ifeq ($(SAMPLE_ENABLED),0) 97 | EXEC ?= @echo "[@]" 98 | endif 99 | 100 | ################################################################################ 101 | 102 | # Target rules 103 | all: build 104 | 105 | build: matrixMul 106 | 107 | matrixMul.o:matrixMul.cu 108 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 109 | 110 | matrixMul: matrixMul.o 111 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 112 | $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) 113 | $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) 114 | 115 | run: build 116 | $(EXEC) ./matrixMul 117 | 118 | testrun: build 119 | 120 | clean: 121 | rm -f matrixMul matrixMul.o 122 | rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMul 123 | 124 | clobber: clean 125 | -------------------------------------------------------------------------------- /TotalView/src/TVscript_demo.c: -------------------------------------------------------------------------------- 1 | /* 2 | This is a derivative work of the cpi example program from MPICH2, 3 | which includes the following copyright notice: 4 | 5 | COPYRIGHT 6 | 7 | The following is a notice of limited availability of the code, and disclaimer 8 | which must be included in the prologue of the code and in all source listings 9 | of the code. 10 | 11 | Copyright Notice 12 | + 2002 University of Chicago 13 | 14 | Permission is hereby granted to use, reproduce, prepare derivative works, and 15 | to redistribute to others. This software was authored by: 16 | 17 | Argonne National Laboratory Group 18 | W. Gropp: (630) 252-4318; FAX: (630) 252-5986; e-mail: gropp@mcs.anl.gov 19 | E. Lusk: (630) 252-7852; FAX: (630) 252-5986; e-mail: lusk@mcs.anl.gov 20 | Mathematics and Computer Science Division 21 | Argonne National Laboratory, Argonne IL 60439 22 | 23 | 24 | GOVERNMENT LICENSE 25 | 26 | Portions of this material resulted from work developed under a U.S. 27 | Government Contract and are subject to the following license: the Government 28 | is granted for itself and others acting on its behalf a paid-up, nonexclusive, 29 | irrevocable worldwide license in this computer software to reproduce, prepare 30 | derivative works, and perform publicly and display publicly. 31 | 32 | DISCLAIMER 33 | 34 | This computer code material was prepared, in part, as an account of work 35 | sponsored by an agency of the United States Government. Neither the United 36 | States, nor the University of Chicago, nor any of their employees, makes any 37 | warranty express or implied, or assumes any legal liability or responsibility 38 | for the accuracy, completeness, or usefulness of any information, apparatus, 39 | product, or process disclosed, or represents that its use would not infringe 40 | privately owned rights. 41 | 42 | 43 | 44 | */ 45 | #include "mpi.h" 46 | #include 47 | #include 48 | #include 49 | #include 50 | #ifndef INTERVAL_START 51 | #define INTERVAL_START 10 52 | #endif 53 | 54 | #ifndef INTERVAL_END 55 | #define INTERVAL_END 1000000 56 | #endif 57 | 58 | typedef struct error_detail { 59 | int intervals; 60 | double almost_pi; 61 | double delta; 62 | } error_detail; 63 | 64 | double f( double ); 65 | 66 | double f( double a ) 67 | { 68 | return (4.0 / (1.0 + a*a)); 69 | } 70 | 71 | double calc_error( int, double, double ); 72 | 73 | double calc_error( int n, double almost_pi, double last_error ) 74 | { 75 | float ref; 76 | ref = 3.141592653589793238462643; 77 | double pidiff; 78 | int numprocs; 79 | 80 | pidiff = fabs(almost_pi - ref); 81 | if ( pidiff > last_error ) { 82 | MPI_Comm_size(MPI_COMM_WORLD, &numprocs); 83 | if ( numprocs == 3 ) 84 | abort(); 85 | else 86 | printf( "Error increased for intervals = %d!\n", n ); 87 | } 88 | return pidiff; 89 | } 90 | 91 | int main( int argc, char *argv[] ) 92 | { 93 | int done = 0, n, myid, numprocs, i; 94 | double mypi, pi, h, sum, x, pidiff, last_error = 4.; 95 | double startwtime=0.0, endwtime; 96 | int namelen; 97 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 98 | error_detail err_detail; 99 | 100 | MPI_Init(&argc,&argv); 101 | MPI_Comm_size(MPI_COMM_WORLD,&numprocs); 102 | MPI_Comm_rank(MPI_COMM_WORLD,&myid); 103 | MPI_Get_processor_name(processor_name,&namelen); 104 | fprintf(stderr,"Process %d on %s\n", 105 | myid, processor_name); 106 | 107 | n = INTERVAL_START; 108 | while (n <= INTERVAL_END) 109 | { 110 | if (myid == 0) 111 | { 112 | startwtime = MPI_Wtime(); 113 | } 114 | MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); 115 | if (n == 0) 116 | done = 1; 117 | else 118 | { 119 | h = 1.0 / (double) n; 120 | sum = 0.0; 121 | for (i = myid + 1; i <= n; i += numprocs) 122 | { 123 | x = h * ((double)i - 0.5); 124 | x = f(x); 125 | sum += x; 126 | } 127 | mypi = h * sum; 128 | 129 | MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 130 | 131 | if (myid == 0) 132 | { 133 | pidiff = calc_error( n, pi, last_error ); 134 | last_error = pidiff; 135 | err_detail.intervals = n; 136 | err_detail.almost_pi = pi; 137 | err_detail.delta = pidiff; 138 | printf("the answer is approximately %.16f, Error is %.16f\n", 139 | pi, pidiff); 140 | endwtime = MPI_Wtime(); 141 | printf("wall clock time = %f\n", 142 | endwtime-startwtime); 143 | } 144 | } 145 | n = n * 10; 146 | } 147 | MPI_Finalize(); 148 | sleep(2); 149 | return 0; 150 | } 151 | 152 | 153 | -------------------------------------------------------------------------------- /Valgrind/memcheck/memalign.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | /* #include "tests/malloc.h" */ 5 | #include "malloc.h" 6 | #include 7 | #include "../../../config.h" 8 | 9 | int main ( void ) 10 | { 11 | // Nb: assuming VG_MIN_MALLOC_SZB is 8 or more... 12 | int* p; 13 | int* piece; 14 | assert(sizeof(long int) == sizeof(void*)); 15 | 16 | #if !defined(MUSL_LIBC) 17 | // Check behaviour of memalign/free for big alignment. 18 | // In particular, the below aims at checking that a 19 | // superblock with a big size is not marked as reclaimable 20 | // if the superblock is used to provide a big aligned block 21 | // (see bug 250101, comment #14). 22 | // Valgrind m_mallocfree.c will allocate a big superblock for the memalign 23 | // call and will split it in two. This split superblock was 24 | // wrongly marked as reclaimable, which was then causing 25 | // assert failures (as reclaimable blocks cannot be split). 26 | p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1); 27 | assert(p && (0 == (long)p % (1024 * 1024))); 28 | // We allocate (and then free) a piece of memory smaller than 29 | // the hole created in the big superblock. 30 | // If the superblock is marked as reclaimable, the below free(s) will cause 31 | // an assert. Note that the test has to be run with a --free-list-vol 32 | // parameter smaller than the released blocks size to ensure the free is directly 33 | // executed (otherwise memcheck does not really release the memory and so 34 | // the bug is not properly tested). 35 | piece = malloc(1024 * 1000); 36 | assert (piece); 37 | free (piece); 38 | free (p); 39 | 40 | // Same as above but do the free in the reverse order. 41 | p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1); 42 | assert(p && (0 == (long)p % (1024 * 1024))); 43 | piece = malloc(1024 * 100); 44 | assert (piece); 45 | free (p); 46 | free (piece); 47 | 48 | p = memalign(0, 100); 49 | assert(p && (0 == (long)p % 8)); 50 | p = memalign(1, 100); 51 | assert(p && (0 == (long)p % 8)); 52 | p = memalign(2, 100); 53 | assert(p && (0 == (long)p % 8)); 54 | p = memalign(3, 100); 55 | assert(p && (0 == (long)p % 8)); 56 | p = memalign(4, 100); 57 | assert(p && (0 == (long)p % 8)); 58 | p = memalign(5, 100); 59 | assert(p && (0 == (long)p % 8)); 60 | 61 | p = memalign(7, 100); 62 | assert(p && (0 == (long)p % 8)); 63 | p = memalign(8, 100); 64 | assert(p && (0 == (long)p % 8)); 65 | p = memalign(9, 100); 66 | assert(p && (0 == (long)p % 16)); 67 | 68 | p = memalign(31, 100); 69 | assert(p && (0 == (long)p % 32)); 70 | p = memalign(32, 100); 71 | assert(p && (0 == (long)p % 32)); 72 | p = memalign(33, 100); 73 | assert(p && (0 == (long)p % 64)); 74 | 75 | p = memalign(4095, 100); 76 | assert(p && (0 == (long)p % 4096)); 77 | p = memalign(4096, 100); 78 | assert(p && (0 == (long)p % 4096)); 79 | p = memalign(4097, 100); 80 | assert(p && (0 == (long)p % 8192)); 81 | 82 | p = memalign(4 * 1024 * 1024, 100); 83 | assert(p && (0 == (long)p % (4 * 1024 * 1024))); 84 | p = memalign(16 * 1024 * 1024, 100); 85 | assert(p && (0 == (long)p % (16 * 1024 * 1024))); 86 | 87 | // size 0 88 | p = memalign(256, 0); 89 | assert(p && (0 == (long)p % 256)); 90 | #else 91 | p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1); 92 | assert(p && (0 == (long)p % (1024 * 1024))); 93 | piece = malloc(1024 * 1000); assert (piece); 94 | free (piece); 95 | free (p); 96 | p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1); 97 | assert(p && (0 == (long)p % (1024 * 1024))); 98 | piece = malloc(1024 * 100); 99 | assert (piece); 100 | free (p); 101 | free (piece); 102 | 103 | errno = 0; 104 | p = memalign(0, 100); 105 | assert(p && (0 == (long)p % 8)); 106 | p = memalign(1, 100); 107 | assert(p && (0 == (long)p % 8)); 108 | p = memalign(2, 100); 109 | assert(p && (0 == (long)p % 8)); 110 | p = memalign(3, 100); 111 | assert(!p); 112 | //assert(errno == EINVAL); 113 | errno = 0; 114 | p = memalign(4, 100); 115 | assert(p && 0 == (long)p % 8); 116 | p = memalign(5, 100); 117 | assert(!p); 118 | //assert(errno == EINVAL); 119 | errno = 0; 120 | p = memalign(7, 100); 121 | assert(!p); 122 | //assert(errno == EINVAL); 123 | errno = 0; 124 | p = memalign(8, 100); 125 | assert(p && (0 == (long)p % 8)); 126 | p = memalign(9, 100); 127 | assert(!p); 128 | //assert(errno == EINVAL); 129 | errno = 0; 130 | p = memalign(31, 100); 131 | assert(!p); 132 | //assert(errno == EINVAL); 133 | p = memalign(32, 100); 134 | assert(p && (0 == (long)p % 32)); 135 | errno = 0; 136 | p = memalign(33, 100); 137 | assert(!p); 138 | //assert(errno == EINVAL); 139 | errno = 0; 140 | p = memalign(4095, 100); 141 | assert(!p); 142 | //assert(errno == EINVAL); 143 | p = memalign(4096, 100); 144 | assert(p && (0 == (long)p % 4096)); 145 | errno = 0; 146 | p = memalign(4097, 100); 147 | assert(!p); 148 | //assert(errno == EINVAL); 149 | 150 | p = memalign(4 * 1024 * 1024, 100); 151 | assert(p && (0 == (long)p % (4 * 1024 * 1024))); 152 | p = memalign(16 * 1024 * 1024, 100); 153 | assert(p && (0 == (long)p % (16 * 1024 * 1024))); 154 | 155 | // size 0 156 | p = memalign(256, 0); 157 | assert(p && (0 == (long)p % 256)); 158 | #endif 159 | } 160 | -------------------------------------------------------------------------------- /TotalView/src/demoMpi_v2.C: -------------------------------------------------------------------------------- 1 | /* compile with mpiCC -o demoMpi demoMpi.C -g -lm */ 2 | /* run with mpirun -np 10 -tv demoMpi for Message Queue and 3 | run with mpirun -np 4 -tv demoMpi for Lamination and broken links */ 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | template 12 | T getMax(T* A,int b); 13 | 14 | template 15 | T* get_full_domain(int a); 16 | 17 | int BUFLEN=512, NSTEPS=10; 18 | 19 | char *sendMessage(char *sbuf, char *recbuf, int dest, int source) 20 | { 21 | MPI_Status status; 22 | int my_mpi_comm_world=MPI_COMM_WORLD; 23 | int my_mpi_char=MPI_CHAR; 24 | 25 | MPI_Recv(recbuf, BUFLEN, my_mpi_char, source, 99, my_mpi_comm_world, &status); 26 | MPI_Send(sbuf, strlen(sbuf)+1, my_mpi_char, dest, 99, my_mpi_comm_world); 27 | return recbuf; 28 | } 29 | 30 | char *f2(char *sbuf, char *recbuf, int dest, int source) 31 | { 32 | return sendMessage(sbuf, recbuf, dest, source ); 33 | } 34 | 35 | char *f1(char *sbuf, char *recbuf, int dest, int source) 36 | { 37 | return f2(sbuf, recbuf, dest, source ); 38 | } 39 | 40 | char *g2(char *sbuf, char *recbuf, int dest, int source) 41 | { 42 | return sendMessage(sbuf, recbuf, dest, source ); 43 | } 44 | 45 | char *g1(char *sbuf, char *recbuf, int dest, int source) 46 | { 47 | return g2(sbuf, recbuf, dest, source ); 48 | } 49 | 50 | 51 | int main(int argc, char *argv[]) 52 | { 53 | int root=0,full_domain_length,sub_domain_length; 54 | double global_max,local_max; 55 | double *full_domain,*sub_domain; 56 | int myid, numprocs, next, namelen, previous; 57 | int default_length=1000; 58 | char* sendBuffer=new char[BUFLEN]; 59 | char* recvBuffer=new char[BUFLEN]; 60 | char processor_name[MPI_MAX_PROCESSOR_NAME]; 61 | int my_mpi_comm_world=MPI_COMM_WORLD; 62 | int my_mpi_double=MPI_DOUBLE; 63 | int my_mpi_max=MPI_MAX; 64 | int my_mpi_int=MPI_INT; 65 | MPI_Init(&argc,&argv); 66 | // sleep(1); 67 | MPI_Comm_size(my_mpi_comm_world,&numprocs); 68 | MPI_Comm_rank(my_mpi_comm_world,&myid); 69 | MPI_Get_processor_name(processor_name,&namelen); 70 | 71 | MPI_Get_processor_name(processor_name,&namelen); 72 | if( myid%2 == 0) sleep( 2 ); 73 | 74 | fprintf(stderr,"Process %d on %s\n",myid,processor_name); 75 | sprintf(sendBuffer,"hello there from %d on %s",myid,processor_name); 76 | next = myid+4; /* set a barrier here */ 77 | if(next>=numprocs) 78 | next-=numprocs; 79 | previous = myid-4; 80 | if(previous<0) 81 | previous+=numprocs; 82 | 83 | /* Part 1: Deadlock and code patching */ 84 | /* After hitting deadlock, enable eval points and restart program */ 85 | if (myid%2==0){ 86 | f1(sendBuffer, recvBuffer, next, previous); 87 | } else { 88 | g1(sendBuffer, recvBuffer, next, previous ); 89 | } 90 | 91 | fprintf(stderr,"%d get '%s'\n",myid,recvBuffer); 92 | MPI_Barrier(my_mpi_comm_world); 93 | 94 | /* Part 2: Collective communication */ 95 | /* 96 | * Root obtains full domain and broadcasts its length. 97 | */ 98 | if (myid == root) { 99 | if( argc > 2) full_domain_length=atoi((char*)argv[2]); 100 | else full_domain_length=default_length; 101 | full_domain=get_full_domain(full_domain_length); 102 | } 103 | MPI_Bcast(&full_domain_length, 1, my_mpi_int, root, my_mpi_comm_world); 104 | /* 105 | * Allocate subdomain memory. 106 | * Scatter the initial dataset among the processes. 107 | */ 108 | sub_domain_length = full_domain_length / numprocs; 109 | sub_domain = new double[sub_domain_length]; 110 | 111 | MPI_Scatter(full_domain, sub_domain_length, my_mpi_double, 112 | sub_domain, sub_domain_length, my_mpi_double, 113 | root, my_mpi_comm_world); 114 | /* 115 | * Loop computing and determining max values. 116 | Stop here and observe effect of Scatter. Dive on sub_domain and show array of size 117 | sub_domain_length. Laminate and visualize. Then hit Go. all subarrays will be sorted 118 | in parallel and their local max will be returned. 119 | */ 120 | local_max=getMax(sub_domain, sub_domain_length); 121 | MPI_Reduce(&local_max, &global_max, 1, my_mpi_double, 122 | my_mpi_max, root, my_mpi_comm_world); 123 | /* 124 | * Gather final dataset. 125 | Dive and Laminate local_max to observe different values for local subarray. Now 126 | visualize again and observe sorted arrays. 127 | */ 128 | MPI_Gather(sub_domain, sub_domain_length, my_mpi_double, 129 | full_domain, sub_domain_length, my_mpi_double, 130 | root, my_mpi_comm_world); 131 | 132 | /* Part 3 all to all comunication and "bottleneck problem" */ 133 | /* After you reached barrier hit Go and then Halt. Open Message 134 | Queue Graph. You can clearly identify bottleneck at node 1. Enable eval and Go */ 135 | 136 | if(myid==1){ 137 | int work=1; 138 | while(work) 139 | work=1; 140 | } 141 | 142 | MPI_Alltoall(sub_domain, sub_domain_length/numprocs, my_mpi_double, 143 | sub_domain, sub_domain_length/numprocs, my_mpi_double, 144 | my_mpi_comm_world); 145 | 146 | 147 | 148 | MPI_Finalize(); 149 | /* Before finishing you can Laminate and Visualize sub_array again to check AlltoAll execution */ 150 | return (0); 151 | } 152 | //// **************************************************************************************** /// 153 | template 154 | T* get_full_domain(int size){ 155 | T* array=new T[size]; 156 | for(int i=0;i 162 | int myCmp(const void *a, const void *b){ 163 | T* i=(T*)a; 164 | T* j=(T*)b; 165 | return (*i<*j?-1:(*i==*j?0:1)); 166 | } 167 | 168 | template 169 | T getMax(T* array,int length){ 170 | qsort(array,length,sizeof(T),&myCmp); 171 | T lmax=array[length-1]; 172 | return lmax; 173 | } 174 | -------------------------------------------------------------------------------- /fortran_memory/README.md: -------------------------------------------------------------------------------- 1 | # Fortran memory bug examples 2 | 3 | The codes are from 2012 NERSC training (see `/global/cfs/cdirs/training/2012/NUG2012/debugging`). 4 | 5 | - `free_twice.f90`: Free twice 6 | - `heap_overflow_underflow.f90`: Out of bound array references 7 | - `memory_leaks.f90`: memory leaks 8 | - `segfault.f90`: segfault 9 | 10 | ## Detecting memory leaks 11 | 12 | `memory_leaks.f90` has total memory leaks of (4n + 8n) × 10 13 | = 120n = 120 × 1000000 bytes = 120MB from 10+20=30 memory 14 | blocks. 15 | 16 | Build as follows: 17 | 18 | ``` 19 | $ ftn -g -O0 -o memory_leaks memory_leaks.f90 20 | ``` 21 | 22 | ### With Valgrind 23 | 24 | Run: 25 | 26 | ``` 27 | $ srun -n 4 valgrind --leak-check=full --log-file=memory_leaks.%q{SLURM_JOB_ID}.%q{SLURM_PROCID}.out ./memory_leaks 28 | 29 | $ cat memory_leaks.32347754.0.out 30 | ... 31 | ==1392038== HEAP SUMMARY: 32 | ==1392038== in use at exit: 120,095,997 bytes in 632 blocks 33 | ==1392038== total heap usage: 828 allocs, 196 frees, 161,833,032 bytes allocated 34 | ==1392038== 35 | ==1392038== 4,000,000 bytes in 1 blocks are possibly lost in loss record 603 of 607 36 | ==1392038== at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) 37 | ==1392038== by 0x400F56: sub_bad_ (memory_leaks.f90:40) 38 | ==1392038== by 0x401273: MAIN__ (memory_leaks.f90:17) 39 | ==1392038== by 0x40134A: main (memory_leaks.f90:6) 40 | ==1392038== 41 | ==1392038== 4,000,000 bytes in 1 blocks are possibly lost in loss record 604 of 607 42 | ==1392038== at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) 43 | ==1392038== by 0x400C18: sub_badx2_ (memory_leaks.f90:50) 44 | ==1392038== by 0x40129F: MAIN__ (memory_leaks.f90:20) 45 | ==1392038== by 0x40134A: main (memory_leaks.f90:6) 46 | ==1392038== 47 | ==1392038== 36,000,000 bytes in 9 blocks are definitely lost in loss record 605 of 607 48 | ==1392038== at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) 49 | ==1392038== by 0x400F56: sub_bad_ (memory_leaks.f90:40) 50 | ==1392038== by 0x401273: MAIN__ (memory_leaks.f90:17) 51 | ==1392038== by 0x40134A: main (memory_leaks.f90:6) 52 | ==1392038== 53 | ==1392038== 36,000,000 bytes in 9 blocks are definitely lost in loss record 606 of 607 54 | ==1392038== at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) 55 | ==1392038== by 0x400C18: sub_badx2_ (memory_leaks.f90:50) 56 | ==1392038== by 0x40129F: MAIN__ (memory_leaks.f90:20) 57 | ==1392038== by 0x40134A: main (memory_leaks.f90:6) 58 | ==1392038== 59 | ==1392038== 40,000,000 bytes in 10 blocks are definitely lost in loss record 607 of 607 60 | ==1392038== at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so) 61 | ==1392038== by 0x400D97: sub_badx2_ (memory_leaks.f90:53) 62 | ==1392038== by 0x40129F: MAIN__ (memory_leaks.f90:20) 63 | ==1392038== by 0x40134A: main (memory_leaks.f90:6) 64 | ==1392038== 65 | ==1392038== LEAK SUMMARY: 66 | ==1392038== definitely lost: 112,000,000 bytes in 28 blocks 67 | ==1392038== indirectly lost: 0 bytes in 0 blocks 68 | ==1392038== possibly lost: 8,000,000 bytes in 2 blocks 69 | ==1392038== still reachable: 95,997 bytes in 602 blocks 70 | ==1392038== suppressed: 0 bytes in 0 blocks 71 | ... 72 | ``` 73 | 74 | The output is a bit difficult to understand. Some blocks are 75 | classified as definitely lost while others are classified as possibly 76 | lost. An important thing may be that the code has memory leaks of 77 | 112,000,000 + 8,000,000 = 120MB, as predicted. 78 | 79 | The tool also reports memory leaks of 95,997 bytes, probably in the 80 | system libraries. It's worthwhile trying to suppress such errors 81 | with the suppression files provided by HPE in Valgrind4hpc, 82 | `$VALGRIND4HPC_BASEDIR/share/suppressions/{known,libmpich_cray,libpmi,misc}.supp`. 83 | You can use these suppression files with the `--suppressions=...` 84 | flags and rerun the Valgrind run. 85 | 86 | ### With Valgrind4hpc 87 | 88 | Valgrind4hpc is a HPE tool that aggregates duplicate Valgrind 89 | messages across MPI processes, which is explained in Valgrind 90 | memcheck's [Valgrind4hpc](../Valgrind/memcheck/README.md#Valgrind4hpc). 91 | Run as follows. 92 | 93 | ``` 94 | $ module load valgrind4hpc 95 | $ valgrind4hpc -n 4 --valgrind-args="--leak-check=full" ./memory_leaks 96 | RANKS: <0..3> 97 | 98 | 4,000,000 bytes in 1 blocks are possibly lost in loss record 603 of 607 99 | at malloc (in vg_replace_malloc.c:393) 100 | by sub_badx2_ (in memory_leaks.f90:50) 101 | by MAIN__ (in memory_leaks.f90:20) 102 | by main (in memory_leaks.f90:6) 103 | 104 | RANKS: <0..3> 105 | 106 | 8,000,000 bytes in 2 blocks are possibly lost in loss record 604 of 607 107 | at malloc (in vg_replace_malloc.c:393) 108 | by sub_bad_ (in memory_leaks.f90:40) 109 | by MAIN__ (in memory_leaks.f90:17) 110 | by main (in memory_leaks.f90:6) 111 | 112 | RANKS: <0..3> 113 | 114 | 32,000,000 bytes in 8 blocks are definitely lost 115 | at malloc (in vg_replace_malloc.c:393) 116 | by sub_bad_ (in memory_leaks.f90:40) 117 | by MAIN__ (in memory_leaks.f90:17) 118 | by main (in memory_leaks.f90:6) 119 | 120 | RANKS: <0..3> 121 | 122 | 36,000,000 bytes in 9 blocks are definitely lost 123 | at malloc (in vg_replace_malloc.c:393) 124 | by sub_badx2_ (in memory_leaks.f90:50) 125 | by MAIN__ (in memory_leaks.f90:20) 126 | by main (in memory_leaks.f90:6) 127 | 128 | RANKS: <0..3> 129 | 130 | 40,000,000 bytes in 10 blocks are definitely lost 131 | at malloc (in vg_replace_malloc.c:393) 132 | by sub_badx2_ (in memory_leaks.f90:53) 133 | by MAIN__ (in memory_leaks.f90:20) 134 | by main (in memory_leaks.f90:6) 135 | 136 | RANKS: <0..3> 137 | 138 | HEAP SUMMARY: 139 | in use at exit: 120000000 bytes in 30 blocks 140 | 141 | LEAK SUMMARY: 142 | definitely lost: 108000000 bytes in 27 blocks 143 | indirectly lost: 0 bytes in 0 blocks 144 | possibly lost: 12000000 bytes in 3 blocks 145 | still reachable: 0 bytes in 0 blocks 146 | 147 | ERROR SUMMARY: 0 errors from 0 contexts (suppressed 601) 148 | ``` 149 | 150 | Again the report is a bit difficult to understand, and some blocks 151 | are classified as definitely lost while others are classified as 152 | possibly lost. An important thing again may be that there were 153 | memory leaks of total 120MB from 30 blocks. The memory leaks of 154 | 95,997 bytes reported by Valgrind in the previous section must be 155 | indeed due to leaks in system libraries. 156 | 157 | ## Other example codes with Valgrind 158 | 159 | We see the codes `free_twice.f90` and `heap_overflow_underflow.f90` 160 | fail without Valgrind giving useful info. 161 | 162 | -------------------------------------------------------------------------------- /TotalView/programs/combined.TVD.v4breakpoints: -------------------------------------------------------------------------------- 1 | # Magic: LR-70-3595585-9ER 2 | # Generated file -- DO NOT EDIT 3 | # Breakpoint list saved by Linux x86_64 TotalView 2017.2.11 4 | 5 | dset TV::Private::saved_breakpoint_actual_format 4 6 | dset TV::Private::saved_breakpoint_actual_revision 0 7 | 8 | namespace eval TV::Private { 9 | 10 | BP_start 1 11 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 12 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#66} 13 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#traverseArray(int)#$b1} 14 | BP_set InnerUnitLineOffset {0} 15 | BP_set Flags {g 1 p g g} 16 | BP_set SatSet {C} 17 | BP_set BlocksEnabled {0x3} 18 | BP_set BlockCount {3} 19 | BP_set BlockAddress {0x004018c9} 20 | BP_set SourceText { for (int i = 0; i < n; i++)} 21 | BP_set Instruction {movl $0,-20(%rbp)} 22 | BP_done 1 23 | 24 | BP_start 2 25 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 26 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#94} 27 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#cleanup(void)} 28 | BP_set InnerUnitLineOffset {1} 29 | BP_set Flags {g 1 p g g} 30 | BP_set SatSet {C} 31 | BP_set SourceText { exit(-1);} 32 | BP_set Instruction {movl $-1,%edi} 33 | BP_done 2 34 | 35 | BP_start 3 36 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 37 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#154} 38 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#derived_class(void)#$b1} 39 | BP_set InnerUnitLineOffset {0} 40 | BP_set Flags {g 1 p g g} 41 | BP_set SatSet {C} 42 | BP_set SourceText { Base1 b1('A'), *base1Ptr;} 43 | BP_set Instruction {leal -129(%rbp),%rax} 44 | BP_done 3 45 | 46 | BP_start 4 47 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 48 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#169} 49 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#derived_class(void)#$b1} 50 | BP_set InnerUnitLineOffset {15} 51 | BP_set Flags {g 1 p g g} 52 | BP_set SatSet {C} 53 | BP_set SourceText { <<"\\nd contains " << dd ;} 54 | BP_set Instruction {movl $0x405eef,%esi} 55 | BP_done 4 56 | 57 | BP_start 5 58 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 59 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#224} 60 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#stl_view(void)#$b1} 61 | BP_set InnerUnitLineOffset {10} 62 | BP_set Flags {g 1 p g g} 63 | BP_set SatSet {C} 64 | BP_set SourceText { lb1.push_back(b1); } 65 | BP_set Instruction {leal -138(%rbp),%rdx} 66 | BP_done 5 67 | 68 | BP_start 6 69 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 70 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#244} 71 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#stl_view(void)#$b1} 72 | BP_set InnerUnitLineOffset {30} 73 | BP_set Flags {g 1 p g g} 74 | BP_set SatSet {C} 75 | BP_set SourceText { s1="this is another string";} 76 | BP_set Instruction {leal -128(%rbp),%rax} 77 | BP_done 6 78 | 79 | BP_start 7 80 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 81 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#357} 82 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#user_templates(void)#$b1} 83 | BP_set InnerUnitLineOffset {5} 84 | BP_set Flags {g 1 p g g} 85 | BP_set SatSet {C} 86 | BP_set SourceText { vect i = vect(5, ia);} 87 | BP_set Instruction {leal -80(%rbp),%rdx} 88 | BP_done 7 89 | 90 | BP_start 8 91 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 92 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#364} 93 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#user_templates(void)#$b1} 94 | BP_set InnerUnitLineOffset {12} 95 | BP_set Flags {g 1 p g g} 96 | BP_set SatSet {C} 97 | BP_set SourceText { printArray(i.v, i.len);} 98 | BP_set Instruction {mov -112(%rbp),%edx} 99 | BP_done 8 100 | 101 | BP_start 9 102 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 103 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#514} 104 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#arrays(void)#$b1#$b1#$b1} 105 | BP_set InnerUnitLineOffset {3} 106 | BP_set Flags {g 1 p g g} 107 | BP_set SatSet {C} 108 | BP_set SourceText { vol[i][j] = cylinder.volume();} 109 | BP_set Instruction {mov %r12,%rbx} 110 | BP_done 9 111 | 112 | BP_start 10 113 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 114 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#592} 115 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#combine_waves_worker(void*)#$b1} 116 | BP_set InnerUnitLineOffset {4} 117 | BP_set Flags {g 1 p g g} 118 | BP_set SatSet {C} 119 | BP_set SourceText { temp=arg->a[j]+arg->b[j];} 120 | BP_set Instruction {mov -8(%rbp),%rax} 121 | BP_done 10 122 | 123 | BP_start 11 124 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 125 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#608} 126 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#parallel_combine_waves(float*,float*)#$b1} 127 | BP_set InnerUnitLineOffset {6} 128 | BP_set Flags {g 1 p g g} 129 | BP_set SatSet {C} 130 | BP_set SourceText { args=initialize_args(a, b, args);} 131 | BP_set Instruction {mov -8(%rbp),%rdx} 132 | BP_done 11 133 | 134 | BP_start 12 135 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 136 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#670} 137 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#pthreads_loop(void)#$b1} 138 | BP_set InnerUnitLineOffset {7} 139 | BP_set Flags {g 1 p g g} 140 | BP_set SatSet {C} 141 | BP_set SourceText { simple_wave(component,period,amplitude);} 142 | BP_set Instruction {mov -804(%rbp),%edx} 143 | BP_done 12 144 | 145 | BP_start 13 146 | BP_set ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29} 147 | BP_set LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#717} 148 | BP_set InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#diveinall(void)#$b1} 149 | BP_set InnerUnitLineOffset {18} 150 | BP_set Flags {g 1 p g g} 151 | BP_set SatSet {C} 152 | BP_set SourceText { bb = cc = 0xffffffffLL;} 153 | BP_set Instruction {movl $-1,%eax} 154 | BP_done 13 155 | 156 | } 157 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/common/exception.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* CUda UTility Library */ 29 | #ifndef COMMON_EXCEPTION_H_ 30 | #define COMMON_EXCEPTION_H_ 31 | 32 | // includes, system 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | //! Exception wrapper. 40 | //! @param Std_Exception Exception out of namespace std for easy typing. 41 | template 42 | class Exception : public Std_Exception { 43 | public: 44 | //! @brief Static construction interface 45 | //! @return Alwayss throws ( Located_Exception) 46 | //! @param file file in which the Exception occurs 47 | //! @param line line in which the Exception occurs 48 | //! @param detailed details on the code fragment causing the Exception 49 | static void throw_it(const char *file, const int line, 50 | const char *detailed = "-"); 51 | 52 | //! Static construction interface 53 | //! @return Alwayss throws ( Located_Exception) 54 | //! @param file file in which the Exception occurs 55 | //! @param line line in which the Exception occurs 56 | //! @param detailed details on the code fragment causing the Exception 57 | static void throw_it(const char *file, const int line, 58 | const std::string &detailed); 59 | 60 | //! Destructor 61 | virtual ~Exception() throw(); 62 | 63 | private: 64 | //! Constructor, default (private) 65 | Exception(); 66 | 67 | //! Constructor, standard 68 | //! @param str string returned by what() 69 | explicit Exception(const std::string &str); 70 | }; 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | //! Exception handler function for arbitrary exceptions 74 | //! @param ex exception to handle 75 | //////////////////////////////////////////////////////////////////////////////// 76 | template 77 | inline void handleException(const Exception_Typ &ex) { 78 | std::cerr << ex.what() << std::endl; 79 | 80 | exit(EXIT_FAILURE); 81 | } 82 | 83 | //! Convenience macros 84 | 85 | //! Exception caused by dynamic program behavior, e.g. file does not exist 86 | #define RUNTIME_EXCEPTION(msg) \ 87 | Exception::throw_it(__FILE__, __LINE__, msg) 88 | 89 | //! Logic exception in program, e.g. an assert failed 90 | #define LOGIC_EXCEPTION(msg) \ 91 | Exception::throw_it(__FILE__, __LINE__, msg) 92 | 93 | //! Out of range exception 94 | #define RANGE_EXCEPTION(msg) \ 95 | Exception::throw_it(__FILE__, __LINE__, msg) 96 | 97 | //////////////////////////////////////////////////////////////////////////////// 98 | //! Implementation 99 | 100 | // includes, system 101 | #include 102 | 103 | //////////////////////////////////////////////////////////////////////////////// 104 | //! Static construction interface. 105 | //! @param Exception causing code fragment (file and line) and detailed infos. 106 | //////////////////////////////////////////////////////////////////////////////// 107 | /*static*/ template 108 | void Exception::throw_it(const char *file, const int line, 109 | const char *detailed) { 110 | std::stringstream s; 111 | 112 | // Quiet heavy-weight but exceptions are not for 113 | // performance / release versions 114 | s << "Exception in file '" << file << "' in line " << line << "\n" 115 | << "Detailed description: " << detailed << "\n"; 116 | 117 | throw Exception(s.str()); 118 | } 119 | 120 | //////////////////////////////////////////////////////////////////////////////// 121 | //! Static construction interface. 122 | //! @param Exception causing code fragment (file and line) and detailed infos. 123 | //////////////////////////////////////////////////////////////////////////////// 124 | /*static*/ template 125 | void Exception::throw_it(const char *file, const int line, 126 | const std::string &msg) { 127 | throw_it(file, line, msg.c_str()); 128 | } 129 | 130 | //////////////////////////////////////////////////////////////////////////////// 131 | //! Constructor, default (private). 132 | //////////////////////////////////////////////////////////////////////////////// 133 | template 134 | Exception::Exception() : Std_Exception("Unknown Exception.\n") {} 135 | 136 | //////////////////////////////////////////////////////////////////////////////// 137 | //! Constructor, standard (private). 138 | //! String returned by what(). 139 | //////////////////////////////////////////////////////////////////////////////// 140 | template 141 | Exception::Exception(const std::string &s) : Std_Exception(s) {} 142 | 143 | //////////////////////////////////////////////////////////////////////////////// 144 | //! Destructor 145 | //////////////////////////////////////////////////////////////////////////////// 146 | template 147 | Exception::~Exception() throw() {} 148 | 149 | // functions, exported 150 | 151 | #endif // COMMON_EXCEPTION_H_ 152 | -------------------------------------------------------------------------------- /Linaro-Forge/performance/mmult.py: -------------------------------------------------------------------------------- 1 | # =============================================================================== 2 | # Copyright (C) March 2023 - Linaro Limited (or its affiliates). All rights reserved. 3 | # Copyright (C) Arm Limited, 2019-2023 All rights reserved. 4 | # The example code is provided to you as an aid to learning when working 5 | # with Linaro Forge, including but not limited to programming tutorials. 6 | # Linaro hereby grants to you, subject to the terms and conditions of this Licence, 7 | # a non-exclusive, non-transferable, non-sub-licensable, free-of-charge licence, 8 | # to use and copy the Software solely for the purpose of demonstration and 9 | # evaluation. 10 | # You accept that the Software has not been tested by Linaro therefore the Software 11 | # is provided “as is”, without warranty of any kind, express or implied. In no 12 | # event shall the authors or copyright holders be liable for any claim, damages 13 | # or other liability, whether in action or contract, tort or otherwise, arising 14 | # from, out of or in connection with the Software or the use of Software. 15 | # =============================================================================== 16 | 17 | #!/usr/bin/env python 18 | from __future__ import division 19 | from __future__ import print_function 20 | from __future__ import with_statement 21 | 22 | 23 | 24 | import argparse 25 | import ctypes 26 | import os 27 | import sys 28 | from ctypes import c_int 29 | 30 | import numpy 31 | import mpi4py 32 | 33 | from mpi4py import MPI 34 | from numpy.ctypeslib import ndpointer 35 | from scipy.linalg import blas 36 | 37 | 38 | # Enable MPI SINGLE thread 39 | mpi4py.rc.threaded = False 40 | mpi4py.rc.thread_level = "single" 41 | 42 | 43 | 44 | 45 | 46 | # Check if C kernel has been compiled 47 | try: 48 | open("libmmult_c.so", 'r') 49 | except FileNotFoundError: 50 | print("C kernel not found. Please run 'make -f mmult_py.makefile' " 51 | "to compile it before running this script") 52 | sys.exit(1) 53 | 54 | # Check and load F90 kernel 55 | sys.path.insert(0, '.') 56 | try: 57 | import libmmult_f 58 | except ImportError: 59 | print("F90 kernel not found. Please run 'make -f mmult_py.makefile' " 60 | "to compile it before running this script") 61 | sys.exit(1) 62 | 63 | # Load C kernel 64 | C_MMULT_LIB = ctypes.CDLL(os.path.join(os.path.dirname(__file__), "libmmult_c.so")) 65 | 66 | # Declare ctype for ndarray pointer 67 | arr_ptr_t_c = ndpointer(dtype=numpy.float64, ndim=1, flags='C') 68 | C_MMULT_LIB.mmult.argtypes = [c_int, c_int, arr_ptr_t_c, arr_ptr_t_c, arr_ptr_t_c] 69 | C_MMULT_LIB.mmult.restype = None 70 | 71 | DEFAULT_SIZE = 64 72 | DEFAULT_FN = "res_Py.mat" 73 | DEFAULT_KERNEL = "C" 74 | SOLVER_CHOICES = ["C", "F90", "Py"] 75 | 76 | 77 | def minit(sz, fortran_style_array_order, A): 78 | for i in range(0, sz): 79 | for j in range(0, sz): 80 | if fortran_style_array_order: 81 | A[i, j] = i*(j+1) 82 | else: 83 | A[i*sz+j] = i*(j+1) 84 | 85 | 86 | def mwrite(A, fn): 87 | f = open(fn, "w") 88 | A.tofile(f, sep="\t", format="%g") 89 | f.close() 90 | 91 | 92 | def main(sz, kernel, filename): 93 | intercomm = MPI.Comm.Get_parent() 94 | 95 | comm = MPI.COMM_WORLD 96 | nproc = comm.size 97 | mr = comm.rank 98 | 99 | if mr == 0: 100 | print("-------------------------------------------------------------------\n" 101 | "This program contains an intentional bug. See the 'Worked Examples'\n" 102 | "section of the Linaro Forge user guide for more information:\n" 103 | "https://docs.linaroforge.com/latest/html/forge/index.html or\n" 104 | "../doc/userguide-forge.pdf\n" 105 | "-------------------------------------------------------------------\n") 106 | 107 | remainder = sz%nproc 108 | 109 | if remainder > 0: 110 | if mr == 0: 111 | print("{}: Info: reducing SIZE {} to {} to be a multiple of number of " 112 | "processes ({})".format(mr, sz, sz-remainder, nproc)) 113 | sz = sz-remainder 114 | 115 | mslice = int(sz*sz/nproc) 116 | mslice_r = int(sz/nproc) 117 | 118 | fortran_style_array_order = kernel in ('F90', 'Py') 119 | 120 | if kernel == "F90": 121 | print("Skipping due to the Fortran function call ommiting some call arguments in Python") 122 | sys.exit(0) 123 | 124 | if mr == 0: 125 | print("{rank}: Size of the matrices: {size}x{size}".format(rank=mr, size=sz)) 126 | print("{}: Kernel: {}".format(mr, kernel)) 127 | 128 | if mr == 0: 129 | if fortran_style_array_order: 130 | mat_a = numpy.ndarray(shape=(sz, sz), dtype='d', order='F') 131 | mat_b = numpy.ndarray(shape=(sz, sz), dtype='d', order='F') 132 | mat_c = numpy.ndarray(shape=(sz, sz), dtype='d', order='F') 133 | else: 134 | mat_a = numpy.ndarray(shape=(sz*sz), dtype='d', order='C') 135 | mat_b = numpy.ndarray(shape=(sz*sz), dtype='d', order='C') 136 | mat_c = numpy.ndarray(shape=(sz*sz), dtype='d', order='C') 137 | 138 | print("{}: Initializing matrices...".format(mr)) 139 | minit(sz, fortran_style_array_order, mat_a) 140 | minit(sz, fortran_style_array_order, mat_b) 141 | minit(sz, fortran_style_array_order, mat_c) 142 | 143 | print("{}: Sending matrices".format(mr)) 144 | for i in range(1, nproc): 145 | # Get a slice from the mat_a and mat_c matrix 146 | if fortran_style_array_order: 147 | mat_a_slice = mat_c[:, i*mslice_r:(i+1)*mslice_r] 148 | mat_c_slice = mat_c[:, i*mslice_r:(i+1)*mslice_r] 149 | else: 150 | mat_a_slice = mat_a[i*mslice:(i+1)*mslice] 151 | mat_c_slice = mat_c[i*mslice:(i+1)*mslice] 152 | comm.send(mat_a_slice, dest=i, tag=i) 153 | comm.send(mat_b, dest=i, tag=100+i) 154 | comm.send(mat_c_slice, dest=i, tag=200+i) 155 | else: 156 | print("{}: Receiving matrices".format(mr)) 157 | if fortran_style_array_order: 158 | mat_a = numpy.ndarray(shape=(sz, mslice_r), dtype='d', order='F') 159 | mat_b = numpy.ndarray(shape=(sz, sz), dtype='d', order='F') 160 | mat_c = numpy.ndarray(shape=(sz, mslice_r), dtype='d', order='F') 161 | else: 162 | mat_a = numpy.ndarray(shape=(mslice), dtype='d', order='C') 163 | mat_b = numpy.ndarray(shape=(sz*sz), dtype='d', order='C') 164 | mat_c = numpy.ndarray(shape=(mslice), dtype='d', order='C') 165 | 166 | mat_a = comm.recv(source=0, tag=mr) 167 | mat_b = comm.recv(source=0, tag=100+mr) 168 | mat_c = comm.recv(source=0, tag=200+mr) 169 | 170 | # Processing 171 | print("{}: Processing..".format(mr)) 172 | if kernel == "F90": 173 | # f2py makes sz parameter optional 174 | libmmult_f.mmult(nproc, mat_a, mat_b, mat_c) 175 | 176 | elif kernel == "Py": 177 | mat_c = blas.dgemm(alpha=1.0, a=mat_b, b=mat_a, beta=1.0, c=mat_c, 178 | overwrite_c=True, trans_b=False) 179 | else: 180 | C_MMULT_LIB.mmult(sz, nproc, mat_a, mat_b, mat_c) 181 | 182 | if mr == 0: 183 | print("{}: Receiving result matrix...".format(mr)) 184 | for i in range(1, nproc): 185 | if fortran_style_array_order: 186 | mat_c[:, i*mslice_r:(i+1)*mslice_r] = comm.recv(source=i, tag=500+i) 187 | else: 188 | mat_c[i*mslice:(i+1)*mslice] = comm.recv(source=i, tag=500+i) 189 | else: 190 | print("{}: Sending result matrix...".format(mr)) 191 | comm.send(mat_c, dest=0, tag=500+mr) 192 | 193 | # Writing result 194 | if mr == 0: 195 | mwrite(mat_c, filename) 196 | 197 | if mr == 0: 198 | print("{}: Done".format(mr)) 199 | 200 | if intercomm != MPI.COMM_NULL: 201 | intercomm.Barrier() 202 | 203 | 204 | if __name__ == "__main__": 205 | parser = argparse.ArgumentParser(description="Matrix product.") 206 | parser.add_argument("-k", dest="kernel", metavar="KERNEL", action="store", type=str, 207 | help=("Solver. Options: [%s] (default is C)" 208 | % "|".join(SOLVER_CHOICES)), 209 | choices=SOLVER_CHOICES, default=DEFAULT_KERNEL) 210 | parser.add_argument("-s", dest="mat_size", metavar="SIZE", action="store", type=int, 211 | help=("size of the matrix to compute (default is %d)" 212 | % (DEFAULT_SIZE)), 213 | default=DEFAULT_SIZE) 214 | parser.add_argument("-o", dest="fn", metavar="FILENAME", action="store", type=str, 215 | help=("output matrix file name (default is %s)" % DEFAULT_FN), 216 | default=DEFAULT_FN) 217 | 218 | args = parser.parse_args() 219 | 220 | main(args.mat_size, args.kernel, args.fn) 221 | -------------------------------------------------------------------------------- /TotalView/src/main.cxx: -------------------------------------------------------------------------------- 1 | /*********************************************************************** 2 | * Copyright 2000-2007 by Etnus, LLC. ALL RIGHTS RESERVED 3 | * No part of this material may be reproduced, stored in a retrieval 4 | * system, transmitted or used in any form or by any means, electronic, 5 | * mechanical, photocopying, recording, or otherwise, without the prior 6 | * written permission of, or express license from Etnus, LLC. 7 | *********************************************************************** 8 | *********************************************************************** 9 | * Copyright 2007 by TotalView Technologies 10 | * Copyright 1999-2007 by Etnus, Inc. 11 | * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc. 12 | * Copyright 1989-1996 by BBN Inc. 13 | ***********************************************************************/ 14 | #include "myClassA.hxx" 15 | #include "myClassB.hxx" 16 | 17 | #ifdef USEMPI 18 | #include 19 | 20 | int rank; 21 | 22 | #endif //USEMPI 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | void *myMalloc( int size ) 30 | { 31 | return malloc( size ); 32 | } 33 | 34 | void myFree( void *v ) 35 | { 36 | free( v ); 37 | } 38 | 39 | void myFunc() 40 | { 41 | int * alloc = 0; 42 | int i; 43 | 44 | alloc = (int *) myMalloc( 10 * sizeof(int) ); 45 | 46 | for( i=0; i<=10; i++ ) 47 | { 48 | alloc[i]=12-i; 49 | } 50 | 51 | alloc=0; 52 | } 53 | 54 | void double_free() 55 | { 56 | int *p; 57 | int length = 0xab; 58 | int junk = 0; 59 | 60 | p = (int*) malloc( length ); 61 | printf ( "malloced %4d (%#6x) bytes at %p\n", length, length, p ); 62 | 63 | // Breakpoint here 64 | // Show allocated annotation 65 | // Show block properties and enable notify when deallocated 66 | junk = 0; 67 | 68 | // Now release the memory the first time - legal 69 | free ( p ); 70 | 71 | // Breakpoint here 72 | // Show that the block is marked dangling 73 | // Show that the deallocation stack is available now 74 | junk = 0; 75 | 76 | // Now release the memory the second time - illegal 77 | #ifdef USEMPI 78 | if( rank == 1 ) 79 | #endif 80 | free ( p ); 81 | 82 | // Note: an evaluation point can be used to avoid the segv 83 | // and continue the demo. 84 | } 85 | 86 | void corrupt_data() 87 | { 88 | int i, j; 89 | int size; 90 | int * p0; 91 | int * p1; 92 | int * p2; 93 | 94 | // Breakpoint here. 95 | // Enable Guard Blocks on the Memory Debugging Configuration Page. 96 | // Use 8 byte pre and post guard size. 97 | 98 | size = 16; 99 | 100 | // Loop added around the allocates and the corrupt loop, 101 | // to make the memory report viewing more interesting. 102 | // 9/2009 CS at the suggestion of MS and EH 103 | 104 | for ( j=0; j<6 ; j++ ) 105 | { 106 | 107 | p0 = (int *) malloc( size * sizeof( int ) ); 108 | p1 = (int *) malloc( size * sizeof( int ) ); 109 | p2 = (int *) malloc( size * sizeof( int ) ); 110 | 111 | // Common corruption cases. Oops in the for loop condition. 112 | for( i=0; i<=size; i++ ) 113 | { 114 | p1[i] = size - i; 115 | } 116 | } 117 | 118 | // Breakpoint here. 119 | // Check Heap Status Corrupt Guard Block View to scan 120 | // all blocks for corruption. 121 | i = 0; 122 | 123 | // Corrupt Guard Memory Event on free(). 124 | free( p1 ); 125 | } 126 | 127 | 128 | void corrupt_data_rz() 129 | { 130 | int i; 131 | int size; 132 | int * p0; 133 | int * p1; 134 | int * p2; 135 | 136 | // Code is mostly the same as corrupt_data(), just repeated 137 | // to demonstrate the ability to change from Guard Blocks to 138 | // RedZones on the fly, and compare and contrast the two. 139 | 140 | size = 16; 141 | 142 | p0 = (int *) malloc( size * sizeof( int ) ); 143 | p1 = (int *) malloc( size * sizeof( int ) ); 144 | p2 = (int *) malloc( size * sizeof( int ) ); 145 | 146 | // Common corruption cases. Oops in the for loop condition. 147 | 148 | for( i=0; i<=size; i++ ) 149 | { 150 | p1[i] = size - i; 151 | } 152 | 153 | free( p2 ); 154 | free( p1 ); 155 | free( p0 ); 156 | } 157 | 158 | 159 | void corrupt_data_sizes() 160 | { 161 | int i; 162 | int size; 163 | int * p0; 164 | int * p1; 165 | int * p2; 166 | 167 | // Like corrupt_data(), but allocates/corrupts different sizes. 168 | // In RedZones detailed options, set size limits to ignore the 169 | // smallest and largest size, and catch the middle size. 170 | 171 | size = 64; 172 | p0 = (int *) malloc( size * sizeof( int ) ); 173 | for( i=0; i<=size; i++ ) 174 | { 175 | p0[i] = size - i; 176 | } 177 | 178 | size = 256; 179 | p1 = (int *) malloc( size * sizeof( int ) ); 180 | for( i=0; i<=size; i++ ) 181 | { 182 | p1[i] = size - i; 183 | } 184 | 185 | size = 128; 186 | p2 = (int *) malloc( size * sizeof( int ) ); 187 | for( i=0; i<=size; i++ ) 188 | { 189 | p2[i] = size - i; 190 | } 191 | 192 | free( p2 ); 193 | free( p1 ); 194 | free( p0 ); 195 | } 196 | 197 | 198 | void read_overrun() 199 | { 200 | int i; 201 | int size; 202 | int * p1; 203 | int j; 204 | 205 | // RedZones can catch out-of-bounds access even if only a read. 206 | // Make size such that RZ catches even if size limits from 207 | // corrupt_sizes method are still in place. 208 | 209 | size = 128; 210 | 211 | p1 = (int *) malloc( size * sizeof( int ) ); 212 | 213 | // Fill in with a correct loop. 214 | for( i=0; idestroy(); 327 | } 328 | else { 329 | delete b1; 330 | } 331 | } 332 | 333 | int * alloc2 = 0; 334 | for( int i = 1; i < 25; i++ ) 335 | { 336 | alloc2 = (int *) myMalloc((int)(i * (10+(int) rand()/(RAND_MAX+1.0)))); 337 | 338 | if( i % 5 ) 339 | { 340 | myFree(alloc2); 341 | } 342 | } 343 | 344 | myFunc(); 345 | 346 | } 347 | 348 | /* breakpoint here*/ 349 | /* discuss heap view (graphical and source) 350 | * show leak detection 351 | * show filtering 352 | */ 353 | 354 | printf("Reached the end of filterapp-leaks\n"); 355 | 356 | #ifdef USEMPI 357 | MPI_Finalize(); 358 | #endif 359 | 360 | return 0; 361 | } 362 | -------------------------------------------------------------------------------- /Linaro-Forge/performance/common.makefile: -------------------------------------------------------------------------------- 1 | ### Determine compiler invocation ### 2 | 3 | ifdef PE_ENV 4 | 5 | # Cray-specific invocations 6 | CC = cc 7 | CXX = CC 8 | MPICC = $(CC) 9 | MPICXX = $(CXX) 10 | FC = ftn 11 | F77 = $(FC) 12 | F90 = $(FC) 13 | MPIF77 = $(FC) 14 | MPIF90 = $(FC) 15 | 16 | else 17 | 18 | ifneq ($(filter default undefined,$(origin FC)),) 19 | # default to GNU 20 | FC := gfortran 21 | endif 22 | F77 ?= $(FC) 23 | F90 ?= $(FC) 24 | 25 | # MPI C/C++ Compilers 26 | ifndef MPICC 27 | ifeq ($(shell which mpiicc > /dev/null 2>&1; echo $$?),0) 28 | MPICC := mpiicc 29 | else ifeq ($(shell which mpicc > /dev/null 2>&1; echo $$?),0) 30 | MPICC := mpicc 31 | endif 32 | endif 33 | # Only detect toolchain if MPICC is set, otherwise defer error to rule which invokes compiler 34 | ifdef MPICC 35 | # disable remark #10441: warning for deprecated Intel Compiler Classic 36 | MPICC_VERSION := $(shell $(MPICC) --version -diag-disable=10441 2> /dev/null || $(MPICC) --version 2> /dev/null || $(MPICC) -qversion 2> /dev/null) 37 | else 38 | MPICC = $(error Could not detect MPI C compiler in PATH - failed to make target $@) 39 | endif 40 | 41 | ifndef MPICXX 42 | ifeq ($(shell which mpiicpc > /dev/null 2>&1; echo $$?),0) 43 | MPICXX := mpiicpc 44 | else ifeq ($(shell which mpic++ > /dev/null 2>&1; echo $$?),0) 45 | MPICXX := mpic++ 46 | else ifeq ($(shell which mpicxx > /dev/null 2>&1; echo $$?),0) 47 | MPICXX := mpicxx 48 | endif 49 | endif 50 | MPICXX ?= $(error Could not detect MPI C++ compiler in PATH - failed to make target $@) 51 | 52 | # MPI Fortran Compilers 53 | ifndef MPIF90 54 | ifeq ($(shell which mpiifort > /dev/null 2>&1; echo $$?),0) 55 | MPIF90 := mpiifort 56 | else ifeq ($(shell which mpifc > /dev/null 2>&1; echo $$?),0) 57 | MPIF90 := mpifc 58 | else ifeq ($(shell which mpifort > /dev/null 2>&1; echo $$?),0) 59 | MPIF90 := mpifort 60 | else ifeq ($(shell which mpif90 > /dev/null 2>&1; echo $$?),0) 61 | MPIF90 := mpif90 62 | endif 63 | endif 64 | 65 | # Only detect toolchain if MPIF90 is set, otherwise defer error to rule which invokes compiler 66 | ifdef MPIF90 67 | # disable remark #10441: warning for deprecated Intel Compiler Classic 68 | MPIF90_VERSION := $(shell $(MPIF90) --version -diag-disable=10441 2> /dev/null || $(MPIF90) --version 2> /dev/null || $(MPIF90) -qversion 2> /dev/null) 69 | else 70 | MPIF90 = $(error Could not detect MPI Fortran compiler in PATH - failed to make target $@) 71 | endif 72 | 73 | ifndef MPIF77 74 | ifeq ($(shell which mpif77 > /dev/null 2>&1; echo $$?),0) 75 | MPIF77 := mpif77 76 | else 77 | MPIF77 = $(MPIF90) 78 | endif 79 | endif 80 | 81 | MPIFC ?= $(MPIF90) 82 | 83 | endif 84 | 85 | ### Recommended compiler flags ### 86 | 87 | # Flags for compiler inlining: MAP works whether inlining is on or off, 88 | # but you'll typically see more intuitive stacks with it turned off. 89 | # The major compilers are discussed here: 90 | # 91 | # Intel: -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 is 92 | # recommended. At O3 the compiler doesn't produce enough unwind info even 93 | # with -debug inline-debug-info set. 94 | # 95 | # PGI: -g -O3 -Meh_frame -Mframe -Mnoautoinline is recommended. Other settings 96 | # dont produce enough unwind information for inlined functions otherwise. This 97 | # adds some performance penalty - around 8% is typical. 98 | # 99 | # The PGI C runtime static library contains an undefined reference to 100 | # __kmpc_fork_call, which will cause compilation to fail when linking 101 | # allinea-profiler.ld. Add --undefined __wrap___kmpc_fork_call to your link line 102 | # before linking to the Forge sampler to resolve this. 103 | # 104 | # GNU: -g -O3 -fno-inline is recommended. You might be lucky without -fno-inline, 105 | # as it should produce enough information to unwind those calls. You will see 106 | # my_function [inlined] in the MAP stack for functions that were inline. 107 | # -fno-inline-functions appears with newer gnu compilers, just to confuse 108 | 109 | # Common OpenMP flags for supported compilers 110 | # -fopenmp for gnu 111 | # -openmp for intel 112 | # -mp for pgi 113 | # -qsmp=omp:noopt for IBM 114 | # -homp for cray (compiler) 115 | 116 | # Common pthread flags for supported compilers 117 | # -pthread for GNU 118 | # -lpthread for other compilers 119 | 120 | INTEL_LLVM_MAP_CFLAGS := -g -fno-inline -no-ipo -fno-omit-frame-pointer -O3 121 | INTEL_LLVM_DDT_CFLAGS := -g -Wall -O0 122 | INTEL_LLVM_OPENMP_CFLAG := -qopenmp 123 | INTEL_LLVM_MAP_FCFLAGS := $(INTEL_LLVM_MAP_CFLAGS) 124 | INTEL_LLVM_DDT_FCFLAGS := $(filter-out -Wall, $(INTEL_LLVM_DDT_CFLAGS)) -warn all 125 | INTEL_LLVM_OPENMP_FCFLAG := $(INTEL_LLVM_OPENMP_CFLAG) 126 | INTEL_LLVM_PTHREAD_CFLAG := -lpthread 127 | INTEL_LLVM_SHARED_LIBRARY_CFLAGS=-fPIC 128 | INTEL_LLVM_SHARED_LIBRARY_LINKER_FLAGS=-shared 129 | 130 | INTEL_MAP_CFLAGS := -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 131 | INTEL_DDT_CFLAGS := -g -w3 -O0 132 | INTEL_OPENMP_CFLAG := -qopenmp 133 | INTEL_MAP_FCFLAGS := $(INTEL_MAP_CFLAGS) 134 | INTEL_DDT_FCFLAGS := $(filter-out -w3, $(INTEL_DDT_CFLAGS)) -warn all 135 | INTEL_OPENMP_FCFLAG := $(INTEL_OPENMP_CFLAG) 136 | INTEL_PTHREAD_CFLAG := -lpthread 137 | INTEL_SHARED_LIBRARY_CFLAGS=-fPIC 138 | INTEL_SHARED_LIBRARY_LINKER_FLAGS=-shared 139 | 140 | PGI_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline 141 | PGI_DDT_CFLAGS := -g -O0 142 | PGI_MAJOR_VERSION_GT_17 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 17 2> /dev/null) 143 | ifeq ($(PGI_MAJOR_VERSION_GT_17),1) 144 | PGI_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call 145 | endif 146 | PGI_OPENMP_CFLAG := -mp 147 | PGI_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS)) 148 | PGI_DDT_FCFLAGS := $(PGI_DDT_CFLAGS) 149 | PGI_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG) 150 | PGI_PTHREAD_CFLAG := -lpthread 151 | PGI_SHARED_LIBRARY_CFLAGS=-fPIC 152 | PGI_SHARED_LIBRARY_LINKER_FLAGS=-shared 153 | 154 | NVC_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline 155 | NVC_DDT_CFLAGS := -g -O0 156 | NVC_MAJOR_VERSION_GT_20 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 20 2> /dev/null) 157 | NVC_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call 158 | NVC_OPENMP_CFLAG := -mp 159 | NVC_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS)) 160 | NVC_DDT_FCFLAGS := $(PGI_DDT_CFLAGS) 161 | NVC_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG) 162 | NVC_PTHREAD_CFLAG := -lpthread 163 | NVC_SHARED_LIBRARY_CFLAGS=-fPIC 164 | NVC_SHARED_LIBRARY_LINKER_FLAGS=-shared 165 | 166 | IBM_MAP_CFLAGS := -g -O3 -qnoinline 167 | IBM_DDT_CFLAGS := -g -Werror -Weverything -O0 168 | IBM_OPENMP_CFLAG := -qsmp=omp:noopt 169 | IBM_MAP_FCFLAGS := $(IBM_MAP_CFLAGS) 170 | IBM_DDT_FCFLAGS := $(IBM_DDT_CFLAGS) 171 | IBM_OPENMP_FCFLAG := $(IBM_OPENMP_CFLAG) -qsmp=omp:noopt -qnohot -lxlf90 -lxlsmp -lxlfmath 172 | IBM_PTHREAD_CFLAG := -lpthread 173 | IBM_SHARED_LIBRARY_CFLAGS= 174 | IBM_SHARED_LIBRARY_LINKER_FLAGS=-qmkshrobj 175 | 176 | CRAY_MAP_CFLAGS := -g -O3 -hipa0 177 | CRAY_DDT_CFLAGS := -g -h msglevel_2 -O0 178 | CRAY_OPENMP_CFLAG := -homp 179 | CRAY_MAP_FCFLAGS := $(CRAY_MAP_CFLAGS) 180 | CRAY_DDT_FCFLAGS := -g -m 2 181 | CRAY_OPENMP_FCFLAG := $(CRAY_OPENMP_CFLAG) 182 | CRAY_PTHREAD_CFLAG := -lpthread 183 | CRAY_SHARED_LIBRARY_CFLAGS=-fPIC 184 | CRAY_SHARED_LIBRARY_LINKER_FLAGS=-shared 185 | 186 | GNU_MAP_CFLAGS := -g -O3 -fno-inline -fno-optimize-sibling-calls 187 | GNU_DDT_CFLAGS := -g -Wall -Werror -O0 188 | GNU_OPENMP_CFLAG := -fopenmp 189 | GNU_MAP_FCFLAGS := $(GNU_MAP_CFLAGS) 190 | GNU_DDT_FCFLAGS := $(GNU_DDT_CFLAGS) 191 | GNU_OPENMP_FCFLAG := $(GNU_OPENMP_CFLAG) 192 | GNU_PTHREAD_CFLAG := -pthread 193 | GNU_SHARED_LIBRARY_CFLAGS=-fPIC 194 | GNU_SHARED_LIBRARY_LINKER_FLAGS=-shared 195 | 196 | # GCC 10 is stricter on requiring standard-compliant Fortran, set this flag 197 | # when compiling older Fortran programs. 198 | GNU_LEGACY_STD_FCFLAG := -std=legacy 199 | 200 | ### Toolchain detection ### 201 | 202 | define get_compiler_toolchain 203 | $(if $(or $(findstring icx,$(1)),$(findstring ifx,$(1)),$(findstring Intel(R) oneAPI,$(1)),$(findstring INTEL,$(PE_ENV))), 204 | INTEL_LLVM, 205 | $(if $(or $(findstring icc,$(1)),$(findstring ifort,$(1)),$(findstring Intel,$(1)),$(findstring INTEL,$(PE_ENV))), 206 | INTEL, 207 | $(if $(or $(findstring pgcc,$(1)),$(findstring pgfortran,$(1)),$(findstring PGI,$(1)), $(findstring PGI,$(PE_ENV))), 208 | PGI, 209 | $(if $(or $(findstring nvc,$(1)),$(findstring nvfortran,$(1))), 210 | NVC, 211 | $(if $(or $(findstring xlc,$(1)),$(findstring xlf,$(1)),$(findstring IBM,$(1)),$(findstring IBM,$(PE_ENV))), 212 | IBM, 213 | $(if $(findstring CRAY,$(PE_ENV)), 214 | CRAY, 215 | GNU)))))) 216 | endef 217 | 218 | CC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(CC))) 219 | MPICC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPICC_VERSION))) 220 | FC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(FC))) 221 | MPIF90_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPIF90_VERSION))) 222 | 223 | ### Compiler flags for toolchain (allow overrides) ### 224 | 225 | MAP_CFLAGS ?= $($(CC_TOOLCHAIN)_MAP_CFLAGS) 226 | MAP_FCFLAGS ?= $($(FC_TOOLCHAIN)_MAP_FCFLAGS) 227 | DDT_CFLAGS ?= $($(CC_TOOLCHAIN)_DDT_CFLAGS) 228 | DDT_FCFLAGS ?= $($(CC_TOOLCHAIN)_DDT_FCFLAGS) 229 | OPENMP_CFLAG ?= $($(CC_TOOLCHAIN)_OPENMP_CFLAG) 230 | OPENMP_FCFLAG ?= $($(FC_TOOLCHAIN)_OPENMP_FCFLAG) 231 | PTHREAD_CFLAG ?= $($(CC_TOOLCHAIN)_PTHREAD_CFLAG) 232 | PTHREAD_FCFLAG ?= $($(FC_TOOLCHAIN)_PTHREAD_FCFLAG) 233 | SHARED_LIBRARY_CFLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_CFLAGS) 234 | SHARED_LIBRARY_LINKER_FLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_LINKER_FLAGS) 235 | LEGACY_STD_FCFLAG ?= $($(FC_TOOLCHAIN)_LEGACY_STD_FCFLAG) 236 | MPI_MAP_CFLAGS ?= $($(MPICC_TOOLCHAIN)_MAP_CFLAGS) 237 | MPI_MAP_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_MAP_FCFLAGS) 238 | MPI_DDT_CFLAGS ?= $($(MPICC_TOOLCHAIN)_DDT_CFLAGS) 239 | MPI_DDT_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_DDT_FCFLAGS) 240 | MPI_OPENMP_CFLAG ?= $($(MPICC_TOOLCHAIN)_OPENMP_CFLAG) 241 | MPI_OPENMP_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_OPENMP_FCFLAG) 242 | MPI_PTHREAD_CFLAG ?= $($(MPICC_TOOLCHAIN)_PTHREAD_CFLAG) 243 | MPI_PTHREAD_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_PTHREAD_FCFLAG) 244 | 245 | ## Link flags for static Forge sampler 246 | ifeq ($(CC_TOOLCHAIN),GNU) 247 | ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) 248 | MAP_STATIC_C_LINKFLAGS := -no-pie 249 | endif 250 | endif 251 | 252 | ifeq ($(FC_TOOLCHAIN),GNU) 253 | ifneq ($(shell $(FC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) 254 | MAP_STATIC_FC_LINKFLAGS := -no-pie 255 | endif 256 | endif 257 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/core-files/common.makefile: -------------------------------------------------------------------------------- 1 | ### Determine compiler invocation ### 2 | 3 | ifdef PE_ENV 4 | 5 | # Cray-specific invocations 6 | CC = cc 7 | CXX = CC 8 | MPICC = $(CC) 9 | MPICXX = $(CXX) 10 | FC = ftn 11 | F77 = $(FC) 12 | F90 = $(FC) 13 | MPIF77 = $(FC) 14 | MPIF90 = $(FC) 15 | 16 | else 17 | 18 | ifneq ($(filter default undefined,$(origin FC)),) 19 | # default to GNU 20 | FC := gfortran 21 | endif 22 | F77 ?= $(FC) 23 | F90 ?= $(FC) 24 | 25 | # MPI C/C++ Compilers 26 | ifndef MPICC 27 | ifeq ($(shell which mpiicc > /dev/null 2>&1; echo $$?),0) 28 | MPICC := mpiicc 29 | else ifeq ($(shell which mpicc > /dev/null 2>&1; echo $$?),0) 30 | MPICC := mpicc 31 | endif 32 | endif 33 | # Only detect toolchain if MPICC is set, otherwise defer error to rule which invokes compiler 34 | ifdef MPICC 35 | # disable remark #10441: warning for deprecated Intel Compiler Classic 36 | MPICC_VERSION := $(shell $(MPICC) --version -diag-disable=10441 2> /dev/null || $(MPICC) --version 2> /dev/null || $(MPICC) -qversion 2> /dev/null) 37 | else 38 | MPICC = $(error Could not detect MPI C compiler in PATH - failed to make target $@) 39 | endif 40 | 41 | ifndef MPICXX 42 | ifeq ($(shell which mpiicpc > /dev/null 2>&1; echo $$?),0) 43 | MPICXX := mpiicpc 44 | else ifeq ($(shell which mpic++ > /dev/null 2>&1; echo $$?),0) 45 | MPICXX := mpic++ 46 | else ifeq ($(shell which mpicxx > /dev/null 2>&1; echo $$?),0) 47 | MPICXX := mpicxx 48 | endif 49 | endif 50 | MPICXX ?= $(error Could not detect MPI C++ compiler in PATH - failed to make target $@) 51 | 52 | # MPI Fortran Compilers 53 | ifndef MPIF90 54 | ifeq ($(shell which mpiifort > /dev/null 2>&1; echo $$?),0) 55 | MPIF90 := mpiifort 56 | else ifeq ($(shell which mpifc > /dev/null 2>&1; echo $$?),0) 57 | MPIF90 := mpifc 58 | else ifeq ($(shell which mpifort > /dev/null 2>&1; echo $$?),0) 59 | MPIF90 := mpifort 60 | else ifeq ($(shell which mpif90 > /dev/null 2>&1; echo $$?),0) 61 | MPIF90 := mpif90 62 | endif 63 | endif 64 | 65 | # Only detect toolchain if MPIF90 is set, otherwise defer error to rule which invokes compiler 66 | ifdef MPIF90 67 | # disable remark #10441: warning for deprecated Intel Compiler Classic 68 | MPIF90_VERSION := $(shell $(MPIF90) --version -diag-disable=10441 2> /dev/null || $(MPIF90) --version 2> /dev/null || $(MPIF90) -qversion 2> /dev/null) 69 | else 70 | MPIF90 = $(error Could not detect MPI Fortran compiler in PATH - failed to make target $@) 71 | endif 72 | 73 | ifndef MPIF77 74 | ifeq ($(shell which mpif77 > /dev/null 2>&1; echo $$?),0) 75 | MPIF77 := mpif77 76 | else 77 | MPIF77 = $(MPIF90) 78 | endif 79 | endif 80 | 81 | MPIFC ?= $(MPIF90) 82 | 83 | endif 84 | 85 | ### Recommended compiler flags ### 86 | 87 | # Flags for compiler inlining: MAP works whether inlining is on or off, 88 | # but you'll typically see more intuitive stacks with it turned off. 89 | # The major compilers are discussed here: 90 | # 91 | # Intel: -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 is 92 | # recommended. At O3 the compiler doesn't produce enough unwind info even 93 | # with -debug inline-debug-info set. 94 | # 95 | # PGI: -g -O3 -Meh_frame -Mframe -Mnoautoinline is recommended. Other settings 96 | # dont produce enough unwind information for inlined functions otherwise. This 97 | # adds some performance penalty - around 8% is typical. 98 | # 99 | # The PGI C runtime static library contains an undefined reference to 100 | # __kmpc_fork_call, which will cause compilation to fail when linking 101 | # allinea-profiler.ld. Add --undefined __wrap___kmpc_fork_call to your link line 102 | # before linking to the Forge sampler to resolve this. 103 | # 104 | # GNU: -g -O3 -fno-inline is recommended. You might be lucky without -fno-inline, 105 | # as it should produce enough information to unwind those calls. You will see 106 | # my_function [inlined] in the MAP stack for functions that were inline. 107 | # -fno-inline-functions appears with newer gnu compilers, just to confuse 108 | 109 | # Common OpenMP flags for supported compilers 110 | # -fopenmp for gnu 111 | # -openmp for intel 112 | # -mp for pgi 113 | # -qsmp=omp:noopt for IBM 114 | # -homp for cray (compiler) 115 | 116 | # Common pthread flags for supported compilers 117 | # -pthread for GNU 118 | # -lpthread for other compilers 119 | 120 | INTEL_LLVM_MAP_CFLAGS := -g -fno-inline -no-ipo -fno-omit-frame-pointer -O3 121 | INTEL_LLVM_DDT_CFLAGS := -g -Wall -O0 122 | INTEL_LLVM_OPENMP_CFLAG := -qopenmp 123 | INTEL_LLVM_MAP_FCFLAGS := $(INTEL_LLVM_MAP_CFLAGS) 124 | INTEL_LLVM_DDT_FCFLAGS := $(filter-out -Wall, $(INTEL_LLVM_DDT_CFLAGS)) -warn all 125 | INTEL_LLVM_OPENMP_FCFLAG := $(INTEL_LLVM_OPENMP_CFLAG) 126 | INTEL_LLVM_PTHREAD_CFLAG := -lpthread 127 | INTEL_LLVM_SHARED_LIBRARY_CFLAGS=-fPIC 128 | INTEL_LLVM_SHARED_LIBRARY_LINKER_FLAGS=-shared 129 | 130 | INTEL_MAP_CFLAGS := -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 131 | INTEL_DDT_CFLAGS := -g -w3 -O0 132 | INTEL_OPENMP_CFLAG := -qopenmp 133 | INTEL_MAP_FCFLAGS := $(INTEL_MAP_CFLAGS) 134 | INTEL_DDT_FCFLAGS := $(filter-out -w3, $(INTEL_DDT_CFLAGS)) -warn all 135 | INTEL_OPENMP_FCFLAG := $(INTEL_OPENMP_CFLAG) 136 | INTEL_PTHREAD_CFLAG := -lpthread 137 | INTEL_SHARED_LIBRARY_CFLAGS=-fPIC 138 | INTEL_SHARED_LIBRARY_LINKER_FLAGS=-shared 139 | 140 | PGI_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline 141 | PGI_DDT_CFLAGS := -g -O0 142 | PGI_MAJOR_VERSION_GT_17 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 17 2> /dev/null) 143 | ifeq ($(PGI_MAJOR_VERSION_GT_17),1) 144 | PGI_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call 145 | endif 146 | PGI_OPENMP_CFLAG := -mp 147 | PGI_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS)) 148 | PGI_DDT_FCFLAGS := $(PGI_DDT_CFLAGS) 149 | PGI_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG) 150 | PGI_PTHREAD_CFLAG := -lpthread 151 | PGI_SHARED_LIBRARY_CFLAGS=-fPIC 152 | PGI_SHARED_LIBRARY_LINKER_FLAGS=-shared 153 | 154 | NVC_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline 155 | NVC_DDT_CFLAGS := -g -O0 156 | NVC_MAJOR_VERSION_GT_20 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 20 2> /dev/null) 157 | NVC_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call 158 | NVC_OPENMP_CFLAG := -mp 159 | NVC_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS)) 160 | NVC_DDT_FCFLAGS := $(PGI_DDT_CFLAGS) 161 | NVC_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG) 162 | NVC_PTHREAD_CFLAG := -lpthread 163 | NVC_SHARED_LIBRARY_CFLAGS=-fPIC 164 | NVC_SHARED_LIBRARY_LINKER_FLAGS=-shared 165 | 166 | IBM_MAP_CFLAGS := -g -O3 -qnoinline 167 | IBM_DDT_CFLAGS := -g -Werror -Weverything -O0 168 | IBM_OPENMP_CFLAG := -qsmp=omp:noopt 169 | IBM_MAP_FCFLAGS := $(IBM_MAP_CFLAGS) 170 | IBM_DDT_FCFLAGS := $(IBM_DDT_CFLAGS) 171 | IBM_OPENMP_FCFLAG := $(IBM_OPENMP_CFLAG) -qsmp=omp:noopt -qnohot -lxlf90 -lxlsmp -lxlfmath 172 | IBM_PTHREAD_CFLAG := -lpthread 173 | IBM_SHARED_LIBRARY_CFLAGS= 174 | IBM_SHARED_LIBRARY_LINKER_FLAGS=-qmkshrobj 175 | 176 | CRAY_MAP_CFLAGS := -g -O3 -hipa0 177 | CRAY_DDT_CFLAGS := -g -h msglevel_2 -O0 178 | CRAY_OPENMP_CFLAG := -homp 179 | CRAY_MAP_FCFLAGS := $(CRAY_MAP_CFLAGS) 180 | CRAY_DDT_FCFLAGS := -g -m 2 181 | CRAY_OPENMP_FCFLAG := $(CRAY_OPENMP_CFLAG) 182 | CRAY_PTHREAD_CFLAG := -lpthread 183 | CRAY_SHARED_LIBRARY_CFLAGS=-fPIC 184 | CRAY_SHARED_LIBRARY_LINKER_FLAGS=-shared 185 | 186 | GNU_MAP_CFLAGS := -g -O3 -fno-inline -fno-optimize-sibling-calls 187 | GNU_DDT_CFLAGS := -g -Wall -Werror -O0 188 | GNU_OPENMP_CFLAG := -fopenmp 189 | GNU_MAP_FCFLAGS := $(GNU_MAP_CFLAGS) 190 | GNU_DDT_FCFLAGS := $(GNU_DDT_CFLAGS) 191 | GNU_OPENMP_FCFLAG := $(GNU_OPENMP_CFLAG) 192 | GNU_PTHREAD_CFLAG := -pthread 193 | GNU_SHARED_LIBRARY_CFLAGS=-fPIC 194 | GNU_SHARED_LIBRARY_LINKER_FLAGS=-shared 195 | 196 | # GCC 10 is stricter on requiring standard-compliant Fortran, set this flag 197 | # when compiling older Fortran programs. 198 | GNU_LEGACY_STD_FCFLAG := -std=legacy 199 | 200 | ### Toolchain detection ### 201 | 202 | define get_compiler_toolchain 203 | $(if $(or $(findstring icx,$(1)),$(findstring ifx,$(1)),$(findstring Intel(R) oneAPI,$(1)),$(findstring INTEL,$(PE_ENV))), 204 | INTEL_LLVM, 205 | $(if $(or $(findstring icc,$(1)),$(findstring ifort,$(1)),$(findstring Intel,$(1)),$(findstring INTEL,$(PE_ENV))), 206 | INTEL, 207 | $(if $(or $(findstring pgcc,$(1)),$(findstring pgfortran,$(1)),$(findstring PGI,$(1)), $(findstring PGI,$(PE_ENV))), 208 | PGI, 209 | $(if $(or $(findstring nvc,$(1)),$(findstring nvfortran,$(1))), 210 | NVC, 211 | $(if $(or $(findstring xlc,$(1)),$(findstring xlf,$(1)),$(findstring IBM,$(1)),$(findstring IBM,$(PE_ENV))), 212 | IBM, 213 | $(if $(findstring CRAY,$(PE_ENV)), 214 | CRAY, 215 | GNU)))))) 216 | endef 217 | 218 | CC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(CC))) 219 | MPICC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPICC_VERSION))) 220 | FC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(FC))) 221 | MPIF90_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPIF90_VERSION))) 222 | 223 | ### Compiler flags for toolchain (allow overrides) ### 224 | 225 | MAP_CFLAGS ?= $($(CC_TOOLCHAIN)_MAP_CFLAGS) 226 | MAP_FCFLAGS ?= $($(FC_TOOLCHAIN)_MAP_FCFLAGS) 227 | DDT_CFLAGS ?= $($(CC_TOOLCHAIN)_DDT_CFLAGS) 228 | DDT_FCFLAGS ?= $($(CC_TOOLCHAIN)_DDT_FCFLAGS) 229 | OPENMP_CFLAG ?= $($(CC_TOOLCHAIN)_OPENMP_CFLAG) 230 | OPENMP_FCFLAG ?= $($(FC_TOOLCHAIN)_OPENMP_FCFLAG) 231 | PTHREAD_CFLAG ?= $($(CC_TOOLCHAIN)_PTHREAD_CFLAG) 232 | PTHREAD_FCFLAG ?= $($(FC_TOOLCHAIN)_PTHREAD_FCFLAG) 233 | SHARED_LIBRARY_CFLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_CFLAGS) 234 | SHARED_LIBRARY_LINKER_FLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_LINKER_FLAGS) 235 | LEGACY_STD_FCFLAG ?= $($(FC_TOOLCHAIN)_LEGACY_STD_FCFLAG) 236 | MPI_MAP_CFLAGS ?= $($(MPICC_TOOLCHAIN)_MAP_CFLAGS) 237 | MPI_MAP_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_MAP_FCFLAGS) 238 | MPI_DDT_CFLAGS ?= $($(MPICC_TOOLCHAIN)_DDT_CFLAGS) 239 | MPI_DDT_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_DDT_FCFLAGS) 240 | MPI_OPENMP_CFLAG ?= $($(MPICC_TOOLCHAIN)_OPENMP_CFLAG) 241 | MPI_OPENMP_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_OPENMP_FCFLAG) 242 | MPI_PTHREAD_CFLAG ?= $($(MPICC_TOOLCHAIN)_PTHREAD_CFLAG) 243 | MPI_PTHREAD_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_PTHREAD_FCFLAG) 244 | 245 | ## Link flags for static Forge sampler 246 | ifeq ($(CC_TOOLCHAIN),GNU) 247 | ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) 248 | MAP_STATIC_C_LINKFLAGS := -no-pie 249 | endif 250 | endif 251 | 252 | ifeq ($(FC_TOOLCHAIN),GNU) 253 | ifneq ($(shell $(FC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),) 254 | MAP_STATIC_FC_LINKFLAGS := -no-pie 255 | endif 256 | endif 257 | -------------------------------------------------------------------------------- /Valgrind/memcheck/leak.h: -------------------------------------------------------------------------------- 1 | // These counters are used to get a delta between leak counts at startup 2 | // (eg. due to libc) and later on. Necessary to get reliable leak tests 3 | // across different platforms. 4 | #define DECLARE_LEAK_COUNTERS \ 5 | long L0_bytes = 0, L_bytes = 0, L0_blocks = 0, L_blocks = 0; \ 6 | long D0_bytes = 0, D_bytes = 0, D0_blocks = 0, D_blocks = 0; \ 7 | long R0_bytes = 0, R_bytes = 0, R0_blocks = 0, R_blocks = 0; \ 8 | long S0_bytes = 0, S_bytes = 0, S0_blocks = 0, S_blocks = 0 9 | 10 | // Set a baseline, in case allocations have already happened. 11 | #define GET_INITIAL_LEAK_COUNTS \ 12 | do { \ 13 | VALGRIND_DO_QUICK_LEAK_CHECK; \ 14 | VALGRIND_COUNT_LEAKS( L0_bytes, D0_bytes, R0_bytes, S0_bytes );\ 15 | VALGRIND_COUNT_LEAK_BLOCKS(L0_blocks, D0_blocks, R0_blocks, S0_blocks); \ 16 | } while (0) 17 | 18 | // Set a baseline, in case allocations have already happened. 19 | #define GET_FINAL_LEAK_COUNTS \ 20 | do { \ 21 | VALGRIND_DO_QUICK_LEAK_CHECK; \ 22 | VALGRIND_COUNT_LEAKS( L_bytes, D_bytes, R_bytes, S_bytes ); \ 23 | VALGRIND_COUNT_LEAK_BLOCKS(L_blocks, D_blocks, R_blocks, S_blocks); \ 24 | L_bytes -= L0_bytes; L_blocks -= L0_blocks; \ 25 | D_bytes -= D0_bytes; D_blocks -= D0_blocks; \ 26 | R_bytes -= R0_bytes; R_blocks -= R0_blocks; \ 27 | S_bytes -= S0_bytes; S_blocks -= S0_blocks; \ 28 | } while (0) 29 | 30 | // Print leak counts. When used in conjunction with -q the normal counts 31 | // aren't shown, which is what we want. 32 | #define PRINT_LEAK_COUNTS(where) \ 33 | do { \ 34 | fprintf(where,"leaked: %3ld bytes in %2ld blocks\n", \ 35 | L_bytes,L_blocks); \ 36 | fprintf(where,"dubious: %3ld bytes in %2ld blocks\n", \ 37 | D_bytes,D_blocks); \ 38 | fprintf(where,"reachable: %3ld bytes in %2ld blocks\n", \ 39 | R_bytes,R_blocks); \ 40 | fprintf(where,"suppressed: %3ld bytes in %2ld blocks\n", \ 41 | S_bytes,S_blocks); \ 42 | } while (0) 43 | 44 | /* Upon a call to a function, some architectures store pointers into 45 | * into registers. Valgrind may consider these registers when determining 46 | * whether an address is reachable, so we need to zero-out these registers 47 | * as needed. 48 | */ 49 | #if defined __powerpc__ 50 | #define CLEAR_CALLER_SAVED_REGS \ 51 | do { \ 52 | __asm__ __volatile__( "li 3, 0" : : :/*trash*/"r3" ); \ 53 | __asm__ __volatile__( "li 4, 0" : : :/*trash*/"r4" ); \ 54 | __asm__ __volatile__( "li 5, 0" : : :/*trash*/"r5" ); \ 55 | __asm__ __volatile__( "li 6, 0" : : :/*trash*/"r6" ); \ 56 | __asm__ __volatile__( "li 7, 0" : : :/*trash*/"r7" ); \ 57 | __asm__ __volatile__( "li 8, 0" : : :/*trash*/"r8" ); \ 58 | __asm__ __volatile__( "li 9, 0" : : :/*trash*/"r9" ); \ 59 | __asm__ __volatile__( "li 10, 0" : : :/*trash*/"r10" ); \ 60 | __asm__ __volatile__( "li 11, 0" : : :/*trash*/"r11" ); \ 61 | __asm__ __volatile__( "li 12, 0" : : :/*trash*/"r12" ); \ 62 | } while (0) 63 | #elif defined(__nanomips__) 64 | #define CLEAR_CALLER_SAVED_REGS \ 65 | do { \ 66 | __asm__ __volatile__ (".set push \n\t" \ 67 | ".set noat \n\t" \ 68 | "move $at, $zero \n\t" \ 69 | "move $t4, $zero \n\t" \ 70 | "move $t5, $zero \n\t" \ 71 | "move $a0, $zero \n\t" \ 72 | "move $a1, $zero \n\t" \ 73 | "move $a2, $zero \n\t" \ 74 | "move $a3, $zero \n\t" \ 75 | "move $a4, $zero \n\t" \ 76 | "move $a5, $zero \n\t" \ 77 | "move $a6, $zero \n\t" \ 78 | "move $a7, $zero \n\t" \ 79 | "move $t0, $zero \n\t" \ 80 | "move $t1, $zero \n\t" \ 81 | "move $t2, $zero \n\t" \ 82 | "move $t3, $zero \n\t" \ 83 | "move $t8, $zero \n\t" \ 84 | "move $t9, $zero \n\t" \ 85 | ".set pop \n\t" \ 86 | : : : "$at", "$t4", "$t5", "$a0", "$a1", "$a2", \ 87 | "$a3", "$a4", "$a5", "$a6", "$a7", "$t0", \ 88 | "$t1", "$t2", "$t3", "$t8", "$t9"); \ 89 | } while (0) 90 | #elif (__mips == 32) 91 | #define CLEAR_CALLER_SAVED_REGS \ 92 | do { \ 93 | __asm__ __volatile__ (".set push \n\t" \ 94 | ".set noat \n\t" \ 95 | "move $1, $0 \n\t" /* at = 0 */ \ 96 | "move $2, $0 \n\t" /* v0 = 0 */ \ 97 | "move $3, $0 \n\t" /* v1 = 0 */ \ 98 | "move $4, $0 \n\t" /* a0 = 0 */ \ 99 | "move $5, $0 \n\t" /* a1 = 0 */ \ 100 | "move $6, $0 \n\t" /* a2 = 0 */ \ 101 | "move $7, $0 \n\t" /* a3 = 0 */ \ 102 | "move $8, $0 \n\t" /* t0 = 0 */ \ 103 | "move $9, $0 \n\t" /* t1 = 0 */ \ 104 | "move $10, $0 \n\t" /* t2 = 0 */ \ 105 | "move $11, $0 \n\t" /* t3 = 0 */ \ 106 | "move $12, $0 \n\t" /* t4 = 0 */ \ 107 | "move $13, $0 \n\t" /* t5 = 0 */ \ 108 | "move $14, $0 \n\t" /* t6 = 0 */ \ 109 | "move $15, $0 \n\t" /* t7 = 0 */ \ 110 | "move $24, $0 \n\t" /* t8 = 0 */ \ 111 | "move $25, $0 \n\t" /* t9 = 0 */ \ 112 | "move $31, $0 \n\t" /* ra = 0 */ \ 113 | ".set pop \n\t" \ 114 | : : : "$1", "$2", "$3", "$4", "$5", "$6", "$7", \ 115 | "$8", "$9", "$10", "$11", "$12", "$13", \ 116 | "$14", "$15", "$24", "$25", "$31"); \ 117 | } while (0) 118 | #elif (__mips == 64) 119 | #define CLEAR_CALLER_SAVED_REGS \ 120 | do { \ 121 | __asm__ __volatile__ (".set push \n\t" \ 122 | ".set noat \n\t" \ 123 | "move $1, $0 \n\t" /* at = 0 */ \ 124 | "move $2, $0 \n\t" /* v0 = 0 */ \ 125 | "move $3, $0 \n\t" /* v1 = 0 */ \ 126 | "move $4, $0 \n\t" /* a0 = 0 */ \ 127 | "move $5, $0 \n\t" /* a1 = 0 */ \ 128 | "move $6, $0 \n\t" /* a2 = 0 */ \ 129 | "move $7, $0 \n\t" /* a3 = 0 */ \ 130 | "move $8, $0 \n\t" /* a4 = 0 */ \ 131 | "move $9, $0 \n\t" /* a5 = 0 */ \ 132 | "move $10, $0 \n\t" /* a6 = 0 */ \ 133 | "move $11, $0 \n\t" /* a7 = 0 */ \ 134 | "move $12, $0 \n\t" /* t0 = 0 */ \ 135 | "move $13, $0 \n\t" /* t1 = 0 */ \ 136 | "move $14, $0 \n\t" /* t2 = 0 */ \ 137 | "move $15, $0 \n\t" /* t3 = 0 */ \ 138 | "move $24, $0 \n\t" /* t8 = 0 */ \ 139 | "move $25, $0 \n\t" /* t9 = 0 */ \ 140 | "move $31, $0 \n\t" /* ra = 0 */ \ 141 | ".set pop \n\t" \ 142 | : : : "$1", "$2", "$3", "$4", "$5", "$6", "$7", \ 143 | "$8", "$9", "$10", "$11", "$12", "$13", \ 144 | "$14", "$15", "$24", "$25", "$31"); \ 145 | } while (0) 146 | #elif defined (__clang__) && defined(VGA_x86) 147 | #define CLEAR_CALLER_SAVED_REGS \ 148 | do { \ 149 | __asm__ __volatile__ ("movl $0, %ecx\n\t"); \ 150 | } while (0) 151 | #elif defined(__arm__) 152 | /* 32bit arm */ 153 | #define CLEAR_CALLER_SAVED_REGS \ 154 | do { \ 155 | __asm__ __volatile__ ("mov %r0, $0\n\t"); \ 156 | __asm__ __volatile__ ("mov %r1, $0\n\t"); \ 157 | __asm__ __volatile__ ("mov %r2, $0\n\t"); \ 158 | __asm__ __volatile__ ("mov %r3, $0\n\t"); \ 159 | } while (0) 160 | #elif defined(__aarch64__) 161 | /* 64bit arm */ 162 | #define CLEAR_CALLER_SAVED_REGS \ 163 | do { \ 164 | __asm__ __volatile__ ("mov x0, 0\n\t"); \ 165 | __asm__ __volatile__ ("mov x1, 0\n\t"); \ 166 | __asm__ __volatile__ ("mov x2, 0\n\t"); \ 167 | __asm__ __volatile__ ("mov x3, 0\n\t"); \ 168 | __asm__ __volatile__ ("mov x4, 0\n\t"); \ 169 | __asm__ __volatile__ ("mov x5, 0\n\t"); \ 170 | __asm__ __volatile__ ("mov x6, 0\n\t"); \ 171 | __asm__ __volatile__ ("mov x7, 0\n\t"); \ 172 | __asm__ __volatile__ ("mov x8, 0\n\t"); \ 173 | __asm__ __volatile__ ("mov x9, 0\n\t"); \ 174 | __asm__ __volatile__ ("mov x10, 0\n\t"); \ 175 | __asm__ __volatile__ ("mov x11, 0\n\t"); \ 176 | __asm__ __volatile__ ("mov x12, 0\n\t"); \ 177 | __asm__ __volatile__ ("mov x13, 0\n\t"); \ 178 | __asm__ __volatile__ ("mov x14, 0\n\t"); \ 179 | __asm__ __volatile__ ("mov x15, 0\n\t"); \ 180 | __asm__ __volatile__ ("mov x16, 0\n\t"); \ 181 | __asm__ __volatile__ ("mov x17, 0\n\t"); \ 182 | __asm__ __volatile__ ("mov x18, 0\n\t"); \ 183 | } while (0) 184 | #else 185 | #define CLEAR_CALLER_SAVED_REGS /*nothing*/ 186 | #endif 187 | 188 | 189 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/matrixMul.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /** 29 | * Matrix multiplication: C = A * B. 30 | * Host code. 31 | * 32 | * This sample implements matrix multiplication which makes use of shared memory 33 | * to ensure data reuse, the matrix multiplication is done using tiling approach. 34 | * It has been written for clarity of exposition to illustrate various CUDA programming 35 | * principles, not with the goal of providing the most performant generic kernel for matrix multiplication. 36 | * See also: 37 | * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," 38 | * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), 39 | * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. 40 | */ 41 | 42 | // System includes 43 | #include 44 | #include 45 | 46 | // CUDA runtime 47 | #include 48 | #include 49 | 50 | // Helper functions and utilities to work with CUDA 51 | #include 52 | #include 53 | 54 | /** 55 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 56 | * wA is A's width and wB is B's width 57 | */ 58 | template __global__ void MatrixMulCUDA(float *C, float *A, 59 | float *B, int wA, 60 | int wB) { 61 | // Block index 62 | int bx = blockIdx.x; 63 | int by = blockIdx.y; 64 | 65 | // Thread index 66 | int tx = threadIdx.x; 67 | int ty = threadIdx.y; 68 | 69 | // Index of the first sub-matrix of A processed by the block 70 | int aBegin = wA * BLOCK_SIZE * by; 71 | 72 | // Index of the last sub-matrix of A processed by the block 73 | int aEnd = aBegin + wA - 1; 74 | 75 | // Step size used to iterate through the sub-matrices of A 76 | int aStep = BLOCK_SIZE; 77 | 78 | // Index of the first sub-matrix of B processed by the block 79 | int bBegin = BLOCK_SIZE * bx; 80 | 81 | // Step size used to iterate through the sub-matrices of B 82 | int bStep = BLOCK_SIZE * wB; 83 | 84 | // Csub is used to store the element of the block sub-matrix 85 | // that is computed by the thread 86 | float Csub = 0; 87 | 88 | // Loop over all the sub-matrices of A and B 89 | // required to compute the block sub-matrix 90 | for (int a = aBegin, b = bBegin; 91 | a <= aEnd; 92 | a += aStep, b += bStep) { 93 | // Declaration of the shared memory array As used to 94 | // store the sub-matrix of A 95 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 96 | 97 | // Declaration of the shared memory array Bs used to 98 | // store the sub-matrix of B 99 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 100 | 101 | // Load the matrices from device memory 102 | // to shared memory; each thread loads 103 | // one element of each matrix 104 | As[ty][tx] = A[a + wA * ty + tx]; 105 | Bs[ty][tx] = B[b + wB * ty + tx]; 106 | 107 | // Synchronize to make sure the matrices are loaded 108 | __syncthreads(); 109 | 110 | // Multiply the two matrices together; 111 | // each thread computes one element 112 | // of the block sub-matrix 113 | #pragma unroll 114 | 115 | for (int k = 0; k < BLOCK_SIZE; ++k) { 116 | Csub += As[ty][k] * Bs[k][tx]; 117 | } 118 | 119 | // Synchronize to make sure that the preceding 120 | // computation is done before loading two new 121 | // sub-matrices of A and B in the next iteration 122 | __syncthreads(); 123 | } 124 | 125 | // Write the block sub-matrix to device memory; 126 | // each thread writes one element 127 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 128 | C[c + wB * ty + tx] = Csub; 129 | } 130 | 131 | void ConstantInit(float *data, int size, float val) { 132 | for (int i = 0; i < size; ++i) { 133 | data[i] = val; 134 | } 135 | } 136 | 137 | /** 138 | * Run a simple test of matrix multiplication using CUDA 139 | */ 140 | int MatrixMultiply(int argc, char **argv, 141 | int block_size, const dim3 &dimsA, 142 | const dim3 &dimsB) { 143 | // Allocate host memory for matrices A and B 144 | unsigned int size_A = dimsA.x * dimsA.y; 145 | unsigned int mem_size_A = sizeof(float) * size_A; 146 | float *h_A; 147 | checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); 148 | unsigned int size_B = dimsB.x * dimsB.y; 149 | unsigned int mem_size_B = sizeof(float) * size_B; 150 | float *h_B; 151 | checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); 152 | cudaStream_t stream; 153 | 154 | // Initialize host memory 155 | const float valB = 0.01f; 156 | ConstantInit(h_A, size_A, 1.0f); 157 | ConstantInit(h_B, size_B, valB); 158 | 159 | // Allocate device memory 160 | float *d_A, *d_B, *d_C; 161 | 162 | // Allocate host matrix C 163 | dim3 dimsC(dimsB.x, dimsA.y, 1); 164 | unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 165 | float *h_C; 166 | checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); 167 | 168 | if (h_C == NULL) { 169 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 170 | exit(EXIT_FAILURE); 171 | } 172 | 173 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); 174 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); 175 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); 176 | // Allocate CUDA events that we'll use for timing 177 | cudaEvent_t start, stop; 178 | checkCudaErrors(cudaEventCreate(&start)); 179 | checkCudaErrors(cudaEventCreate(&stop)); 180 | 181 | checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 182 | 183 | // copy host memory to device 184 | checkCudaErrors( 185 | cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); 186 | checkCudaErrors( 187 | cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); 188 | 189 | // Setup execution parameters 190 | dim3 threads(block_size, block_size); 191 | dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 192 | 193 | // Create and start timer 194 | printf("Computing result using CUDA Kernel...\n"); 195 | 196 | // Performs warmup operation using matrixMul CUDA kernel 197 | if (block_size == 16) { 198 | MatrixMulCUDA<16> 199 | <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 200 | } else { 201 | MatrixMulCUDA<32> 202 | <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 203 | } 204 | 205 | printf("done\n"); 206 | checkCudaErrors(cudaStreamSynchronize(stream)); 207 | 208 | // Record the start event 209 | checkCudaErrors(cudaEventRecord(start, stream)); 210 | 211 | // Execute the kernel 212 | int nIter = 300; 213 | 214 | for (int j = 0; j < nIter; j++) { 215 | if (block_size == 16) { 216 | MatrixMulCUDA<16> 217 | <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 218 | } else { 219 | MatrixMulCUDA<32> 220 | <<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 221 | } 222 | } 223 | 224 | // Record the stop event 225 | checkCudaErrors(cudaEventRecord(stop, stream)); 226 | 227 | // Wait for the stop event to complete 228 | checkCudaErrors(cudaEventSynchronize(stop)); 229 | 230 | float msecTotal = 0.0f; 231 | checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); 232 | 233 | // Compute and print the performance 234 | float msecPerMatrixMul = msecTotal / nIter; 235 | double flopsPerMatrixMul = 2.0 * static_cast(dimsA.x) * 236 | static_cast(dimsA.y) * 237 | static_cast(dimsB.x); 238 | double gigaFlops = 239 | (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 240 | printf( 241 | "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," 242 | " WorkgroupSize= %u threads/block\n", 243 | gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); 244 | 245 | // Copy result from device to host 246 | checkCudaErrors( 247 | cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); 248 | checkCudaErrors(cudaStreamSynchronize(stream)); 249 | 250 | printf("Checking computed result for correctness: "); 251 | bool correct = true; 252 | 253 | // test relative error by the formula 254 | // |_cpu - _gpu|/<|x|, |y|> < eps 255 | double eps = 1.e-6; // machine zero 256 | 257 | for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { 258 | double abs_err = fabs(h_C[i] - (dimsA.x * valB)); 259 | double dot_length = dimsA.x; 260 | double abs_val = fabs(h_C[i]); 261 | double rel_err = abs_err / abs_val / dot_length; 262 | 263 | if (rel_err > eps) { 264 | printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", 265 | i, h_C[i], dimsA.x * valB, eps); 266 | correct = false; 267 | } 268 | } 269 | 270 | printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 271 | 272 | // Clean up memory 273 | checkCudaErrors(cudaFreeHost(h_A)); 274 | checkCudaErrors(cudaFreeHost(h_B)); 275 | checkCudaErrors(cudaFreeHost(h_C)); 276 | checkCudaErrors(cudaFree(d_A)); 277 | checkCudaErrors(cudaFree(d_B)); 278 | checkCudaErrors(cudaFree(d_C)); 279 | checkCudaErrors(cudaEventDestroy(start)); 280 | checkCudaErrors(cudaEventDestroy(stop)); 281 | printf( 282 | "\nNOTE: The CUDA Samples are not meant for performance " 283 | "measurements. Results may vary when GPU Boost is enabled.\n"); 284 | 285 | if (correct) { 286 | return EXIT_SUCCESS; 287 | } else { 288 | return EXIT_FAILURE; 289 | } 290 | } 291 | 292 | 293 | /** 294 | * Program main 295 | */ 296 | int main(int argc, char **argv) { 297 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 298 | 299 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || 300 | checkCmdLineFlag(argc, (const char **)argv, "?")) { 301 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 302 | printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); 303 | printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); 304 | printf(" Note: Outer matrix dimensions of A & B matrices" \ 305 | " must be equal.\n"); 306 | 307 | exit(EXIT_SUCCESS); 308 | } 309 | 310 | // This will pick the best possible CUDA capable device, otherwise 311 | // override the device ID based on input provided at the command line 312 | int dev = findCudaDevice(argc, (const char **)argv); 313 | 314 | int block_size = 32; 315 | 316 | dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); 317 | dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); 318 | 319 | // width of Matrix A 320 | if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { 321 | dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 322 | } 323 | 324 | // height of Matrix A 325 | if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { 326 | dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 327 | } 328 | 329 | // width of Matrix B 330 | if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { 331 | dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 332 | } 333 | 334 | // height of Matrix B 335 | if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { 336 | dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 337 | } 338 | 339 | if (dimsA.x != dimsB.y) { 340 | printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", 341 | dimsA.x, dimsB.y); 342 | exit(EXIT_FAILURE); 343 | } 344 | 345 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, 346 | dimsB.x, dimsB.y); 347 | 348 | checkCudaErrors(cudaProfilerStart()); 349 | int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); 350 | checkCudaErrors(cudaProfilerStop()); 351 | 352 | exit(matrix_result); 353 | } 354 | -------------------------------------------------------------------------------- /Valgrind/memcheck/memcheck.h: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | ---------------------------------------------------------------- 4 | 5 | Notice that the following BSD-style license applies to this one 6 | file (memcheck.h) only. The rest of Valgrind is licensed under the 7 | terms of the GNU General Public License, version 2, unless 8 | otherwise indicated. See the COPYING file in the source 9 | distribution for details. 10 | 11 | ---------------------------------------------------------------- 12 | 13 | This file is part of MemCheck, a heavyweight Valgrind tool for 14 | detecting memory errors. 15 | 16 | Copyright (C) 2000-2017 Julian Seward. All rights reserved. 17 | 18 | Redistribution and use in source and binary forms, with or without 19 | modification, are permitted provided that the following conditions 20 | are met: 21 | 22 | 1. Redistributions of source code must retain the above copyright 23 | notice, this list of conditions and the following disclaimer. 24 | 25 | 2. The origin of this software must not be misrepresented; you must 26 | not claim that you wrote the original software. If you use this 27 | software in a product, an acknowledgment in the product 28 | documentation would be appreciated but is not required. 29 | 30 | 3. Altered source versions must be plainly marked as such, and must 31 | not be misrepresented as being the original software. 32 | 33 | 4. The name of the author may not be used to endorse or promote 34 | products derived from this software without specific prior written 35 | permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 38 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 39 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 41 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 42 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 43 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 44 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 45 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 46 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 47 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 48 | 49 | ---------------------------------------------------------------- 50 | 51 | Notice that the above BSD-style license applies to this one file 52 | (memcheck.h) only. The entire rest of Valgrind is licensed under 53 | the terms of the GNU General Public License, version 2. See the 54 | COPYING file in the source distribution for details. 55 | 56 | ---------------------------------------------------------------- 57 | */ 58 | 59 | 60 | #ifndef __MEMCHECK_H 61 | #define __MEMCHECK_H 62 | 63 | 64 | /* This file is for inclusion into client (your!) code. 65 | 66 | You can use these macros to manipulate and query memory permissions 67 | inside your own programs. 68 | 69 | See comment near the top of valgrind.h on how to use them. 70 | */ 71 | 72 | #include "valgrind.h" 73 | 74 | /* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 75 | This enum comprises an ABI exported by Valgrind to programs 76 | which use client requests. DO NOT CHANGE THE ORDER OF THESE 77 | ENTRIES, NOR DELETE ANY -- add new ones at the end. */ 78 | typedef 79 | enum { 80 | VG_USERREQ__MAKE_MEM_NOACCESS = VG_USERREQ_TOOL_BASE('M','C'), 81 | VG_USERREQ__MAKE_MEM_UNDEFINED, 82 | VG_USERREQ__MAKE_MEM_DEFINED, 83 | VG_USERREQ__DISCARD, 84 | VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE, 85 | VG_USERREQ__CHECK_MEM_IS_DEFINED, 86 | VG_USERREQ__DO_LEAK_CHECK, 87 | VG_USERREQ__COUNT_LEAKS, 88 | 89 | VG_USERREQ__GET_VBITS, 90 | VG_USERREQ__SET_VBITS, 91 | 92 | VG_USERREQ__CREATE_BLOCK, 93 | 94 | VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, 95 | 96 | /* Not next to VG_USERREQ__COUNT_LEAKS because it was added later. */ 97 | VG_USERREQ__COUNT_LEAK_BLOCKS, 98 | 99 | VG_USERREQ__ENABLE_ADDR_ERROR_REPORTING_IN_RANGE, 100 | VG_USERREQ__DISABLE_ADDR_ERROR_REPORTING_IN_RANGE, 101 | 102 | /* This is just for memcheck's internal use - don't use it */ 103 | _VG_USERREQ__MEMCHECK_RECORD_OVERLAP_ERROR 104 | = VG_USERREQ_TOOL_BASE('M','C') + 256, 105 | _VG_USERREQ__MEMCHECK_VERIFY_ALIGNMENT 106 | } Vg_MemCheckClientRequest; 107 | 108 | 109 | 110 | /* Client-code macros to manipulate the state of memory. */ 111 | 112 | /* Mark memory at _qzz_addr as unaddressable for _qzz_len bytes. */ 113 | #define VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr,_qzz_len) \ 114 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 115 | VG_USERREQ__MAKE_MEM_NOACCESS, \ 116 | (_qzz_addr), (_qzz_len), 0, 0, 0) 117 | 118 | /* Similarly, mark memory at _qzz_addr as addressable but undefined 119 | for _qzz_len bytes. */ 120 | #define VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr,_qzz_len) \ 121 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 122 | VG_USERREQ__MAKE_MEM_UNDEFINED, \ 123 | (_qzz_addr), (_qzz_len), 0, 0, 0) 124 | 125 | /* Similarly, mark memory at _qzz_addr as addressable and defined 126 | for _qzz_len bytes. */ 127 | #define VALGRIND_MAKE_MEM_DEFINED(_qzz_addr,_qzz_len) \ 128 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 129 | VG_USERREQ__MAKE_MEM_DEFINED, \ 130 | (_qzz_addr), (_qzz_len), 0, 0, 0) 131 | 132 | /* Similar to VALGRIND_MAKE_MEM_DEFINED except that addressability is 133 | not altered: bytes which are addressable are marked as defined, 134 | but those which are not addressable are left unchanged. */ 135 | #define VALGRIND_MAKE_MEM_DEFINED_IF_ADDRESSABLE(_qzz_addr,_qzz_len) \ 136 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 137 | VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, \ 138 | (_qzz_addr), (_qzz_len), 0, 0, 0) 139 | 140 | /* Create a block-description handle. The description is an ascii 141 | string which is included in any messages pertaining to addresses 142 | within the specified memory range. Has no other effect on the 143 | properties of the memory range. */ 144 | #define VALGRIND_CREATE_BLOCK(_qzz_addr,_qzz_len, _qzz_desc) \ 145 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 146 | VG_USERREQ__CREATE_BLOCK, \ 147 | (_qzz_addr), (_qzz_len), (_qzz_desc), \ 148 | 0, 0) 149 | 150 | /* Discard a block-description-handle. Returns 1 for an 151 | invalid handle, 0 for a valid handle. */ 152 | #define VALGRIND_DISCARD(_qzz_blkindex) \ 153 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 154 | VG_USERREQ__DISCARD, \ 155 | 0, (_qzz_blkindex), 0, 0, 0) 156 | 157 | 158 | /* Client-code macros to check the state of memory. */ 159 | 160 | /* Check that memory at _qzz_addr is addressable for _qzz_len bytes. 161 | If suitable addressibility is not established, Valgrind prints an 162 | error message and returns the address of the first offending byte. 163 | Otherwise it returns zero. */ 164 | #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(_qzz_addr,_qzz_len) \ 165 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0, \ 166 | VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE, \ 167 | (_qzz_addr), (_qzz_len), 0, 0, 0) 168 | 169 | /* Check that memory at _qzz_addr is addressable and defined for 170 | _qzz_len bytes. If suitable addressibility and definedness are not 171 | established, Valgrind prints an error message and returns the 172 | address of the first offending byte. Otherwise it returns zero. */ 173 | #define VALGRIND_CHECK_MEM_IS_DEFINED(_qzz_addr,_qzz_len) \ 174 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0, \ 175 | VG_USERREQ__CHECK_MEM_IS_DEFINED, \ 176 | (_qzz_addr), (_qzz_len), 0, 0, 0) 177 | 178 | /* Use this macro to force the definedness and addressibility of an 179 | lvalue to be checked. If suitable addressibility and definedness 180 | are not established, Valgrind prints an error message and returns 181 | the address of the first offending byte. Otherwise it returns 182 | zero. */ 183 | #define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue) \ 184 | VALGRIND_CHECK_MEM_IS_DEFINED( \ 185 | (volatile unsigned char *)&(__lvalue), \ 186 | (unsigned long)(sizeof (__lvalue))) 187 | 188 | 189 | /* Do a full memory leak check (like --leak-check=full) mid-execution. */ 190 | #define VALGRIND_DO_LEAK_CHECK \ 191 | VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK, \ 192 | 0, 0, 0, 0, 0) 193 | 194 | /* Same as VALGRIND_DO_LEAK_CHECK but only showing the entries for 195 | which there was an increase in leaked bytes or leaked nr of blocks 196 | since the previous leak search. */ 197 | #define VALGRIND_DO_ADDED_LEAK_CHECK \ 198 | VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK, \ 199 | 0, 1, 0, 0, 0) 200 | 201 | /* Same as VALGRIND_DO_ADDED_LEAK_CHECK but showing entries with 202 | increased or decreased leaked bytes/blocks since previous leak 203 | search. */ 204 | #define VALGRIND_DO_CHANGED_LEAK_CHECK \ 205 | VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK, \ 206 | 0, 2, 0, 0, 0) 207 | 208 | /* Same as VALGRIND_DO_LEAK_CHECK but only showing new entries 209 | i.e. loss records that were not there in the previous leak 210 | search. */ 211 | #define VALGRIND_DO_NEW_LEAK_CHECK \ 212 | VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK, \ 213 | 0, 3, 0, 0, 0) 214 | 215 | /* Do a summary memory leak check (like --leak-check=summary) mid-execution. */ 216 | #define VALGRIND_DO_QUICK_LEAK_CHECK \ 217 | VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK, \ 218 | 1, 0, 0, 0, 0) 219 | 220 | /* Return number of leaked, dubious, reachable and suppressed bytes found by 221 | all previous leak checks. They must be lvalues. */ 222 | #define VALGRIND_COUNT_LEAKS(leaked, dubious, reachable, suppressed) \ 223 | /* For safety on 64-bit platforms we assign the results to private 224 | unsigned long variables, then assign these to the lvalues the user 225 | specified, which works no matter what type 'leaked', 'dubious', etc 226 | are. We also initialise '_qzz_leaked', etc because 227 | VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as 228 | defined. */ \ 229 | { \ 230 | unsigned long _qzz_leaked = 0, _qzz_dubious = 0; \ 231 | unsigned long _qzz_reachable = 0, _qzz_suppressed = 0; \ 232 | VALGRIND_DO_CLIENT_REQUEST_STMT( \ 233 | VG_USERREQ__COUNT_LEAKS, \ 234 | &_qzz_leaked, &_qzz_dubious, \ 235 | &_qzz_reachable, &_qzz_suppressed, 0); \ 236 | leaked = _qzz_leaked; \ 237 | dubious = _qzz_dubious; \ 238 | reachable = _qzz_reachable; \ 239 | suppressed = _qzz_suppressed; \ 240 | } 241 | 242 | /* Return number of leaked, dubious, reachable and suppressed bytes found by 243 | all previous leak checks. They must be lvalues. */ 244 | #define VALGRIND_COUNT_LEAK_BLOCKS(leaked, dubious, reachable, suppressed) \ 245 | /* For safety on 64-bit platforms we assign the results to private 246 | unsigned long variables, then assign these to the lvalues the user 247 | specified, which works no matter what type 'leaked', 'dubious', etc 248 | are. We also initialise '_qzz_leaked', etc because 249 | VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as 250 | defined. */ \ 251 | { \ 252 | unsigned long _qzz_leaked = 0, _qzz_dubious = 0; \ 253 | unsigned long _qzz_reachable = 0, _qzz_suppressed = 0; \ 254 | VALGRIND_DO_CLIENT_REQUEST_STMT( \ 255 | VG_USERREQ__COUNT_LEAK_BLOCKS, \ 256 | &_qzz_leaked, &_qzz_dubious, \ 257 | &_qzz_reachable, &_qzz_suppressed, 0); \ 258 | leaked = _qzz_leaked; \ 259 | dubious = _qzz_dubious; \ 260 | reachable = _qzz_reachable; \ 261 | suppressed = _qzz_suppressed; \ 262 | } 263 | 264 | 265 | /* Get the validity data for addresses [zza..zza+zznbytes-1] and copy it 266 | into the provided zzvbits array. Return values: 267 | 0 if not running on valgrind 268 | 1 success 269 | 2 [previously indicated unaligned arrays; these are now allowed] 270 | 3 if any parts of zzsrc/zzvbits are not addressable. 271 | The metadata is not copied in cases 0, 2 or 3 so it should be 272 | impossible to segfault your system by using this call. 273 | */ 274 | #define VALGRIND_GET_VBITS(zza,zzvbits,zznbytes) \ 275 | (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0, \ 276 | VG_USERREQ__GET_VBITS, \ 277 | (const char*)(zza), \ 278 | (char*)(zzvbits), \ 279 | (zznbytes), 0, 0) 280 | 281 | /* Set the validity data for addresses [zza..zza+zznbytes-1], copying it 282 | from the provided zzvbits array. Return values: 283 | 0 if not running on valgrind 284 | 1 success 285 | 2 [previously indicated unaligned arrays; these are now allowed] 286 | 3 if any parts of zza/zzvbits are not addressable. 287 | The metadata is not copied in cases 0, 2 or 3 so it should be 288 | impossible to segfault your system by using this call. 289 | */ 290 | #define VALGRIND_SET_VBITS(zza,zzvbits,zznbytes) \ 291 | (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0, \ 292 | VG_USERREQ__SET_VBITS, \ 293 | (const char*)(zza), \ 294 | (const char*)(zzvbits), \ 295 | (zznbytes), 0, 0 ) 296 | 297 | /* Disable and re-enable reporting of addressing errors in the 298 | specified address range. */ 299 | #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(_qzz_addr,_qzz_len) \ 300 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 301 | VG_USERREQ__DISABLE_ADDR_ERROR_REPORTING_IN_RANGE, \ 302 | (_qzz_addr), (_qzz_len), 0, 0, 0) 303 | 304 | #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(_qzz_addr,_qzz_len) \ 305 | VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */, \ 306 | VG_USERREQ__ENABLE_ADDR_ERROR_REPORTING_IN_RANGE, \ 307 | (_qzz_addr), (_qzz_len), 0, 0, 0) 308 | 309 | #endif 310 | 311 | -------------------------------------------------------------------------------- /Linaro-Forge/correctness/gpu-nvidia-mmult/common/helper_string.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | // These are helper functions for the SDK samples (string parsing, timers, etc) 29 | #ifndef COMMON_HELPER_STRING_H_ 30 | #define COMMON_HELPER_STRING_H_ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 38 | #ifndef _CRT_SECURE_NO_DEPRECATE 39 | #define _CRT_SECURE_NO_DEPRECATE 40 | #endif 41 | #ifndef STRCASECMP 42 | #define STRCASECMP _stricmp 43 | #endif 44 | #ifndef STRNCASECMP 45 | #define STRNCASECMP _strnicmp 46 | #endif 47 | #ifndef STRCPY 48 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) 49 | #endif 50 | 51 | #ifndef FOPEN 52 | #define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) 53 | #endif 54 | #ifndef FOPEN_FAIL 55 | #define FOPEN_FAIL(result) (result != 0) 56 | #endif 57 | #ifndef SSCANF 58 | #define SSCANF sscanf_s 59 | #endif 60 | #ifndef SPRINTF 61 | #define SPRINTF sprintf_s 62 | #endif 63 | #else // Linux Includes 64 | #include 65 | #include 66 | 67 | #ifndef STRCASECMP 68 | #define STRCASECMP strcasecmp 69 | #endif 70 | #ifndef STRNCASECMP 71 | #define STRNCASECMP strncasecmp 72 | #endif 73 | #ifndef STRCPY 74 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) 75 | #endif 76 | 77 | #ifndef FOPEN 78 | #define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) 79 | #endif 80 | #ifndef FOPEN_FAIL 81 | #define FOPEN_FAIL(result) (result == NULL) 82 | #endif 83 | #ifndef SSCANF 84 | #define SSCANF sscanf 85 | #endif 86 | #ifndef SPRINTF 87 | #define SPRINTF sprintf 88 | #endif 89 | #endif 90 | 91 | #ifndef EXIT_WAIVED 92 | #define EXIT_WAIVED 2 93 | #endif 94 | 95 | // CUDA Utility Helper Functions 96 | inline int stringRemoveDelimiter(char delimiter, const char *string) { 97 | int string_start = 0; 98 | 99 | while (string[string_start] == delimiter) { 100 | string_start++; 101 | } 102 | 103 | if (string_start >= static_cast(strlen(string) - 1)) { 104 | return 0; 105 | } 106 | 107 | return string_start; 108 | } 109 | 110 | inline int getFileExtension(char *filename, char **extension) { 111 | int string_length = static_cast(strlen(filename)); 112 | 113 | while (filename[string_length--] != '.') { 114 | if (string_length == 0) break; 115 | } 116 | 117 | if (string_length > 0) string_length += 2; 118 | 119 | if (string_length == 0) 120 | *extension = NULL; 121 | else 122 | *extension = &filename[string_length]; 123 | 124 | return string_length; 125 | } 126 | 127 | inline bool checkCmdLineFlag(const int argc, const char **argv, 128 | const char *string_ref) { 129 | bool bFound = false; 130 | 131 | if (argc >= 1) { 132 | for (int i = 1; i < argc; i++) { 133 | int string_start = stringRemoveDelimiter('-', argv[i]); 134 | const char *string_argv = &argv[i][string_start]; 135 | 136 | const char *equal_pos = strchr(string_argv, '='); 137 | int argv_length = static_cast( 138 | equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 139 | 140 | int length = static_cast(strlen(string_ref)); 141 | 142 | if (length == argv_length && 143 | !STRNCASECMP(string_argv, string_ref, length)) { 144 | bFound = true; 145 | continue; 146 | } 147 | } 148 | } 149 | 150 | return bFound; 151 | } 152 | 153 | // This function wraps the CUDA Driver API into a template function 154 | template 155 | inline bool getCmdLineArgumentValue(const int argc, const char **argv, 156 | const char *string_ref, T *value) { 157 | bool bFound = false; 158 | 159 | if (argc >= 1) { 160 | for (int i = 1; i < argc; i++) { 161 | int string_start = stringRemoveDelimiter('-', argv[i]); 162 | const char *string_argv = &argv[i][string_start]; 163 | int length = static_cast(strlen(string_ref)); 164 | 165 | if (!STRNCASECMP(string_argv, string_ref, length)) { 166 | if (length + 1 <= static_cast(strlen(string_argv))) { 167 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 168 | *value = (T)atoi(&string_argv[length + auto_inc]); 169 | } 170 | 171 | bFound = true; 172 | i = argc; 173 | } 174 | } 175 | } 176 | 177 | return bFound; 178 | } 179 | 180 | inline int getCmdLineArgumentInt(const int argc, const char **argv, 181 | const char *string_ref) { 182 | bool bFound = false; 183 | int value = -1; 184 | 185 | if (argc >= 1) { 186 | for (int i = 1; i < argc; i++) { 187 | int string_start = stringRemoveDelimiter('-', argv[i]); 188 | const char *string_argv = &argv[i][string_start]; 189 | int length = static_cast(strlen(string_ref)); 190 | 191 | if (!STRNCASECMP(string_argv, string_ref, length)) { 192 | if (length + 1 <= static_cast(strlen(string_argv))) { 193 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 194 | value = atoi(&string_argv[length + auto_inc]); 195 | } else { 196 | value = 0; 197 | } 198 | 199 | bFound = true; 200 | continue; 201 | } 202 | } 203 | } 204 | 205 | if (bFound) { 206 | return value; 207 | } else { 208 | return 0; 209 | } 210 | } 211 | 212 | inline float getCmdLineArgumentFloat(const int argc, const char **argv, 213 | const char *string_ref) { 214 | bool bFound = false; 215 | float value = -1; 216 | 217 | if (argc >= 1) { 218 | for (int i = 1; i < argc; i++) { 219 | int string_start = stringRemoveDelimiter('-', argv[i]); 220 | const char *string_argv = &argv[i][string_start]; 221 | int length = static_cast(strlen(string_ref)); 222 | 223 | if (!STRNCASECMP(string_argv, string_ref, length)) { 224 | if (length + 1 <= static_cast(strlen(string_argv))) { 225 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 226 | value = static_cast(atof(&string_argv[length + auto_inc])); 227 | } else { 228 | value = 0.f; 229 | } 230 | 231 | bFound = true; 232 | continue; 233 | } 234 | } 235 | } 236 | 237 | if (bFound) { 238 | return value; 239 | } else { 240 | return 0; 241 | } 242 | } 243 | 244 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 245 | const char *string_ref, 246 | char **string_retval) { 247 | bool bFound = false; 248 | 249 | if (argc >= 1) { 250 | for (int i = 1; i < argc; i++) { 251 | int string_start = stringRemoveDelimiter('-', argv[i]); 252 | char *string_argv = const_cast(&argv[i][string_start]); 253 | int length = static_cast(strlen(string_ref)); 254 | 255 | if (!STRNCASECMP(string_argv, string_ref, length)) { 256 | *string_retval = &string_argv[length + 1]; 257 | bFound = true; 258 | continue; 259 | } 260 | } 261 | } 262 | 263 | if (!bFound) { 264 | *string_retval = NULL; 265 | } 266 | 267 | return bFound; 268 | } 269 | 270 | ////////////////////////////////////////////////////////////////////////////// 271 | //! Find the path for a file assuming that 272 | //! files are found in the searchPath. 273 | //! 274 | //! @return the path if succeeded, otherwise 0 275 | //! @param filename name of the file 276 | //! @param executable_path optional absolute path of the executable 277 | ////////////////////////////////////////////////////////////////////////////// 278 | inline char *sdkFindFilePath(const char *filename, 279 | const char *executable_path) { 280 | // defines a variable that is replaced with the name of the 281 | // executable 282 | 283 | // Typical relative search paths to locate needed companion files (e.g. sample 284 | // input data, or JIT source files) The origin for the relative search may be 285 | // the .exe file, a .bat file launching an .exe, a browser .exe launching the 286 | // .exe or .bat, etc 287 | const char *searchPath[] = { 288 | "./", // same dir 289 | "./data/", // same dir 290 | 291 | "../../../../Samples//", // up 4 in tree 292 | "../../../Samples//", // up 3 in tree 293 | "../../Samples//", // up 2 in tree 294 | 295 | "../../../../Samples//data/", // up 4 in tree 296 | "../../../Samples//data/", // up 3 in tree 297 | "../../Samples//data/", // up 2 in tree 298 | 299 | "../../../../Samples/0_Introduction//", // up 4 in tree 300 | "../../../Samples/0_Introduction//", // up 3 in tree 301 | "../../Samples/0_Introduction//", // up 2 in tree 302 | 303 | "../../../../Samples/1_Utilities//", // up 4 in tree 304 | "../../../Samples/1_Utilities//", // up 3 in tree 305 | "../../Samples/1_Utilities//", // up 2 in tree 306 | 307 | "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree 308 | "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree 309 | "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree 310 | 311 | "../../../../Samples/3_CUDA_Features//", // up 4 in tree 312 | "../../../Samples/3_CUDA_Features//", // up 3 in tree 313 | "../../Samples/3_CUDA_Features//", // up 2 in tree 314 | 315 | "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree 316 | "../../../Samples/4_CUDA_Libraries//", // up 3 in tree 317 | "../../Samples/4_CUDA_Libraries//", // up 2 in tree 318 | 319 | "../../../../Samples/5_Domain_Specific//", // up 4 in tree 320 | "../../../Samples/5_Domain_Specific//", // up 3 in tree 321 | "../../Samples/5_Domain_Specific//", // up 2 in tree 322 | 323 | "../../../../Samples/6_Performance//", // up 4 in tree 324 | "../../../Samples/6_Performance//", // up 3 in tree 325 | "../../Samples/6_Performance//", // up 2 in tree 326 | 327 | "../../../../Samples/0_Introduction//data/", // up 4 in tree 328 | "../../../Samples/0_Introduction//data/", // up 3 in tree 329 | "../../Samples/0_Introduction//data/", // up 2 in tree 330 | 331 | "../../../../Samples/1_Utilities//data/", // up 4 in tree 332 | "../../../Samples/1_Utilities//data/", // up 3 in tree 333 | "../../Samples/1_Utilities//data/", // up 2 in tree 334 | 335 | "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree 336 | "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree 337 | "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree 338 | 339 | "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree 340 | "../../../Samples/3_CUDA_Features//data/", // up 3 in tree 341 | "../../Samples/3_CUDA_Features//data/", // up 2 in tree 342 | 343 | "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree 344 | "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree 345 | "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree 346 | 347 | "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree 348 | "../../../Samples/5_Domain_Specific//data/", // up 3 in tree 349 | "../../Samples/5_Domain_Specific//data/", // up 2 in tree 350 | 351 | "../../../../Samples/6_Performance//data/", // up 4 in tree 352 | "../../../Samples/6_Performance//data/", // up 3 in tree 353 | "../../Samples/6_Performance//data/", // up 2 in tree 354 | 355 | "../../../../Common/data/", // up 4 in tree 356 | "../../../Common/data/", // up 3 in tree 357 | "../../Common/data/" // up 2 in tree 358 | }; 359 | 360 | // Extract the executable name 361 | std::string executable_name; 362 | 363 | if (executable_path != 0) { 364 | executable_name = std::string(executable_path); 365 | 366 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 367 | // Windows path delimiter 368 | size_t delimiter_pos = executable_name.find_last_of('\\'); 369 | executable_name.erase(0, delimiter_pos + 1); 370 | 371 | if (executable_name.rfind(".exe") != std::string::npos) { 372 | // we strip .exe, only if the .exe is found 373 | executable_name.resize(executable_name.size() - 4); 374 | } 375 | 376 | #else 377 | // Linux & OSX path delimiter 378 | size_t delimiter_pos = executable_name.find_last_of('/'); 379 | executable_name.erase(0, delimiter_pos + 1); 380 | #endif 381 | } 382 | 383 | // Loop over all search paths and return the first hit 384 | for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { 385 | std::string path(searchPath[i]); 386 | size_t executable_name_pos = path.find(""); 387 | 388 | // If there is executable_name variable in the searchPath 389 | // replace it with the value 390 | if (executable_name_pos != std::string::npos) { 391 | if (executable_path != 0) { 392 | path.replace(executable_name_pos, strlen(""), 393 | executable_name); 394 | } else { 395 | // Skip this path entry if no executable argument is given 396 | continue; 397 | } 398 | } 399 | 400 | #ifdef _DEBUG 401 | printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); 402 | #endif 403 | 404 | // Test if the file exists 405 | path.append(filename); 406 | FILE *fp; 407 | FOPEN(fp, path.c_str(), "rb"); 408 | 409 | if (fp != NULL) { 410 | fclose(fp); 411 | // File found 412 | // returning an allocated array here for backwards compatibility reasons 413 | char *file_path = reinterpret_cast(malloc(path.length() + 1)); 414 | STRCPY(file_path, path.length() + 1, path.c_str()); 415 | return file_path; 416 | } 417 | 418 | if (fp) { 419 | fclose(fp); 420 | } 421 | } 422 | 423 | // File not found 424 | printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); 425 | return 0; 426 | } 427 | 428 | #endif // COMMON_HELPER_STRING_H_ 429 | --------------------------------------------------------------------------------