├── .gitignore
├── README.md
├── Linaro-Forge
    ├── correctness
    │   ├── core-files
    │   │   ├── gdb-dump-corefile.cmd
    │   │   ├── core.makefile
    │   │   ├── div-by-zero.c
    │   │   └── common.makefile
    │   ├── debug
    │   │   ├── Makefile
    │   │   ├── deadlock.c
    │   │   ├── memory_debugging.c
    │   │   ├── simple.c
    │   │   └── split.c
    │   └── gpu-nvidia-mmult
    │   │   ├── LICENSE
    │   │   ├── common
    │   │       ├── helper_functions.h
    │   │       ├── exception.h
    │   │       └── helper_string.h
    │   │   ├── README.md
    │   │   ├── Makefile
    │   │   └── matrixMul.cu
    ├── performance
    │   ├── mmult_py.makefile
    │   ├── mmultlib.c
    │   ├── mmultlib.f90
    │   ├── mmult.py
    │   └── common.makefile
    ├── README.md
    └── scripts
    │   └── submit-job.sh
├── .gitmodules
├── Sanitzers
    ├── AddressSanitizer
    │   ├── use-after-free.c
    │   ├── example_UseAfterFree.cc
    │   ├── illegalmemoryaccess.cpp
    │   └── README.md
    ├── LeakSanitizer
    │   ├── memory-leak.c
    │   └── README.md
    ├── MemorySanitizer
    │   ├── umr.cc
    │   ├── umr2.cc
    │   └── README.md
    ├── ThreadSanitizer
    │   ├── buggyreduction_omp.c
    │   ├── tiny_race.c
    │   └── README.md
    ├── Sanitizers4hpc
    │   ├── GPU
    │   │   ├── main.cc
    │   │   ├── README.md
    │   │   └── memcheck_demo.cu
    │   └── CPU
    │   │   ├── buggyreduction_mpiomp.c
    │   │   └── README.md
    └── README.md
├── Valgrind
    ├── memcheck
    │   ├── uninitialized.c
    │   ├── doublefree.c
    │   ├── manuel1.c
    │   ├── invalidparams.c
    │   ├── memoryleak.c
    │   ├── memoryleak_mpi.c
    │   ├── overlap.c
    │   ├── leak-cases.c
    │   ├── memalign.c
    │   ├── leak.h
    │   └── memcheck.h
    ├── massif
    │   └── example.c
    └── dhat
    │   ├── ad-hoc.c
    │   ├── dhat.out.1688970
    │   ├── dhat.out.2245130
    │   └── basic.c
├── TotalView
    ├── README.md
    ├── programs
    │   ├── TVcmd1
    │   ├── TVcmd2
    │   ├── TVcmd4
    │   ├── TVcmd3
    │   ├── TVcmd5
    │   ├── demoMpi_v2.TVD.v4breakpoints
    │   ├── Makefile
    │   └── combined.TVD.v4breakpoints
    └── src
    │   ├── array.h
    │   ├── simple.c
    │   ├── array.c
    │   ├── ReplayEngine_demo.cxx
    │   ├── myClassA.hxx
    │   ├── myClassB.hxx
    │   ├── myClassA.cxx
    │   ├── simple_threaded.c
    │   ├── myClassB.cxx
    │   ├── TVscript_demo.c
    │   ├── demoMpi_v2.C
    │   └── main.cxx
├── gdb4hpc
    └── README.md
├── CUDA
    └── CUDA-GDB
    │   └── README.md
└── fortran_memory
    ├── free_twice.f90
    ├── heap_overflow_underflow.f90
    ├── segfault.f90
    ├── memory_leaks.f90
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.[oa]
2 | *.mod
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # debugging
2 | 
3 | Debugging example codes
4 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/core-files/gdb-dump-corefile.cmd:
--------------------------------------------------------------------------------
1 | handle SIGFPE stop
2 | set confirm off
3 | run
4 | gcore div-by-zero.core
5 | quit
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "CUDA/compute-sanitizer-samples"]
2 | 	path = CUDA/compute-sanitizer-samples
3 | 	url = https://github.com/NVIDIA/compute-sanitizer-samples.git
4 | 


--------------------------------------------------------------------------------
/Sanitzers/AddressSanitizer/use-after-free.c:
--------------------------------------------------------------------------------
1 | #include <stdlib.h>
2 | int main() {
3 |   char *x = (char*)malloc(10 * sizeof(char*));
4 |   free(x);
5 |   return x[5];
6 | }
7 | 


--------------------------------------------------------------------------------
/Sanitzers/LeakSanitizer/memory-leak.c:
--------------------------------------------------------------------------------
1 | #include <stdlib.h>
2 | void *p;
3 | int main() {
4 |   p = malloc(7);
5 |   p = 0; // The memory is leaked here.
6 |   return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/Sanitzers/AddressSanitizer/example_UseAfterFree.cc:
--------------------------------------------------------------------------------
1 | int main(int argc, char **argv) {
2 |   int *array = new int[100];
3 |   delete [] array;
4 |   return array[argc];  // BOOM
5 | }
6 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/uninitialized.c:
--------------------------------------------------------------------------------
 1 | /* Taken from Valgrind memcheck manual */
 2 | 
 3 | #include <stdio.h>
 4 | 
 5 | int main()
 6 | {
 7 |   int x;
 8 |   printf ("x = %d\n", x);
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/Sanitzers/MemorySanitizer/umr.cc:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main(int argc, char** argv) {
 4 |   int* a = new int[10];
 5 |   a[5] = 0;
 6 |   if (a[argc])
 7 |     printf("xx\n");
 8 |   return 0;
 9 | }
10 | 


--------------------------------------------------------------------------------
/Sanitzers/MemorySanitizer/umr2.cc:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main(int argc, char** argv) {
 4 |   int* a = new int[10];
 5 |   a[5] = 0;
 6 |   volatile int b = a[argc];
 7 |   if (b)
 8 |     printf("xx\n");
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/doublefree.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | int main ( void )
 6 | {
 7 |    int i;
 8 |    void* p = malloc(177);
 9 |    for (i = 0; i < 2; i++)
10 |      free(p);
11 |    return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/manuel1.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main ()
 4 | {
 5 |   int x;
 6 | 
 7 |   if (x==0xCAFEBABE)
 8 |   {
 9 |     printf ("x = %d\n", 99);
10 |   }
11 |   else
12 |   {
13 |     printf ("x = %d\n", 88);
14 |   }
15 | 
16 |   return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/TotalView/README.md:
--------------------------------------------------------------------------------
 1 | # TotalView training materials
 2 | 
 3 | See
 4 | 
 5 | - `/global/cfs/cdirs/training/2024/TotalView_May2024`
 6 | 
 7 | ## Build
 8 | 
 9 | The source codes are in the `src` directory.
10 | 
11 | ```
12 | $ cd programs
13 | $ make clean
14 | $ make
15 | ```
16 | 


--------------------------------------------------------------------------------
/TotalView/programs/TVcmd1:
--------------------------------------------------------------------------------
1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#138=print err_detail" ./TVscript_demo'
2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#138=print err_detail" ./TVscript_demo
3 | 


--------------------------------------------------------------------------------
/Sanitzers/ThreadSanitizer/buggyreduction_omp.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main (int argc, char **argv) {
 4 |   int sum = 0;
 5 |   #pragma omp parallel for shared(sum)
 6 |   for (int i=0; i<1000; i++)
 7 |     sum += i;
 8 | 
 9 |   printf("sum = %d\n", sum);
10 |   return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/TotalView/programs/TVcmd2:
--------------------------------------------------------------------------------
1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#129=print {mypi*numprocs}" ./TVscript_demo'
2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#129=print {mypi*numprocs}" ./TVscript_demo
3 | 


--------------------------------------------------------------------------------
/TotalView/programs/TVcmd4:
--------------------------------------------------------------------------------
1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 3 -event_action "error=>display_backtrace -show_arguments -show_locals" TVscript_demo'
2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 3 -event_action "error=>display_backtrace -show_arguments -show_locals" TVscript_demo
3 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/debug/Makefile:
--------------------------------------------------------------------------------
 1 | # -*- Mode: Makefile; -*-
 2 | #
 3 | # See COPYRIGHT in top-level directory.
 4 | #
 5 | 
 6 | CC=cc
 7 | CFLAGS= -O0 -g -Wall -Wno-stringop-overflow
 8 | BINS=simple deadlock split memory_debugging
 9 | 
10 | all: $(BINS)
11 | 
12 | clean:
13 | 	rm -f $(BINS)
14 | 


--------------------------------------------------------------------------------
/TotalView/programs/TVcmd3:
--------------------------------------------------------------------------------
1 | echo 'rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#88=display_backtrace 1 -show_locals" ./TVscript_demo'
2 | rm -f *log ; tvscript -mpi "Open MPI" -tasks 4 -create_actionpoint "TVscript_demo.c#88=display_backtrace 1 -show_locals" ./TVscript_demo
3 | 


--------------------------------------------------------------------------------
/TotalView/programs/TVcmd5:
--------------------------------------------------------------------------------
1 | echo 'rm -f *log ; tvscript -memory_debugging -mpi "Open MPI" -tasks 4 -event_action "termination_notification=list_allocations" ./TVscript_demo'
2 | rm -f *log ; tvscript -memory_debugging -mpi "Open MPI" -tasks 4 -event_action "termination_notification=list_allocations" ./TVscript_demo
3 | 


--------------------------------------------------------------------------------
/Sanitzers/AddressSanitizer/illegalmemoryaccess.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | int main(int argc, char **argv) {
 4 |  int *array = new int[100];
 5 | 
 6 |   for (int i = 0; i < 110; ++i) // Access more than allocated memory.
 7 |       array[i] = i+1;
 8 | 
 9 |   delete [] array;
10 | 
11 |   return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/Sanitzers/ThreadSanitizer/tiny_race.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | int Global;
 3 | void *Thread1(void *x) {
 4 |   Global = 42;
 5 |   return x;
 6 | }
 7 | int main() {
 8 |   pthread_t t;
 9 |   pthread_create(&t, NULL, Thread1, NULL);
10 |   Global = 43;
11 |   pthread_join(t, NULL);
12 |   return Global;
13 | }
14 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/invalidparams.c:
--------------------------------------------------------------------------------
 1 | /* Taken from Valgrind memcheck manual */
 2 | 
 3 | #include <stdlib.h>
 4 | #include <unistd.h>
 5 | 
 6 | int main( void )
 7 | {
 8 |   char* arr = malloc(10);
 9 |   int* arr2 = malloc(sizeof(int));
10 | 
11 |   write( 1 /* stdout */, arr, 10 );
12 | 
13 |   exit(arr2[0]);
14 | }
15 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/memoryleak.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | 
 3 | void f(void)
 4 | {
 5 |    int* x = malloc(10 * sizeof(int));
 6 |    x[10] = 0;        // problem 1: heap block overrun
 7 | }                    // problem 2: memory leak -- x not freed
 8 | 
 9 | int main(void)
10 | {
11 |    f();
12 |    return 0;
13 | }
14 | 


--------------------------------------------------------------------------------
/gdb4hpc/README.md:
--------------------------------------------------------------------------------
1 | # gdb4hpc
2 | 
3 | - `HPE_Oct2024`
4 |     Materials taken from
5 |     `/global/cfs/cdirs/training/2024/HPE_Oct2024/gdb4hpc_lab`,
6 |     prepared by HPE for the hands-on session for gdb4hpc in the HPE
7 |     Perlmutter Training on User Environment and Profiling/Debugging,
8 |     October 14-15, 2024
9 | 


--------------------------------------------------------------------------------
/CUDA/CUDA-GDB/README.md:
--------------------------------------------------------------------------------
 1 | CUDA-GDB example codes in the CUDA-GDB User Manual:
 2 | 
 3 | -   `bitreverse.cu`
 4 |     ```
 5 |     $ nvcc -g -G -o bitreverse bitreverse.cu
 6 | 
 7 |     $ cuda-gdb ./bitreverse
 8 |     ```
 9 | -   `autostep.cu`
10 |     ```
11 |     $ nvcc -g -G -o autostep.cu -o autostep
12 | 
13 |     $ cuda-gdb ./autostep
14 |     ```
15 | 


--------------------------------------------------------------------------------
/Sanitzers/Sanitizers4hpc/GPU/main.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | 
 3 | void launch_memcheck_demo();
 4 | 
 5 | int main (int argc, char **argv)
 6 | {
 7 |     int rank, nprocs;
 8 |     MPI_Init (&argc, &argv);
 9 |     MPI_Comm_rank (MPI_COMM_WORLD, &rank);
10 |     MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
11 | 
12 |     launch_memcheck_demo();
13 |     MPI_Finalize();
14 |     return 0;
15 | }
16 | 


--------------------------------------------------------------------------------
/Sanitzers/Sanitizers4hpc/CPU/buggyreduction_mpiomp.c:
--------------------------------------------------------------------------------
 1 | #include "mpi.h"
 2 | #include <stdio.h>
 3 | 
 4 | int main (int argc, char **argv) {
 5 |   int rank;
 6 | 
 7 |   MPI_Init(&argc, &argv);
 8 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 9 | 
10 |   int sum = 0;
11 |   #pragma omp parallel for shared(sum)
12 |   for (int i=0; i<1000; i++)
13 |     sum += i;
14 | 
15 |   printf("%d: sum = %d\n", rank, sum);
16 | 
17 |   MPI_Finalize();
18 |   return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/Valgrind/massif/example.c:
--------------------------------------------------------------------------------
 1 | /* From Massif Valgrind manual */
 2 | 
 3 | #include <stdlib.h>
 4 | 
 5 | void g(void)
 6 | {
 7 |   malloc(4000);
 8 | }
 9 | 
10 | void f(voild)
11 | {
12 |   malloc(2000);
13 |   g();
14 | }
15 | 
16 | int main(void)
17 | {
18 | 
19 |   int i;
20 |   int* a[10];
21 | 
22 |   for (i = 0; i < 10; i++) {
23 |     a[i] = malloc(1000);
24 |   }
25 | 
26 |   f();
27 | 
28 |   g();
29 | 
30 |   for (i = 0; i < 10; i++) {
31 |     free(a[i]);
32 |   }
33 | 
34 |   return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/memoryleak_mpi.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <mpi.h>
 3 | 
 4 | void f(void)
 5 | {
 6 |    int* x = malloc(25000 * sizeof(int));
 7 |    x[25000] = 0;     // problem 1: heap block overrun
 8 | }                    // problem 2: memory leak -- x not freed
 9 | 
10 | int main(int argc, char **argv)
11 | {
12 |    int nproc, me;
13 |    MPI_Init(&argc, &argv);
14 |    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
15 |    MPI_Comm_rank(MPI_COMM_WORLD, &me);
16 |    f();
17 |    MPI_Finalize();
18 |    return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/fortran_memory/free_twice.f90:
--------------------------------------------------------------------------------
 1 |       program free_twice
 2 | !...  A buggy code prepared for a debugger tutorial by NERSC
 3 |       use mpi
 4 |       integer, parameter :: n = 1024
 5 |       real, allocatable :: a(:), b(:)
 6 |       integer i, ierr
 7 |       call mpi_init(ierr)
 8 |       allocate (a(n), b(n))
 9 |       call random_number(a)
10 |       b = cos(a)
11 |       deallocate (a)
12 |       print *, sum(b)
13 |       deallocate (a,b)  ! Oops..., deallocating 'a' again
14 |       call mpi_finalize(ierr)
15 |       end
16 | 


--------------------------------------------------------------------------------
/TotalView/src/array.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARRAY_H
 2 | #define ARRAY_H
 3 | #include <string.h>
 4 | 
 5 | // You can't look up the values of preprocessed macros in the debugger
 6 | // So, if you want to look at the value of JMAX and IMAX compile w/
 7 | // -DUSE_GLOBALS
 8 | #ifdef USE_GLOBALS
 9 |    int JMAX = 1000
10 |    int IMAX = 1000
11 | #else
12 | //kah   #define JMAX 1000
13 |    #define JMAX 100
14 |    #define IMAX 100
15 | #endif
16 | 
17 | /*** some global vars **/
18 | extern double b[];
19 | 
20 | void array();
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/core-files/core.makefile:
--------------------------------------------------------------------------------
 1 | include common.makefile
 2 | 
 3 | CFLAGS = -g
 4 | GDB_PATH = gdb
 5 | 
 6 | all: div-by-zero
 7 | 	# Create two different core files so that loading multiple ones
 8 | 	# can be tested.
 9 | 	$(GDB_PATH) -x gdb-dump-corefile.cmd --args ./div-by-zero 1
10 | 	mv div-by-zero.core div-by-zero-1.core
11 | 	$(GDB_PATH) -x gdb-dump-corefile.cmd --args ./div-by-zero 2
12 | 	mv div-by-zero.core div-by-zero-2.core
13 | 
14 | div-by-zero : div-by-zero.c
15 | 	$(CC) $(CFLAGS) $< -o $@
16 | 
17 | clean:
18 | 	$(RM) div-by-zero div-by-zero-1.core div-by-zero-2.core
19 | 


--------------------------------------------------------------------------------
/Sanitzers/Sanitizers4hpc/CPU/README.md:
--------------------------------------------------------------------------------
 1 | # Sanitizers4hpc with CPU codes
 2 | 
 3 | Example code:
 4 | 
 5 | -   `buggyreduction_mpiomp.c`: Santizers4hpc with ThreadSanitizer
 6 |     example code
 7 | 
 8 | ## `buggyreduction_mpiomp.c`
 9 | 
10 | This is a simple MPI code based on `buggyreduction_omp.c`.
11 | 
12 | ```
13 | $ salloc -C cpu -n 2 -c 2 -q shared -t 20
14 | ...
15 | 
16 | $ cc -fsanitize=thread -g -O1 -fopenmp buggyreduction_mpiomp.c -o buggyreduction_mpiomp
17 | 
18 | $ export OMP_NUM_THREADS=2
19 | 
20 | $ module load sanitizers4hpc
21 | 
22 | $ sanitizers4hpc -l "-n 2 -c 2" -- ./buggyreduction_mpiomp
23 | ```
24 | 


--------------------------------------------------------------------------------
/Linaro-Forge/performance/mmult_py.makefile:
--------------------------------------------------------------------------------
 1 | include common.makefile
 2 | 
 3 | # The MPI compiler commands (typically mpicc and mpif90) are autodetected
 4 | # by common.makefile. You can override by uncommenting the following:
 5 | #MPICC=
 6 | #MPIF90=
 7 | 
 8 | CFLAGS = -O2
 9 | 
10 | targets = libmmult_c.so libmmult_f
11 | 
12 | .PHONY: all
13 | all: $(targets)
14 | 
15 | libmmult_c.so: mmultlib.c
16 | 	$(MPICC) -std=c99 -fPIC -shared $(CFLAGS) $^ -o $@
17 | 
18 | .PHONY: libmmult_f
19 | libmmult_f: mmultlib.f90
20 | 	f2py --opt="$(CFLAGS)" -c $^ -m $@
21 | 
22 | .PHONY: clean
23 | clean:
24 | 	$(RM) libmmult_c.so libmmult_f*.so res*.mat
25 | 
26 | 


--------------------------------------------------------------------------------
/Valgrind/dhat/ad-hoc.c:
--------------------------------------------------------------------------------
 1 | /* #include "dhat/dhat.h" */
 2 | #include "dhat.h"
 3 | #include <stdlib.h>
 4 | void g(void) {
 5 |    DHAT_AD_HOC_EVENT(30);
 6 | }
 7 | 
 8 | void f(void) {
 9 |    g();
10 |    DHAT_AD_HOC_EVENT(20);
11 |    g();
12 | }
13 | 
14 | int main(void) {
15 |    f();
16 |    DHAT_AD_HOC_EVENT(10);
17 |    f();
18 | 
19 |    // At one point malloc was broken with --mode=ad-hoc(!), and Valgrind was
20 |    // printing messages like "VG_USERREQ__CLIENT_CALL1: func=0x0" when malloc
21 |    // was called. So check that it's basically working...
22 |    char* p = malloc(100);
23 |    p = realloc(p, 200);
24 |    free(p);
25 | 
26 |    return 0;
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/Linaro-Forge/README.md:
--------------------------------------------------------------------------------
 1 | # Linaro Forge training materials
 2 | 
 3 | See
 4 | 
 5 | - `/global/cfs/cdirs/training/2025/linaro-forge-training`
 6 | - `/global/cfs/cdirs/training/2024/Forge_Mar2024`
 7 | 
 8 | # Build
 9 | 
10 | ## Debugging
11 | 
12 | ```
13 | $ cd correctness
14 | 
15 | $ cd core-files
16 | $ make -f core.makefile clean
17 | $ make -f core.makefile
18 | 
19 | $ cd ..
20 | 
21 | $ cd debug
22 | $ make clean
23 | $ make
24 | 
25 | $ cd ..
26 | 
27 | $ cd gpu-nvidia-mmult
28 | $ make clean
29 | $ make
30 | 
31 | $ cd ../..
32 | ```
33 | 
34 | ## Profiling
35 | 
36 | ```
37 | $ cd performance
38 | $ ml python
39 | $ make -f mmult_py.makefile clean
40 | $ make -f mmult_py.makefile
41 | ```
42 | 


--------------------------------------------------------------------------------
/TotalView/src/simple.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h> 
 2 | #include "array.h"
 3 | 
 4 | int main(int argc, char **argv)
 5 | { 
 6 |     /************* command line args ***/ 
 7 |     { 
 8 |         char command_line_string[80];
 9 |         if (argc > 1) 
10 |         { 
11 |             strcpy(command_line_string, argv[1]); 
12 |             printf("arg_2=%s\n", command_line_string); 
13 |         } 
14 |     }
15 |     
16 |     array();
17 | 
18 |     array();
19 | 
20 |     {
21 | 
22 |         char input[80];
23 |         scanf( "%s", input );
24 |         printf( "You entered: %s\n", input );
25 |         scanf( "%s", input );
26 |         printf( "Now you entered: %s\n", input );
27 | 
28 |     }
29 | 
30 |     return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/fortran_memory/heap_overflow_underflow.f90:
--------------------------------------------------------------------------------
 1 |       program heap_overflow_underflow
 2 | !...  A buggy code prepared for a debugger tutorial by NERSC
 3 |       use mpi
 4 |       integer, parameter :: n = 1024
 5 |       integer, parameter :: ouf = 8
 6 |       real, allocatable :: a(:), b(:)
 7 |       integer i, ierr
 8 |       call mpi_init(ierr)
 9 |       allocate (a(n), b(n))
10 |       call random_number(a)
11 |       b = cos(a)
12 |       b(1)     = cos(a(1-ouf))  ! read underflow
13 |       b(1-ouf) = cos(a(1))      ! write underflow
14 |       b(n) =     cos(a(n+ouf))  ! read overflow
15 |       b(n+ouf) = cos(a(n))      ! write overflow
16 |       print *, sum(b)
17 |       deallocate (a, b)
18 |       call mpi_finalize(ierr)
19 |       end
20 | 


--------------------------------------------------------------------------------
/Linaro-Forge/scripts/submit-job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | #SBATCH -J linaro-forge-hands-on
 4 | #SBATCH -o stdout.%J.out
 5 | #SBATCH -e stderr.%J.err
 6 | #SBATCH -A ntrain7
 7 | #SBATCH -C cpu
 8 | #SBATCH --time=00:30:00
 9 | #SBATCH --nodes=1
10 | #SBATCH -c 32
11 | #SBATCH --reservation=forge_cpu
12 | 
13 | export SLURM_CPU_BIND="cores"
14 | 
15 | SIMPLEPATH=$FORGE_TRAINING/correctness/debug/simple
16 | MMULTPATH=$FORGE_TRAINING/performance/mmult.py
17 | 
18 | module load forge
19 | 
20 | # Debug a simple MPI program
21 | ddt --offline -o offline-debugging.html --break-at=simple.c:32 --break-at=simple.c:41 srun -n 4 $SIMPLEPATH
22 | 
23 | # Profile matrix multiplication example
24 | #cd `dirname $MMULTPATH`
25 | #map --profile srun -n 8 python3 $MMULTPATH -s 3072
26 | 


--------------------------------------------------------------------------------
/TotalView/src/array.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include "array.h"
 4 | 
 5 | 
 6 | double b[IMAX];
 7 | 
 8 | void dowork()
 9 | {
10 |        /**** some array operations ***/
11 |     {
12 |         int i, j, jmod;
13 |         double xi, xj, dx, scale = 100.0;
14 | 
15 |         for (j = 0; j < JMAX; j++)
16 |         {
17 |             jmod = (100*j) %JMAX;
18 |             xj = (double)jmod/(double)JMAX;
19 |             for (i = 0; i < IMAX; i++)
20 |             {
21 |                 xi = (double)i/(double)IMAX;
22 |                 dx = xi-xj;
23 |                 b[i] = 2.0/(1.0+exp(scale*dx*dx));
24 |             }
25 |             printf("counter %d\n", j);
26 |         }
27 |     }
28 |     return;
29 | }
30 |    
31 | 
32 | void array()
33 | {
34 |     dowork();
35 |     return;
36 | }
37 | 


--------------------------------------------------------------------------------
/fortran_memory/segfault.f90:
--------------------------------------------------------------------------------
 1 |       program segfault
 2 | !...  A buggy code prepared for a debugger tutorial by NERSC
 3 |       use mpi
 4 |       integer, parameter :: n = 10
 5 |       real, pointer :: a(:) => null()
 6 |       real, pointer :: b(:) => null()
 7 |       real, pointer :: c(:) => null()
 8 |       integer me, i, ierr
 9 |       call mpi_init(ierr)
10 |       call mpi_comm_rank(mpi_comm_world,me,ierr)
11 | !     allocate (a(n), b(n), c(n))  ! Oops, forgot to allocate...
12 |       call sub(a,b,c,n)
13 |       print *, sum(c)
14 |       deallocate (a, b, c)
15 |       call mpi_finalize(ierr)
16 |       end
17 | 
18 |       subroutine sub(a,b,c,n)
19 |       integer n
20 |       real a(n), b(n), c(n)
21 |       call random_number(a)
22 |       call random_number(b)
23 |       do i=1,n
24 |          c(i) = cos(a(i)) * sin(b(i))
25 |       end do
26 |       end
27 | 


--------------------------------------------------------------------------------
/Valgrind/dhat/dhat.out.1688970:
--------------------------------------------------------------------------------
 1 | {"dhatFileVersion":2
 2 | ,"mode":"heap","verb":"Allocated"
 3 | ,"bklt":true,"bkacc":true
 4 | ,"tu":"instrs","Mtu":"Minstr"
 5 | ,"tuth":500
 6 | ,"cmd":"./basic"
 7 | ,"pid":1688970
 8 | ,"te":341947
 9 | ,"tg":336617
10 | ,"pps":
11 |  [{"tb":4000,"tbk":2
12 |   ,"tl":7111
13 |   ,"mb":3000,"mbk":1
14 |   ,"gb":3000,"gbk":1
15 |   ,"eb":3000,"ebk":1
16 |   ,"rb":1008,"wb":1516
17 |   ,"fs":[1,2]
18 |   }
19 |  ,{"tb":3000,"tbk":2
20 |   ,"tl":1831
21 |   ,"mb":2000,"mbk":1
22 |   ,"gb":2000,"gbk":1
23 |   ,"eb":0,"ebk":0
24 |   ,"rb":2000,"wb":2000
25 |   ,"fs":[3,4]
26 |   }
27 |  ]
28 | ,"ftbl":
29 |  ["[root]"
30 |  ,"0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_dhat-amd64-linux.so)"
31 |  ,"0x40051F: main (basic.c:11)"
32 |  ,"0x4E0A571: calloc (in /usr/lib/valgrind/vgpreload_dhat-amd64-linux.so)"
33 |  ,"0x400540: main (basic.c:15)"
34 |  ]
35 | }
36 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/core-files/div-by-zero.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <sys/types.h>
 3 | #include <signal.h>
 4 | #include <stdio.h>
 5 | #include <sys/resource.h>
 6 | #include <unistd.h>
 7 | 
 8 | void print_fraction(int numerator, int denominator)
 9 | {
10 |     printf("%d\n", numerator / denominator); kill (getpid(), SIGFPE);
11 | } 
12 | 
13 | 
14 | int main(int argc, char* argv[])
15 | {
16 |     /* Take a value out of the arguments so that loading multiple */
17 |     /* core files with different values works in ddt. */
18 |     int numerator = 1;
19 |     if(argc == 2) {
20 |         numerator = atoi(argv[1]);
21 |     }
22 | 
23 |     /* ensure a core file is generated */
24 |     struct rlimit limit;
25 |     limit.rlim_cur = limit.rlim_max = RLIM_INFINITY;
26 |     setrlimit(RLIMIT_CORE, &limit);
27 |     /* divide by zero */
28 |     print_fraction(numerator, 0);
29 | 
30 |     return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/Valgrind/dhat/dhat.out.2245130:
--------------------------------------------------------------------------------
 1 | {"dhatFileVersion":2
 2 | ,"mode":"ad-hoc","verb":"Occurred"
 3 | ,"bklt":false,"bkacc":false
 4 | ,"bu":"unit","bsu":"units","bksu":"events"
 5 | ,"tu":"instrs","Mtu":"Minstr"
 6 | ,"cmd":"./ad-hoc"
 7 | ,"pid":2245130
 8 | ,"te":340933
 9 | ,"pps":
10 |  [{"tb":30,"tbk":1
11 |   ,"fs":[1,2,3]
12 |   }
13 |  ,{"tb":20,"tbk":1
14 |   ,"fs":[4,3]
15 |   }
16 |  ,{"tb":30,"tbk":1
17 |   ,"fs":[5,6,3]
18 |   }
19 |  ,{"tb":10,"tbk":1
20 |   ,"fs":[7]
21 |   }
22 |  ,{"tb":30,"tbk":1
23 |   ,"fs":[1,2,8]
24 |   }
25 |  ,{"tb":20,"tbk":1
26 |   ,"fs":[4,8]
27 |   }
28 |  ,{"tb":30,"tbk":1
29 |   ,"fs":[5,6,8]
30 |   }
31 |  ]
32 | ,"ftbl":
33 |  ["[root]"
34 |  ,"0x4006F6: g (ad-hoc.c:5)"
35 |  ,"0x4006F6: f (ad-hoc.c:9)"
36 |  ,"0x4004C8: main (ad-hoc.c:15)"
37 |  ,"0x400750: f (ad-hoc.c:10)"
38 |  ,"0x4007A9: g (ad-hoc.c:5)"
39 |  ,"0x4007A9: f (ad-hoc.c:11)"
40 |  ,"0x400519: main (ad-hoc.c:22)"
41 |  ,"0x40052C: main (ad-hoc.c:17)"
42 |  ]
43 | }
44 | 


--------------------------------------------------------------------------------
/Valgrind/dhat/basic.c:
--------------------------------------------------------------------------------
 1 | // Some basic allocations and accesses.
 2 | 
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | /* #include "dhat/dhat.h" */
 7 | #include "dhat.h"
 8 | 
 9 | int main(void)
10 | {
11 |    int64_t* m = malloc(1000);
12 |    m[0] = 1;                     // write 8 bytes
13 |    m[10] = m[1];                 // read and write 8 bytes
14 | 
15 |    char* c = calloc(1, 2000);
16 |    for (int i = 0; i < 1000; i++) {
17 |       c[i + 1000] = c[i];        // read and write 1000 bytes
18 |    }
19 | 
20 |    char* r = realloc(m, 3000);   // read and write 1000 bytes (memcpy)
21 |    for (int i = 0; i < 500; i++) {
22 |       r[i + 2000] = 99;          // write 500 bytes
23 |    }
24 | 
25 |    c = realloc(c, 1000);         // read and write 1000 bytes (memcpy)
26 | 
27 |    free(c);
28 |                                  // totals: 3008 read, 3516 write
29 | 
30 |    // Should be ignored because we're not in ad hoc mode.
31 |    DHAT_AD_HOC_EVENT(100);
32 | 
33 |    return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/TotalView/src/ReplayEngine_demo.cxx:
--------------------------------------------------------------------------------
 1 | //
 2 | //                 ReplayEngine Demo
 3 | //
 4 | //
 5 | #include <stdlib.h>
 6 | #include <assert.h>
 7 | #include <math.h>
 8 | #include <map>
 9 | #include <list>
10 | #include <vector>
11 | #include <string>
12 | #include <stdio.h>
13 | #include <iostream>
14 | #include <pthread.h>
15 | 
16 | int funcA(int);
17 | int funcB(int);
18 | int badstuff();
19 | #define MAXDEPTH 20
20 | int arraylength=MAXDEPTH;
21 | 
22 | 
23 | int main()
24 | {
25 |     funcA(0);
26 |     badstuff();
27 |     funcA(0);
28 |     exit(1);
29 | }
30 | 
31 | 
32 | 
33 | int funcA(int a){
34 |     int b;
35 |     b=a+2;
36 |     b=funcB(b);
37 |     return b;
38 | }
39 | 
40 | 
41 | int funcB(int b){
42 |     int c;
43 |     int i;
44 |     int v[MAXDEPTH];
45 |     int *p;
46 | 
47 |     c=b+2;
48 |     p=&c;
49 | 
50 |     if( c<MAXDEPTH )  
51 |        c=funcA(c);
52 | 
53 |     for (i=arraylength-1; i>0; i--){
54 |         v[i]=*p;
55 |     }
56 | 
57 |     return c; 
58 | }
59 | 
60 | int badstuff(){
61 |     arraylength=5*MAXDEPTH;
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/TotalView/src/myClassA.hxx:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |  * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED
 3 |  * No part of this material may be reproduced, stored in a retrieval
 4 |  * system, transmitted or used in any form or by any means, electronic,
 5 |  * mechanical, photocopying, recording, or otherwise, without the prior
 6 |  * written permission of, or express license from Etnus, LLC.
 7 |  ***********************************************************************
 8 |  * This file contains PROPRIETARY INFORMATION of Etnus, LLC.
 9 |  ***********************************************************************
10 |  * Copyright 1999 by Etnus, Inc.
11 |  * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc.
12 |  * Copyright 1989-1996 by BBN Inc.
13 |  ***********************************************************************/ 
14 | #ifndef __MY_CLASS_A__
15 | #define __MY_CLASS_A__
16 | 
17 | class myClassA {
18 | 
19 | public:
20 |   myClassA();
21 |   ~myClassA();
22 | 
23 | private:
24 |   float *float_p;
25 |   int   *int_p;
26 |   int    size;
27 | };
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/debug/deadlock.c:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 2 | /*
 3 |  * See COPYRIGHT in top-level directory.
 4 |  */
 5 | 
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include "mpi.h"
 9 | 
10 | int main(int argc, char **argv)
11 | {
12 |     int rank, size;
13 |     int i, data;
14 | 
15 |     MPI_Init(&argc, &argv);
16 | 
17 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
18 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
19 | 
20 |     if (rank == 0) {
21 |         int *sendbuf;
22 |         MPI_Request *sendreqs;
23 | 
24 |         /* setup send operations */
25 |         sendreqs = malloc(sizeof(MPI_Request) * size);
26 |         sendbuf = malloc(sizeof(int) * size);
27 | 
28 |         for (i = 0; i < size; i++) {
29 |             sendbuf[i] = i * 10;
30 |             MPI_Ssend(&sendbuf[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD);
31 | 	    //FIX: MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD, &sendreqs[i]);
32 |         }
33 |         
34 | 	MPI_Recv(&data, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
35 |         
36 | 	MPI_Waitall(size, sendreqs, MPI_STATUSES_IGNORE);    
37 |     }
38 | 
39 |     MPI_Finalize();
40 |     return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/debug/memory_debugging.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <unistd.h>
 3 | #include "mpi.h"
 4 | 
 5 | int *chunk_a, *chunk_b;
 6 | 
 7 | void func_a(int size) {
 8 |     sleep(size);
 9 |     chunk_a = malloc(4000 * size);
10 |     free(chunk_a);
11 | } 
12 | 
13 | void func_b(int size) {
14 |     chunk_b = malloc(2000 * size);
15 |     func_a(size);
16 |     free(chunk_b);
17 | }
18 | 
19 | int main(int argc, char** argv) {
20 |     int rank, size, i;
21 |     int *a[10], *dynamicArray;
22 |     
23 |     MPI_Init(&argc, &argv);
24 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
25 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
26 | 
27 |     if (rank == 0) {
28 | 	for (i = 0; i < size; i++) {
29 |             dynamicArray = malloc(sizeof(int)*100000);
30 | 	}
31 |     }
32 | 
33 |     for (i = 0; i < 10; i++) {
34 |         a[i] = malloc(1000);
35 |     }
36 | 
37 |     func_a(size);
38 |     func_b(size/2);
39 | 
40 |     // Address not mapped error
41 |     for (i = 0; i < 11; i++) {
42 |         free(a[i]);
43 |     }
44 | 
45 |     free(a[1]);         // Free previously freed pointer
46 |     free(dynamicArray); // Should only be freed on proc 0
47 | 
48 |     MPI_Finalize();
49 | 
50 |     return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/debug/simple.c:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 2 | /*
 3 |  * See COPYRIGHT in top-level directory.
 4 |  */
 5 | 
 6 | #include <mpi.h>
 7 | #include <stdio.h>
 8 | 
 9 | int main(int argc, char **argv)
10 | {
11 |     int rank, size, target, source;
12 |     int sendbuf, recvbuf;
13 |     MPI_Request reqs[2];
14 | 
15 |     MPI_Init(&argc, &argv);
16 | 
17 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
18 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
19 | 
20 |     /* get communication partners */
21 |     target = (rank + 1) % size;
22 |     source = (rank - 1);
23 |     if (source < 0)
24 |         source += size;
25 | 
26 |     sendbuf = 42;
27 |     recvbuf = 0;
28 | 
29 |     MPI_Irecv(&recvbuf, 3, MPI_INT, source, 0, MPI_COMM_WORLD, &reqs[0]);
30 | 
31 |     /* stop here to view posted recvs */
32 |     printf("recvs posted, recvbuf = %d\n", recvbuf);
33 | 
34 |     MPI_Isend(&sendbuf, 3, MPI_INT, target, 0, MPI_COMM_WORLD, &reqs[1]);
35 | 
36 |     /* stop here to view send ops */
37 |     printf("sends issued\n");
38 | 
39 |     MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
40 | 
41 |     printf("communication complete, recvbuf = %d\n", recvbuf);
42 | 
43 |     MPI_Finalize();
44 |     return 0;
45 | }
46 | 


--------------------------------------------------------------------------------
/TotalView/src/myClassB.hxx:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |  * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED
 3 |  * No part of this material may be reproduced, stored in a retrieval
 4 |  * system, transmitted or used in any form or by any means, electronic,
 5 |  * mechanical, photocopying, recording, or otherwise, without the prior
 6 |  * written permission of, or express license from Etnus, LLC.
 7 |  ***********************************************************************
 8 |  * This file contains PROPRIETARY INFORMATION of Etnus, LLC.
 9 |  ***********************************************************************
10 |  * Copyright 1999 by Etnus, Inc.
11 |  * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc.
12 |  * Copyright 1989-1996 by BBN Inc.
13 |  ***********************************************************************/
14 | 
15 | #ifndef __MY_CLASS_B__
16 | #define __MY_CLASS_B__
17 | 
18 | #include <vector>
19 | 
20 | class myClassB {
21 | 
22 | public:
23 |   myClassB();
24 |   ~myClassB();
25 |   void init(void);
26 |   void destroy(void);
27 | 
28 | private:
29 |   
30 |   std::vector<char *>  *vector_char_p;
31 |   int                  **b_pp;
32 |   int                    size;
33 | };
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/TotalView/src/myClassA.cxx:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |  * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED
 3 |  * No part of this material may be reproduced, stored in a retrieval
 4 |  * system, transmitted or used in any form or by any means, electronic,
 5 |  * mechanical, photocopying, recording, or otherwise, without the prior
 6 |  * written permission of, or express license from Etnus, LLC.
 7 |  ***********************************************************************
 8 |  * This file contains PROPRIETARY INFORMATION of Etnus, LLC.
 9 |  ***********************************************************************
10 |  * Copyright 1999 by Etnus, Inc.
11 |  * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc.
12 |  * Copyright 1989-1996 by BBN Inc.
13 |  ***********************************************************************/ 
14 | #include <stdlib.h>
15 | #include "myClassA.hxx"
16 | 
17 | myClassA::myClassA() : size (128) {
18 |   
19 |   float_p = new float[size];
20 | 
21 |   for(int i=0; i<size; i++) {
22 |     float_p[i] = (float) i;
23 |   }
24 | 
25 |   int_p = (int *) malloc(size * sizeof(int));
26 | 
27 |   for(int i=0; i<size; i++) {
28 |     int_p[i] = size-i;
29 |   }
30 | }
31 | 
32 | myClassA::~myClassA () {
33 | 
34 |   delete[] float_p;
35 |   free (int_p);
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/Sanitzers/MemorySanitizer/README.md:
--------------------------------------------------------------------------------
 1 | # MemorySanitizer (MSAN)
 2 | 
 3 | Example codes:
 4 | 
 5 | -  `umr.cc`: From [https://github.com/google/sanitizers/wiki/MemorySanitizer](https://github.com/google/sanitizers/wiki/MemorySanitizer)
 6 | -  `umr2.cc`: To track origins of unitialized values; from [Clang MemorySanitizer page](https://clang.llvm.org/docs/MemorySanitizer.html)
 7 | 
 8 | ## `umr.cc`
 9 | 
10 | The code doesn't initialize `a[1]` but uses its value in a conditional
11 | statement.
12 | 
13 | The GNU compilers don't support MSAN, so use a different compiler.
14 | This example uses `PrgEnv-cray`. Compile with `-fsanitize=memory`.
15 | 
16 | ```
17 | $ CC -fsanitize=memory -g -O1 -o umr umr.cc
18 | 
19 | $ export MSAN_OPTIONS="allow_addr2line=true"
20 | 
21 | $ ./umr         # no argument here so argc in the program is 1
22 | ==578284==WARNING: MemorySanitizer: use-of-uninitialized-value
23 |     #0 0x2cf202 in main /pscratch/sd/e/elvis/sanitizers/umr.cc:6:7
24 |     #1 0x7fc4fa63e24c in __libc_start_main (/lib64/libc.so.6+0x3524c)
25 |     #2 0x24e4b9 in _start /home/abuild/rpmbuild/BUILD/glibc-2.31/csu/../sysdeps/x86_64/start.S:120
26 | 
27 | SUMMARY: MemorySanitizer: use-of-uninitialized-value /pscratch/sd/e/elvis/sanitizers/umr.cc:6:7 in main
28 | Exiting
29 | ```
30 | 
31 | The `MSAN_OPTIONS` environment variable is set as the source line
32 | info in error messages is not displayed in this programming
33 | environment.
34 | 


--------------------------------------------------------------------------------
/Linaro-Forge/performance/mmultlib.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |    ===============================================================================
 3 |    Copyright (C) March 2023 - Linaro Limited (or its affiliates). All rights reserved.
 4 |    Copyright (C) Arm Limited, 2019-2023 All rights reserved.
 5 |    The example code is provided to you as an aid to learning when working
 6 |    with Linaro Forge, including but not limited to programming tutorials.
 7 |    Linaro hereby grants to you, subject to the terms and conditions of this Licence,
 8 |    a non-exclusive, non-transferable, non-sub-licensable, free-of-charge licence,
 9 |    to use and copy the Software solely for the purpose of demonstration and
10 |    evaluation.
11 |    You accept that the Software has not been tested by Linaro therefore the Software
12 |    is provided “as is”, without warranty of any kind, express or implied. In no
13 |    event shall the authors or copyright holders be liable for any claim, damages
14 |    or other liability, whether in action or contract, tort or otherwise, arising
15 |    from, out of or in connection with the Software or the use of Software.
16 |    ===============================================================================
17 | */
18 | 
19 | #include <stdio.h>
20 | #include <stdlib.h>
21 | 
22 | 
23 | void mmult(int sz, int nslices, double *A, double *B, double *C)
24 | {
25 |   for(int i=0; i<sz/nslices; i++)
26 |   {
27 |     for(int j=0; j<sz; j++)
28 |     {
29 |       double res = 0.0;
30 | 
31 |       for(int k=0; k<sz; k++)
32 |       {
33 |         res += A[i*sz+k]*B[k*sz+j];
34 |       }
35 | 
36 |       C[i*sz+j] += res;
37 |     }
38 |   }
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/Linaro-Forge/performance/mmultlib.f90:
--------------------------------------------------------------------------------
 1 | !  ===============================================================================
 2 | !  Copyright (C) March 2023 - Linaro Limited (or its affiliates). All rights reserved.
 3 | !  Copyright (C) Arm Limited, 2019-2023 All rights reserved.
 4 | !  The example code is provided to you as an aid to learning when working
 5 | !  with Linaro Forge, including but not limited to programming tutorials.
 6 | !  Linaro hereby grants to you, subject to the terms and conditions of this Licence,
 7 | !  a non-exclusive, non-transferable, non-sub-licensable, free-of-charge licence,
 8 | !  to use and copy the Software solely for the purpose of demonstration and
 9 | !  evaluation.
10 | !  You accept that the Software has not been tested by Linaro therefore the Software
11 | !  is provided “as is”, without warranty of any kind, express or implied. In no
12 | !  event shall the authors or copyright holders be liable for any claim, damages
13 | !  or other liability, whether in action or contract, tort or otherwise, arising
14 | !  from, out of or in connection with the Software or the use of Software.
15 | !  ===============================================================================
16 | 
17 | 
18 | ! File mmult.F90
19 |   subroutine mmult(sz, nslices, A, B, C)
20 |     integer, intent(in)     :: sz, nslices
21 |     real(8), intent(in)     :: A(sz,sz), B(sz,sz)
22 |     real(8), intent(inout)  :: C(sz,sz)
23 |     integer                 :: i,j,k
24 | 
25 |     do i=1,sz/nslices
26 |       do k=1,sz
27 |         do j=1,sz
28 |           C(j,i)=A(k,i)*B(j,k)+C(j,i)
29 |         end do
30 |       end do
31 |     end do
32 | 
33 |   end subroutine mmult
34 | 


--------------------------------------------------------------------------------
/fortran_memory/memory_leaks.f90:
--------------------------------------------------------------------------------
 1 |       program memory_leaks
 2 | 
 3 | !...  Buggy code prepared by NERSC User Service Group for a debugging tutorial
 4 | !...  February, 2012
 5 | 
 6 |       use mpi
 7 |       implicit none
 8 |       integer, parameter :: n = 1000000
 9 |       real val
10 |       integer i, ierr
11 |       call mpi_init(ierr)
12 |       val = 0.
13 |       do i=1,10
14 |          call sub_ok(val,n)
15 |       end do
16 |       do i=1,10
17 |          call sub_bad(val,n)
18 |       end do
19 |       do i=1,10
20 |          call sub_badx2(val,n)
21 |       end do
22 |       print *, val
23 |       call mpi_finalize(ierr)
24 |       end
25 | 
26 |       subroutine sub_ok(val,n)      ! no memory leak
27 |       integer n
28 |       real val
29 |       real, allocatable :: a(:)
30 |       allocate (a(n))
31 |       call random_number(a)
32 |       val = val + sum(a)
33 |       deallocate(a)
34 |       end
35 | 
36 |       subroutine sub_bad(val,n)     ! memory leak of 4*n bytes per call
37 |       integer n
38 |       real val
39 |       real, pointer :: a(:)
40 |       allocate (a(n))
41 |       call random_number(a)
42 |       val = val + sum(a)
43 | !     deallocate(a)                 ! not ok not to deallocate
44 |       end
45 | 
46 |       subroutine sub_badx2(val,n)   ! memory leak of 8*n bytes per call
47 |       integer n
48 |       real val
49 |       real, pointer :: a(:)
50 |       allocate (a(n))
51 |       call random_number(a)
52 |       val = val + sum(a)
53 |       allocate (a(n))               ! not ok to allocate again
54 |       call random_number(a)
55 |       val = val + sum(a)
56 | !     deallocate(a)                 ! not ok not to deallocate
57 |       end
58 | 


--------------------------------------------------------------------------------
/TotalView/src/simple_threaded.c:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <vector>
 4 | #include <stdlib.h>
 5 | #include <pthread.h>
 6 | #include <unistd.h>
 7 | #ifdef ADD_MPI
 8 | #include <mpi.h>
 9 | #endif //ADD_MPI
10 | 
11 | void random_vector(std::vector<int>& vec)
12 | {
13 |   size_t count = (size_t)rand() % 1000;
14 |   if(count < 100) count = 150;
15 | 
16 |   for(size_t i=0; i<count; ++i)
17 |   {
18 |     vec.push_back(rand());
19 |     usleep(20);
20 |   }
21 | 
22 | }
23 | 
24 | void runme()
25 | {
26 |   std::vector<int> vec;
27 | 
28 |   random_vector(vec);
29 | 
30 |   std::cout << "A thread has finished" << std::endl;
31 | }
32 | 
33 | 
34 | 
35 | int main(int argc, char** argv)
36 | {
37 | #ifdef ADD_MPI
38 |   int rank, nnodes, nthreads;
39 |   MPI_Init (&argc, &argv);
40 |   MPI_Comm_rank (MPI_COMM_WORLD, &rank);
41 |   MPI_Comm_size (MPI_COMM_WORLD, &nnodes);
42 | #endif //ADD_MPI
43 | 
44 |   time_t tm = time(NULL);
45 |   srand(tm);
46 |   int numThreads = 25;
47 |   if(argc >= 2)
48 |   {
49 |     numThreads = atoi(argv[1]);
50 |     if(numThreads < 0)
51 |       numThreads = 25;
52 |   }
53 | 
54 |   std::vector<pthread_t> threads;
55 |   threads.reserve(numThreads);
56 | 
57 |   std::cout << "Main is going to create " << numThreads << " threads." << std::endl;
58 | 
59 |   for(int i=0; i<numThreads; ++i)
60 |   {
61 |     pthread_t newThread;
62 |     pthread_create(&newThread, NULL, (void*(*)(void*))runme, NULL);
63 |     threads.push_back(newThread);
64 |   }
65 | 
66 |   for(int i=0; i<numThreads; ++i)
67 |   {
68 |     pthread_join(threads[i], NULL);
69 |   }
70 | 
71 | #ifdef ADD_MPI
72 |   MPI_Barrier(MPI_COMM_WORLD);
73 |   MPI_Finalize();
74 | #endif //ADD_MPI
75 | 
76 |   return 0;
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | For additional information on the license terms, see the CUDA EULA at
28 | https://docs.nvidia.com/cuda/eula/index.html
29 | 


--------------------------------------------------------------------------------
/TotalView/programs/demoMpi_v2.TVD.v4breakpoints:
--------------------------------------------------------------------------------
 1 | # Magic: LR-70-3595585-9ER
 2 | # Generated file -- DO NOT EDIT
 3 | # Breakpoint list saved by Linux x86_64 TotalView 2020.1.13
 4 | 
 5 | dset TV::Private::saved_breakpoint_actual_format 4
 6 | dset TV::Private::saved_breakpoint_actual_revision 0
 7 | 
 8 | namespace eval TV::Private {
 9 | 
10 | BP_start 1
11 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#53}
12 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#61}
13 | BP_set	InnerUnitPath {##/home/stewart/Training/Labs/Lab3/demoMpi_v2#demoMpi_v2.C#main#$b1}
14 | BP_set	InnerUnitLineOffset {8}
15 | BP_set	Flags {g 1 p g g}
16 | BP_set	SatSet {C}
17 | BP_set	SourceText {    ompi_communicator_t* my_mpi_comm_world=MPI_COMM_WORLD;}
18 | BP_set	Instruction {movl     $0x6128a0,-72(%rbp)}
19 | BP_done 1
20 | 
21 | BP_start 2
22 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#53}
23 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#71}
24 | BP_set	InnerUnitPath {##/home/stewart/Training/Labs/Lab3/demoMpi_v2#demoMpi_v2.C#main#$b1}
25 | BP_set	InnerUnitLineOffset {18}
26 | BP_set	Flags {g 1 p g g}
27 | BP_set	SatSet {C}
28 | BP_set	SourceText {    MPI_Get_processor_name(processor_name,&namelen);}
29 | BP_set	Instruction {leal     -148(%rbp),%rdx}
30 | BP_done 2
31 | 
32 | BP_start 3
33 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#53}
34 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/demoMpi_v2.C#80}
35 | BP_set	InnerUnitPath {##/home/stewart/Training/Labs/Lab3/demoMpi_v2#demoMpi_v2.C#main#$b1}
36 | BP_set	InnerUnitLineOffset {27}
37 | BP_set	Flags {g 1 p g g}
38 | BP_set	SatSet {C}
39 | BP_set	SourceText {    if(previous<0) }
40 | BP_set	Instruction {cmpl     $0,-32(%rbp)}
41 | BP_done 3
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/TotalView/src/myClassB.cxx:
--------------------------------------------------------------------------------
 1 | /***********************************************************************
 2 |  * Copyright 2000-2006 by Etnus, LLC. ALL RIGHTS RESERVED
 3 |  * No part of this material may be reproduced, stored in a retrieval
 4 |  * system, transmitted or used in any form or by any means, electronic,
 5 |  * mechanical, photocopying, recording, or otherwise, without the prior
 6 |  * written permission of, or express license from Etnus, LLC.
 7 |  ***********************************************************************
 8 |  * Copyright 1999 by Etnus, Inc.
 9 |  * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc.
10 |  * Copyright 1989-1996 by BBN Inc.
11 |  ***********************************************************************/ 
12 | #include "myClassB.hxx"
13 | #include <string.h>
14 | #include <stdlib.h>
15 | 
16 | myClassB::myClassB() : size(256) {
17 |  
18 |   vector_char_p = new std::vector<char *>();
19 | 
20 |   for(int i=0; i<size; i++) {
21 |     vector_char_p->push_back((char *) strdup("This is from calling strdup in myClassB."));
22 |   }
23 | 
24 |   init();
25 | }
26 | 
27 | 
28 | myClassB::~myClassB() {
29 |   
30 |   std::vector<char *>::iterator iter;
31 | 
32 |   for(int i=0; i<size; i++) {
33 |     free(b_pp[i]);
34 |   }
35 | 
36 |   for(iter = vector_char_p->begin(); iter != vector_char_p->end(); iter++) {
37 |     free(*iter);
38 |   }
39 | 
40 |   delete vector_char_p;
41 | 
42 | }
43 | 
44 | void myClassB::init(void) {
45 | 
46 |   b_pp = (int **) malloc (size * sizeof(int *));
47 | 
48 |   for(int i=0; i<size; i++) {
49 |     b_pp[i] = (int *) malloc(128 * sizeof(int));
50 |   }
51 | }
52 | 
53 | void myClassB::destroy(void) {
54 | 
55 |   delete vector_char_p;
56 |   
57 |   for(int i=0; i<size; i=i+2) {
58 |     free(b_pp[i]);
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/Sanitzers/ThreadSanitizer/README.md:
--------------------------------------------------------------------------------
 1 | # ThreadSanitizer (TSAN)
 2 | 
 3 | Example codes:
 4 | 
 5 | -   `buggyreduction_omp.c`: ThreadSanitizer example codes
 6 | -   `tiny_race.c`: a pthread example code; from [Clang ThreadSanitizer
 7 |     documentation](https://clang.llvm.org/docs/ThreadSanitizer.html)
 8 |     ```
 9 |     $ clang -fsanitize=thread -g -O1 tiny_race.c -o tiny_race
10 |     $ ./tiny_race
11 |     ```
12 | 
13 | ## `buggyreduction_omp.c`
14 | 
15 | This code doesn't have the `reduction` clause for the variable
16 | `sum`, so there is a race condition among OpenMP threads.
17 | 
18 | To build and run:
19 | 
20 | ```
21 | $ cc -fsanitize=thread -g -O1 -fopenmp buggyreduction_omp.c -o buggyreduction_omp
22 | 
23 | $ export OMP_NUM_THREADS=8
24 | 
25 | $ ./buggyreduction_omp
26 | =================
27 | WARNING: ThreadSanitizer: data race (pid=2240264)
28 |   Read of size 4 at 0x7ffdf6e678bc by thread T1:
29 |     #0 main._omp_fn.0 /pscratch/sd/e/elvis/sanitizers/buggyreduction_omp.c:6 (a.out+0x400895)
30 |     #1 <null> <null> (libgomp.so.1+0x1dd4d)
31 | 
32 |   Previous write of size 4 at 0x7ffdf6e678bc by main thread:
33 |     #0 main._omp_fn.0 /pscratch/sd/e/elvis/sanitizers/buggyreduction_omp.c:7 (a.out+0x4008aa)
34 |     #1 GOMP_parallel <null> (libgomp.so.1+0x14e95)
35 | 
36 |   Location is stack of main thread.
37 | 
38 |   Location is global '<null>' at 0x000000000000 ([stack]+0x1e8bc)
39 | 
40 |   Thread T1 (tid=2240266, running) created by main thread at:
41 |     #0 pthread_create <null> (libtsan.so.2+0x61be6)
42 |     #1 <null> <null> (libgomp.so.1+0x1e38f)
43 | 
44 | SUMMARY: ThreadSanitizer: data race /pscratch/sd/e/elvis/sanitizers/buggyreduction_omp.c:6 in main._omp_fn.0
45 | ==================
46 | sum = 335625
47 | ThreadSanitizer: reported 1 warnings
48 | ```
49 | 
50 | You may have to run a few times to see the error (because of a race
51 | condition!).
52 | 


--------------------------------------------------------------------------------
/Sanitzers/Sanitizers4hpc/GPU/README.md:
--------------------------------------------------------------------------------
 1 | # Sanitizers4hpc with GPU codes
 2 | 
 3 | Example code:
 4 | 
 5 | -   `main.cc` and `memcheck_demo.cu`: Santizers4hpc with Compute Sanitzer's Memcheck example code
 6 | 
 7 | ## `main.cc` and `memcheck_demo.cu`
 8 | 
 9 | This is a simple MPI adaptation with a Nvidia's Compute Sanitizer
10 | example code,
11 | [`memcheck_demo.cu`](https://github.com/NVIDIA/compute-sanitizer-samples/blob/master/Memcheck/memcheck_demo.cu).
12 | The code is for using Compute Sanitizer's Memcheck tool.
13 | 
14 | ```
15 | $ salloc -A <project> -C gpu -N 1 --gpus-per-node=4 -q debug -t 10 ...
16 | ...
17 | 
18 | $ cc -fsanitize=thread -g -O1 -fopenmp buggyreduction_mpiomp.c -o buggyreduction_mpiomp
19 | $ CC -c -g main.cc
20 | $ nvcc -Xcompiler -rdynamic -lineinfo -c memcheck_demo.cu
21 | $ CC -o memcheck_demo main.o memcheck_demo.o
22 | 
23 | $ module load sanitizers4hpc
24 | 
25 | $ sanitizers4hpc -l "-n 4 -c 32 --cpu-bind=cores --gpus-per-task=1 --gpu-bind=none" -m ${CUDA_HOME}/compute-sanitizer/compute-sanitizer -f -- ./memcheck_demo
26 | RANKS: <2,3>
27 | ...
28 | Saved host backtrace up to driver entry point at error
29 |     #0 0x2eae6f in /usr/local/cuda-12.2/compat/libcuda.so.1
30 |     #1 0xd8f0 in /home/jenkins/src/gtlt/cuda/gtlt_cuda_query.c:325:gtlt_cuda_pointer_type /opt/cray/pe/lib64/libmpi_gtl_cuda.so.0
31 | ...
32 | RANKS: <0-1>
33 | ...
34 | Saved host backtrace up to driver entry point at error
35 |     #0 0x2eae6f in /usr/local/cuda-12.2/compat/libcuda.so.1
36 |     #1 0xd8f0 in /home/jenkins/src/gtlt/cuda/gtlt_cuda_query.c:325:gtlt_cuda_pointer_type /opt/cray/pe/lib64/libmpi_gtl_cuda.so.0
37 | ...
38 | ```
39 | 
40 | The `-f` flag is needed to bypass the `santizers4hpc`'s  requirement
41 | that the executable is instrumented for a LLVM Sanitizer. This
42 | example code is basically a CUDA code that is not instrumented for
43 | a LLVM Sanitizer.
44 | 
45 | Aggregation of output will improve in CPE/24.07.
46 | 


--------------------------------------------------------------------------------
/Sanitzers/LeakSanitizer/README.md:
--------------------------------------------------------------------------------
 1 | # LeakSanitizer (LSAN)
 2 | 
 3 | Example code:
 4 | 
 5 | -   `memory-leak.c`
 6 | 
 7 | ## LeakSanitizer as a stand-alone sanitizer
 8 | 
 9 | `memory-leak.c` has a memory leak of 7 bytes as the memory block
10 | pointed to by the pointer `p` is not freed before setting it to
11 | `NULL` (0).
12 | 
13 | Here we try the `clang` compiler in the `PrgEnv-aocc` environment
14 | to demonstrate that the tool works in the environment but you are
15 | free to use a different compiler that supports LSAN. To build and
16 | run:
17 | 
18 | ```
19 | $ clang -fsanitize=leak -g -O0 -o memory-leak memory-leak.c
20 | 
21 | $ ./memory-leak
22 | =================================================================
23 | ==2335900==ERROR: LeakSanitizer: detected memory leaks
24 | 
25 | Direct leak of 7 byte(s) in 1 object(s) allocated from:
26 |     #0 0x55966653a842 in malloc /.../nersc/nersc-user-env/prgenv/llvm_src_17.0.6/compiler-rt/lib/lsan/lsan_interceptors.cpp:75:3
27 |     #1 0x559666565898 in main /pscratch/sd/e/elvis/addresssanitizer/memory-leak.c:4:7
28 |     #2 0x7efe8f83e24c in __libc_start_main (/lib64/libc.so.6+0x3524c) (BuildId: ddc393ac74ed8f90d4fdfff796432fbafd281e1b)
29 | 
30 | SUMMARY: LeakSanitizer: 7 byte(s) leaked in 1 allocation(s)
31 | ```
32 | 
33 | ## LeakSanitizer run under AddressSanitizer:
34 | 
35 | LeakSanitizer can be combined with AddressSanitizer to get both
36 | memory error and leak detection, too. Build with `-fsanitize=address`
37 | but run the executable with the environment variable `ASAN_OPTIONS`
38 | set to `detect_leaks=1`:
39 | 
40 | ```
41 | $ clang -fsanitize=address -g -o memory-leak memory-leak.c
42 | 
43 | $ ASAN_OPTIONS=detect_leaks=1 ./memory-leak
44 | =================================================================
45 | ==2339511==ERROR: LeakSanitizer: detected memory leaks
46 | 
47 | Direct leak of 7 byte(s) in 1 object(s) allocated from:
48 |     #0 0x56040740afde in malloc /.../nersc/nersc-user-env/prgenv/llvm_src_17.0.6/compiler-rt/lib/asan/asan_malloc_linux.cpp:69:3
49 |     #1 0x560407447a68 in main /pscratch/sd/e/elvis/addresssanitizer/memory-leak.c:4:7
50 |     #2 0x7fdab443e24c in __libc_start_main (/lib64/libc.so.6+0x3524c) (BuildId: ddc393ac74ed8f90d4fdfff796432fbafd281e1b)
51 | 
52 | SUMMARY: AddressSanitizer: 7 byte(s) leaked in 1 allocation(s)
53 | ```
54 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/common/helper_functions.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | // These are helper functions for the SDK samples (string parsing,
29 | // timers, image helpers, etc)
30 | #ifndef COMMON_HELPER_FUNCTIONS_H_
31 | #define COMMON_HELPER_FUNCTIONS_H_
32 | 
33 | #ifdef WIN32
34 | #pragma warning(disable : 4996)
35 | #endif
36 | 
37 | // includes, project
38 | #include <assert.h>
39 | #include <exception.h>
40 | #include <math.h>
41 | #include <stdio.h>
42 | #include <stdlib.h>
43 | 
44 | #include <algorithm>
45 | #include <fstream>
46 | #include <iostream>
47 | #include <string>
48 | #include <vector>
49 | 
50 | // includes, timer, string parsing, image helpers
51 | #include <helper_image.h>  // helper functions for image compare, dump, data comparisons
52 | #include <helper_string.h>  // helper functions for string parsing
53 | #include <helper_timer.h>   // helper functions for timers
54 | 
55 | #ifndef EXIT_WAIVED
56 | #define EXIT_WAIVED 2
57 | #endif
58 | 
59 | #endif  // COMMON_HELPER_FUNCTIONS_H_
60 | 


--------------------------------------------------------------------------------
/Sanitzers/Sanitizers4hpc/GPU/memcheck_demo.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | #include <iostream>
29 | 
30 | __device__ int x;
31 | 
32 | __global__ void unaligned_kernel(void)
33 | {
34 |     *(int*) ((char*)&x + 1) = 42;
35 | }
36 | 
37 | __device__ void out_of_bounds_function(void)
38 | {
39 |     *(int*) 0x87654320 = 42;
40 | }
41 | 
42 | __global__ void out_of_bounds_kernel(void)
43 | {
44 |     out_of_bounds_function();
45 | }
46 | 
47 | static void run_unaligned(void)
48 | {
49 |     std::cout << "Running unaligned_kernel: ";
50 |     unaligned_kernel<<<1,1>>>();
51 |     std::cout << cudaGetErrorString(cudaDeviceSynchronize()) << std::endl;
52 | }
53 | 
54 | static void run_out_of_bounds(void)
55 | {
56 |     std::cout << "Running out_of_bounds_kernel: ";
57 |     out_of_bounds_kernel<<<1,1>>>();
58 |     std::cout << cudaGetErrorString(cudaDeviceSynchronize()) << std::endl;
59 | }
60 | 
61 | void launch_memcheck_demo() {
62 |     int *devMem = nullptr;
63 | 
64 |     std::cout << "Mallocing memory" << std::endl;
65 |     cudaMalloc((void**)&devMem, 1024);
66 | 
67 |     run_unaligned();
68 |     run_out_of_bounds();
69 | 
70 |     // Omitted to demo leakcheck
71 |     // cudaFree(devMem);
72 | }
73 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/debug/split.c:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 2 | /*
 3 |  * See COPYRIGHT in top-level directory.
 4 |  */
 5 | 
 6 | #include <mpi.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <unistd.h>
10 | 
11 | int main(int argc, char **argv)
12 | {
13 |     int rank, size;
14 |     int color, split_rank, split_size;
15 |     int *sendbuf, *recvbuf;
16 |     MPI_Comm split_comm;
17 | 
18 |     MPI_Init(&argc, &argv);
19 | 
20 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
21 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
22 | 
23 |     /* color evens and odds */
24 |     color = (rank % 2 == 0);
25 |     MPI_Comm_split(MPI_COMM_WORLD, color, 0, &split_comm);
26 |     if (color == 1)
27 |         MPI_Comm_set_name(split_comm, "Even Comm");
28 |     else
29 |         MPI_Comm_set_name(split_comm, "Odd Comm");
30 | 
31 |     MPI_Comm_rank(split_comm, &split_rank);
32 |     MPI_Comm_size(split_comm, &split_size);
33 | 
34 |     /* setup some comm buffers */
35 |     sendbuf = malloc(sizeof(int) * split_size);
36 |     recvbuf = malloc(sizeof(int) * split_size);
37 | 
38 |     /* odd comm */
39 |     if (color == 0) {
40 |         int i, curr, num_ops = split_size * 2 + 1;
41 |         MPI_Request *reqs = malloc(sizeof(MPI_Request) * num_ops);
42 | 
43 |         for (i = 0, curr = 0; i < split_size; i++) {
44 |             MPI_Irecv(&recvbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]);
45 |         }
46 | 
47 |         for (i = 0; i < split_size; i++) {
48 |             sendbuf[i] = rand();
49 |             sleep(1);
50 |             MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]);
51 |         }
52 | 
53 |         if (split_rank == 0) {
54 |             MPI_Isend(NULL, 0, MPI_INT, 0, 0, MPI_COMM_WORLD, &reqs[curr++]);
55 |             MPI_Waitall(num_ops, reqs, MPI_STATUSES_IGNORE);
56 |         } else {
57 |             MPI_Waitall(num_ops - 1, reqs, MPI_STATUSES_IGNORE);
58 |         }
59 |         free(reqs);
60 |     }
61 | 
62 |     /* even comm */
63 |     if (color == 1) {
64 |         int i, curr, num_ops = split_size * 2 + 1;
65 |         MPI_Request *reqs = malloc(sizeof(MPI_Request) * num_ops);
66 | 
67 |         for (i = 0, curr = 0; i < split_size; i++) {
68 |             MPI_Irecv(&recvbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]);
69 |         }
70 | 
71 |         for (i = 0; i < split_size; i++) {
72 |             sendbuf[i] = rand();
73 |             sleep(2);
74 |             MPI_Isend(&sendbuf[i], 1, MPI_INT, i, 0, split_comm, &reqs[curr++]);
75 |         }
76 | 
77 |         if (split_rank == 0) {
78 |             MPI_Irecv(NULL, 0, MPI_INT, 1, 0, MPI_COMM_WORLD, &reqs[curr++]);
79 |             MPI_Waitall(num_ops, reqs, MPI_STATUSES_IGNORE);
80 |         } else {
81 |             MPI_Waitall(num_ops - 1, reqs, MPI_STATUSES_IGNORE);
82 |         }
83 |         free(reqs);
84 |     }
85 | 
86 |     free(sendbuf);
87 |     free(recvbuf);
88 | 
89 |     MPI_Finalize();
90 |     return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/overlap.c:
--------------------------------------------------------------------------------
  1 | #include <string.h>
  2 | #include <stdio.h>
  3 | 
  4 | char b[50];
  5 | 
  6 | void reset_b(void)
  7 | {
  8 |    int i;
  9 | 
 10 |    for (i = 0; i < 50; i++)
 11 |       b[i] = '_';
 12 |    b[49] = '\0';
 13 | }
 14 | 
 15 | void reset_b2(void)
 16 | {
 17 |    reset_b();
 18 |    strcpy(b, "ABCDEFG");
 19 | }
 20 | 
 21 | int main(void)
 22 | {
 23 |    char x[100];
 24 |    char a[] = "abcdefghijklmnopqrstuvwxyz";
 25 |    int  i;
 26 | 
 27 |    /* testing memcpy/strcpy overlap */
 28 | 
 29 |    for (i = 0; i < 50; i++) {
 30 |       x[i] = i+1;    // don't put any zeroes in there
 31 |    }
 32 |    for (i = 50; i < 100; i++) {
 33 |       // because of the errors, the strcpy's will overrun, so put some
 34 |       // zeroes in the second half to stop them eventually
 35 |       x[i] = 0;  
 36 |                
 37 |    }
 38 | 
 39 |    memcpy(x+20, x, 20);    // ok
 40 |    memcpy(x+20, x, 21);    // overlap
 41 |    memcpy(x, x+20, 20);    // ok
 42 |    memcpy(x, x+20, 21);    // overlap
 43 | 
 44 |    strncpy(x+20, x, 20);    // ok
 45 |    strncpy(x+20, x, 21);    // overlap
 46 |    strncpy(x, x+20, 20);    // ok
 47 |    strncpy(x, x+20, 21);    // overlap
 48 |    
 49 |    x[39] = '\0';
 50 |    strcpy(x, x+20);    // ok
 51 | 
 52 |    x[39] = 39;
 53 |    x[40] = '\0';
 54 |    strcpy(x, x+20);    // overlap
 55 | 
 56 |    x[19] = '\0';
 57 |    strcpy(x+20, x);    // ok
 58 | 
 59 | /*
 60 |    x[19] = 19;
 61 |    x[20] = '\0';
 62 |    strcpy(x+20, x);    // overlap, but runs forever (or until it seg faults)
 63 | */
 64 | 
 65 |    /* testing strcpy, strncpy() */
 66 | 
 67 |    reset_b();
 68 |    printf("`%s'\n", b);
 69 | 
 70 |    strcpy(b, a);
 71 |    printf("`%s'\n", b);
 72 |    
 73 |    reset_b();
 74 |    strncpy(b, a, 25);
 75 |    printf("`%s'\n", b);
 76 | 
 77 |    reset_b();
 78 |    strncpy(b, a, 26);
 79 |    printf("`%s'\n", b);
 80 | 
 81 |    reset_b();
 82 |    strncpy(b, a, 27);
 83 |    printf("`%s'\n", b);
 84 | 
 85 |    printf("\n");
 86 | 
 87 |    /* testing strncat() */
 88 | 
 89 |    reset_b2();
 90 |    printf("`%s'\n", b);
 91 |    
 92 |    reset_b2();
 93 |    strcat(b, a);
 94 |    printf("`%s'\n", b);
 95 |    
 96 |    reset_b2();
 97 |    strncat(b, a, 25);
 98 |    printf("`%s'\n", b);
 99 |    
100 |    reset_b2();
101 |    strncat(b, a, 26);
102 |    printf("`%s'\n", b);
103 |    
104 |    reset_b2();
105 |    strncat(b, a, 27);
106 |    printf("`%s'\n", b);
107 | 
108 |    /* Nb: can't actually get strcat warning -- if any overlap occurs, it will
109 |       always run forever, I think... */
110 | 
111 |    for ( i = 0; i < 2; i++) 
112 |       strncat(a+20, a, 21);    // run twice to check 2nd error isn't shown
113 |    strncat(a, a+20, 21);
114 | 
115 |    /* This is ok, but once gave a warning when strncpy() was wrong,
116 |       and used 'n' for the length, even when the src was shorter than 'n' */
117 |    {
118 |       char dest[64];
119 |       char src [16];
120 |       strcpy( src, "short" );
121 |       strncpy( dest, src, 20 );
122 |    }
123 | 
124 |    return 0;
125 | }
126 | 


--------------------------------------------------------------------------------
/Sanitzers/AddressSanitizer/README.md:
--------------------------------------------------------------------------------
 1 | # AddressSanitizer (ASAN)
 2 | 
 3 | Example codes:
 4 | 
 5 | -   `illegalmemoryaccess.cpp`
 6 | -   `use-after-free.c`: from [https://github.com/google/sanitizers/wiki/AddressSanitizer](https://github.com/google/sanitizers/wiki/AddressSanitizer)
 7 | -   `example_UseAfterFree.cc`: from [https://clang.llvm.org/docs/AddressSanitizer.html](https://clang.llvm.org/docs/AddressSanitizer.html)
 8 | 
 9 | ## `illegalmemoryaccess.cpp`
10 | 
11 | This code attempts to write outside of the allocated block ("heap
12 | over"). In this example, the GNU compiler is used, but any compiler
13 | that supports ASAN can be used. Build with the `-fsanitize=address`
14 | flag:
15 | 
16 | ```
17 | $ g++ -O0 -g -fsanitize=address -o illegalmemoryaccess illegalmemoryaccess.cpp
18 | ```
19 | 
20 | Run:
21 | 
22 | ```
23 | $ ./illegalmemoryaccess
24 | =================================================================
25 | ==2267569==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x604000000038 at pc 0x0000004009df bp 0x7ffe9e373680 sp 0x7ffe9e373678
26 | WRITE of size 4 at 0x604000000038 thread T0
27 |     #0 0x4009de in main /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:7
28 |     #1 0x7fbf17c3c24c in __libc_start_main (/lib64/libc.so.6+0x3524c)
29 |     #2 0x4008b9 in _start ../sysdeps/x86_64/start.S:120
30 | 
31 | 0x604000000038 is located 0 bytes to the right of 40-byte region [0x604000000010,0x604000000038)
32 | allocated by thread T0 here:
33 |     #0 0x7fbf188bba88 in operator new[](unsigned long) (/usr/lib64/libasan.so.8+0xbba88)
34 |     #1 0x40097e in main /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:4
35 |     #2 0x7fbf17c3c24c in __libc_start_main (/lib64/libc.so.6+0x3524c)
36 | 
37 | SUMMARY: AddressSanitizer: heap-buffer-overflow /pscratch/sd/e/elvis/addresssanitizer/illegalmemoryaccess.cpp:7 in main
38 | Shadow bytes around the buggy address:
39 |   0x0c087fff7fb0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
40 |   0x0c087fff7fc0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
41 |   0x0c087fff7fd0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
42 |   0x0c087fff7fe0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
43 |   0x0c087fff7ff0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
44 | =>0x0c087fff8000: fa fa 00 00 00 00 00[fa]fa fa fa fa fa fa fa fa
45 |   0x0c087fff8010: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
46 |   0x0c087fff8020: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
47 |   0x0c087fff8030: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
48 |   0x0c087fff8040: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
49 |   0x0c087fff8050: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
50 | Shadow byte legend (one shadow byte represents 8 application bytes):
51 |   Addressable:           00
52 |   Partially addressable: 01 02 03 04 05 06 07
53 |   Heap left redzone:       fa
54 |   Freed heap region:       fd
55 |   ...
56 |   Right alloca redzone:    cb
57 | ==2267569==ABORTING
58 | ```
59 | 
60 | -   Heap-buffer-overflow for attempting to write 4 bytes outside
61 |     of the allocated memory block at line 7 of `illegalmemoryaccess.cpp`
62 | -   Memory block in question: 40 byte region
63 |     `[0x604000000010,0x604000000038)`, allocated at line 4
64 | -   Shadow bytes
65 |     -   Mapped to `[0xc087fff8002,0xc087fff8007)` via `Shadow = (Mem >> 3) + 0x7fff8000`
66 |     -   `00 00 00 00 00`: 5*8 bytes = 40 bytes
67 |     -   `[fa]`: attempted to write to the heap left redzone (`fa`)
68 | 
69 | 


--------------------------------------------------------------------------------
/Sanitzers/README.md:
--------------------------------------------------------------------------------
 1 | # Sanitizers
 2 | 
 3 | See NERSC docs page, [Sanitizers and Sanitizers4hpc](https://docs.nersc.gov/tools/debug/sanitizers/).
 4 | 
 5 | ## Introduction
 6 | 
 7 | LLVM Sanitizers are a group of debugging tools for detecting various
 8 | kinds of bugs in C and C++ codes. There are multiple tools, including
 9 | AddressSanitizer, LeakSanitizer, ThreadSanitizer, MemorySanitizer,
10 | each with a specific debugging capability.
11 | 
12 | A sanitizer consists of a compiler instrumentation module and a
13 | runtime library. To use a sanitizer, you first build an executable
14 | instrumented for the sanitizer, by specifying a compile flag. When
15 | the instrumented executable is run, the runtime intercepts relevant
16 | operations and inspects them. When it detects a problem, it generates
17 | a warning message.
18 | 
19 | Because of the instrumentation and the way how the debugging work
20 | is played out, memory usage can become several times bigger and the
21 | instrumented code can run several times slower. Therefore, it is
22 | important to rebuild your code without instrumentation after debugging
23 | is complete.
24 | 
25 | ### Supported Compilers
26 | 
27 | These tools can be used with more than just LLVM compilers: they
28 | are compatible with all compilers provided on Perlmutter, except
29 | the Nvidia compiler.
30 | 
31 | You don't need to change the way you compile your MPI code in order
32 | to use these tools (i.e., you can still use the Cray compiler
33 | wrappers `cc`/`CC`/`ftn` as normal).  For a non-MPI code, the
34 | following C/C++ base compilers can be used, too.
35 | 
36 | | GNU | Cray | Intel | AOCC | LLVM |
37 | |:---:|:----:|:-----:|:----:|:----:|
38 | | `gcc`/`g++` | `craycc`/`craycxx` | `icx`/`icpx` | `clang`/`clang++` | `clang`/`clang++` |
39 | 
40 | Note that Intel's `icc` and `icpc` do not work for the sanitizer
41 | tools as they are not Clang-based.
42 | 
43 | ### Sanitizer Flags
44 | 
45 | These compilers accept many LLVM sanitizer compile flags. Use the
46 | ones for your needs. For example, you don't have to instrument the
47 | entire code. Instead, you can exclude certain functions or source
48 | files from instrumentation with the `-fsanitize-blacklist=` or
49 | `-fsanitize-ignorelist=` option.
50 | 
51 | Runtime behavior of a tool can be controlled by setting the santizer
52 | environment variable to certain runtime flags. The variable is
53 | `ASAN_OPTIONS` for AddressSantizer, `LSAN_OPTIONS` for LeakSanitizer,
54 | `TSAN_OPTIONS` for ThreadSantizer, `MSAN_OPTIONS` for MemorySanitizer,
55 | etc.
56 | 
57 | You can find compile and runtime flags at the following web pages:
58 | 
59 | - [AddressSanitizer
60 |   Flags](https://github.com/google/sanitizers/wiki/AddressSanitizerFlags)
61 | - [ThreadSanitizer
62 |   Flags](https://github.com/google/sanitizers/wiki/ThreadSanitizerFlags)
63 | - [Sanitizer Common
64 |   Flags](https://github.com/google/sanitizers/wiki/SanitizerCommonFlags)
65 | 
66 | ## Sanitizers4hpc
67 | 
68 | HPE's `Sanitizers4hpc` is an aggregation tool to collect and analyze
69 | LLVM Sanitizer output from a distributed-memory parallel (e.g.,
70 | MPI) code at scale.  It makes sanitizer's result easier to understand,
71 | by presenting output by group of MPI tasks sharing the same pattern.
72 | 
73 | Currently it supports
74 | 
75 | - AddressSanitizer
76 | - LeakSanitizer
77 | - ThreadSanitizer
78 | 
79 | with the Cray and the GNU compilers. It also supports Nvidia Compute
80 | Sanitizer's Memcheck tool for CUDA codes (an example below).
81 | 
82 | To run an app with the tool, load the `sanitizers4hpc` module and
83 | then launch as follows:
84 | 
85 | ```
86 | sanitizers4hpc <sanitizers4hpc options> -- ./a.out <application arguments>
87 | ```
88 | 
89 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/leak-cases.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include "leak.h"
  4 | /* #include "../memcheck.h" */
  5 | #include "memcheck.h"
  6 | 
  7 | // Pointer chain          AAA Category/output BBB Category/output
  8 | // -------------          ------------------- ------------
  9 | // p1 ---> AAA            DR / R
 10 | // p2 ---> AAA ---> BBB   DR / R              IR / R
 11 | // p3      AAA            DL / L
 12 | // p4      AAA ---> BBB   DL / I              IL / L
 13 | // p5 -?-> AAA            (y)DR, (n)DL / P
 14 | // p6 ---> AAA -?-> BBB   DR / R              (y)IR, (n)DL / P
 15 | // p7 -?-> AAA ---> BBB   (y)DR, (n)DL / P    (y)IR, (n)IL / P
 16 | // p8 -?-> AAA -?-> BBB   (y)DR, (n)DL / P    (y,y)IR, (n,y)IL, (_,n)DL / P
 17 | // p9      AAA -?-> BBB   DL / L              (y)IL, (n)DL / I 
 18 | //
 19 | // Pointer chain legend:
 20 | // - pN: a root set pointer
 21 | // - AAA, BBB: heap blocks
 22 | // - --->: a start-pointer
 23 | // - -?->: an interior-pointer
 24 | //
 25 | // Category legend:
 26 | // - DR: Directly reachable
 27 | // - IR: Indirectly reachable
 28 | // - DL: Directly lost
 29 | // - IL: Indirectly lost
 30 | // - (y)XY: it's XY if the interior-pointer is a real pointer
 31 | // - (n)XY: it's XY if the interior-pointer is not a real pointer
 32 | // - (_)XY: it's XY in either case
 33 | //
 34 | // How we handle the 9 cases:
 35 | // - "directly lost":    case 3
 36 | // - "indirectly lost":  cases 4, 9
 37 | // - "possibly lost":    cases 5..8
 38 | // - "still reachable":  cases 1, 2
 39 | 
 40 | 
 41 | typedef
 42 |    struct _Node {
 43 |       struct _Node* next;
 44 |       // Padding ensures the structu is the same size on 32-bit and 64-bit
 45 |       // machines.
 46 |       char padding[8 - sizeof(struct _Node*)];
 47 |    } Node;
 48 | 
 49 | Node* mk(Node* next)
 50 | {
 51 |    // We allocate two nodes, so we can do p+1 and still point within the
 52 |    // block.
 53 |    Node* x = malloc(2 * sizeof(Node));
 54 |    x->next = next;
 55 |    return x;
 56 | }
 57 | 
 58 | // These are definite roots.
 59 | Node* p1;
 60 | Node* p2;
 61 | Node* p3;
 62 | Node* p4;
 63 | Node* p5;
 64 | Node* p6;
 65 | Node* p7;
 66 | Node* p8;
 67 | Node* p9;
 68 | 
 69 | void f(void)
 70 | {
 71 |    p1 = mk(NULL);       // Case 1: 16/1 still reachable
 72 | 
 73 |    p2 = mk(mk(NULL));   // Case 2: 16/1 still reachable
 74 |                                 // 16/1 still reachable
 75 |    (void)mk(NULL);      // Case 3: 16/1 definitely lost
 76 | 
 77 |    (void)mk(mk(NULL));  // Case 4: 16/1 indirectly lost (counted again below!)
 78 |                                 // 32(16d,16i)/1 definitely lost (double count!)
 79 |    p5 = mk(NULL);       // Case 5: 16/1 possibly lost (ok)
 80 |    p5++;
 81 | 
 82 |    p6 = mk(mk(NULL));   // Case 6: 16/1 still reachable
 83 |    (p6->next)++;                // 16/1 possibly lost
 84 | 
 85 |    p7 = mk(mk(NULL));   // Case 7: 16/1 possibly lost
 86 |    p7++;                        // 16/1 possibly lost
 87 | 
 88 |    p8 = mk(mk(NULL));   // Case 8: 16/1 possibly lost
 89 |    (p8->next)++;                // 16/1 possibly lost
 90 |    p8++;
 91 | 
 92 |    p9 = mk(mk(NULL));   // Case 9: 16/1 indirectly lost (counted again below!)
 93 |    (p9->next)++;                // 32(16d,16i)/1 definitely lost (double count!)
 94 |    p9 = NULL;
 95 | }
 96 | 
 97 | int main(void)
 98 | {
 99 |    DECLARE_LEAK_COUNTERS;
100 | 
101 |    GET_INITIAL_LEAK_COUNTS;
102 | 
103 |    // Originally, this program did all the work in main(), but on some
104 |    // platforms (x86/Darwin and AMD64/Linux with --enable-only32bit) stray
105 |    // pointers to supposedly-lost heap blocks were being left on the stack,
106 |    // thus making them reachable.  Doing the allocations in f() and the leak
107 |    // counting in main() avoids the problem.
108 |    f();
109 | 
110 |    CLEAR_CALLER_SAVED_REGS;
111 |    GET_FINAL_LEAK_COUNTS;
112 | 
113 |    PRINT_LEAK_COUNTS(stderr);
114 | 
115 |    return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/TotalView/programs/Makefile:
--------------------------------------------------------------------------------
  1 | SRC=../src
  2 | CXX=g++
  3 | OMP_CC=gcc
  4 | OMP_F77=gfortran
  5 | OMP_OPT=-fopenmp
  6 | CC=gcc
  7 | F90=gfortran
  8 | MPICC=cc
  9 | MPICXX=CC
 10 | CFLAGS= -g
 11 | CCFLAGS= -g
 12 | F90FLAGS= -g
 13 | MPIFLAGS=-DUSEMPI -DMPICH_IGNORE_CXX_SEEK
 14 | 
 15 | 
 16 | 
 17 | 
 18 | PROGRAMS= simple combined demoMpi_v2 filterapp TVscript_demo ReplayEngine_demo simple_threaded  
 19 | 
 20 | 
 21 | all: ${PROGRAMS} 
 22 | 
 23 | simple: ${SRC}/simple.c ${SRC}/array.c ${SRC}/array.h
 24 | 	$(CC) $(CFLAGS) $(LDFLAGS) $(SRC)/simple.c $(SRC)/array.c -o $@ -lm
 25 | 
 26 | simple_threaded: ${SRC}/simple_threaded.c 
 27 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/simple_threaded.c -o $@ -lpthread
 28 | 
 29 | combined: ${SRC}/combined.cxx
 30 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/combined.cxx -lpthread -o $@
 31 | 
 32 | demoMpi: $(SRC)/demoMpi.C
 33 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/demoMpi.C -o $@
 34 | 
 35 | filterapp-mpi: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx
 36 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx
 37 | 
 38 | memory-mpi: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx
 39 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx
 40 | 
 41 | memory-comp: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx
 42 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx
 43 | 
 44 | memory-redzone: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx
 45 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx
 46 | 
 47 | demoMpi_v2: $(SRC)/demoMpi_v2.C
 48 | 	$(MPICXX) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/demoMpi_v2.C -o $@
 49 | 
 50 | MPI_Replay_Engine_demo: $(SRC)/MPI_Replay_Engine_demo.C $(SRC)/merge.h
 51 | 	$(MPICXX) $(CCFLAGS) -I$(SRC) $(MPIFLAGS) $(LDFLAGS) $(SRC)/MPI_Replay_Engine_demo.C -o $@
 52 | 
 53 | filterapp: $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx $(SRC)/myClassB.hxx $(SRC)/myClassA.hxx
 54 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) -o $@ $(SRC)/main.cxx $(SRC)/myClassA.cxx $(SRC)/myClassB.cxx
 55 | 
 56 | f90_demo: $(SRC)/f90_demo.f
 57 | 
 58 | f90_demo: $(SRC)/f90_demo.f
 59 | 	$(F90) $(F90FLAGS)  $(LDFLAGS) -o $@ $(SRC)/f90_demo.f
 60 | 
 61 | springs : $(SRC)/omp-springs.c
 62 | 	${CC} ${CFLAGS} ${LD_FLAGS} $(SRC)/springs.c -o springs -lm
 63 | 
 64 | tx_omp_c_llnl3 : $(SRC)/tx_omp_c_llnl3.c
 65 | 	${OMP_CC} ${CFLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/tx_omp_c_llnl3.c -o tx_omp_c_llnl3 -lm
 66 | 
 67 | omp-springs : $(SRC)/omp-springs.c
 68 | 	${OMP_CC} ${CFLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/omp-springs.c -o omp-springs -lm
 69 | 
 70 | omp-springs-fort : $(SRC)/omp-springs.f
 71 | 	${OMP_F77} ${F90FLAGS} ${OMP_OPT} ${LD_FLAGS} $(SRC)/omp-springs.f -o omp-springs-fort -lm
 72 | 
 73 | txdining: $(SRC)/txdining.cxx $(SRC)/txdining.hxx
 74 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/txdining.cxx -o $@ -lpthread -lrt
 75 | 
 76 | ReplayEngine_demo: $(SRC)/ReplayEngine_demo.cxx
 77 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/ReplayEngine_demo.cxx -o $@
 78 | 
 79 | RedZone_demo: $(SRC)/RedZone_demo.cxx
 80 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) $(SRC)/RedZone_demo.cxx -o $@
 81 | 
 82 | TVscript_demo: $(SRC)/TVscript_demo.c
 83 | 	$(MPICC) $(CCFLAGS) $(MPIFLAGS) $(LDFLAGS) $(SRC)/TVscript_demo.c -o $@ -lm 
 84 | 
 85 | sudoku: ${SRC}/sudoku.c
 86 | 	$(CC) $(CFLAGS) $(LDFLAGS) $(SRC)/sudoku.c -o $@ -lm
 87 | 
 88 | mem_example: ${SRC}/mem_example.cpp
 89 | 	$(CXX) $(CFLAGS) $(LDFLAGS) $(SRC)/mem_example.cpp -o $@ -lm
 90 | 
 91 | 
 92 | cppview_demo:	${SRC}/cppview_demo.cxx ${SRC}/tv_data_display.c  ${SRC}/tv_data_display.h
 93 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) ${SRC}/cppview_demo.cxx ${SRC}/tv_data_display.c -I${SRC} -o $@
 94 | 
 95 | threads: ${SRC}/threads.cxx
 96 | 	$(CXX) $(CCFLAGS) $(LDFLAGS) -lpthread $(SRC)/threads.cxx  -o $@
 97 | 
 98 | 
 99 | 
100 | clean:
101 | 	rm -f $(PROGRAMS) *.o
102 | 
103 | cuda-clean:
104 | 	rm -f $(CUDA_PROGRAMS) *.o
105 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/README.md:
--------------------------------------------------------------------------------
 1 | # matrixMul - Matrix Multiplication (CUDA Runtime API Version)
 2 | 
 3 | ## Description
 4 | 
 5 | This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication.  To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | CUDA Runtime API, Linear Algebra
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.3 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.9 ](https://developer.nvidia.com/cuda-gpus)  [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l, aarch64
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
26 | cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHost, cudaProfilerStart, cudaEventSynchronize, cudaEventRecord, cudaFreeHost, cudaStreamSynchronize, cudaEventDestroy, cudaEventElapsedTime, cudaMemcpyAsync, cudaEventCreate
27 | 
28 | ## Prerequisites
29 | 
30 | Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
31 | 
32 | ## Build and Run
33 | 
34 | ### Windows
35 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
36 | ```
37 | *_vs<version>.sln - for Visual Studio <version>
38 | ```
39 | Each individual sample has its own set of solution files in its directory:
40 | 
41 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
42 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
43 | 
44 | ### Linux
45 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
46 | ```
47 | $ cd <sample_dir>
48 | $ make
49 | ```
50 | The samples makefiles can take advantage of certain options:
51 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
52 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
53 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
54 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
55 | *   **dbg=1** - build with debug symbols
56 |     ```
57 |     $ make dbg=1
58 |     ```
59 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
60 |     ```
61 |     $ make SMS="50 60"
62 |     ```
63 | 
64 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
65 | ```
66 |     $ make HOST_COMPILER=g++
67 | ```
68 | 
69 | ## References (for more details)
70 | 
71 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | #
 28 | ################################################################################
 29 | #
 30 | # Makefile project only supported on Mac OS X and Linux Platforms)
 31 | #
 32 | ################################################################################
 33 | 
 34 | # Location of the CUDA Toolkit
 35 | CUDA_PATH ?= /opt/nvidia/hpc_sdk/Linux_x86_64/23.9/cuda
 36 | 
 37 | # architecture
 38 | HOST_ARCH   := $(shell uname -m)
 39 | TARGET_ARCH ?= $(HOST_ARCH)
 40 | TARGET_SIZE := 64
 41 | 
 42 | # operating system
 43 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 44 | TARGET_OS ?= $(HOST_OS)
 45 | 
 46 | HOST_COMPILER ?= g++
 47 | NVCC          := `which nvcc` -ccbin $(HOST_COMPILER)
 48 | 
 49 | # internal flags
 50 | NVCCFLAGS   := -m${TARGET_SIZE}
 51 | CCFLAGS     :=
 52 | LDFLAGS     :=
 53 | 
 54 | ifdef TARGET_OVERRIDE # cuda toolkit targets override
 55 |     NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
 56 | endif
 57 | 
 58 | NVCCFLAGS += -g -G
 59 | BUILD_TYPE := debug
 60 | 
 61 | ALL_CCFLAGS :=
 62 | ALL_CCFLAGS += $(NVCCFLAGS)
 63 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
 64 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
 65 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 66 | 
 67 | SAMPLE_ENABLED := 1
 68 | 
 69 | ALL_LDFLAGS :=
 70 | ALL_LDFLAGS += $(ALL_CCFLAGS)
 71 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
 72 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
 73 | 
 74 | # Common includes and paths for CUDA
 75 | INCLUDES  := -Icommon
 76 | LIBRARIES :=
 77 | 
 78 | ################################################################################
 79 | 
 80 | # Gencode arguments
 81 | SMS = 50 52 60 61 70 75 80
 82 | 
 83 | ifeq ($(GENCODE_FLAGS),)
 84 | # Generate SASS code for each SM architecture listed in $(SMS)
 85 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
 86 | 
 87 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
 88 | HIGHEST_SM := $(lastword $(sort $(SMS)))
 89 | ifneq ($(HIGHEST_SM),)
 90 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 91 | endif
 92 | endif
 93 | 
 94 | ALL_CCFLAGS += --threads 0 --std=c++11
 95 | 
 96 | ifeq ($(SAMPLE_ENABLED),0)
 97 | EXEC ?= @echo "[@]"
 98 | endif
 99 | 
100 | ################################################################################
101 | 
102 | # Target rules
103 | all: build
104 | 
105 | build: matrixMul
106 | 
107 | matrixMul.o:matrixMul.cu
108 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
109 | 
110 | matrixMul: matrixMul.o
111 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
112 | 	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
113 | 	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
114 | 
115 | run: build
116 | 	$(EXEC) ./matrixMul
117 | 
118 | testrun: build
119 | 
120 | clean:
121 | 	rm -f matrixMul matrixMul.o
122 | 	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMul
123 | 
124 | clobber: clean
125 | 


--------------------------------------------------------------------------------
/TotalView/src/TVscript_demo.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | This is a derivative work of the cpi example program from MPICH2,
  3 | which includes the following copyright notice:
  4 | 
  5 | 				  COPYRIGHT
  6 | 
  7 | The following is a notice of limited availability of the code, and disclaimer
  8 | which must be included in the prologue of the code and in all source listings
  9 | of the code.
 10 | 
 11 | Copyright Notice
 12 |  + 2002 University of Chicago
 13 | 
 14 | Permission is hereby granted to use, reproduce, prepare derivative works, and
 15 | to redistribute to others.  This software was authored by:
 16 | 
 17 | Argonne National Laboratory Group
 18 | W. Gropp: (630) 252-4318; FAX: (630) 252-5986; e-mail: gropp@mcs.anl.gov
 19 | E. Lusk:  (630) 252-7852; FAX: (630) 252-5986; e-mail: lusk@mcs.anl.gov
 20 | Mathematics and Computer Science Division
 21 | Argonne National Laboratory, Argonne IL 60439
 22 | 
 23 | 
 24 | 			      GOVERNMENT LICENSE
 25 | 
 26 | Portions of this material resulted from work developed under a U.S.
 27 | Government Contract and are subject to the following license: the Government
 28 | is granted for itself and others acting on its behalf a paid-up, nonexclusive,
 29 | irrevocable worldwide license in this computer software to reproduce, prepare
 30 | derivative works, and perform publicly and display publicly.
 31 | 
 32 | 				  DISCLAIMER
 33 | 
 34 | This computer code material was prepared, in part, as an account of work
 35 | sponsored by an agency of the United States Government.  Neither the United
 36 | States, nor the University of Chicago, nor any of their employees, makes any
 37 | warranty express or implied, or assumes any legal liability or responsibility
 38 | for the accuracy, completeness, or usefulness of any information, apparatus,
 39 | product, or process disclosed, or represents that its use would not infringe
 40 | privately owned rights.
 41 | 
 42 | 
 43 | 
 44 | */
 45 | #include "mpi.h"
 46 | #include <stdio.h>
 47 | #include <stdlib.h>
 48 | #include <math.h>
 49 | #include <unistd.h>
 50 | #ifndef INTERVAL_START
 51 | #define INTERVAL_START 10
 52 | #endif
 53 | 
 54 | #ifndef INTERVAL_END
 55 | #define INTERVAL_END 1000000
 56 | #endif
 57 | 
 58 | typedef struct error_detail {
 59 |   int intervals;
 60 |   double almost_pi;
 61 |   double delta;
 62 | } error_detail;
 63 | 
 64 | double f( double );
 65 | 
 66 | double f( double a )
 67 | {
 68 |     return (4.0 / (1.0 + a*a));
 69 | }
 70 | 
 71 | double calc_error( int, double, double );
 72 | 
 73 | double calc_error( int n, double almost_pi, double last_error )
 74 | {
 75 |     float ref;
 76 |     ref = 3.141592653589793238462643;
 77 |     double pidiff;
 78 |     int numprocs;
 79 | 
 80 |     pidiff = fabs(almost_pi - ref);
 81 |     if ( pidiff > last_error ) {
 82 |         MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
 83 |         if ( numprocs == 3 )
 84 | 	    abort();
 85 |         else
 86 |             printf( "Error increased for intervals = %d!\n", n );
 87 |         }
 88 |     return pidiff;
 89 | }
 90 | 
 91 | int main( int argc, char *argv[] )
 92 | {
 93 |     int done = 0, n, myid, numprocs, i;
 94 |     double mypi, pi, h, sum, x, pidiff, last_error = 4.;
 95 |     double startwtime=0.0, endwtime;
 96 |     int  namelen;
 97 |     char processor_name[MPI_MAX_PROCESSOR_NAME];
 98 |     error_detail err_detail;
 99 | 
100 |     MPI_Init(&argc,&argv);
101 |     MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
102 |     MPI_Comm_rank(MPI_COMM_WORLD,&myid);
103 |     MPI_Get_processor_name(processor_name,&namelen);
104 |     fprintf(stderr,"Process %d on %s\n",
105 | 	    myid, processor_name);
106 | 
107 |     n = INTERVAL_START;
108 |     while (n <= INTERVAL_END)
109 |     {
110 |         if (myid == 0)
111 |         {
112 | 	    startwtime = MPI_Wtime();
113 |         }
114 |         MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
115 |         if (n == 0)
116 |             done = 1;
117 |         else
118 |         {
119 |             h   = 1.0 / (double) n;
120 |             sum = 0.0;
121 |             for (i = myid + 1; i <= n; i += numprocs)
122 |             {
123 |                 x = h * ((double)i - 0.5);
124 |                 x = f(x);
125 |                 sum += x;
126 |             }
127 |             mypi = h * sum;
128 | 
129 |             MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
130 | 
131 |             if (myid == 0)
132 | 	    {
133 | 	       pidiff = calc_error( n, pi, last_error );
134 |                last_error = pidiff;
135 | 	       err_detail.intervals = n;
136 | 	       err_detail.almost_pi = pi;
137 | 	       err_detail.delta = pidiff;
138 |                printf("the answer is approximately %.16f, Error is %.16f\n",
139 |                        pi, pidiff);
140 | 	       endwtime = MPI_Wtime();
141 | 	       printf("wall clock time = %f\n",
142 | 		       endwtime-startwtime);
143 | 	    }
144 |         }
145 |         n = n * 10;
146 |     }
147 |     MPI_Finalize();
148 | sleep(2);
149 |     return 0;
150 | }
151 | 
152 |             
153 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/memalign.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <assert.h>
  4 | /* #include "tests/malloc.h" */
  5 | #include "malloc.h"
  6 | #include <errno.h>
  7 | #include "../../../config.h"
  8 | 
  9 | int main ( void )
 10 | {
 11 |    // Nb: assuming VG_MIN_MALLOC_SZB is 8 or more...
 12 |    int* p;
 13 |    int* piece;
 14 |    assert(sizeof(long int) == sizeof(void*));
 15 | 
 16 | #if !defined(MUSL_LIBC)
 17 |    // Check behaviour of memalign/free for big alignment.
 18 |    // In particular, the below aims at checking that a
 19 |    // superblock with a big size is not marked as reclaimable
 20 |    // if the superblock is used to provide a big aligned block
 21 |    // (see bug 250101, comment #14).
 22 |    // Valgrind m_mallocfree.c will allocate a big superblock for the memalign
 23 |    // call and will split it in two. This split superblock was
 24 |    // wrongly marked as reclaimable, which was then causing
 25 |    // assert failures (as reclaimable blocks cannot be split).
 26 |    p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1);
 27 |    assert(p && (0 == (long)p % (1024 * 1024)));
 28 |    // We allocate (and then free) a piece of memory smaller than
 29 |    // the hole created in the big superblock.
 30 |    // If the superblock is marked as reclaimable, the below free(s) will cause
 31 |    // an assert. Note that the test has to be run with a --free-list-vol
 32 |    // parameter smaller than the released blocks size to ensure the free is directly
 33 |    // executed (otherwise memcheck does not really release the memory and so
 34 |    // the bug is not properly tested).
 35 |    piece = malloc(1024 * 1000);
 36 |    assert (piece);
 37 |    free (piece);
 38 |    free (p);
 39 | 
 40 |    // Same as above but do the free in the reverse order.
 41 |    p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1);
 42 |    assert(p && (0 == (long)p % (1024 * 1024)));
 43 |    piece = malloc(1024 * 100);
 44 |    assert (piece);
 45 |    free (p);
 46 |    free (piece);
 47 | 
 48 |    p = memalign(0, 100);
 49 |    assert(p && (0 == (long)p % 8));
 50 |    p = memalign(1, 100);
 51 |    assert(p && (0 == (long)p % 8));
 52 |    p = memalign(2, 100);
 53 |    assert(p && (0 == (long)p % 8));
 54 |    p = memalign(3, 100);
 55 |    assert(p && (0 == (long)p % 8));
 56 |    p = memalign(4, 100);
 57 |    assert(p && (0 == (long)p % 8));
 58 |    p = memalign(5, 100);
 59 |    assert(p && (0 == (long)p % 8));
 60 | 
 61 |    p = memalign(7, 100);
 62 |    assert(p && (0 == (long)p % 8));
 63 |    p = memalign(8, 100);
 64 |    assert(p && (0 == (long)p % 8));
 65 |    p = memalign(9, 100);
 66 |    assert(p && (0 == (long)p % 16));
 67 | 
 68 |    p = memalign(31, 100);
 69 |    assert(p && (0 == (long)p % 32));
 70 |    p = memalign(32, 100);
 71 |    assert(p && (0 == (long)p % 32));
 72 |    p = memalign(33, 100);
 73 |    assert(p && (0 == (long)p % 64));
 74 | 
 75 |    p = memalign(4095, 100);
 76 |    assert(p && (0 == (long)p % 4096));
 77 |    p = memalign(4096, 100);
 78 |    assert(p && (0 == (long)p % 4096));
 79 |    p = memalign(4097, 100);
 80 |    assert(p && (0 == (long)p % 8192));
 81 | 
 82 |    p = memalign(4 * 1024 * 1024, 100);
 83 |    assert(p && (0 == (long)p % (4 * 1024 * 1024)));
 84 |    p = memalign(16 * 1024 * 1024, 100);
 85 |    assert(p && (0 == (long)p % (16 * 1024 * 1024)));
 86 | 
 87 |    // size 0
 88 |    p = memalign(256, 0);
 89 |    assert(p && (0 == (long)p % 256));
 90 | #else
 91 |    p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1);
 92 |    assert(p && (0 == (long)p % (1024 * 1024)));
 93 |    piece = malloc(1024 * 1000); assert (piece);
 94 |    free (piece);
 95 |    free (p);
 96 |    p = memalign(1024 * 1024, 4 * 1024 * 1024 + 1);
 97 |    assert(p && (0 == (long)p % (1024 * 1024)));
 98 |    piece = malloc(1024 * 100);
 99 |    assert (piece);
100 |    free (p);
101 |    free (piece);
102 | 
103 |    errno = 0;
104 |    p = memalign(0, 100);
105 |    assert(p && (0 == (long)p % 8));
106 |    p = memalign(1, 100);
107 |    assert(p && (0 == (long)p % 8));
108 |    p = memalign(2, 100);
109 |    assert(p && (0 == (long)p % 8));
110 |    p = memalign(3, 100);
111 |    assert(!p);
112 |    //assert(errno == EINVAL);
113 |    errno = 0;
114 |    p = memalign(4, 100);
115 |    assert(p && 0 == (long)p % 8);
116 |    p = memalign(5, 100);
117 |    assert(!p);
118 |    //assert(errno == EINVAL);
119 |    errno = 0;
120 |    p = memalign(7, 100);
121 |    assert(!p);
122 |    //assert(errno == EINVAL);
123 |    errno = 0;
124 |    p = memalign(8, 100);
125 |    assert(p && (0 == (long)p % 8));
126 |    p = memalign(9, 100);
127 |    assert(!p);
128 |    //assert(errno == EINVAL);
129 |    errno = 0;
130 |    p = memalign(31, 100);
131 |    assert(!p);
132 |    //assert(errno == EINVAL);
133 |    p = memalign(32, 100);
134 |    assert(p && (0 == (long)p % 32));
135 |    errno = 0;
136 |    p = memalign(33, 100);
137 |    assert(!p);
138 |    //assert(errno == EINVAL);
139 |    errno = 0;
140 |    p = memalign(4095, 100);
141 |    assert(!p);
142 |    //assert(errno == EINVAL);
143 |    p = memalign(4096, 100);
144 |    assert(p && (0 == (long)p % 4096));
145 |    errno = 0;
146 |    p = memalign(4097, 100);
147 |    assert(!p);
148 |    //assert(errno == EINVAL);
149 | 
150 |    p = memalign(4 * 1024 * 1024, 100);
151 |    assert(p && (0 == (long)p % (4 * 1024 * 1024)));
152 |    p = memalign(16 * 1024 * 1024, 100);
153 |    assert(p && (0 == (long)p % (16 * 1024 * 1024)));
154 | 
155 |    // size 0
156 |    p = memalign(256, 0);
157 |    assert(p && (0 == (long)p % 256));
158 | #endif
159 | }
160 | 


--------------------------------------------------------------------------------
/TotalView/src/demoMpi_v2.C:
--------------------------------------------------------------------------------
  1 | /* compile with mpiCC -o demoMpi demoMpi.C -g -lm */
  2 | /* run with mpirun -np 10 -tv demoMpi for Message Queue and
  3 |    run with mpirun -np 4 -tv demoMpi  for Lamination and broken links */ 
  4 | #include <mpi.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <stdlib.h>
  8 | #include <unistd.h>
  9 | 
 10 | 
 11 | template<class T>
 12 | T getMax(T* A,int  b);
 13 | 
 14 | template<class T>
 15 | T* get_full_domain(int a);
 16 | 
 17 | int BUFLEN=512, NSTEPS=10;
 18 | 
 19 | char *sendMessage(char *sbuf, char *recbuf, int dest, int source)
 20 | {
 21 |     MPI_Status status;
 22 |     int my_mpi_comm_world=MPI_COMM_WORLD;
 23 |     int my_mpi_char=MPI_CHAR;
 24 | 
 25 |     MPI_Recv(recbuf, BUFLEN, my_mpi_char, source, 99, my_mpi_comm_world, &status);
 26 |     MPI_Send(sbuf, strlen(sbuf)+1, my_mpi_char, dest, 99, my_mpi_comm_world);
 27 |     return recbuf;
 28 | }
 29 | 
 30 | char *f2(char *sbuf, char *recbuf, int dest, int source)
 31 | {
 32 |     return sendMessage(sbuf, recbuf, dest, source );
 33 | }
 34 |    
 35 | char *f1(char *sbuf, char *recbuf, int dest, int source)
 36 | {
 37 |     return f2(sbuf, recbuf, dest, source );
 38 | }
 39 | 
 40 | char *g2(char *sbuf, char *recbuf, int dest, int source)
 41 | {
 42 |     return sendMessage(sbuf, recbuf, dest, source );
 43 | }
 44 | 
 45 | char *g1(char *sbuf, char *recbuf, int dest, int source)
 46 | {
 47 |     return g2(sbuf, recbuf, dest, source );
 48 | }
 49 | 
 50 | 
 51 | int main(int argc, char *argv[])
 52 | {
 53 |     int		root=0,full_domain_length,sub_domain_length;
 54 |     double	global_max,local_max;
 55 |     double	*full_domain,*sub_domain;
 56 |     int myid, numprocs, next, namelen, previous;
 57 |     int default_length=1000;
 58 |     char* sendBuffer=new char[BUFLEN];
 59 |     char* recvBuffer=new char[BUFLEN];
 60 |     char processor_name[MPI_MAX_PROCESSOR_NAME];
 61 |     int my_mpi_comm_world=MPI_COMM_WORLD;
 62 |     int my_mpi_double=MPI_DOUBLE;
 63 |     int my_mpi_max=MPI_MAX;
 64 |     int my_mpi_int=MPI_INT;
 65 |     MPI_Init(&argc,&argv);
 66 |     // sleep(1);
 67 |     MPI_Comm_size(my_mpi_comm_world,&numprocs);
 68 |     MPI_Comm_rank(my_mpi_comm_world,&myid);
 69 |     MPI_Get_processor_name(processor_name,&namelen);
 70 | 
 71 |     MPI_Get_processor_name(processor_name,&namelen);
 72 |     if( myid%2 == 0) sleep( 2 );
 73 | 
 74 |     fprintf(stderr,"Process %d on %s\n",myid,processor_name);
 75 |     sprintf(sendBuffer,"hello there from %d on %s",myid,processor_name);
 76 |     next = myid+4; /* set a barrier here */
 77 |     if(next>=numprocs) 
 78 |       next-=numprocs;
 79 |     previous = myid-4;
 80 |     if(previous<0) 
 81 |       previous+=numprocs;
 82 |                
 83 |     /*  Part 1: Deadlock and code patching */    
 84 |     /* After hitting deadlock, enable eval points and restart program */ 
 85 |     if (myid%2==0){
 86 |         f1(sendBuffer, recvBuffer, next, previous);
 87 |     } else {
 88 |         g1(sendBuffer, recvBuffer, next, previous );
 89 |     }
 90 | 
 91 |     fprintf(stderr,"%d get '%s'\n",myid,recvBuffer);
 92 |     MPI_Barrier(my_mpi_comm_world);
 93 | 
 94 |    /* Part 2: Collective communication  */
 95 | /*
 96 |  * Root obtains full domain and broadcasts its length.
 97 |  */
 98 |     if (myid == root) {
 99 |       if( argc > 2) full_domain_length=atoi((char*)argv[2]);
100 |       else full_domain_length=default_length;
101 |       full_domain=get_full_domain<double>(full_domain_length);
102 |     }
103 |     MPI_Bcast(&full_domain_length, 1, my_mpi_int, root, my_mpi_comm_world);
104 | /*
105 |  * Allocate subdomain memory.
106 |  * Scatter the initial dataset among the processes.
107 |  */
108 |     sub_domain_length = full_domain_length / numprocs;
109 |     sub_domain = new double[sub_domain_length];
110 | 
111 |     MPI_Scatter(full_domain, sub_domain_length, my_mpi_double,
112 |    	    sub_domain, sub_domain_length, my_mpi_double,
113 |     	    root, my_mpi_comm_world);
114 | /*
115 |  * Loop computing and determining max values.
116 |    Stop here and observe effect of Scatter. Dive on sub_domain and show array of size
117 |    sub_domain_length. Laminate and visualize. Then hit Go. all subarrays will be sorted
118 |    in parallel and their local max will be returned.
119 |  */
120 |    local_max=getMax<double>(sub_domain, sub_domain_length);
121 |    MPI_Reduce(&local_max, &global_max, 1, my_mpi_double,
122 | 		my_mpi_max, root, my_mpi_comm_world);
123 | /*
124 |  * Gather final dataset.
125 |    Dive and Laminate local_max to observe different values for local subarray. Now
126 |    visualize again and observe sorted arrays.
127 |  */
128 |     MPI_Gather(sub_domain, sub_domain_length, my_mpi_double,
129 | 	    full_domain, sub_domain_length, my_mpi_double,
130 | 	    root, my_mpi_comm_world);
131 | 
132 |     /* Part 3 all to all comunication and "bottleneck problem" */
133 |     /* After you reached barrier hit Go and then Halt. Open Message
134 |        Queue Graph. You can clearly identify bottleneck at node 1. Enable eval and Go */
135 | 
136 |     if(myid==1){
137 |       int work=1;
138 |       while(work)
139 | 	work=1;
140 |     }
141 |     
142 |     MPI_Alltoall(sub_domain, sub_domain_length/numprocs, my_mpi_double,
143 | 		 sub_domain, sub_domain_length/numprocs, my_mpi_double,
144 | 		 my_mpi_comm_world);
145 |     
146 | 
147 | 
148 |     MPI_Finalize();
149 |     /* Before finishing you can Laminate and Visualize sub_array again to check AlltoAll execution */
150 |     return (0);
151 | }
152 | //// **************************************************************************************** ///
153 | template<class T>
154 | T* get_full_domain(int size){
155 |   T* array=new T[size];
156 |   for(int i=0;i<size;++i){
157 |     array[i]=rand();//*100/RAND_MAX;
158 |   }
159 |   return array; 
160 | }
161 | template<class T>
162 | int myCmp(const void *a, const void *b){
163 |   T* i=(T*)a;
164 |   T* j=(T*)b;
165 |   return (*i<*j?-1:(*i==*j?0:1));
166 | }
167 | 
168 | template<class T>
169 | T getMax(T* array,int length){
170 |   qsort(array,length,sizeof(T),&myCmp<T>);
171 |   T lmax=array[length-1];
172 |   return lmax;
173 | }
174 | 


--------------------------------------------------------------------------------
/fortran_memory/README.md:
--------------------------------------------------------------------------------
  1 | # Fortran memory bug examples
  2 | 
  3 | The codes are from 2012 NERSC training (see `/global/cfs/cdirs/training/2012/NUG2012/debugging`).
  4 | 
  5 | -   `free_twice.f90`: Free twice
  6 | -   `heap_overflow_underflow.f90`: Out of bound array references
  7 | -   `memory_leaks.f90`: memory leaks
  8 | -   `segfault.f90`: segfault
  9 | 
 10 | ## Detecting memory leaks
 11 | 
 12 | `memory_leaks.f90` has total memory leaks of (4n + 8n) &times; 10
 13 | = 120n = 120 &times; 1000000 bytes = 120MB from 10+20=30 memory
 14 | blocks.
 15 | 
 16 | Build as follows:
 17 | 
 18 | ```
 19 | $ ftn -g -O0 -o memory_leaks memory_leaks.f90
 20 | ```
 21 | 
 22 | ### With Valgrind
 23 | 
 24 | Run:
 25 | 
 26 | ```
 27 | $ srun -n 4 valgrind --leak-check=full --log-file=memory_leaks.%q{SLURM_JOB_ID}.%q{SLURM_PROCID}.out ./memory_leaks
 28 | 
 29 | $ cat memory_leaks.32347754.0.out
 30 | ...
 31 | ==1392038== HEAP SUMMARY:
 32 | ==1392038==     in use at exit: 120,095,997 bytes in 632 blocks
 33 | ==1392038==   total heap usage: 828 allocs, 196 frees, 161,833,032 bytes allocated
 34 | ==1392038==
 35 | ==1392038== 4,000,000 bytes in 1 blocks are possibly lost in loss record 603 of 607
 36 | ==1392038==    at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
 37 | ==1392038==    by 0x400F56: sub_bad_ (memory_leaks.f90:40)
 38 | ==1392038==    by 0x401273: MAIN__ (memory_leaks.f90:17)
 39 | ==1392038==    by 0x40134A: main (memory_leaks.f90:6)
 40 | ==1392038==
 41 | ==1392038== 4,000,000 bytes in 1 blocks are possibly lost in loss record 604 of 607
 42 | ==1392038==    at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
 43 | ==1392038==    by 0x400C18: sub_badx2_ (memory_leaks.f90:50)
 44 | ==1392038==    by 0x40129F: MAIN__ (memory_leaks.f90:20)
 45 | ==1392038==    by 0x40134A: main (memory_leaks.f90:6)
 46 | ==1392038==
 47 | ==1392038== 36,000,000 bytes in 9 blocks are definitely lost in loss record 605 of 607
 48 | ==1392038==    at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
 49 | ==1392038==    by 0x400F56: sub_bad_ (memory_leaks.f90:40)
 50 | ==1392038==    by 0x401273: MAIN__ (memory_leaks.f90:17)
 51 | ==1392038==    by 0x40134A: main (memory_leaks.f90:6)
 52 | ==1392038==
 53 | ==1392038== 36,000,000 bytes in 9 blocks are definitely lost in loss record 606 of 607
 54 | ==1392038==    at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
 55 | ==1392038==    by 0x400C18: sub_badx2_ (memory_leaks.f90:50)
 56 | ==1392038==    by 0x40129F: MAIN__ (memory_leaks.f90:20)
 57 | ==1392038==    by 0x40134A: main (memory_leaks.f90:6)
 58 | ==1392038==
 59 | ==1392038== 40,000,000 bytes in 10 blocks are definitely lost in loss record 607 of 607
 60 | ==1392038==    at 0x4E056A4: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
 61 | ==1392038==    by 0x400D97: sub_badx2_ (memory_leaks.f90:53)
 62 | ==1392038==    by 0x40129F: MAIN__ (memory_leaks.f90:20)
 63 | ==1392038==    by 0x40134A: main (memory_leaks.f90:6)
 64 | ==1392038==
 65 | ==1392038== LEAK SUMMARY:
 66 | ==1392038==    definitely lost: 112,000,000 bytes in 28 blocks
 67 | ==1392038==    indirectly lost: 0 bytes in 0 blocks
 68 | ==1392038==      possibly lost: 8,000,000 bytes in 2 blocks
 69 | ==1392038==    still reachable: 95,997 bytes in 602 blocks
 70 | ==1392038==         suppressed: 0 bytes in 0 blocks
 71 | ...
 72 | ```
 73 | 
 74 | The output is a bit difficult to understand. Some blocks are
 75 | classified as definitely lost while others are classified as possibly
 76 | lost. An important thing may be that the code has memory leaks of
 77 | 112,000,000 + 8,000,000 = 120MB, as predicted.
 78 | 
 79 | The tool also reports memory leaks of 95,997 bytes, probably in the
 80 | system libraries. It's worthwhile trying to suppress such errors
 81 | with the suppression files provided by HPE in Valgrind4hpc,
 82 | `$VALGRIND4HPC_BASEDIR/share/suppressions/{known,libmpich_cray,libpmi,misc}.supp`.
 83 | You can use these suppression files with the `--suppressions=...`
 84 | flags and rerun the Valgrind run.
 85 | 
 86 | ### With Valgrind4hpc
 87 | 
 88 | Valgrind4hpc is a HPE tool that aggregates duplicate Valgrind
 89 | messages across MPI processes, which is explained in Valgrind
 90 | memcheck's [Valgrind4hpc](../Valgrind/memcheck/README.md#Valgrind4hpc).
 91 | Run as follows.
 92 | 
 93 | ```
 94 | $ module load valgrind4hpc
 95 | $ valgrind4hpc -n 4 --valgrind-args="--leak-check=full" ./memory_leaks
 96 | RANKS: <0..3>
 97 | 
 98 | 4,000,000 bytes in 1 blocks are possibly lost in loss record 603 of 607
 99 |   at malloc (in vg_replace_malloc.c:393)
100 |   by sub_badx2_ (in memory_leaks.f90:50)
101 |   by MAIN__ (in memory_leaks.f90:20)
102 |   by main (in memory_leaks.f90:6)
103 | 
104 | RANKS: <0..3>
105 | 
106 | 8,000,000 bytes in 2 blocks are possibly lost in loss record 604 of 607
107 |   at malloc (in vg_replace_malloc.c:393)
108 |   by sub_bad_ (in memory_leaks.f90:40)
109 |   by MAIN__ (in memory_leaks.f90:17)
110 |   by main (in memory_leaks.f90:6)
111 | 
112 | RANKS: <0..3>
113 | 
114 | 32,000,000 bytes in 8 blocks are definitely lost
115 |   at malloc (in vg_replace_malloc.c:393)
116 |   by sub_bad_ (in memory_leaks.f90:40)
117 |   by MAIN__ (in memory_leaks.f90:17)
118 |   by main (in memory_leaks.f90:6)
119 | 
120 | RANKS: <0..3>
121 | 
122 | 36,000,000 bytes in 9 blocks are definitely lost
123 |   at malloc (in vg_replace_malloc.c:393)
124 |   by sub_badx2_ (in memory_leaks.f90:50)
125 |   by MAIN__ (in memory_leaks.f90:20)
126 |   by main (in memory_leaks.f90:6)
127 | 
128 | RANKS: <0..3>
129 | 
130 | 40,000,000 bytes in 10 blocks are definitely lost
131 |   at malloc (in vg_replace_malloc.c:393)
132 |   by sub_badx2_ (in memory_leaks.f90:53)
133 |   by MAIN__ (in memory_leaks.f90:20)
134 |   by main (in memory_leaks.f90:6)
135 | 
136 | RANKS: <0..3>
137 | 
138 | HEAP SUMMARY:
139 |   in use at exit: 120000000 bytes in 30 blocks
140 | 
141 | LEAK SUMMARY:
142 |    definitely lost: 108000000 bytes in 27 blocks
143 |    indirectly lost: 0 bytes in 0 blocks
144 |      possibly lost: 12000000 bytes in 3 blocks
145 |    still reachable: 0 bytes in 0 blocks
146 | 
147 | ERROR SUMMARY: 0 errors from 0 contexts (suppressed 601)
148 | ```
149 | 
150 | Again the report is a bit difficult to understand, and some blocks
151 | are classified as definitely lost while others are classified as
152 | possibly lost. An important thing again may be that there were
153 | memory leaks of total 120MB from 30 blocks. The memory leaks of
154 | 95,997 bytes reported by Valgrind in the previous section must be
155 | indeed due to leaks in system libraries.
156 | 
157 | ## Other example codes with Valgrind
158 | 
159 | We see the codes `free_twice.f90` and `heap_overflow_underflow.f90`
160 | fail without Valgrind giving useful info.
161 | 
162 | 


--------------------------------------------------------------------------------
/TotalView/programs/combined.TVD.v4breakpoints:
--------------------------------------------------------------------------------
  1 | # Magic: LR-70-3595585-9ER
  2 | # Generated file -- DO NOT EDIT
  3 | # Breakpoint list saved by Linux x86_64 TotalView 2017.2.11
  4 | 
  5 | dset TV::Private::saved_breakpoint_actual_format 4
  6 | dset TV::Private::saved_breakpoint_actual_revision 0
  7 | 
  8 | namespace eval TV::Private {
  9 | 
 10 | BP_start 1
 11 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 12 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#66}
 13 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#traverseArray(int)#$b1}
 14 | BP_set	InnerUnitLineOffset {0}
 15 | BP_set	Flags {g 1 p g g}
 16 | BP_set	SatSet {C}
 17 | BP_set	BlocksEnabled {0x3}
 18 | BP_set	BlockCount {3}
 19 | BP_set	BlockAddress {0x004018c9}
 20 | BP_set	SourceText {	for (int i = 0; i < n; i++)}
 21 | BP_set	Instruction {movl     $0,-20(%rbp)}
 22 | BP_done 1
 23 | 
 24 | BP_start 2
 25 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 26 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#94}
 27 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#cleanup(void)}
 28 | BP_set	InnerUnitLineOffset {1}
 29 | BP_set	Flags {g 1 p g g}
 30 | BP_set	SatSet {C}
 31 | BP_set	SourceText {	exit(-1);}
 32 | BP_set	Instruction {movl     $-1,%edi}
 33 | BP_done 2
 34 | 
 35 | BP_start 3
 36 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 37 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#154}
 38 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#derived_class(void)#$b1}
 39 | BP_set	InnerUnitLineOffset {0}
 40 | BP_set	Flags {g 1 p g g}
 41 | BP_set	SatSet {C}
 42 | BP_set	SourceText {	Base1 b1('A'), *base1Ptr;}
 43 | BP_set	Instruction {leal     -129(%rbp),%rax}
 44 | BP_done 3
 45 | 
 46 | BP_start 4
 47 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 48 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#169}
 49 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#derived_class(void)#$b1}
 50 | BP_set	InnerUnitLineOffset {15}
 51 | BP_set	Flags {g 1 p g g}
 52 | BP_set	SatSet {C}
 53 | BP_set	SourceText {			 <<"\\nd contains " << dd ;}
 54 | BP_set	Instruction {movl     $0x405eef,%esi}
 55 | BP_done 4
 56 | 
 57 | BP_start 5
 58 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 59 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#224}
 60 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#stl_view(void)#$b1}
 61 | BP_set	InnerUnitLineOffset {10}
 62 | BP_set	Flags {g 1 p g g}
 63 | BP_set	SatSet {C}
 64 | BP_set	SourceText {        lb1.push_back(b1); }
 65 | BP_set	Instruction {leal     -138(%rbp),%rdx}
 66 | BP_done 5
 67 | 
 68 | BP_start 6
 69 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 70 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#244}
 71 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#stl_view(void)#$b1}
 72 | BP_set	InnerUnitLineOffset {30}
 73 | BP_set	Flags {g 1 p g g}
 74 | BP_set	SatSet {C}
 75 | BP_set	SourceText {        s1="this is another string";}
 76 | BP_set	Instruction {leal     -128(%rbp),%rax}
 77 | BP_done 6
 78 | 
 79 | BP_start 7
 80 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 81 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#357}
 82 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#user_templates(void)#$b1}
 83 | BP_set	InnerUnitLineOffset {5}
 84 | BP_set	Flags {g 1 p g g}
 85 | BP_set	SatSet {C}
 86 | BP_set	SourceText {	vect<int>  i = vect<int>(5, ia);}
 87 | BP_set	Instruction {leal     -80(%rbp),%rdx}
 88 | BP_done 7
 89 | 
 90 | BP_start 8
 91 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
 92 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#364}
 93 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#user_templates(void)#$b1}
 94 | BP_set	InnerUnitLineOffset {12}
 95 | BP_set	Flags {g 1 p g g}
 96 | BP_set	SatSet {C}
 97 | BP_set	SourceText {	printArray(i.v, i.len);}
 98 | BP_set	Instruction {mov      -112(%rbp),%edx}
 99 | BP_done 8
100 | 
101 | BP_start 9
102 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
103 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#514}
104 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#arrays(void)#$b1#$b1#$b1}
105 | BP_set	InnerUnitLineOffset {3}
106 | BP_set	Flags {g 1 p g g}
107 | BP_set	SatSet {C}
108 | BP_set	SourceText {	  vol[i][j] = cylinder.volume();}
109 | BP_set	Instruction {mov      %r12,%rbx}
110 | BP_done 9
111 | 
112 | BP_start 10
113 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
114 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#592}
115 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#combine_waves_worker(void*)#$b1}
116 | BP_set	InnerUnitLineOffset {4}
117 | BP_set	Flags {g 1 p g g}
118 | BP_set	SatSet {C}
119 | BP_set	SourceText {    temp=arg->a[j]+arg->b[j];}
120 | BP_set	Instruction {mov      -8(%rbp),%rax}
121 | BP_done 10
122 | 
123 | BP_start 11
124 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
125 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#608}
126 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#parallel_combine_waves(float*,float*)#$b1}
127 | BP_set	InnerUnitLineOffset {6}
128 | BP_set	Flags {g 1 p g g}
129 | BP_set	SatSet {C}
130 | BP_set	SourceText {  args=initialize_args(a, b, args);}
131 | BP_set	Instruction {mov      -8(%rbp),%rdx}
132 | BP_done 11
133 | 
134 | BP_start 12
135 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
136 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#670}
137 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#pthreads_loop(void)#$b1}
138 | BP_set	InnerUnitLineOffset {7}
139 | BP_set	Flags {g 1 p g g}
140 | BP_set	SatSet {C}
141 | BP_set	SourceText {  simple_wave(component,period,amplitude);}
142 | BP_set	Instruction {mov      -804(%rbp),%edx}
143 | BP_done 12
144 | 
145 | BP_start 13
146 | BP_set	ContextPath {/home/stewart/Projects/DemoDVD/src/combined.cxx#29}
147 | BP_set	LinePath {/home/stewart/Projects/DemoDVD/src/combined.cxx#717}
148 | BP_set	InnerUnitPath {##/home/stewart/Projects/DemoDVD/programs/combined#combined.cxx#diveinall(void)#$b1}
149 | BP_set	InnerUnitLineOffset {18}
150 | BP_set	Flags {g 1 p g g}
151 | BP_set	SatSet {C}
152 | BP_set	SourceText {  bb =  cc = 0xffffffffLL;}
153 | BP_set	Instruction {movl     $-1,%eax}
154 | BP_done 13
155 | 
156 | }
157 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/common/exception.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | /* CUda UTility Library */
 29 | #ifndef COMMON_EXCEPTION_H_
 30 | #define COMMON_EXCEPTION_H_
 31 | 
 32 | // includes, system
 33 | #include <stdlib.h>
 34 | #include <exception>
 35 | #include <iostream>
 36 | #include <stdexcept>
 37 | #include <string>
 38 | 
 39 | //! Exception wrapper.
 40 | //! @param Std_Exception Exception out of namespace std for easy typing.
 41 | template <class Std_Exception>
 42 | class Exception : public Std_Exception {
 43 |  public:
 44 |   //! @brief Static construction interface
 45 |   //! @return Alwayss throws ( Located_Exception<Exception>)
 46 |   //! @param file file in which the Exception occurs
 47 |   //! @param line line in which the Exception occurs
 48 |   //! @param detailed details on the code fragment causing the Exception
 49 |   static void throw_it(const char *file, const int line,
 50 |                        const char *detailed = "-");
 51 | 
 52 |   //! Static construction interface
 53 |   //! @return Alwayss throws ( Located_Exception<Exception>)
 54 |   //! @param file file in which the Exception occurs
 55 |   //! @param line line in which the Exception occurs
 56 |   //! @param detailed details on the code fragment causing the Exception
 57 |   static void throw_it(const char *file, const int line,
 58 |                        const std::string &detailed);
 59 | 
 60 |   //! Destructor
 61 |   virtual ~Exception() throw();
 62 | 
 63 |  private:
 64 |   //! Constructor, default (private)
 65 |   Exception();
 66 | 
 67 |   //! Constructor, standard
 68 |   //! @param str string returned by what()
 69 |   explicit Exception(const std::string &str);
 70 | };
 71 | 
 72 | ////////////////////////////////////////////////////////////////////////////////
 73 | //! Exception handler function for arbitrary exceptions
 74 | //! @param ex exception to handle
 75 | ////////////////////////////////////////////////////////////////////////////////
 76 | template <class Exception_Typ>
 77 | inline void handleException(const Exception_Typ &ex) {
 78 |   std::cerr << ex.what() << std::endl;
 79 | 
 80 |   exit(EXIT_FAILURE);
 81 | }
 82 | 
 83 | //! Convenience macros
 84 | 
 85 | //! Exception caused by dynamic program behavior, e.g. file does not exist
 86 | #define RUNTIME_EXCEPTION(msg) \
 87 |   Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
 88 | 
 89 | //! Logic exception in program, e.g. an assert failed
 90 | #define LOGIC_EXCEPTION(msg) \
 91 |   Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
 92 | 
 93 | //! Out of range exception
 94 | #define RANGE_EXCEPTION(msg) \
 95 |   Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
 96 | 
 97 | ////////////////////////////////////////////////////////////////////////////////
 98 | //! Implementation
 99 | 
100 | // includes, system
101 | #include <sstream>
102 | 
103 | ////////////////////////////////////////////////////////////////////////////////
104 | //! Static construction interface.
105 | //! @param  Exception causing code fragment (file and line) and detailed infos.
106 | ////////////////////////////////////////////////////////////////////////////////
107 | /*static*/ template <class Std_Exception>
108 | void Exception<Std_Exception>::throw_it(const char *file, const int line,
109 |                                         const char *detailed) {
110 |   std::stringstream s;
111 | 
112 |   // Quiet heavy-weight but exceptions are not for
113 |   // performance / release versions
114 |   s << "Exception in file '" << file << "' in line " << line << "\n"
115 |     << "Detailed description: " << detailed << "\n";
116 | 
117 |   throw Exception(s.str());
118 | }
119 | 
120 | ////////////////////////////////////////////////////////////////////////////////
121 | //! Static construction interface.
122 | //! @param  Exception causing code fragment (file and line) and detailed infos.
123 | ////////////////////////////////////////////////////////////////////////////////
124 | /*static*/ template <class Std_Exception>
125 | void Exception<Std_Exception>::throw_it(const char *file, const int line,
126 |                                         const std::string &msg) {
127 |   throw_it(file, line, msg.c_str());
128 | }
129 | 
130 | ////////////////////////////////////////////////////////////////////////////////
131 | //! Constructor, default (private).
132 | ////////////////////////////////////////////////////////////////////////////////
133 | template <class Std_Exception>
134 | Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
135 | 
136 | ////////////////////////////////////////////////////////////////////////////////
137 | //! Constructor, standard (private).
138 | //! String returned by what().
139 | ////////////////////////////////////////////////////////////////////////////////
140 | template <class Std_Exception>
141 | Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
142 | 
143 | ////////////////////////////////////////////////////////////////////////////////
144 | //! Destructor
145 | ////////////////////////////////////////////////////////////////////////////////
146 | template <class Std_Exception>
147 | Exception<Std_Exception>::~Exception() throw() {}
148 | 
149 |   // functions, exported
150 | 
151 | #endif  // COMMON_EXCEPTION_H_
152 | 


--------------------------------------------------------------------------------
/Linaro-Forge/performance/mmult.py:
--------------------------------------------------------------------------------
  1 | # ===============================================================================
  2 | # Copyright (C) March 2023 - Linaro Limited (or its affiliates). All rights reserved.
  3 | # Copyright (C) Arm Limited, 2019-2023 All rights reserved.
  4 | # The example code is provided to you as an aid to learning when working
  5 | # with Linaro Forge, including but not limited to programming tutorials.
  6 | # Linaro hereby grants to you, subject to the terms and conditions of this Licence,
  7 | # a non-exclusive, non-transferable, non-sub-licensable, free-of-charge licence,
  8 | # to use and copy the Software solely for the purpose of demonstration and
  9 | # evaluation.
 10 | # You accept that the Software has not been tested by Linaro therefore the Software
 11 | # is provided “as is”, without warranty of any kind, express or implied. In no
 12 | # event shall the authors or copyright holders be liable for any claim, damages
 13 | # or other liability, whether in action or contract, tort or otherwise, arising
 14 | # from, out of or in connection with the Software or the use of Software.
 15 | # ===============================================================================
 16 | 
 17 | #!/usr/bin/env python
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | from __future__ import with_statement
 21 | 
 22 | 
 23 | 
 24 | import argparse
 25 | import ctypes
 26 | import os
 27 | import sys
 28 | from ctypes import c_int
 29 | 
 30 | import numpy
 31 | import mpi4py
 32 | 
 33 | from mpi4py import MPI
 34 | from numpy.ctypeslib import ndpointer
 35 | from scipy.linalg import blas
 36 | 
 37 | 
 38 | # Enable MPI SINGLE thread
 39 | mpi4py.rc.threaded = False
 40 | mpi4py.rc.thread_level = "single"
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | # Check if C kernel has been compiled
 47 | try:
 48 |     open("libmmult_c.so", 'r')
 49 | except FileNotFoundError:
 50 |     print("C kernel not found. Please run 'make -f mmult_py.makefile' "
 51 |           "to compile it before running this script")
 52 |     sys.exit(1)
 53 | 
 54 | # Check and load F90 kernel
 55 | sys.path.insert(0, '.')
 56 | try:
 57 |     import libmmult_f
 58 | except ImportError:
 59 |     print("F90 kernel not found. Please run 'make -f mmult_py.makefile' "
 60 |           "to compile it before running this script")
 61 |     sys.exit(1)
 62 | 
 63 | # Load C kernel
 64 | C_MMULT_LIB = ctypes.CDLL(os.path.join(os.path.dirname(__file__), "libmmult_c.so"))
 65 | 
 66 | # Declare ctype for ndarray pointer
 67 | arr_ptr_t_c = ndpointer(dtype=numpy.float64, ndim=1, flags='C')
 68 | C_MMULT_LIB.mmult.argtypes = [c_int, c_int, arr_ptr_t_c, arr_ptr_t_c, arr_ptr_t_c]
 69 | C_MMULT_LIB.mmult.restype = None
 70 | 
 71 | DEFAULT_SIZE = 64
 72 | DEFAULT_FN = "res_Py.mat"
 73 | DEFAULT_KERNEL = "C"
 74 | SOLVER_CHOICES = ["C", "F90", "Py"]
 75 | 
 76 | 
 77 | def minit(sz, fortran_style_array_order, A):
 78 |     for i in range(0, sz):
 79 |         for j in range(0, sz):
 80 |             if fortran_style_array_order:
 81 |                 A[i, j] = i*(j+1)
 82 |             else:
 83 |                 A[i*sz+j] = i*(j+1)
 84 | 
 85 | 
 86 | def mwrite(A, fn):
 87 |     f = open(fn, "w")
 88 |     A.tofile(f, sep="\t", format="%g")
 89 |     f.close()
 90 | 
 91 | 
 92 | def main(sz, kernel, filename):
 93 |     intercomm = MPI.Comm.Get_parent()
 94 | 
 95 |     comm = MPI.COMM_WORLD
 96 |     nproc = comm.size
 97 |     mr = comm.rank
 98 | 
 99 |     if mr == 0:
100 |         print("-------------------------------------------------------------------\n"
101 |               "This program contains an intentional bug. See the 'Worked Examples'\n"
102 |               "section of the Linaro Forge user guide for more information:\n"
103 |               "https://docs.linaroforge.com/latest/html/forge/index.html or\n"
104 |               "../doc/userguide-forge.pdf\n"
105 |               "-------------------------------------------------------------------\n")
106 | 
107 |     remainder = sz%nproc
108 | 
109 |     if remainder > 0:
110 |         if mr == 0:
111 |             print("{}: Info: reducing SIZE {} to {} to be a multiple of number of "
112 |                   "processes ({})".format(mr, sz, sz-remainder, nproc))
113 |         sz = sz-remainder
114 | 
115 |     mslice = int(sz*sz/nproc)
116 |     mslice_r = int(sz/nproc)
117 | 
118 |     fortran_style_array_order = kernel in ('F90', 'Py')
119 | 
120 |     if kernel == "F90":
121 |         print("Skipping due to the Fortran function call ommiting some call arguments in Python")
122 |         sys.exit(0)
123 |     
124 |     if mr == 0:
125 |         print("{rank}: Size of the matrices: {size}x{size}".format(rank=mr, size=sz))
126 |         print("{}: Kernel: {}".format(mr, kernel))
127 | 
128 |     if mr == 0:
129 |         if fortran_style_array_order:
130 |             mat_a = numpy.ndarray(shape=(sz, sz), dtype='d', order='F')
131 |             mat_b = numpy.ndarray(shape=(sz, sz), dtype='d', order='F')
132 |             mat_c = numpy.ndarray(shape=(sz, sz), dtype='d', order='F')
133 |         else:
134 |             mat_a = numpy.ndarray(shape=(sz*sz), dtype='d', order='C')
135 |             mat_b = numpy.ndarray(shape=(sz*sz), dtype='d', order='C')
136 |             mat_c = numpy.ndarray(shape=(sz*sz), dtype='d', order='C')
137 | 
138 |         print("{}: Initializing matrices...".format(mr))
139 |         minit(sz, fortran_style_array_order, mat_a)
140 |         minit(sz, fortran_style_array_order, mat_b)
141 |         minit(sz, fortran_style_array_order, mat_c)
142 | 
143 |         print("{}: Sending matrices".format(mr))
144 |         for i in range(1, nproc):
145 |             # Get a slice from the mat_a and mat_c matrix
146 |             if fortran_style_array_order:
147 |                 mat_a_slice = mat_c[:, i*mslice_r:(i+1)*mslice_r]
148 |                 mat_c_slice = mat_c[:, i*mslice_r:(i+1)*mslice_r]
149 |             else:
150 |                 mat_a_slice = mat_a[i*mslice:(i+1)*mslice]
151 |                 mat_c_slice = mat_c[i*mslice:(i+1)*mslice]
152 |             comm.send(mat_a_slice, dest=i, tag=i)
153 |             comm.send(mat_b, dest=i, tag=100+i)
154 |             comm.send(mat_c_slice, dest=i, tag=200+i)
155 |     else:
156 |         print("{}: Receiving matrices".format(mr))
157 |         if fortran_style_array_order:
158 |             mat_a = numpy.ndarray(shape=(sz, mslice_r), dtype='d', order='F')
159 |             mat_b = numpy.ndarray(shape=(sz, sz), dtype='d', order='F')
160 |             mat_c = numpy.ndarray(shape=(sz, mslice_r), dtype='d', order='F')
161 |         else:
162 |             mat_a = numpy.ndarray(shape=(mslice), dtype='d', order='C')
163 |             mat_b = numpy.ndarray(shape=(sz*sz), dtype='d', order='C')
164 |             mat_c = numpy.ndarray(shape=(mslice), dtype='d', order='C')
165 | 
166 |         mat_a = comm.recv(source=0, tag=mr)
167 |         mat_b = comm.recv(source=0, tag=100+mr)
168 |         mat_c = comm.recv(source=0, tag=200+mr)
169 | 
170 |     # Processing
171 |     print("{}: Processing..".format(mr))
172 |     if kernel == "F90":
173 |         # f2py makes sz parameter optional
174 |         libmmult_f.mmult(nproc, mat_a, mat_b, mat_c)
175 | 
176 |     elif kernel == "Py":
177 |         mat_c = blas.dgemm(alpha=1.0, a=mat_b, b=mat_a, beta=1.0, c=mat_c,
178 |                            overwrite_c=True, trans_b=False)
179 |     else:
180 |         C_MMULT_LIB.mmult(sz, nproc, mat_a, mat_b, mat_c)
181 | 
182 |     if mr == 0:
183 |         print("{}: Receiving result matrix...".format(mr))
184 |         for i in range(1, nproc):
185 |             if fortran_style_array_order:
186 |                 mat_c[:, i*mslice_r:(i+1)*mslice_r] = comm.recv(source=i, tag=500+i)
187 |             else:
188 |                 mat_c[i*mslice:(i+1)*mslice] = comm.recv(source=i, tag=500+i)
189 |     else:
190 |         print("{}: Sending result matrix...".format(mr))
191 |         comm.send(mat_c, dest=0, tag=500+mr)
192 | 
193 |     # Writing result
194 |     if mr == 0:
195 |         mwrite(mat_c, filename)
196 | 
197 |     if mr == 0:
198 |         print("{}: Done".format(mr))
199 | 
200 |     if intercomm != MPI.COMM_NULL:
201 |         intercomm.Barrier()
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     parser = argparse.ArgumentParser(description="Matrix product.")
206 |     parser.add_argument("-k", dest="kernel", metavar="KERNEL", action="store", type=str,
207 |                         help=("Solver. Options: [%s] (default is C)"
208 |                               % "|".join(SOLVER_CHOICES)),
209 |                         choices=SOLVER_CHOICES, default=DEFAULT_KERNEL)
210 |     parser.add_argument("-s", dest="mat_size", metavar="SIZE", action="store", type=int,
211 |                         help=("size of the matrix to compute (default is %d)"
212 |                               % (DEFAULT_SIZE)),
213 |                         default=DEFAULT_SIZE)
214 |     parser.add_argument("-o", dest="fn", metavar="FILENAME", action="store", type=str,
215 |                         help=("output matrix file name (default is %s)" % DEFAULT_FN),
216 |                         default=DEFAULT_FN)
217 | 
218 |     args = parser.parse_args()
219 | 
220 |     main(args.mat_size, args.kernel, args.fn)
221 | 


--------------------------------------------------------------------------------
/TotalView/src/main.cxx:
--------------------------------------------------------------------------------
  1 | /***********************************************************************
  2 |  * Copyright 2000-2007 by Etnus, LLC. ALL RIGHTS RESERVED
  3 |  * No part of this material may be reproduced, stored in a retrieval
  4 |  * system, transmitted or used in any form or by any means, electronic,
  5 |  * mechanical, photocopying, recording, or otherwise, without the prior
  6 |  * written permission of, or express license from Etnus, LLC.
  7 |  ***********************************************************************
  8 |  ***********************************************************************
  9 |  * Copyright 2007 by TotalView Technologies
 10 |  * Copyright 1999-2007 by Etnus, Inc.
 11 |  * Copyright 1996-1998 by Dolphin Interconnect Solutions, Inc.
 12 |  * Copyright 1989-1996 by BBN Inc.
 13 |  ***********************************************************************/ 
 14 | #include "myClassA.hxx"
 15 | #include "myClassB.hxx"
 16 | 
 17 | #ifdef USEMPI
 18 | #include <mpi.h>
 19 | 
 20 | int rank;
 21 | 
 22 | #endif //USEMPI
 23 | 
 24 | #include <unistd.h>
 25 | #include <stdio.h>
 26 | #include <stdlib.h>
 27 | #include <string.h>
 28 | 
 29 | void *myMalloc( int size )
 30 | {
 31 |   return malloc( size );
 32 | }
 33 | 
 34 | void myFree( void *v )
 35 | {
 36 |   free( v );
 37 | }
 38 | 
 39 | void myFunc()
 40 | {  
 41 |   int * alloc = 0;
 42 |   int i;
 43 |   
 44 |   alloc = (int *) myMalloc( 10 * sizeof(int) );
 45 |   
 46 |   for( i=0; i<=10; i++ )
 47 |     {
 48 |       alloc[i]=12-i;
 49 |     }
 50 |   
 51 |   alloc=0;
 52 | }
 53 | 
 54 | void double_free()
 55 | {
 56 |   int	  	 *p;
 57 |   int            length    = 0xab;   
 58 |   int            junk      = 0;
 59 |   
 60 |   p = (int*) malloc( length );
 61 |   printf ( "malloced %4d (%#6x) bytes at %p\n", length, length, p );
 62 |   
 63 |   // Breakpoint here
 64 |   // Show allocated annotation
 65 |   // Show block properties and enable notify when deallocated
 66 |   junk = 0;
 67 |   
 68 |   // Now release the memory the first time - legal
 69 |   free ( p );
 70 |   
 71 |   // Breakpoint here
 72 |   // Show that the block is marked dangling
 73 |   // Show that the deallocation stack is available now
 74 |   junk = 0;
 75 |   
 76 |   // Now release the memory the second time - illegal
 77 | #ifdef USEMPI
 78 |   if( rank == 1 )
 79 | #endif
 80 |      free ( p );
 81 | 
 82 | // Note: an evaluation point can be used to avoid the segv 
 83 | // and continue the demo.
 84 | }
 85 | 
 86 | void corrupt_data()
 87 | {
 88 |   int      i, j;
 89 |   int      size;
 90 |   int    * p0;
 91 |   int    * p1;
 92 |   int    * p2;
 93 | 
 94 |   // Breakpoint here.
 95 |   // Enable Guard Blocks on the Memory Debugging Configuration Page.
 96 |   // Use 8 byte pre and post guard size.
 97 | 
 98 |   size = 16;
 99 | 
100 |   // Loop added around the allocates and the corrupt loop,
101 |   // to make the memory report viewing more interesting.
102 |   // 9/2009 CS at the suggestion of MS and EH
103 | 
104 |   for ( j=0; j<6 ; j++ )
105 |     {
106 | 
107 |       p0 = (int *) malloc( size * sizeof( int ) );
108 |       p1 = (int *) malloc( size * sizeof( int ) );
109 |       p2 = (int *) malloc( size * sizeof( int ) );
110 | 
111 |   // Common corruption cases.  Oops in the for loop condition.
112 |       for( i=0; i<=size; i++ )
113 | 	{
114 | 	  p1[i] = size - i;
115 | 	}
116 |     }
117 | 
118 |   // Breakpoint here.
119 |   // Check Heap Status Corrupt Guard Block View to scan 
120 |   // all blocks for corruption.
121 |   i = 0;
122 | 
123 |   // Corrupt Guard Memory Event on free().
124 |   free( p1 );
125 | }
126 | 
127 | 
128 | void corrupt_data_rz()
129 | {
130 |   int      i;
131 |   int      size;
132 |   int    * p0;
133 |   int    * p1;
134 |   int    * p2;
135 | 
136 |   // Code is mostly the same as corrupt_data(), just repeated
137 |   // to demonstrate the ability to change from Guard Blocks to
138 |   // RedZones on the fly, and compare and contrast the two.
139 | 
140 |   size = 16;
141 | 
142 |   p0 = (int *) malloc( size * sizeof( int ) );
143 |   p1 = (int *) malloc( size * sizeof( int ) );
144 |   p2 = (int *) malloc( size * sizeof( int ) );
145 | 
146 |   // Common corruption cases.  Oops in the for loop condition.
147 | 
148 |   for( i=0; i<=size; i++ )
149 |     {
150 |       p1[i] = size - i;
151 |     }
152 | 
153 |   free( p2 );
154 |   free( p1 );
155 |   free( p0 );
156 | }
157 | 
158 | 
159 | void corrupt_data_sizes()
160 | {
161 |   int      i;
162 |   int      size;
163 |   int    * p0;
164 |   int    * p1;
165 |   int    * p2;
166 | 
167 |   // Like corrupt_data(), but allocates/corrupts different sizes.
168 |   // In RedZones detailed options, set size limits to ignore the
169 |   // smallest and largest size, and catch the middle size.
170 | 
171 |   size = 64;
172 |   p0 = (int *) malloc( size * sizeof( int ) );
173 |   for( i=0; i<=size; i++ )
174 |     {
175 |       p0[i] = size - i;
176 |     }
177 | 
178 |   size = 256;
179 |   p1 = (int *) malloc( size * sizeof( int ) );
180 |   for( i=0; i<=size; i++ )
181 |     {
182 |       p1[i] = size - i;
183 |     }
184 | 
185 |   size = 128;
186 |   p2 = (int *) malloc( size * sizeof( int ) );
187 |   for( i=0; i<=size; i++ )
188 |     {
189 |       p2[i] = size - i;
190 |     }
191 | 
192 |   free( p2 );
193 |   free( p1 );
194 |   free( p0 );
195 | }
196 | 
197 | 
198 | void read_overrun()
199 | {
200 |   int      i;
201 |   int      size;
202 |   int    * p1;
203 |   int      j;
204 | 
205 |   // RedZones can catch out-of-bounds access even if only a read.
206 |   // Make size such that RZ catches even if size limits from
207 |   // corrupt_sizes method are still in place.
208 | 
209 |   size = 128;
210 | 
211 |   p1 = (int *) malloc( size * sizeof( int ) );
212 | 
213 |   // Fill in with a correct loop.
214 |   for( i=0; i<size; i++ )
215 |     {
216 |       p1[i] = size - i;
217 |     }
218 | 
219 |   // Read back, now with an overrun in the loop.
220 |   for( i=0; i<=size; i++ )
221 |     {
222 |       j = p1[i];
223 |     }
224 | 
225 |   free( p1 );
226 | }
227 | 
228 | 
229 | int main( int argc, char *argv[] ) 
230 | {
231 |   myClassA  *a1;
232 |   myClassA  *a2;
233 |   myClassB  *b1;
234 |   int      **int_pp;
235 |   bool loop = false, runforever, runRedZones=false;
236 | 
237 |   // Pass an argument to make this run forever.
238 |   //  argc == 2 ? runforever = 1 : runforever = 0;
239 | 
240 |   // runforever was eliminated.  Instead, the last bunch of allocates/leaks
241 |   // are done in a counted loop with a usleep - tuning notes appear near the
242 |   // loop.  Here, we check for an argument starting with "R", in which case
243 |   // the RedZones sections are to be run.
244 | 
245 |   if ( argc == 2 )
246 |     {
247 |       if ( argv[1][0] == 'r' || argv[1][0] == 'R' ) runRedZones = true;
248 |     }
249 | 
250 | 
251 | 
252 | #ifdef USEMPI
253 |   int numprocs;
254 |   MPI_Init(&argc,&argv);
255 |   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
256 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
257 |   // Do this so that one rank has a lot more allocations/leaks
258 |   // than the other processors.
259 |   loop = rank == 1;
260 | #else
261 |   loop = true;
262 | #endif
263 | 
264 |   // TotalView can raise events for some kinds of memory errors.
265 |   //   - double free
266 |   //   - free pointer that points to data on the stack.
267 |   //   - free interior pointer
268 |   //   - free an unknown block
269 |   //   - realloc an unknown block
270 |   //   - double allocation (malloc problem)
271 |   //   - misaligned allocation (malloc problem)
272 |   //   - out of memory ( ... you check if malloc returns non-null, right?)
273 |   //   - invalid aligned alloc request (problem calling memalign)
274 |   //   - guard block corruption
275 |   //   - free notification (tagging)
276 |   //   - realloc notification (tagging)
277 |   //
278 |   // double_free() demonstrates "double free" and "free notification".
279 | // #ifndef USEMPI
280 | //  double_free();
281 | // #endif
282 | 
283 |   // TotalView can detect writing outside array bounds conditions
284 |   // by using "Guard Blocks".  The "Guard Blocks" feature must be
285 |   // enabled for this portion of the demo.
286 | // #ifndef USEMPI
287 | //  corrupt_data();
288 | // #endif
289 | 
290 |   // Set of routines similar in flavor to above, but to demo RedZones.
291 |   // If doing RedZones demo, break here, disable Guard Blocks, and
292 |   // enable RedZones.  Also suggest pre-planting eval points to skip
293 |   // over these calls in sequence.
294 | 
295 |   if ( runRedZones )
296 |     {
297 |     corrupt_data_rz();
298 |     corrupt_data_sizes(); // Suggest RZ size lower limit of 512, upper 1020
299 |     read_overrun();
300 |     }
301 | 
302 |   // In the following loop, tune the trip count to control the size of
303 |   // the memory footprint, which will also determine the time it takes
304 |   // to generate memory reports.  Tune the usleep parameter to control
305 |   // the execution time of the loop.
306 | 
307 |   int j;
308 |   j = 0; // Target in case eval points want to jump here.
309 |   for( j=0; (j<160 && loop) ; j++) {
310 |     usleep(100000);
311 | 
312 |     // Generate some interesting allocations and leaks using
313 |     // C++ constructors and destructors.
314 |     int size = 4;
315 |     int_pp = (int **) malloc(size * sizeof(int *));
316 |     for( int i=0; i<size; i++ )
317 |       {
318 |         int_pp[i] = (int *) malloc(16 * sizeof(int));
319 | 
320 |         a1 = new myClassA();
321 |         b1 = new myClassB();
322 |         a2 = new myClassA();
323 |       
324 |         delete a2;
325 |         if(i%2 == 0) {
326 |           b1->destroy();
327 |         } 
328 |         else {
329 |           delete b1;
330 |         }
331 |       }
332 |   
333 |     int * alloc2 = 0;
334 |     for( int i = 1; i < 25; i++ )
335 |       {
336 |         alloc2 = (int *) myMalloc((int)(i * (10+(int) rand()/(RAND_MAX+1.0))));
337 |       
338 |         if( i % 5 )
339 |           {
340 |             myFree(alloc2);
341 |           }
342 |       }
343 |   
344 |     myFunc();
345 | 
346 |   } 
347 | 
348 |   /* breakpoint here*/
349 |   /* discuss heap view (graphical and source)
350 |    * show leak detection 
351 |    * show filtering 
352 |    */
353 | 
354 | printf("Reached the end of filterapp-leaks\n");
355 | 
356 | #ifdef USEMPI
357 |   MPI_Finalize();
358 | #endif
359 | 
360 |   return 0;
361 | }
362 | 


--------------------------------------------------------------------------------
/Linaro-Forge/performance/common.makefile:
--------------------------------------------------------------------------------
  1 | ### Determine compiler invocation ###
  2 | 
  3 | ifdef PE_ENV
  4 | 
  5 | # Cray-specific invocations
  6 | CC = cc
  7 | CXX = CC
  8 | MPICC = $(CC)
  9 | MPICXX = $(CXX)
 10 | FC = ftn
 11 | F77 = $(FC)
 12 | F90 = $(FC)
 13 | MPIF77 = $(FC)
 14 | MPIF90 = $(FC)
 15 | 
 16 | else
 17 | 
 18 | ifneq ($(filter default undefined,$(origin FC)),)
 19 | # default to GNU
 20 | FC := gfortran
 21 | endif
 22 | F77 ?= $(FC)
 23 | F90 ?= $(FC)
 24 | 
 25 | # MPI C/C++ Compilers
 26 | ifndef MPICC
 27 | ifeq ($(shell which mpiicc > /dev/null 2>&1; echo $$?),0)
 28 | MPICC := mpiicc
 29 | else ifeq ($(shell which mpicc > /dev/null 2>&1; echo $$?),0)
 30 | MPICC := mpicc
 31 | endif
 32 | endif
 33 | # Only detect toolchain if MPICC is set, otherwise defer error to rule which invokes compiler
 34 | ifdef MPICC
 35 | # disable remark #10441: warning for deprecated Intel Compiler Classic
 36 | MPICC_VERSION := $(shell $(MPICC) --version -diag-disable=10441 2> /dev/null || $(MPICC) --version 2> /dev/null || $(MPICC) -qversion 2> /dev/null)
 37 | else
 38 | MPICC = $(error Could not detect MPI C compiler in PATH - failed to make target $@)
 39 | endif
 40 | 
 41 | ifndef MPICXX
 42 | ifeq ($(shell which mpiicpc > /dev/null 2>&1; echo $$?),0)
 43 | MPICXX := mpiicpc
 44 | else ifeq ($(shell which mpic++ > /dev/null 2>&1; echo $$?),0)
 45 | MPICXX := mpic++
 46 | else ifeq ($(shell which mpicxx > /dev/null 2>&1; echo $$?),0)
 47 | MPICXX := mpicxx
 48 | endif
 49 | endif
 50 | MPICXX ?= $(error Could not detect MPI C++ compiler in PATH - failed to make target $@)
 51 | 
 52 | # MPI Fortran Compilers
 53 | ifndef MPIF90
 54 | ifeq ($(shell which mpiifort > /dev/null 2>&1; echo $$?),0)
 55 | MPIF90 := mpiifort
 56 | else ifeq ($(shell which mpifc > /dev/null 2>&1; echo $$?),0)
 57 | MPIF90 := mpifc
 58 | else ifeq ($(shell which mpifort > /dev/null 2>&1; echo $$?),0)
 59 | MPIF90 := mpifort
 60 | else ifeq ($(shell which mpif90 > /dev/null 2>&1; echo $$?),0)
 61 | MPIF90 := mpif90
 62 | endif
 63 | endif
 64 | 
 65 | # Only detect toolchain if MPIF90 is set, otherwise defer error to rule which invokes compiler
 66 | ifdef MPIF90
 67 | # disable remark #10441: warning for deprecated Intel Compiler Classic
 68 | MPIF90_VERSION := $(shell $(MPIF90) --version -diag-disable=10441 2> /dev/null || $(MPIF90) --version 2> /dev/null || $(MPIF90) -qversion 2> /dev/null)
 69 | else
 70 | MPIF90 = $(error Could not detect MPI Fortran compiler in PATH - failed to make target $@)
 71 | endif
 72 | 
 73 | ifndef MPIF77
 74 | ifeq ($(shell which mpif77 > /dev/null 2>&1; echo $$?),0)
 75 | MPIF77 := mpif77
 76 | else
 77 | MPIF77 = $(MPIF90)
 78 | endif
 79 | endif
 80 | 
 81 | MPIFC ?= $(MPIF90)
 82 | 
 83 | endif
 84 | 
 85 | ### Recommended compiler flags ###
 86 | 
 87 | # Flags for compiler inlining: MAP works whether inlining is on or off,
 88 | # but you'll typically see more intuitive stacks with it turned off.
 89 | # The major compilers are discussed here:
 90 | #
 91 | # Intel: -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 is
 92 | # recommended. At O3 the compiler doesn't produce enough unwind info even
 93 | # with -debug inline-debug-info set.
 94 | #
 95 | # PGI: -g -O3 -Meh_frame -Mframe -Mnoautoinline is recommended. Other settings
 96 | # dont produce enough unwind information for inlined functions otherwise. This
 97 | # adds some performance penalty - around 8% is typical.
 98 | #
 99 | # The PGI C runtime static library contains an undefined reference to
100 | # __kmpc_fork_call, which will cause compilation to fail when linking
101 | # allinea-profiler.ld. Add --undefined __wrap___kmpc_fork_call to your link line
102 | # before linking to the Forge sampler to resolve this.
103 | #
104 | # GNU: -g -O3 -fno-inline is recommended. You might be lucky without -fno-inline,
105 | # as it should produce enough information to unwind those calls. You will see
106 | # my_function [inlined] in the MAP stack for functions that were inline.
107 | # -fno-inline-functions appears with newer gnu compilers, just to confuse
108 | 
109 | # Common OpenMP flags for supported compilers
110 | # -fopenmp for gnu
111 | # -openmp  for intel
112 | # -mp      for pgi
113 | # -qsmp=omp:noopt for IBM
114 | # -homp    for cray (compiler)
115 | 
116 | # Common pthread flags for supported compilers
117 | # -pthread for GNU
118 | # -lpthread for other compilers
119 | 
120 | INTEL_LLVM_MAP_CFLAGS := -g -fno-inline -no-ipo -fno-omit-frame-pointer -O3
121 | INTEL_LLVM_DDT_CFLAGS := -g -Wall -O0
122 | INTEL_LLVM_OPENMP_CFLAG := -qopenmp
123 | INTEL_LLVM_MAP_FCFLAGS := $(INTEL_LLVM_MAP_CFLAGS)
124 | INTEL_LLVM_DDT_FCFLAGS := $(filter-out -Wall, $(INTEL_LLVM_DDT_CFLAGS)) -warn all
125 | INTEL_LLVM_OPENMP_FCFLAG := $(INTEL_LLVM_OPENMP_CFLAG)
126 | INTEL_LLVM_PTHREAD_CFLAG := -lpthread
127 | INTEL_LLVM_SHARED_LIBRARY_CFLAGS=-fPIC
128 | INTEL_LLVM_SHARED_LIBRARY_LINKER_FLAGS=-shared
129 | 
130 | INTEL_MAP_CFLAGS := -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3
131 | INTEL_DDT_CFLAGS := -g -w3 -O0
132 | INTEL_OPENMP_CFLAG := -qopenmp
133 | INTEL_MAP_FCFLAGS := $(INTEL_MAP_CFLAGS)
134 | INTEL_DDT_FCFLAGS := $(filter-out -w3, $(INTEL_DDT_CFLAGS)) -warn all
135 | INTEL_OPENMP_FCFLAG := $(INTEL_OPENMP_CFLAG)
136 | INTEL_PTHREAD_CFLAG := -lpthread
137 | INTEL_SHARED_LIBRARY_CFLAGS=-fPIC
138 | INTEL_SHARED_LIBRARY_LINKER_FLAGS=-shared
139 | 
140 | PGI_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline
141 | PGI_DDT_CFLAGS := -g -O0
142 | PGI_MAJOR_VERSION_GT_17 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 17 2> /dev/null)
143 | ifeq ($(PGI_MAJOR_VERSION_GT_17),1)
144 | PGI_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call
145 | endif
146 | PGI_OPENMP_CFLAG := -mp
147 | PGI_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS))
148 | PGI_DDT_FCFLAGS := $(PGI_DDT_CFLAGS)
149 | PGI_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG)
150 | PGI_PTHREAD_CFLAG := -lpthread
151 | PGI_SHARED_LIBRARY_CFLAGS=-fPIC
152 | PGI_SHARED_LIBRARY_LINKER_FLAGS=-shared
153 | 
154 | NVC_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline
155 | NVC_DDT_CFLAGS := -g -O0
156 | NVC_MAJOR_VERSION_GT_20 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 20 2> /dev/null)
157 | NVC_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call
158 | NVC_OPENMP_CFLAG := -mp
159 | NVC_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS))
160 | NVC_DDT_FCFLAGS := $(PGI_DDT_CFLAGS)
161 | NVC_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG)
162 | NVC_PTHREAD_CFLAG := -lpthread
163 | NVC_SHARED_LIBRARY_CFLAGS=-fPIC
164 | NVC_SHARED_LIBRARY_LINKER_FLAGS=-shared
165 | 
166 | IBM_MAP_CFLAGS := -g -O3 -qnoinline
167 | IBM_DDT_CFLAGS := -g -Werror -Weverything -O0
168 | IBM_OPENMP_CFLAG := -qsmp=omp:noopt
169 | IBM_MAP_FCFLAGS := $(IBM_MAP_CFLAGS)
170 | IBM_DDT_FCFLAGS := $(IBM_DDT_CFLAGS)
171 | IBM_OPENMP_FCFLAG := $(IBM_OPENMP_CFLAG) -qsmp=omp:noopt -qnohot -lxlf90 -lxlsmp -lxlfmath
172 | IBM_PTHREAD_CFLAG := -lpthread
173 | IBM_SHARED_LIBRARY_CFLAGS=
174 | IBM_SHARED_LIBRARY_LINKER_FLAGS=-qmkshrobj
175 | 
176 | CRAY_MAP_CFLAGS := -g -O3 -hipa0
177 | CRAY_DDT_CFLAGS := -g -h msglevel_2 -O0
178 | CRAY_OPENMP_CFLAG := -homp
179 | CRAY_MAP_FCFLAGS := $(CRAY_MAP_CFLAGS)
180 | CRAY_DDT_FCFLAGS := -g -m 2
181 | CRAY_OPENMP_FCFLAG := $(CRAY_OPENMP_CFLAG)
182 | CRAY_PTHREAD_CFLAG := -lpthread
183 | CRAY_SHARED_LIBRARY_CFLAGS=-fPIC
184 | CRAY_SHARED_LIBRARY_LINKER_FLAGS=-shared
185 | 
186 | GNU_MAP_CFLAGS := -g -O3 -fno-inline -fno-optimize-sibling-calls
187 | GNU_DDT_CFLAGS := -g -Wall -Werror -O0
188 | GNU_OPENMP_CFLAG := -fopenmp
189 | GNU_MAP_FCFLAGS := $(GNU_MAP_CFLAGS)
190 | GNU_DDT_FCFLAGS := $(GNU_DDT_CFLAGS)
191 | GNU_OPENMP_FCFLAG := $(GNU_OPENMP_CFLAG)
192 | GNU_PTHREAD_CFLAG := -pthread
193 | GNU_SHARED_LIBRARY_CFLAGS=-fPIC
194 | GNU_SHARED_LIBRARY_LINKER_FLAGS=-shared
195 | 
196 | # GCC 10 is stricter on requiring standard-compliant Fortran, set this flag
197 | # when compiling older Fortran programs.
198 | GNU_LEGACY_STD_FCFLAG := -std=legacy
199 | 
200 | ### Toolchain detection ###
201 | 
202 | define get_compiler_toolchain
203 | $(if $(or $(findstring icx,$(1)),$(findstring ifx,$(1)),$(findstring Intel(R) oneAPI,$(1)),$(findstring INTEL,$(PE_ENV))),
204 | 	INTEL_LLVM,
205 | $(if $(or $(findstring icc,$(1)),$(findstring ifort,$(1)),$(findstring Intel,$(1)),$(findstring INTEL,$(PE_ENV))),
206 | 	INTEL,
207 | $(if $(or $(findstring pgcc,$(1)),$(findstring pgfortran,$(1)),$(findstring PGI,$(1)), $(findstring PGI,$(PE_ENV))),
208 | 	PGI,
209 | $(if $(or $(findstring nvc,$(1)),$(findstring nvfortran,$(1))),
210 | 	NVC,
211 | $(if $(or $(findstring xlc,$(1)),$(findstring xlf,$(1)),$(findstring IBM,$(1)),$(findstring IBM,$(PE_ENV))),
212 | 	IBM,
213 | $(if $(findstring CRAY,$(PE_ENV)),
214 | 	CRAY,
215 | 	GNU))))))
216 | endef
217 | 
218 | CC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(CC)))
219 | MPICC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPICC_VERSION)))
220 | FC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(FC)))
221 | MPIF90_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPIF90_VERSION)))
222 | 
223 | ### Compiler flags for toolchain (allow overrides) ###
224 | 
225 | MAP_CFLAGS ?= $($(CC_TOOLCHAIN)_MAP_CFLAGS)
226 | MAP_FCFLAGS ?= $($(FC_TOOLCHAIN)_MAP_FCFLAGS)
227 | DDT_CFLAGS ?= $($(CC_TOOLCHAIN)_DDT_CFLAGS)
228 | DDT_FCFLAGS ?= $($(CC_TOOLCHAIN)_DDT_FCFLAGS)
229 | OPENMP_CFLAG ?= $($(CC_TOOLCHAIN)_OPENMP_CFLAG)
230 | OPENMP_FCFLAG ?= $($(FC_TOOLCHAIN)_OPENMP_FCFLAG)
231 | PTHREAD_CFLAG ?= $($(CC_TOOLCHAIN)_PTHREAD_CFLAG)
232 | PTHREAD_FCFLAG ?= $($(FC_TOOLCHAIN)_PTHREAD_FCFLAG)
233 | SHARED_LIBRARY_CFLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_CFLAGS)
234 | SHARED_LIBRARY_LINKER_FLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_LINKER_FLAGS)
235 | LEGACY_STD_FCFLAG ?= $($(FC_TOOLCHAIN)_LEGACY_STD_FCFLAG)
236 | MPI_MAP_CFLAGS ?= $($(MPICC_TOOLCHAIN)_MAP_CFLAGS)
237 | MPI_MAP_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_MAP_FCFLAGS)
238 | MPI_DDT_CFLAGS ?= $($(MPICC_TOOLCHAIN)_DDT_CFLAGS)
239 | MPI_DDT_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_DDT_FCFLAGS)
240 | MPI_OPENMP_CFLAG ?= $($(MPICC_TOOLCHAIN)_OPENMP_CFLAG)
241 | MPI_OPENMP_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_OPENMP_FCFLAG)
242 | MPI_PTHREAD_CFLAG ?= $($(MPICC_TOOLCHAIN)_PTHREAD_CFLAG)
243 | MPI_PTHREAD_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_PTHREAD_FCFLAG)
244 | 
245 | ## Link flags for static Forge sampler
246 | ifeq ($(CC_TOOLCHAIN),GNU)
247 | ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),)
248 | MAP_STATIC_C_LINKFLAGS := -no-pie
249 | endif
250 | endif
251 | 
252 | ifeq ($(FC_TOOLCHAIN),GNU)
253 | ifneq ($(shell $(FC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),)
254 | MAP_STATIC_FC_LINKFLAGS := -no-pie
255 | endif
256 | endif
257 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/core-files/common.makefile:
--------------------------------------------------------------------------------
  1 | ### Determine compiler invocation ###
  2 | 
  3 | ifdef PE_ENV
  4 | 
  5 | # Cray-specific invocations
  6 | CC = cc
  7 | CXX = CC
  8 | MPICC = $(CC)
  9 | MPICXX = $(CXX)
 10 | FC = ftn
 11 | F77 = $(FC)
 12 | F90 = $(FC)
 13 | MPIF77 = $(FC)
 14 | MPIF90 = $(FC)
 15 | 
 16 | else
 17 | 
 18 | ifneq ($(filter default undefined,$(origin FC)),)
 19 | # default to GNU
 20 | FC := gfortran
 21 | endif
 22 | F77 ?= $(FC)
 23 | F90 ?= $(FC)
 24 | 
 25 | # MPI C/C++ Compilers
 26 | ifndef MPICC
 27 | ifeq ($(shell which mpiicc > /dev/null 2>&1; echo $$?),0)
 28 | MPICC := mpiicc
 29 | else ifeq ($(shell which mpicc > /dev/null 2>&1; echo $$?),0)
 30 | MPICC := mpicc
 31 | endif
 32 | endif
 33 | # Only detect toolchain if MPICC is set, otherwise defer error to rule which invokes compiler
 34 | ifdef MPICC
 35 | # disable remark #10441: warning for deprecated Intel Compiler Classic
 36 | MPICC_VERSION := $(shell $(MPICC) --version -diag-disable=10441 2> /dev/null || $(MPICC) --version 2> /dev/null || $(MPICC) -qversion 2> /dev/null)
 37 | else
 38 | MPICC = $(error Could not detect MPI C compiler in PATH - failed to make target $@)
 39 | endif
 40 | 
 41 | ifndef MPICXX
 42 | ifeq ($(shell which mpiicpc > /dev/null 2>&1; echo $$?),0)
 43 | MPICXX := mpiicpc
 44 | else ifeq ($(shell which mpic++ > /dev/null 2>&1; echo $$?),0)
 45 | MPICXX := mpic++
 46 | else ifeq ($(shell which mpicxx > /dev/null 2>&1; echo $$?),0)
 47 | MPICXX := mpicxx
 48 | endif
 49 | endif
 50 | MPICXX ?= $(error Could not detect MPI C++ compiler in PATH - failed to make target $@)
 51 | 
 52 | # MPI Fortran Compilers
 53 | ifndef MPIF90
 54 | ifeq ($(shell which mpiifort > /dev/null 2>&1; echo $$?),0)
 55 | MPIF90 := mpiifort
 56 | else ifeq ($(shell which mpifc > /dev/null 2>&1; echo $$?),0)
 57 | MPIF90 := mpifc
 58 | else ifeq ($(shell which mpifort > /dev/null 2>&1; echo $$?),0)
 59 | MPIF90 := mpifort
 60 | else ifeq ($(shell which mpif90 > /dev/null 2>&1; echo $$?),0)
 61 | MPIF90 := mpif90
 62 | endif
 63 | endif
 64 | 
 65 | # Only detect toolchain if MPIF90 is set, otherwise defer error to rule which invokes compiler
 66 | ifdef MPIF90
 67 | # disable remark #10441: warning for deprecated Intel Compiler Classic
 68 | MPIF90_VERSION := $(shell $(MPIF90) --version -diag-disable=10441 2> /dev/null || $(MPIF90) --version 2> /dev/null || $(MPIF90) -qversion 2> /dev/null)
 69 | else
 70 | MPIF90 = $(error Could not detect MPI Fortran compiler in PATH - failed to make target $@)
 71 | endif
 72 | 
 73 | ifndef MPIF77
 74 | ifeq ($(shell which mpif77 > /dev/null 2>&1; echo $$?),0)
 75 | MPIF77 := mpif77
 76 | else
 77 | MPIF77 = $(MPIF90)
 78 | endif
 79 | endif
 80 | 
 81 | MPIFC ?= $(MPIF90)
 82 | 
 83 | endif
 84 | 
 85 | ### Recommended compiler flags ###
 86 | 
 87 | # Flags for compiler inlining: MAP works whether inlining is on or off,
 88 | # but you'll typically see more intuitive stacks with it turned off.
 89 | # The major compilers are discussed here:
 90 | #
 91 | # Intel: -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3 is
 92 | # recommended. At O3 the compiler doesn't produce enough unwind info even
 93 | # with -debug inline-debug-info set.
 94 | #
 95 | # PGI: -g -O3 -Meh_frame -Mframe -Mnoautoinline is recommended. Other settings
 96 | # dont produce enough unwind information for inlined functions otherwise. This
 97 | # adds some performance penalty - around 8% is typical.
 98 | #
 99 | # The PGI C runtime static library contains an undefined reference to
100 | # __kmpc_fork_call, which will cause compilation to fail when linking
101 | # allinea-profiler.ld. Add --undefined __wrap___kmpc_fork_call to your link line
102 | # before linking to the Forge sampler to resolve this.
103 | #
104 | # GNU: -g -O3 -fno-inline is recommended. You might be lucky without -fno-inline,
105 | # as it should produce enough information to unwind those calls. You will see
106 | # my_function [inlined] in the MAP stack for functions that were inline.
107 | # -fno-inline-functions appears with newer gnu compilers, just to confuse
108 | 
109 | # Common OpenMP flags for supported compilers
110 | # -fopenmp for gnu
111 | # -openmp  for intel
112 | # -mp      for pgi
113 | # -qsmp=omp:noopt for IBM
114 | # -homp    for cray (compiler)
115 | 
116 | # Common pthread flags for supported compilers
117 | # -pthread for GNU
118 | # -lpthread for other compilers
119 | 
120 | INTEL_LLVM_MAP_CFLAGS := -g -fno-inline -no-ipo -fno-omit-frame-pointer -O3
121 | INTEL_LLVM_DDT_CFLAGS := -g -Wall -O0
122 | INTEL_LLVM_OPENMP_CFLAG := -qopenmp
123 | INTEL_LLVM_MAP_FCFLAGS := $(INTEL_LLVM_MAP_CFLAGS)
124 | INTEL_LLVM_DDT_FCFLAGS := $(filter-out -Wall, $(INTEL_LLVM_DDT_CFLAGS)) -warn all
125 | INTEL_LLVM_OPENMP_FCFLAG := $(INTEL_LLVM_OPENMP_CFLAG)
126 | INTEL_LLVM_PTHREAD_CFLAG := -lpthread
127 | INTEL_LLVM_SHARED_LIBRARY_CFLAGS=-fPIC
128 | INTEL_LLVM_SHARED_LIBRARY_LINKER_FLAGS=-shared
129 | 
130 | INTEL_MAP_CFLAGS := -g -fno-inline -no-ip -no-ipo -fno-omit-frame-pointer -O3
131 | INTEL_DDT_CFLAGS := -g -w3 -O0
132 | INTEL_OPENMP_CFLAG := -qopenmp
133 | INTEL_MAP_FCFLAGS := $(INTEL_MAP_CFLAGS)
134 | INTEL_DDT_FCFLAGS := $(filter-out -w3, $(INTEL_DDT_CFLAGS)) -warn all
135 | INTEL_OPENMP_FCFLAG := $(INTEL_OPENMP_CFLAG)
136 | INTEL_PTHREAD_CFLAG := -lpthread
137 | INTEL_SHARED_LIBRARY_CFLAGS=-fPIC
138 | INTEL_SHARED_LIBRARY_LINKER_FLAGS=-shared
139 | 
140 | PGI_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline
141 | PGI_DDT_CFLAGS := -g -O0
142 | PGI_MAJOR_VERSION_GT_17 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 17 2> /dev/null)
143 | ifeq ($(PGI_MAJOR_VERSION_GT_17),1)
144 | PGI_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call
145 | endif
146 | PGI_OPENMP_CFLAG := -mp
147 | PGI_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS))
148 | PGI_DDT_FCFLAGS := $(PGI_DDT_CFLAGS)
149 | PGI_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG)
150 | PGI_PTHREAD_CFLAG := -lpthread
151 | PGI_SHARED_LIBRARY_CFLAGS=-fPIC
152 | PGI_SHARED_LIBRARY_LINKER_FLAGS=-shared
153 | 
154 | NVC_MAP_CFLAGS := -g -Meh_frame -Mframe -O3 -Mnoautoinline
155 | NVC_DDT_CFLAGS := -g -O0
156 | NVC_MAJOR_VERSION_GT_20 := $(shell expr `$(CC) --version 2> /dev/null | sed -nE 's/^pgcc ([0-9]+)\..*/\1/p'` \> 20 2> /dev/null)
157 | NVC_MAP_CFLAGS := $(PGI_MAP_CFLAGS) -Wl,--undefined=__wrap___kmpc_fork_call
158 | NVC_OPENMP_CFLAG := -mp
159 | NVC_MAP_FCFLAGS := $(filter-out -Meh_frame, $(PGI_MAP_CFLAGS))
160 | NVC_DDT_FCFLAGS := $(PGI_DDT_CFLAGS)
161 | NVC_OPENMP_FCFLAG := $(PGI_OPENMP_CFLAG)
162 | NVC_PTHREAD_CFLAG := -lpthread
163 | NVC_SHARED_LIBRARY_CFLAGS=-fPIC
164 | NVC_SHARED_LIBRARY_LINKER_FLAGS=-shared
165 | 
166 | IBM_MAP_CFLAGS := -g -O3 -qnoinline
167 | IBM_DDT_CFLAGS := -g -Werror -Weverything -O0
168 | IBM_OPENMP_CFLAG := -qsmp=omp:noopt
169 | IBM_MAP_FCFLAGS := $(IBM_MAP_CFLAGS)
170 | IBM_DDT_FCFLAGS := $(IBM_DDT_CFLAGS)
171 | IBM_OPENMP_FCFLAG := $(IBM_OPENMP_CFLAG) -qsmp=omp:noopt -qnohot -lxlf90 -lxlsmp -lxlfmath
172 | IBM_PTHREAD_CFLAG := -lpthread
173 | IBM_SHARED_LIBRARY_CFLAGS=
174 | IBM_SHARED_LIBRARY_LINKER_FLAGS=-qmkshrobj
175 | 
176 | CRAY_MAP_CFLAGS := -g -O3 -hipa0
177 | CRAY_DDT_CFLAGS := -g -h msglevel_2 -O0
178 | CRAY_OPENMP_CFLAG := -homp
179 | CRAY_MAP_FCFLAGS := $(CRAY_MAP_CFLAGS)
180 | CRAY_DDT_FCFLAGS := -g -m 2
181 | CRAY_OPENMP_FCFLAG := $(CRAY_OPENMP_CFLAG)
182 | CRAY_PTHREAD_CFLAG := -lpthread
183 | CRAY_SHARED_LIBRARY_CFLAGS=-fPIC
184 | CRAY_SHARED_LIBRARY_LINKER_FLAGS=-shared
185 | 
186 | GNU_MAP_CFLAGS := -g -O3 -fno-inline -fno-optimize-sibling-calls
187 | GNU_DDT_CFLAGS := -g -Wall -Werror -O0
188 | GNU_OPENMP_CFLAG := -fopenmp
189 | GNU_MAP_FCFLAGS := $(GNU_MAP_CFLAGS)
190 | GNU_DDT_FCFLAGS := $(GNU_DDT_CFLAGS)
191 | GNU_OPENMP_FCFLAG := $(GNU_OPENMP_CFLAG)
192 | GNU_PTHREAD_CFLAG := -pthread
193 | GNU_SHARED_LIBRARY_CFLAGS=-fPIC
194 | GNU_SHARED_LIBRARY_LINKER_FLAGS=-shared
195 | 
196 | # GCC 10 is stricter on requiring standard-compliant Fortran, set this flag
197 | # when compiling older Fortran programs.
198 | GNU_LEGACY_STD_FCFLAG := -std=legacy
199 | 
200 | ### Toolchain detection ###
201 | 
202 | define get_compiler_toolchain
203 | $(if $(or $(findstring icx,$(1)),$(findstring ifx,$(1)),$(findstring Intel(R) oneAPI,$(1)),$(findstring INTEL,$(PE_ENV))),
204 | 	INTEL_LLVM,
205 | $(if $(or $(findstring icc,$(1)),$(findstring ifort,$(1)),$(findstring Intel,$(1)),$(findstring INTEL,$(PE_ENV))),
206 | 	INTEL,
207 | $(if $(or $(findstring pgcc,$(1)),$(findstring pgfortran,$(1)),$(findstring PGI,$(1)), $(findstring PGI,$(PE_ENV))),
208 | 	PGI,
209 | $(if $(or $(findstring nvc,$(1)),$(findstring nvfortran,$(1))),
210 | 	NVC,
211 | $(if $(or $(findstring xlc,$(1)),$(findstring xlf,$(1)),$(findstring IBM,$(1)),$(findstring IBM,$(PE_ENV))),
212 | 	IBM,
213 | $(if $(findstring CRAY,$(PE_ENV)),
214 | 	CRAY,
215 | 	GNU))))))
216 | endef
217 | 
218 | CC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(CC)))
219 | MPICC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPICC_VERSION)))
220 | FC_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(FC)))
221 | MPIF90_TOOLCHAIN := $(strip $(call get_compiler_toolchain,$(MPIF90_VERSION)))
222 | 
223 | ### Compiler flags for toolchain (allow overrides) ###
224 | 
225 | MAP_CFLAGS ?= $($(CC_TOOLCHAIN)_MAP_CFLAGS)
226 | MAP_FCFLAGS ?= $($(FC_TOOLCHAIN)_MAP_FCFLAGS)
227 | DDT_CFLAGS ?= $($(CC_TOOLCHAIN)_DDT_CFLAGS)
228 | DDT_FCFLAGS ?= $($(CC_TOOLCHAIN)_DDT_FCFLAGS)
229 | OPENMP_CFLAG ?= $($(CC_TOOLCHAIN)_OPENMP_CFLAG)
230 | OPENMP_FCFLAG ?= $($(FC_TOOLCHAIN)_OPENMP_FCFLAG)
231 | PTHREAD_CFLAG ?= $($(CC_TOOLCHAIN)_PTHREAD_CFLAG)
232 | PTHREAD_FCFLAG ?= $($(FC_TOOLCHAIN)_PTHREAD_FCFLAG)
233 | SHARED_LIBRARY_CFLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_CFLAGS)
234 | SHARED_LIBRARY_LINKER_FLAGS ?= $($(FC_TOOLCHAIN)_SHARED_LIBRARY_LINKER_FLAGS)
235 | LEGACY_STD_FCFLAG ?= $($(FC_TOOLCHAIN)_LEGACY_STD_FCFLAG)
236 | MPI_MAP_CFLAGS ?= $($(MPICC_TOOLCHAIN)_MAP_CFLAGS)
237 | MPI_MAP_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_MAP_FCFLAGS)
238 | MPI_DDT_CFLAGS ?= $($(MPICC_TOOLCHAIN)_DDT_CFLAGS)
239 | MPI_DDT_FCFLAGS ?= $($(MPIF90_TOOLCHAIN)_DDT_FCFLAGS)
240 | MPI_OPENMP_CFLAG ?= $($(MPICC_TOOLCHAIN)_OPENMP_CFLAG)
241 | MPI_OPENMP_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_OPENMP_FCFLAG)
242 | MPI_PTHREAD_CFLAG ?= $($(MPICC_TOOLCHAIN)_PTHREAD_CFLAG)
243 | MPI_PTHREAD_FCFLAG ?= $($(MPIF90_TOOLCHAIN)_PTHREAD_FCFLAG)
244 | 
245 | ## Link flags for static Forge sampler
246 | ifeq ($(CC_TOOLCHAIN),GNU)
247 | ifneq ($(shell $(CC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),)
248 | MAP_STATIC_C_LINKFLAGS := -no-pie
249 | endif
250 | endif
251 | 
252 | ifeq ($(FC_TOOLCHAIN),GNU)
253 | ifneq ($(shell $(FC) -dumpspecs 2>/dev/null | grep -e '[^f]no-pie'),)
254 | MAP_STATIC_FC_LINKFLAGS := -no-pie
255 | endif
256 | endif
257 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/leak.h:
--------------------------------------------------------------------------------
  1 | // These counters are used to get a delta between leak counts at startup
  2 | // (eg. due to libc) and later on.  Necessary to get reliable leak tests
  3 | // across different platforms.
  4 | #define DECLARE_LEAK_COUNTERS \
  5 |    long L0_bytes = 0, L_bytes = 0, L0_blocks = 0, L_blocks = 0; \
  6 |    long D0_bytes = 0, D_bytes = 0, D0_blocks = 0, D_blocks = 0; \
  7 |    long R0_bytes = 0, R_bytes = 0, R0_blocks = 0, R_blocks = 0; \
  8 |    long S0_bytes = 0, S_bytes = 0, S0_blocks = 0, S_blocks = 0
  9 | 
 10 | // Set a baseline, in case allocations have already happened.
 11 | #define GET_INITIAL_LEAK_COUNTS \
 12 |    do { \
 13 |       VALGRIND_DO_QUICK_LEAK_CHECK; \
 14 |       VALGRIND_COUNT_LEAKS(      L0_bytes,  D0_bytes,  R0_bytes,  S0_bytes );\
 15 |       VALGRIND_COUNT_LEAK_BLOCKS(L0_blocks, D0_blocks, R0_blocks, S0_blocks); \
 16 |    } while (0)
 17 | 
 18 | // Set a baseline, in case allocations have already happened.
 19 | #define GET_FINAL_LEAK_COUNTS \
 20 |    do { \
 21 |       VALGRIND_DO_QUICK_LEAK_CHECK; \
 22 |       VALGRIND_COUNT_LEAKS(      L_bytes,  D_bytes,  R_bytes,  S_bytes ); \
 23 |       VALGRIND_COUNT_LEAK_BLOCKS(L_blocks, D_blocks, R_blocks, S_blocks); \
 24 |       L_bytes -= L0_bytes;  L_blocks -= L0_blocks; \
 25 |       D_bytes -= D0_bytes;  D_blocks -= D0_blocks; \
 26 |       R_bytes -= R0_bytes;  R_blocks -= R0_blocks; \
 27 |       S_bytes -= S0_bytes;  S_blocks -= S0_blocks; \
 28 |    } while (0)
 29 | 
 30 | // Print leak counts.  When used in conjunction with -q the normal counts
 31 | // aren't shown, which is what we want.
 32 | #define PRINT_LEAK_COUNTS(where) \
 33 |    do { \
 34 |       fprintf(where,"leaked:     %3ld bytes in %2ld blocks\n", \
 35 |                      L_bytes,L_blocks); \
 36 |       fprintf(where,"dubious:    %3ld bytes in %2ld blocks\n", \
 37 |                      D_bytes,D_blocks); \
 38 |       fprintf(where,"reachable:  %3ld bytes in %2ld blocks\n", \
 39 |                      R_bytes,R_blocks); \
 40 |       fprintf(where,"suppressed: %3ld bytes in %2ld blocks\n", \
 41 |                      S_bytes,S_blocks); \
 42 |    } while (0)
 43 | 
 44 | /* Upon a call to a function, some architectures store pointers into
 45 |  * into registers.  Valgrind may consider these registers when determining
 46 |  * whether an address is reachable, so we need to zero-out these registers
 47 |  * as needed.
 48 |  */
 49 | #if defined __powerpc__
 50 | #define CLEAR_CALLER_SAVED_REGS \
 51 |   do { \
 52 |    __asm__ __volatile__( "li 3, 0" : : :/*trash*/"r3" ); \
 53 |    __asm__ __volatile__( "li 4, 0" : : :/*trash*/"r4" ); \
 54 |    __asm__ __volatile__( "li 5, 0" : : :/*trash*/"r5" ); \
 55 |    __asm__ __volatile__( "li 6, 0" : : :/*trash*/"r6" ); \
 56 |    __asm__ __volatile__( "li 7, 0" : : :/*trash*/"r7" ); \
 57 |    __asm__ __volatile__( "li 8, 0" : : :/*trash*/"r8" ); \
 58 |    __asm__ __volatile__( "li 9, 0" : : :/*trash*/"r9" ); \
 59 |    __asm__ __volatile__( "li 10, 0" : : :/*trash*/"r10" ); \
 60 |    __asm__ __volatile__( "li 11, 0" : : :/*trash*/"r11" ); \
 61 |    __asm__ __volatile__( "li 12, 0" : : :/*trash*/"r12" ); \
 62 |   } while (0)
 63 | #elif defined(__nanomips__)
 64 | #define CLEAR_CALLER_SAVED_REGS                                             \
 65 |    do {                                                                     \
 66 |       __asm__ __volatile__ (".set push       \n\t"                          \
 67 |                             ".set noat       \n\t"                          \
 68 |                             "move $at, $zero \n\t"                          \
 69 |                             "move $t4, $zero \n\t"                          \
 70 |                             "move $t5, $zero \n\t"                          \
 71 |                             "move $a0, $zero \n\t"                          \
 72 |                             "move $a1, $zero \n\t"                          \
 73 |                             "move $a2, $zero \n\t"                          \
 74 |                             "move $a3, $zero \n\t"                          \
 75 |                             "move $a4, $zero \n\t"                          \
 76 |                             "move $a5, $zero \n\t"                          \
 77 |                             "move $a6, $zero \n\t"                          \
 78 |                             "move $a7, $zero \n\t"                          \
 79 |                             "move $t0, $zero \n\t"                          \
 80 |                             "move $t1, $zero \n\t"                          \
 81 |                             "move $t2, $zero \n\t"                          \
 82 |                             "move $t3, $zero \n\t"                          \
 83 |                             "move $t8, $zero \n\t"                          \
 84 |                             "move $t9, $zero \n\t"                          \
 85 |                             ".set pop        \n\t"                          \
 86 |                             : : : "$at", "$t4", "$t5", "$a0", "$a1", "$a2", \
 87 |                                   "$a3", "$a4", "$a5", "$a6", "$a7", "$t0", \
 88 |                                   "$t1", "$t2", "$t3", "$t8", "$t9");       \
 89 |    } while (0)
 90 | #elif (__mips == 32)
 91 | #define CLEAR_CALLER_SAVED_REGS                                              \
 92 |    do {                                                                      \
 93 |       __asm__ __volatile__ (".set push    \n\t"                              \
 94 |                             ".set noat    \n\t"                              \
 95 |                             "move $1,  $0 \n\t"   /* at = 0 */               \
 96 |                             "move $2,  $0 \n\t"   /* v0 = 0 */               \
 97 |                             "move $3,  $0 \n\t"   /* v1 = 0 */               \
 98 |                             "move $4,  $0 \n\t"   /* a0 = 0 */               \
 99 |                             "move $5,  $0 \n\t"   /* a1 = 0 */               \
100 |                             "move $6,  $0 \n\t"   /* a2 = 0 */               \
101 |                             "move $7,  $0 \n\t"   /* a3 = 0 */               \
102 |                             "move $8,  $0 \n\t"   /* t0 = 0 */               \
103 |                             "move $9,  $0 \n\t"   /* t1 = 0 */               \
104 |                             "move $10, $0 \n\t"   /* t2 = 0 */               \
105 |                             "move $11, $0 \n\t"   /* t3 = 0 */               \
106 |                             "move $12, $0 \n\t"   /* t4 = 0 */               \
107 |                             "move $13, $0 \n\t"   /* t5 = 0 */               \
108 |                             "move $14, $0 \n\t"   /* t6 = 0 */               \
109 |                             "move $15, $0 \n\t"   /* t7 = 0 */               \
110 |                             "move $24, $0 \n\t"   /* t8 = 0 */               \
111 |                             "move $25, $0 \n\t"   /* t9 = 0 */               \
112 |                             "move $31, $0 \n\t"   /* ra = 0 */               \
113 |                             ".set pop     \n\t"                              \
114 |                             : : : "$1", "$2", "$3", "$4", "$5", "$6", "$7",  \
115 |                                   "$8", "$9", "$10", "$11", "$12", "$13",    \
116 |                                   "$14", "$15", "$24", "$25", "$31");        \
117 |    } while (0)
118 | #elif (__mips == 64)
119 | #define CLEAR_CALLER_SAVED_REGS                                              \
120 |    do {                                                                      \
121 |       __asm__ __volatile__ (".set push    \n\t"                              \
122 |                             ".set noat    \n\t"                              \
123 |                             "move $1,  $0 \n\t"  /* at = 0 */                \
124 |                             "move $2,  $0 \n\t"  /* v0 = 0 */                \
125 |                             "move $3,  $0 \n\t"  /* v1 = 0 */                \
126 |                             "move $4,  $0 \n\t"  /* a0 = 0 */                \
127 |                             "move $5,  $0 \n\t"  /* a1 = 0 */                \
128 |                             "move $6,  $0 \n\t"  /* a2 = 0 */                \
129 |                             "move $7,  $0 \n\t"  /* a3 = 0 */                \
130 |                             "move $8,  $0 \n\t"  /* a4 = 0 */                \
131 |                             "move $9,  $0 \n\t"  /* a5 = 0 */                \
132 |                             "move $10, $0 \n\t"  /* a6 = 0 */                \
133 |                             "move $11, $0 \n\t"  /* a7 = 0 */                \
134 |                             "move $12, $0 \n\t"  /* t0 = 0 */                \
135 |                             "move $13, $0 \n\t"  /* t1 = 0 */                \
136 |                             "move $14, $0 \n\t"  /* t2 = 0 */                \
137 |                             "move $15, $0 \n\t"  /* t3 = 0 */                \
138 |                             "move $24, $0 \n\t"  /* t8 = 0 */                \
139 |                             "move $25, $0 \n\t"  /* t9 = 0 */                \
140 |                             "move $31, $0 \n\t"  /* ra = 0 */                \
141 |                             ".set pop     \n\t"                              \
142 |                             : : : "$1", "$2", "$3", "$4", "$5", "$6", "$7",  \
143 |                                   "$8", "$9", "$10", "$11", "$12", "$13",    \
144 |                                   "$14", "$15", "$24", "$25", "$31");        \
145 |    } while (0)
146 | #elif defined (__clang__) && defined(VGA_x86)
147 | #define CLEAR_CALLER_SAVED_REGS                                              \
148 |    do {                                                                      \
149 |       __asm__ __volatile__ ("movl $0, %ecx\n\t"); \
150 |    } while (0)
151 | #elif defined(__arm__)
152 | /* 32bit arm */
153 | #define CLEAR_CALLER_SAVED_REGS                                              \
154 |    do {                                                                      \
155 |       __asm__ __volatile__ ("mov %r0, $0\n\t");                              \
156 |       __asm__ __volatile__ ("mov %r1, $0\n\t");                              \
157 |       __asm__ __volatile__ ("mov %r2, $0\n\t");                              \
158 |       __asm__ __volatile__ ("mov %r3, $0\n\t");                              \
159 |    } while (0)
160 | #elif defined(__aarch64__)
161 | /* 64bit arm */
162 | #define CLEAR_CALLER_SAVED_REGS                                              \
163 |    do {                                                                      \
164 |       __asm__ __volatile__ ("mov x0, 0\n\t");                              \
165 |       __asm__ __volatile__ ("mov x1, 0\n\t");                              \
166 |       __asm__ __volatile__ ("mov x2, 0\n\t");                              \
167 |       __asm__ __volatile__ ("mov x3, 0\n\t");                              \
168 |       __asm__ __volatile__ ("mov x4, 0\n\t");                              \
169 |       __asm__ __volatile__ ("mov x5, 0\n\t");                              \
170 |       __asm__ __volatile__ ("mov x6, 0\n\t");                              \
171 |       __asm__ __volatile__ ("mov x7, 0\n\t");                              \
172 |       __asm__ __volatile__ ("mov x8, 0\n\t");                              \
173 |       __asm__ __volatile__ ("mov x9, 0\n\t");                              \
174 |       __asm__ __volatile__ ("mov x10, 0\n\t");                              \
175 |       __asm__ __volatile__ ("mov x11, 0\n\t");                              \
176 |       __asm__ __volatile__ ("mov x12, 0\n\t");                              \
177 |       __asm__ __volatile__ ("mov x13, 0\n\t");                              \
178 |       __asm__ __volatile__ ("mov x14, 0\n\t");                              \
179 |       __asm__ __volatile__ ("mov x15, 0\n\t");                              \
180 |       __asm__ __volatile__ ("mov x16, 0\n\t");                              \
181 |       __asm__ __volatile__ ("mov x17, 0\n\t");                              \
182 |       __asm__ __volatile__ ("mov x18, 0\n\t");                              \
183 |    } while (0)
184 | #else
185 | #define CLEAR_CALLER_SAVED_REGS  /*nothing*/
186 | #endif
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/matrixMul.cu:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | /**
 29 |  * Matrix multiplication: C = A * B.
 30 |  * Host code.
 31 |  *
 32 |  * This sample implements matrix multiplication which makes use of shared memory
 33 |  * to ensure data reuse, the matrix multiplication is done using tiling approach.
 34 |  * It has been written for clarity of exposition to illustrate various CUDA programming
 35 |  * principles, not with the goal of providing the most performant generic kernel for matrix multiplication.
 36 |  * See also:
 37 |  * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 38 |  * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08),
 39 |  * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 40 |  */
 41 | 
 42 | // System includes
 43 | #include <stdio.h>
 44 | #include <assert.h>
 45 | 
 46 | // CUDA runtime
 47 | #include <cuda_runtime.h>
 48 | #include <cuda_profiler_api.h>
 49 | 
 50 | // Helper functions and utilities to work with CUDA
 51 | #include <helper_functions.h>
 52 | #include <helper_cuda.h>
 53 | 
 54 | /**
 55 |  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 56 |  * wA is A's width and wB is B's width
 57 |  */
 58 | template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
 59 |     float *B, int wA,
 60 |     int wB) {
 61 |   // Block index
 62 |   int bx = blockIdx.x;
 63 |   int by = blockIdx.y;
 64 | 
 65 |   // Thread index
 66 |   int tx = threadIdx.x;
 67 |   int ty = threadIdx.y;
 68 | 
 69 |   // Index of the first sub-matrix of A processed by the block
 70 |   int aBegin = wA * BLOCK_SIZE * by;
 71 | 
 72 |   // Index of the last sub-matrix of A processed by the block
 73 |   int aEnd   = aBegin + wA - 1;
 74 | 
 75 |   // Step size used to iterate through the sub-matrices of A
 76 |   int aStep  = BLOCK_SIZE;
 77 | 
 78 |   // Index of the first sub-matrix of B processed by the block
 79 |   int bBegin = BLOCK_SIZE * bx;
 80 | 
 81 |   // Step size used to iterate through the sub-matrices of B
 82 |   int bStep  = BLOCK_SIZE * wB;
 83 | 
 84 |   // Csub is used to store the element of the block sub-matrix
 85 |   // that is computed by the thread
 86 |   float Csub = 0;
 87 | 
 88 |   // Loop over all the sub-matrices of A and B
 89 |   // required to compute the block sub-matrix
 90 |   for (int a = aBegin, b = bBegin;
 91 |        a <= aEnd;
 92 |        a += aStep, b += bStep) {
 93 |     // Declaration of the shared memory array As used to
 94 |     // store the sub-matrix of A
 95 |     __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 96 | 
 97 |     // Declaration of the shared memory array Bs used to
 98 |     // store the sub-matrix of B
 99 |     __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
100 | 
101 |     // Load the matrices from device memory
102 |     // to shared memory; each thread loads
103 |     // one element of each matrix
104 |     As[ty][tx] = A[a + wA * ty + tx];
105 |     Bs[ty][tx] = B[b + wB * ty + tx];
106 | 
107 |     // Synchronize to make sure the matrices are loaded
108 |     __syncthreads();
109 | 
110 |     // Multiply the two matrices together;
111 |     // each thread computes one element
112 |     // of the block sub-matrix
113 | #pragma unroll
114 | 
115 |     for (int k = 0; k < BLOCK_SIZE; ++k) {
116 |       Csub += As[ty][k] * Bs[k][tx];
117 |     }
118 | 
119 |     // Synchronize to make sure that the preceding
120 |     // computation is done before loading two new
121 |     // sub-matrices of A and B in the next iteration
122 |     __syncthreads();
123 |   }
124 | 
125 |   // Write the block sub-matrix to device memory;
126 |   // each thread writes one element
127 |   int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
128 |   C[c + wB * ty + tx] = Csub;
129 | }
130 | 
131 | void ConstantInit(float *data, int size, float val) {
132 |   for (int i = 0; i < size; ++i) {
133 |     data[i] = val;
134 |   }
135 | }
136 | 
137 | /**
138 |  * Run a simple test of matrix multiplication using CUDA
139 |  */
140 | int MatrixMultiply(int argc, char **argv,
141 |                    int block_size, const dim3 &dimsA,
142 |                    const dim3 &dimsB) {
143 |   // Allocate host memory for matrices A and B
144 |   unsigned int size_A = dimsA.x * dimsA.y;
145 |   unsigned int mem_size_A = sizeof(float) * size_A;
146 |   float *h_A;
147 |   checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
148 |   unsigned int size_B = dimsB.x * dimsB.y;
149 |   unsigned int mem_size_B = sizeof(float) * size_B;
150 |   float *h_B;
151 |   checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
152 |   cudaStream_t stream;
153 | 
154 |   // Initialize host memory
155 |   const float valB = 0.01f;
156 |   ConstantInit(h_A, size_A, 1.0f);
157 |   ConstantInit(h_B, size_B, valB);
158 | 
159 |   // Allocate device memory
160 |   float *d_A, *d_B, *d_C;
161 | 
162 |   // Allocate host matrix C
163 |   dim3 dimsC(dimsB.x, dimsA.y, 1);
164 |   unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
165 |   float *h_C;
166 |   checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
167 | 
168 |   if (h_C == NULL) {
169 |     fprintf(stderr, "Failed to allocate host matrix C!\n");
170 |     exit(EXIT_FAILURE);
171 |   }
172 | 
173 |   checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
174 |   checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
175 |   checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
176 |   // Allocate CUDA events that we'll use for timing
177 |   cudaEvent_t start, stop;
178 |   checkCudaErrors(cudaEventCreate(&start));
179 |   checkCudaErrors(cudaEventCreate(&stop));
180 | 
181 |   checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
182 | 
183 |   // copy host memory to device
184 |   checkCudaErrors(
185 |       cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
186 |   checkCudaErrors(
187 |       cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
188 | 
189 |   // Setup execution parameters
190 |   dim3 threads(block_size, block_size);
191 |   dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
192 | 
193 |   // Create and start timer
194 |   printf("Computing result using CUDA Kernel...\n");
195 | 
196 |   // Performs warmup operation using matrixMul CUDA kernel
197 |   if (block_size == 16) {
198 |     MatrixMulCUDA<16>
199 |         <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
200 |   } else {
201 |     MatrixMulCUDA<32>
202 |         <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
203 |   }
204 | 
205 |   printf("done\n");
206 |   checkCudaErrors(cudaStreamSynchronize(stream));
207 | 
208 |   // Record the start event
209 |   checkCudaErrors(cudaEventRecord(start, stream));
210 | 
211 |   // Execute the kernel
212 |   int nIter = 300;
213 | 
214 |   for (int j = 0; j < nIter; j++) {
215 |     if (block_size == 16) {
216 |       MatrixMulCUDA<16>
217 |           <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
218 |     } else {
219 |       MatrixMulCUDA<32>
220 |           <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
221 |     }
222 |   }
223 | 
224 |   // Record the stop event
225 |   checkCudaErrors(cudaEventRecord(stop, stream));
226 | 
227 |   // Wait for the stop event to complete
228 |   checkCudaErrors(cudaEventSynchronize(stop));
229 | 
230 |   float msecTotal = 0.0f;
231 |   checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
232 | 
233 |   // Compute and print the performance
234 |   float msecPerMatrixMul = msecTotal / nIter;
235 |   double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
236 |                              static_cast<double>(dimsA.y) *
237 |                              static_cast<double>(dimsB.x);
238 |   double gigaFlops =
239 |       (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
240 |   printf(
241 |       "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
242 |       " WorkgroupSize= %u threads/block\n",
243 |       gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
244 | 
245 |   // Copy result from device to host
246 |   checkCudaErrors(
247 |       cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
248 |   checkCudaErrors(cudaStreamSynchronize(stream));
249 | 
250 |   printf("Checking computed result for correctness: ");
251 |   bool correct = true;
252 | 
253 |   // test relative error by the formula
254 |   //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
255 |   double eps = 1.e-6;  // machine zero
256 | 
257 |   for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
258 |     double abs_err = fabs(h_C[i] - (dimsA.x * valB));
259 |     double dot_length = dimsA.x;
260 |     double abs_val = fabs(h_C[i]);
261 |     double rel_err = abs_err / abs_val / dot_length;
262 | 
263 |     if (rel_err > eps) {
264 |       printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
265 |              i, h_C[i], dimsA.x * valB, eps);
266 |       correct = false;
267 |     }
268 |   }
269 | 
270 |   printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
271 | 
272 |   // Clean up memory
273 |   checkCudaErrors(cudaFreeHost(h_A));
274 |   checkCudaErrors(cudaFreeHost(h_B));
275 |   checkCudaErrors(cudaFreeHost(h_C));
276 |   checkCudaErrors(cudaFree(d_A));
277 |   checkCudaErrors(cudaFree(d_B));
278 |   checkCudaErrors(cudaFree(d_C));
279 |   checkCudaErrors(cudaEventDestroy(start));
280 |   checkCudaErrors(cudaEventDestroy(stop));
281 |   printf(
282 |       "\nNOTE: The CUDA Samples are not meant for performance "
283 |       "measurements. Results may vary when GPU Boost is enabled.\n");
284 | 
285 |   if (correct) {
286 |     return EXIT_SUCCESS;
287 |   } else {
288 |     return EXIT_FAILURE;
289 |   }
290 | }
291 | 
292 | 
293 | /**
294 |  * Program main
295 |  */
296 | int main(int argc, char **argv) {
297 |   printf("[Matrix Multiply Using CUDA] - Starting...\n");
298 | 
299 |   if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
300 |       checkCmdLineFlag(argc, (const char **)argv, "?")) {
301 |     printf("Usage -device=n (n >= 0 for deviceID)\n");
302 |     printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
303 |     printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
304 |     printf("  Note: Outer matrix dimensions of A & B matrices" \
305 |            " must be equal.\n");
306 | 
307 |     exit(EXIT_SUCCESS);
308 |   }
309 | 
310 |   // This will pick the best possible CUDA capable device, otherwise
311 |   // override the device ID based on input provided at the command line
312 |   int dev = findCudaDevice(argc, (const char **)argv);
313 | 
314 |   int block_size = 32;
315 | 
316 |   dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
317 |   dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
318 | 
319 |   // width of Matrix A
320 |   if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
321 |     dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
322 |   }
323 | 
324 |   // height of Matrix A
325 |   if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
326 |     dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
327 |   }
328 | 
329 |   // width of Matrix B
330 |   if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
331 |     dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
332 |   }
333 | 
334 |   // height of Matrix B
335 |   if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
336 |     dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
337 |   }
338 | 
339 |   if (dimsA.x != dimsB.y) {
340 |     printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
341 |            dimsA.x, dimsB.y);
342 |     exit(EXIT_FAILURE);
343 |   }
344 | 
345 |   printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
346 |          dimsB.x, dimsB.y);
347 | 
348 |   checkCudaErrors(cudaProfilerStart());
349 |   int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
350 |   checkCudaErrors(cudaProfilerStop());
351 | 
352 |   exit(matrix_result);
353 | }
354 | 


--------------------------------------------------------------------------------
/Valgrind/memcheck/memcheck.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |    ----------------------------------------------------------------
  4 | 
  5 |    Notice that the following BSD-style license applies to this one
  6 |    file (memcheck.h) only.  The rest of Valgrind is licensed under the
  7 |    terms of the GNU General Public License, version 2, unless
  8 |    otherwise indicated.  See the COPYING file in the source
  9 |    distribution for details.
 10 | 
 11 |    ----------------------------------------------------------------
 12 | 
 13 |    This file is part of MemCheck, a heavyweight Valgrind tool for
 14 |    detecting memory errors.
 15 | 
 16 |    Copyright (C) 2000-2017 Julian Seward.  All rights reserved.
 17 | 
 18 |    Redistribution and use in source and binary forms, with or without
 19 |    modification, are permitted provided that the following conditions
 20 |    are met:
 21 | 
 22 |    1. Redistributions of source code must retain the above copyright
 23 |       notice, this list of conditions and the following disclaimer.
 24 | 
 25 |    2. The origin of this software must not be misrepresented; you must 
 26 |       not claim that you wrote the original software.  If you use this 
 27 |       software in a product, an acknowledgment in the product 
 28 |       documentation would be appreciated but is not required.
 29 | 
 30 |    3. Altered source versions must be plainly marked as such, and must
 31 |       not be misrepresented as being the original software.
 32 | 
 33 |    4. The name of the author may not be used to endorse or promote 
 34 |       products derived from this software without specific prior written 
 35 |       permission.
 36 | 
 37 |    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 38 |    OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 39 |    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 40 |    ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 41 |    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 42 |    DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
 43 |    GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 44 |    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 45 |    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 46 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 47 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 48 | 
 49 |    ----------------------------------------------------------------
 50 | 
 51 |    Notice that the above BSD-style license applies to this one file
 52 |    (memcheck.h) only.  The entire rest of Valgrind is licensed under
 53 |    the terms of the GNU General Public License, version 2.  See the
 54 |    COPYING file in the source distribution for details.
 55 | 
 56 |    ---------------------------------------------------------------- 
 57 | */
 58 | 
 59 | 
 60 | #ifndef __MEMCHECK_H
 61 | #define __MEMCHECK_H
 62 | 
 63 | 
 64 | /* This file is for inclusion into client (your!) code.
 65 | 
 66 |    You can use these macros to manipulate and query memory permissions
 67 |    inside your own programs.
 68 | 
 69 |    See comment near the top of valgrind.h on how to use them.
 70 | */
 71 | 
 72 | #include "valgrind.h"
 73 | 
 74 | /* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
 75 |    This enum comprises an ABI exported by Valgrind to programs
 76 |    which use client requests.  DO NOT CHANGE THE ORDER OF THESE
 77 |    ENTRIES, NOR DELETE ANY -- add new ones at the end. */
 78 | typedef
 79 |    enum { 
 80 |       VG_USERREQ__MAKE_MEM_NOACCESS = VG_USERREQ_TOOL_BASE('M','C'),
 81 |       VG_USERREQ__MAKE_MEM_UNDEFINED,
 82 |       VG_USERREQ__MAKE_MEM_DEFINED,
 83 |       VG_USERREQ__DISCARD,
 84 |       VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE,
 85 |       VG_USERREQ__CHECK_MEM_IS_DEFINED,
 86 |       VG_USERREQ__DO_LEAK_CHECK,
 87 |       VG_USERREQ__COUNT_LEAKS,
 88 | 
 89 |       VG_USERREQ__GET_VBITS,
 90 |       VG_USERREQ__SET_VBITS,
 91 | 
 92 |       VG_USERREQ__CREATE_BLOCK,
 93 | 
 94 |       VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE,
 95 | 
 96 |       /* Not next to VG_USERREQ__COUNT_LEAKS because it was added later. */
 97 |       VG_USERREQ__COUNT_LEAK_BLOCKS,
 98 | 
 99 |       VG_USERREQ__ENABLE_ADDR_ERROR_REPORTING_IN_RANGE,
100 |       VG_USERREQ__DISABLE_ADDR_ERROR_REPORTING_IN_RANGE,
101 | 
102 |       /* This is just for memcheck's internal use - don't use it */
103 |       _VG_USERREQ__MEMCHECK_RECORD_OVERLAP_ERROR 
104 |          = VG_USERREQ_TOOL_BASE('M','C') + 256,
105 |       _VG_USERREQ__MEMCHECK_VERIFY_ALIGNMENT
106 |    } Vg_MemCheckClientRequest;
107 | 
108 | 
109 | 
110 | /* Client-code macros to manipulate the state of memory. */
111 | 
112 | /* Mark memory at _qzz_addr as unaddressable for _qzz_len bytes. */
113 | #define VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr,_qzz_len)           \
114 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,      \
115 |                             VG_USERREQ__MAKE_MEM_NOACCESS,       \
116 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
117 |       
118 | /* Similarly, mark memory at _qzz_addr as addressable but undefined
119 |    for _qzz_len bytes. */
120 | #define VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr,_qzz_len)          \
121 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,      \
122 |                             VG_USERREQ__MAKE_MEM_UNDEFINED,      \
123 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
124 | 
125 | /* Similarly, mark memory at _qzz_addr as addressable and defined
126 |    for _qzz_len bytes. */
127 | #define VALGRIND_MAKE_MEM_DEFINED(_qzz_addr,_qzz_len)            \
128 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,      \
129 |                             VG_USERREQ__MAKE_MEM_DEFINED,        \
130 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
131 | 
132 | /* Similar to VALGRIND_MAKE_MEM_DEFINED except that addressability is
133 |    not altered: bytes which are addressable are marked as defined,
134 |    but those which are not addressable are left unchanged. */
135 | #define VALGRIND_MAKE_MEM_DEFINED_IF_ADDRESSABLE(_qzz_addr,_qzz_len)     \
136 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,              \
137 |                             VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, \
138 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
139 | 
140 | /* Create a block-description handle.  The description is an ascii
141 |    string which is included in any messages pertaining to addresses
142 |    within the specified memory range.  Has no other effect on the
143 |    properties of the memory range. */
144 | #define VALGRIND_CREATE_BLOCK(_qzz_addr,_qzz_len, _qzz_desc)	   \
145 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,        \
146 |                             VG_USERREQ__CREATE_BLOCK,              \
147 |                             (_qzz_addr), (_qzz_len), (_qzz_desc),  \
148 |                             0, 0)
149 | 
150 | /* Discard a block-description-handle. Returns 1 for an
151 |    invalid handle, 0 for a valid handle. */
152 | #define VALGRIND_DISCARD(_qzz_blkindex)                          \
153 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,      \
154 |                             VG_USERREQ__DISCARD,                 \
155 |                             0, (_qzz_blkindex), 0, 0, 0)
156 | 
157 | 
158 | /* Client-code macros to check the state of memory. */
159 | 
160 | /* Check that memory at _qzz_addr is addressable for _qzz_len bytes.
161 |    If suitable addressibility is not established, Valgrind prints an
162 |    error message and returns the address of the first offending byte.
163 |    Otherwise it returns zero. */
164 | #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(_qzz_addr,_qzz_len)      \
165 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                             \
166 |                             VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE,  \
167 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
168 | 
169 | /* Check that memory at _qzz_addr is addressable and defined for
170 |    _qzz_len bytes.  If suitable addressibility and definedness are not
171 |    established, Valgrind prints an error message and returns the
172 |    address of the first offending byte.  Otherwise it returns zero. */
173 | #define VALGRIND_CHECK_MEM_IS_DEFINED(_qzz_addr,_qzz_len)        \
174 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                           \
175 |                             VG_USERREQ__CHECK_MEM_IS_DEFINED,    \
176 |                             (_qzz_addr), (_qzz_len), 0, 0, 0)
177 | 
178 | /* Use this macro to force the definedness and addressibility of an
179 |    lvalue to be checked.  If suitable addressibility and definedness
180 |    are not established, Valgrind prints an error message and returns
181 |    the address of the first offending byte.  Otherwise it returns
182 |    zero. */
183 | #define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue)                \
184 |    VALGRIND_CHECK_MEM_IS_DEFINED(                                \
185 |       (volatile unsigned char *)&(__lvalue),                     \
186 |                       (unsigned long)(sizeof (__lvalue)))
187 | 
188 | 
189 | /* Do a full memory leak check (like --leak-check=full) mid-execution. */
190 | #define VALGRIND_DO_LEAK_CHECK                                   \
191 |     VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK,   \
192 |                                     0, 0, 0, 0, 0)
193 | 
194 | /* Same as VALGRIND_DO_LEAK_CHECK but only showing the entries for
195 |    which there was an increase in leaked bytes or leaked nr of blocks
196 |    since the previous leak search. */
197 | #define VALGRIND_DO_ADDED_LEAK_CHECK                            \
198 |     VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK,  \
199 |                                     0, 1, 0, 0, 0)
200 | 
201 | /* Same as VALGRIND_DO_ADDED_LEAK_CHECK but showing entries with
202 |    increased or decreased leaked bytes/blocks since previous leak
203 |    search. */
204 | #define VALGRIND_DO_CHANGED_LEAK_CHECK                          \
205 |     VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK,  \
206 |                                     0, 2, 0, 0, 0)
207 | 
208 | /* Same as VALGRIND_DO_LEAK_CHECK but only showing new entries
209 |    i.e. loss records that were not there in the previous leak
210 |    search. */
211 | #define VALGRIND_DO_NEW_LEAK_CHECK                              \
212 |     VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK,  \
213 |                                     0, 3, 0, 0, 0)
214 | 
215 | /* Do a summary memory leak check (like --leak-check=summary) mid-execution. */
216 | #define VALGRIND_DO_QUICK_LEAK_CHECK                             \
217 |     VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DO_LEAK_CHECK,   \
218 |                                     1, 0, 0, 0, 0)
219 | 
220 | /* Return number of leaked, dubious, reachable and suppressed bytes found by
221 |    all previous leak checks.  They must be lvalues.  */
222 | #define VALGRIND_COUNT_LEAKS(leaked, dubious, reachable, suppressed)     \
223 |    /* For safety on 64-bit platforms we assign the results to private
224 |       unsigned long variables, then assign these to the lvalues the user
225 |       specified, which works no matter what type 'leaked', 'dubious', etc
226 |       are.  We also initialise '_qzz_leaked', etc because
227 |       VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as
228 |       defined. */                                                        \
229 |    {                                                                     \
230 |     unsigned long _qzz_leaked    = 0, _qzz_dubious    = 0;               \
231 |     unsigned long _qzz_reachable = 0, _qzz_suppressed = 0;               \
232 |     VALGRIND_DO_CLIENT_REQUEST_STMT(                                     \
233 |                                VG_USERREQ__COUNT_LEAKS,                  \
234 |                                &_qzz_leaked, &_qzz_dubious,              \
235 |                                &_qzz_reachable, &_qzz_suppressed, 0);    \
236 |     leaked     = _qzz_leaked;                                            \
237 |     dubious    = _qzz_dubious;                                           \
238 |     reachable  = _qzz_reachable;                                         \
239 |     suppressed = _qzz_suppressed;                                        \
240 |    }
241 | 
242 | /* Return number of leaked, dubious, reachable and suppressed bytes found by
243 |    all previous leak checks.  They must be lvalues.  */
244 | #define VALGRIND_COUNT_LEAK_BLOCKS(leaked, dubious, reachable, suppressed) \
245 |    /* For safety on 64-bit platforms we assign the results to private
246 |       unsigned long variables, then assign these to the lvalues the user
247 |       specified, which works no matter what type 'leaked', 'dubious', etc
248 |       are.  We also initialise '_qzz_leaked', etc because
249 |       VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as
250 |       defined. */                                                        \
251 |    {                                                                     \
252 |     unsigned long _qzz_leaked    = 0, _qzz_dubious    = 0;               \
253 |     unsigned long _qzz_reachable = 0, _qzz_suppressed = 0;               \
254 |     VALGRIND_DO_CLIENT_REQUEST_STMT(                                     \
255 |                                VG_USERREQ__COUNT_LEAK_BLOCKS,            \
256 |                                &_qzz_leaked, &_qzz_dubious,              \
257 |                                &_qzz_reachable, &_qzz_suppressed, 0);    \
258 |     leaked     = _qzz_leaked;                                            \
259 |     dubious    = _qzz_dubious;                                           \
260 |     reachable  = _qzz_reachable;                                         \
261 |     suppressed = _qzz_suppressed;                                        \
262 |    }
263 | 
264 | 
265 | /* Get the validity data for addresses [zza..zza+zznbytes-1] and copy it
266 |    into the provided zzvbits array.  Return values:
267 |       0   if not running on valgrind
268 |       1   success
269 |       2   [previously indicated unaligned arrays;  these are now allowed]
270 |       3   if any parts of zzsrc/zzvbits are not addressable.
271 |    The metadata is not copied in cases 0, 2 or 3 so it should be
272 |    impossible to segfault your system by using this call.
273 | */
274 | #define VALGRIND_GET_VBITS(zza,zzvbits,zznbytes)                \
275 |     (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                \
276 |                                     VG_USERREQ__GET_VBITS,      \
277 |                                     (const char*)(zza),         \
278 |                                     (char*)(zzvbits),           \
279 |                                     (zznbytes), 0, 0)
280 | 
281 | /* Set the validity data for addresses [zza..zza+zznbytes-1], copying it
282 |    from the provided zzvbits array.  Return values:
283 |       0   if not running on valgrind
284 |       1   success
285 |       2   [previously indicated unaligned arrays;  these are now allowed]
286 |       3   if any parts of zza/zzvbits are not addressable.
287 |    The metadata is not copied in cases 0, 2 or 3 so it should be
288 |    impossible to segfault your system by using this call.
289 | */
290 | #define VALGRIND_SET_VBITS(zza,zzvbits,zznbytes)                \
291 |     (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                \
292 |                                     VG_USERREQ__SET_VBITS,      \
293 |                                     (const char*)(zza),         \
294 |                                     (const char*)(zzvbits),     \
295 |                                     (zznbytes), 0, 0 )
296 | 
297 | /* Disable and re-enable reporting of addressing errors in the
298 |    specified address range. */
299 | #define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(_qzz_addr,_qzz_len) \
300 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,    \
301 |        VG_USERREQ__DISABLE_ADDR_ERROR_REPORTING_IN_RANGE,      \
302 |        (_qzz_addr), (_qzz_len), 0, 0, 0)
303 | 
304 | #define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(_qzz_addr,_qzz_len) \
305 |     VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,    \
306 |        VG_USERREQ__ENABLE_ADDR_ERROR_REPORTING_IN_RANGE,       \
307 |        (_qzz_addr), (_qzz_len), 0, 0, 0)
308 | 
309 | #endif
310 | 
311 | 


--------------------------------------------------------------------------------
/Linaro-Forge/correctness/gpu-nvidia-mmult/common/helper_string.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | // These are helper functions for the SDK samples (string parsing, timers, etc)
 29 | #ifndef COMMON_HELPER_STRING_H_
 30 | #define COMMON_HELPER_STRING_H_
 31 | 
 32 | #include <stdio.h>
 33 | #include <stdlib.h>
 34 | #include <fstream>
 35 | #include <string>
 36 | 
 37 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 38 | #ifndef _CRT_SECURE_NO_DEPRECATE
 39 | #define _CRT_SECURE_NO_DEPRECATE
 40 | #endif
 41 | #ifndef STRCASECMP
 42 | #define STRCASECMP _stricmp
 43 | #endif
 44 | #ifndef STRNCASECMP
 45 | #define STRNCASECMP _strnicmp
 46 | #endif
 47 | #ifndef STRCPY
 48 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
 49 | #endif
 50 | 
 51 | #ifndef FOPEN
 52 | #define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
 53 | #endif
 54 | #ifndef FOPEN_FAIL
 55 | #define FOPEN_FAIL(result) (result != 0)
 56 | #endif
 57 | #ifndef SSCANF
 58 | #define SSCANF sscanf_s
 59 | #endif
 60 | #ifndef SPRINTF
 61 | #define SPRINTF sprintf_s
 62 | #endif
 63 | #else  // Linux Includes
 64 | #include <string.h>
 65 | #include <strings.h>
 66 | 
 67 | #ifndef STRCASECMP
 68 | #define STRCASECMP strcasecmp
 69 | #endif
 70 | #ifndef STRNCASECMP
 71 | #define STRNCASECMP strncasecmp
 72 | #endif
 73 | #ifndef STRCPY
 74 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
 75 | #endif
 76 | 
 77 | #ifndef FOPEN
 78 | #define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
 79 | #endif
 80 | #ifndef FOPEN_FAIL
 81 | #define FOPEN_FAIL(result) (result == NULL)
 82 | #endif
 83 | #ifndef SSCANF
 84 | #define SSCANF sscanf
 85 | #endif
 86 | #ifndef SPRINTF
 87 | #define SPRINTF sprintf
 88 | #endif
 89 | #endif
 90 | 
 91 | #ifndef EXIT_WAIVED
 92 | #define EXIT_WAIVED 2
 93 | #endif
 94 | 
 95 | // CUDA Utility Helper Functions
 96 | inline int stringRemoveDelimiter(char delimiter, const char *string) {
 97 |   int string_start = 0;
 98 | 
 99 |   while (string[string_start] == delimiter) {
100 |     string_start++;
101 |   }
102 | 
103 |   if (string_start >= static_cast<int>(strlen(string) - 1)) {
104 |     return 0;
105 |   }
106 | 
107 |   return string_start;
108 | }
109 | 
110 | inline int getFileExtension(char *filename, char **extension) {
111 |   int string_length = static_cast<int>(strlen(filename));
112 | 
113 |   while (filename[string_length--] != '.') {
114 |     if (string_length == 0) break;
115 |   }
116 | 
117 |   if (string_length > 0) string_length += 2;
118 | 
119 |   if (string_length == 0)
120 |     *extension = NULL;
121 |   else
122 |     *extension = &filename[string_length];
123 | 
124 |   return string_length;
125 | }
126 | 
127 | inline bool checkCmdLineFlag(const int argc, const char **argv,
128 |                              const char *string_ref) {
129 |   bool bFound = false;
130 | 
131 |   if (argc >= 1) {
132 |     for (int i = 1; i < argc; i++) {
133 |       int string_start = stringRemoveDelimiter('-', argv[i]);
134 |       const char *string_argv = &argv[i][string_start];
135 | 
136 |       const char *equal_pos = strchr(string_argv, '=');
137 |       int argv_length = static_cast<int>(
138 |           equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
139 | 
140 |       int length = static_cast<int>(strlen(string_ref));
141 | 
142 |       if (length == argv_length &&
143 |           !STRNCASECMP(string_argv, string_ref, length)) {
144 |         bFound = true;
145 |         continue;
146 |       }
147 |     }
148 |   }
149 | 
150 |   return bFound;
151 | }
152 | 
153 | // This function wraps the CUDA Driver API into a template function
154 | template <class T>
155 | inline bool getCmdLineArgumentValue(const int argc, const char **argv,
156 |                                     const char *string_ref, T *value) {
157 |   bool bFound = false;
158 | 
159 |   if (argc >= 1) {
160 |     for (int i = 1; i < argc; i++) {
161 |       int string_start = stringRemoveDelimiter('-', argv[i]);
162 |       const char *string_argv = &argv[i][string_start];
163 |       int length = static_cast<int>(strlen(string_ref));
164 | 
165 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
166 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
167 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
168 |           *value = (T)atoi(&string_argv[length + auto_inc]);
169 |         }
170 | 
171 |         bFound = true;
172 |         i = argc;
173 |       }
174 |     }
175 |   }
176 | 
177 |   return bFound;
178 | }
179 | 
180 | inline int getCmdLineArgumentInt(const int argc, const char **argv,
181 |                                  const char *string_ref) {
182 |   bool bFound = false;
183 |   int value = -1;
184 | 
185 |   if (argc >= 1) {
186 |     for (int i = 1; i < argc; i++) {
187 |       int string_start = stringRemoveDelimiter('-', argv[i]);
188 |       const char *string_argv = &argv[i][string_start];
189 |       int length = static_cast<int>(strlen(string_ref));
190 | 
191 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
192 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
193 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
194 |           value = atoi(&string_argv[length + auto_inc]);
195 |         } else {
196 |           value = 0;
197 |         }
198 | 
199 |         bFound = true;
200 |         continue;
201 |       }
202 |     }
203 |   }
204 | 
205 |   if (bFound) {
206 |     return value;
207 |   } else {
208 |     return 0;
209 |   }
210 | }
211 | 
212 | inline float getCmdLineArgumentFloat(const int argc, const char **argv,
213 |                                      const char *string_ref) {
214 |   bool bFound = false;
215 |   float value = -1;
216 | 
217 |   if (argc >= 1) {
218 |     for (int i = 1; i < argc; i++) {
219 |       int string_start = stringRemoveDelimiter('-', argv[i]);
220 |       const char *string_argv = &argv[i][string_start];
221 |       int length = static_cast<int>(strlen(string_ref));
222 | 
223 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
224 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
225 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
226 |           value = static_cast<float>(atof(&string_argv[length + auto_inc]));
227 |         } else {
228 |           value = 0.f;
229 |         }
230 | 
231 |         bFound = true;
232 |         continue;
233 |       }
234 |     }
235 |   }
236 | 
237 |   if (bFound) {
238 |     return value;
239 |   } else {
240 |     return 0;
241 |   }
242 | }
243 | 
244 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
245 |                                      const char *string_ref,
246 |                                      char **string_retval) {
247 |   bool bFound = false;
248 | 
249 |   if (argc >= 1) {
250 |     for (int i = 1; i < argc; i++) {
251 |       int string_start = stringRemoveDelimiter('-', argv[i]);
252 |       char *string_argv = const_cast<char *>(&argv[i][string_start]);
253 |       int length = static_cast<int>(strlen(string_ref));
254 | 
255 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
256 |         *string_retval = &string_argv[length + 1];
257 |         bFound = true;
258 |         continue;
259 |       }
260 |     }
261 |   }
262 | 
263 |   if (!bFound) {
264 |     *string_retval = NULL;
265 |   }
266 | 
267 |   return bFound;
268 | }
269 | 
270 | //////////////////////////////////////////////////////////////////////////////
271 | //! Find the path for a file assuming that
272 | //! files are found in the searchPath.
273 | //!
274 | //! @return the path if succeeded, otherwise 0
275 | //! @param filename         name of the file
276 | //! @param executable_path  optional absolute path of the executable
277 | //////////////////////////////////////////////////////////////////////////////
278 | inline char *sdkFindFilePath(const char *filename,
279 |                              const char *executable_path) {
280 |   // <executable_name> defines a variable that is replaced with the name of the
281 |   // executable
282 | 
283 |   // Typical relative search paths to locate needed companion files (e.g. sample
284 |   // input data, or JIT source files) The origin for the relative search may be
285 |   // the .exe file, a .bat file launching an .exe, a browser .exe launching the
286 |   // .exe or .bat, etc
287 |   const char *searchPath[] = {
288 |       "./",                                           // same dir
289 |       "./data/",                                      // same dir
290 | 
291 |       "../../../../Samples/<executable_name>/",       // up 4 in tree
292 |       "../../../Samples/<executable_name>/",          // up 3 in tree
293 |       "../../Samples/<executable_name>/",             // up 2 in tree
294 | 
295 |       "../../../../Samples/<executable_name>/data/",  // up 4 in tree
296 |       "../../../Samples/<executable_name>/data/",     // up 3 in tree
297 |       "../../Samples/<executable_name>/data/",        // up 2 in tree
298 | 
299 |       "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
300 |       "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
301 |       "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
302 | 
303 |       "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
304 |       "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
305 |       "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
306 | 
307 |       "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
308 |       "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
309 |       "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
310 | 
311 |       "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
312 |       "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
313 |       "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
314 | 
315 |       "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
316 |       "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
317 |       "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
318 | 
319 |       "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
320 |       "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
321 |       "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
322 | 
323 |       "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
324 |       "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
325 |       "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
326 | 
327 |       "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
328 |       "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
329 |       "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
330 | 
331 |       "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
332 |       "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
333 |       "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
334 | 
335 |       "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
336 |       "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
337 |       "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
338 | 
339 |       "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
340 |       "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
341 |       "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
342 | 
343 |       "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
344 |       "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
345 |       "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
346 | 
347 |       "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
348 |       "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
349 |       "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
350 | 
351 |       "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
352 |       "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
353 |       "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
354 | 
355 |       "../../../../Common/data/",                     // up 4 in tree
356 |       "../../../Common/data/",                        // up 3 in tree
357 |       "../../Common/data/"                            // up 2 in tree
358 |   };
359 | 
360 |   // Extract the executable name
361 |   std::string executable_name;
362 | 
363 |   if (executable_path != 0) {
364 |     executable_name = std::string(executable_path);
365 | 
366 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
367 |     // Windows path delimiter
368 |     size_t delimiter_pos = executable_name.find_last_of('\\');
369 |     executable_name.erase(0, delimiter_pos + 1);
370 | 
371 |     if (executable_name.rfind(".exe") != std::string::npos) {
372 |       // we strip .exe, only if the .exe is found
373 |       executable_name.resize(executable_name.size() - 4);
374 |     }
375 | 
376 | #else
377 |     // Linux & OSX path delimiter
378 |     size_t delimiter_pos = executable_name.find_last_of('/');
379 |     executable_name.erase(0, delimiter_pos + 1);
380 | #endif
381 |   }
382 | 
383 |   // Loop over all search paths and return the first hit
384 |   for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
385 |     std::string path(searchPath[i]);
386 |     size_t executable_name_pos = path.find("<executable_name>");
387 | 
388 |     // If there is executable_name variable in the searchPath
389 |     // replace it with the value
390 |     if (executable_name_pos != std::string::npos) {
391 |       if (executable_path != 0) {
392 |         path.replace(executable_name_pos, strlen("<executable_name>"),
393 |                      executable_name);
394 |       } else {
395 |         // Skip this path entry if no executable argument is given
396 |         continue;
397 |       }
398 |     }
399 | 
400 | #ifdef _DEBUG
401 |     printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
402 | #endif
403 | 
404 |     // Test if the file exists
405 |     path.append(filename);
406 |     FILE *fp;
407 |     FOPEN(fp, path.c_str(), "rb");
408 | 
409 |     if (fp != NULL) {
410 |       fclose(fp);
411 |       // File found
412 |       // returning an allocated array here for backwards compatibility reasons
413 |       char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
414 |       STRCPY(file_path, path.length() + 1, path.c_str());
415 |       return file_path;
416 |     }
417 | 
418 |     if (fp) {
419 |       fclose(fp);
420 |     }
421 |   }
422 | 
423 |   // File not found
424 |   printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
425 |   return 0;
426 | }
427 | 
428 | #endif  // COMMON_HELPER_STRING_H_
429 | 


--------------------------------------------------------------------------------