├── 03_taskwait
    ├── src
    │   ├── Makefile.am
    │   └── taskwait.c
    ├── Makefile.am
    ├── docs
    │   ├── Doxyfile.in
    │   ├── UserManual.md
    │   └── Makefile.am
    ├── tests
    │   ├── Makefile.am
    │   └── taskwait_real_00.sh
    ├── README.md
    └── configure.ac
├── 00_build_OpenMP_offload
    ├── Clang
    │   ├── 00_check_gpu
    │   │   ├── realscript.sh
    │   │   ├── tesla.sh
    │   │   └── tesla.log
    │   ├── bugs.md
    │   └── build_clang_offload.md
    └── GCC
    │   ├── 00_check_gpu
    │       ├── realscript.sh
    │       ├── tesla.sh
    │       └── tesla.log
    │   ├── 02_build
    │       ├── build.sh
    │       └── realscript.sh
    │   ├── bugs.md
    │   ├── 01_download
    │       └── download.sh
    │   └── build_gcc_offload.md
├── 09_matAdd
    ├── src
    │   ├── Makefile.am
    │   ├── matAddAB.h
    │   ├── matAdd.c
    │   └── matAddAB.c
    ├── Makefile.am
    ├── tests
    │   ├── matAdd_real_00.sh
    │   ├── Makefile.am
    │   └── matAdd_real_00.sh.5422334.out
    ├── docs
    │   ├── Doxyfile.in
    │   ├── Makefile.am
    │   └── UserManual.md
    ├── README.md
    └── configure.ac
├── 10_matMul
    ├── src
    │   ├── Makefile.am
    │   ├── matMulAB.h
    │   ├── matMul.c
    │   └── matMulAB.c
    ├── Makefile.am
    ├── tests
    │   ├── matMul_real_00.sh
    │   ├── Makefile.am
    │   └── matMul_real_00.sh.5422392.out
    ├── docs
    │   ├── Doxyfile.in
    │   ├── Makefile.am
    │   └── UserManual.md
    ├── README.md
    └── configure.ac
├── 01_accelQuery
    ├── src
    │   ├── Makefile.am
    │   ├── prtAccelInfo.h
    │   ├── accelQuery.c
    │   └── prtAccelInfo.c
    ├── Makefile.am
    ├── tests
    │   ├── accelQuery_real_00.sh
    │   └── Makefile.am
    ├── docs
    │   ├── Doxyfile.in
    │   ├── UserManual.md
    │   └── Makefile.am
    ├── README.md
    └── configure.ac
├── 02_dataTransRate
    ├── src
    │   ├── Makefile.am
    │   ├── check1ns.c
    │   ├── check1ns.h
    │   └── dataTransRate.c
    ├── Makefile.am
    ├── tests
    │   ├── dataTransRate_real_00.sh
    │   └── Makefile.am
    ├── docs
    │   ├── Doxyfile.in
    │   ├── UserManual.md
    │   └── Makefile.am
    ├── README.md
    └── configure.ac
├── 04_scalarAddition
    ├── src
    │   ├── Makefile.am
    │   ├── check1ns.c
    │   ├── check1ns.h
    │   └── scalarAddition.c
    ├── Makefile.am
    ├── tests
    │   ├── scalarAddition_real_00.sh
    │   └── Makefile.am
    ├── docs
    │   ├── Doxyfile.in
    │   ├── UserManual.md
    │   └── Makefile.am
    ├── README.md
    └── configure.ac
├── 05_saxpy
    ├── Makefile.am
    ├── tests
    │   ├── saxpy_real_00.sh
    │   ├── Makefile.am
    │   └── saxpy_real_00.sh.5422320.out
    ├── src
    │   ├── Makefile.am
    │   ├── wtcalc.c
    │   ├── wtcalc.h
    │   ├── check1ns.c
    │   ├── check1ns.h
    │   ├── hsaxpy.h
    │   ├── asaxpy.h
    │   ├── hsaxpy.c
    │   ├── saxpy.c
    │   └── asaxpy.c
    ├── docs
    │   ├── Doxyfile.in
    │   ├── Makefile.am
    │   └── UserManual.md
    ├── README.md
    └── configure.ac
├── 08_distThreads
    ├── src
    │   ├── Makefile.am
    │   ├── gpuThreads.h
    │   ├── distThreads.c
    │   └── gpuThreads.c
    ├── Makefile.am
    ├── tests
    │   ├── distThreads_real_00.sh
    │   └── Makefile.am
    ├── docs
    │   ├── Doxyfile.in
    │   ├── Makefile.am
    │   └── UserManual.md
    ├── README.md
    └── configure.ac
├── simplifiedCode
    ├── 01_accelQuery
    │   └── accelQuery.c
    ├── 04_scalarAddition
    │   └── scalarAddition.c
    ├── 05_saxpy
    │   └── saxpy.c
    └── 02_dataTransRate
    │   └── dataTransRate.c
└── README.md


/03_taskwait/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = taskwait
2 | taskwait_SOURCES = taskwait.c
3 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/Clang/00_check_gpu/realscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Now, on $(hostname)"
3 | nvidia-smi
4 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/00_check_gpu/realscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Now, on $(hostname)"
3 | nvidia-smi
4 | 


--------------------------------------------------------------------------------
/09_matAdd/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = matAdd
2 | matAdd_SOURCES = matAdd.c \
3 | 		 matAddAB.h \
4 | 		 matAddAB.c
5 | 


--------------------------------------------------------------------------------
/10_matMul/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = matMul
2 | matMul_SOURCES = matMul.c \
3 | 		 matMulAB.h \
4 | 		 matMulAB.c
5 | 


--------------------------------------------------------------------------------
/01_accelQuery/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = accelQuery
2 | accelQuery_SOURCES = accelQuery.c \
3 | 		     prtAccelInfo.h \
4 | 		     prtAccelInfo.c
5 | 


--------------------------------------------------------------------------------
/02_dataTransRate/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = dataTransRate
2 | dataTransRate_SOURCES = dataTransRate.c \
3 | 			check1ns.h \
4 | 			check1ns.c
5 | 


--------------------------------------------------------------------------------
/03_taskwait/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/04_scalarAddition/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = scalarAddition
2 | scalarAddition_SOURCES = scalarAddition.c \
3 | 			 check1ns.h \
4 | 			 check1ns.c
5 | 


--------------------------------------------------------------------------------
/05_saxpy/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/08_distThreads/src/Makefile.am:
--------------------------------------------------------------------------------
1 | bin_PROGRAMS = distThreads
2 | distThreads_SOURCES = distThreads.c \
3 | 		      gpuThreads.h \
4 | 		      gpuThreads.c
5 | 


--------------------------------------------------------------------------------
/09_matAdd/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/10_matMul/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/01_accelQuery/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/02_dataTransRate/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/08_distThreads/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/04_scalarAddition/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS       = src
2 | dist_doc_DATA = README.md
3 | if HAVE_DOXYGEN
4 |   SUBDIRS    += docs
5 | endif
6 | SUBDIRS      += tests
7 | 


--------------------------------------------------------------------------------
/05_saxpy/tests/saxpy_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N saxpy
3 | #CCS -t 600m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:gtx1080=1,place=:excl
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/saxpy
9 | 


--------------------------------------------------------------------------------
/10_matMul/tests/matMul_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N matMul
3 | #CCS -t 600m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:gtx1080=1,place=:excl
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/matMul $((2**12))
9 | 


--------------------------------------------------------------------------------
/01_accelQuery/tests/accelQuery_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N accelQuery
3 | #CCS -t 10m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:gtx1080=1,place=:excl
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/accelQuery
9 | 


--------------------------------------------------------------------------------
/08_distThreads/tests/distThreads_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N distThreads
3 | #CCS -t 10m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:gtx1080=1,place=:excl
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/distThreads
9 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/Clang/00_check_gpu/tesla.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N nvidia_smi
3 | #CCS -t 1m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:ncpus=1:mem=8g:vmem=16g:tesla=1
6 | 
7 | sh -x realscript.sh 2>&1 | tee tesla.log
8 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/00_check_gpu/tesla.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N nvidia_smi
3 | #CCS -t 1m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:ncpus=1:mem=8g:vmem=16g:tesla=1
6 | 
7 | sh -x realscript.sh 2>&1 | tee tesla.log
8 | 


--------------------------------------------------------------------------------
/02_dataTransRate/tests/dataTransRate_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N dataTransRate
3 | #CCS -t 10m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:gtx1080=1,place=:excl
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/dataTransRate
9 | 


--------------------------------------------------------------------------------
/05_saxpy/src/Makefile.am:
--------------------------------------------------------------------------------
 1 | bin_PROGRAMS = saxpy
 2 | saxpy_SOURCES = saxpy.c    \
 3 | 		check1ns.h \
 4 | 		check1ns.c \
 5 | 		hsaxpy.h   \
 6 | 		hsaxpy.c   \
 7 | 		asaxpy.h   \
 8 | 		asaxpy.c   \
 9 | 		wtcalc.h   \
10 | 		wtcalc.c
11 | 


--------------------------------------------------------------------------------
/04_scalarAddition/tests/scalarAddition_real_00.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N scalarAddition
3 | #CCS -t 10m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:gtx1080=2
6 | 
7 | echo "hallo from $(hostname)"
8 | ../src/scalarAddition
9 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/02_build/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #CCS -N build
3 | #CCS -t 600m
4 | #CCS -g pc2-mitarbeiter
5 | #CCS --res=rset=1:ncpus=16:mem=32g:vmem=32g:tesla=1
6 | 
7 | module load system/CUDA/10.1.105
8 | sh -x realscript.sh 2>&1 | tee build.log
9 | 


--------------------------------------------------------------------------------
/09_matAdd/tests/matAdd_real_00.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #CCS -N matAdd
 3 | #CCS -t 600m
 4 | #CCS -g pc2-mitarbeiter
 5 | #CCS --res=rset=1:gtx1080=1,place=:excl
 6 | 
 7 | echo "hallo from $(hostname)"
 8 | ../src/matAdd $((2**12))
 9 | ../src/matAdd $((2**13))
10 | 


--------------------------------------------------------------------------------
/05_saxpy/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/09_matAdd/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/10_matMul/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/01_accelQuery/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/03_taskwait/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/08_distThreads/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/02_dataTransRate/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/04_scalarAddition/docs/Doxyfile.in:
--------------------------------------------------------------------------------
1 | PROJECT_NAME            =  @PACKAGE_NAME@
2 | PROJECT_NUMBER          =  @PACKAGE_VERSION@
3 | INPUT                   =  @top_srcdir@/src
4 | RECURSIVE               =  YES
5 | GENERATE_LATEX          =  NO
6 | QUIET                   =  YES
7 | 


--------------------------------------------------------------------------------
/05_saxpy/src/wtcalc.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file wtcalc.c
 3 |  *
 4 |  * @brief Global variable for walltime of the calculation kernel.
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @date 05.04.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  */
10 | 
11 | #include "wtcalc.h"
12 | 
13 | double wtcalc;
14 | 


--------------------------------------------------------------------------------
/04_scalarAddition/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: scalarAddition
 3 | author: Xin Wu (PC²)
 4 | date: 08.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `scalarAddition` adds two integers on host and accelerator, and also compares
10 | the performance.
11 | 
12 | # Usage
13 | 
14 | ```bash
15 | scalarAddition
16 | ```
17 | 
18 | 


--------------------------------------------------------------------------------
/05_saxpy/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = saxpy_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | saxpy_test_00.sh: $(top_srcdir)/src/saxpy
12 | 	echo "${cmdrun} saxpy_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = saxpy_test_00.sh
16 | 


--------------------------------------------------------------------------------
/01_accelQuery/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: accelQuery
 3 | author: Xin Wu (PC²)
 4 | date: 04.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `accelQuery` searches accelerator(s) on a heterogeneous computer.
10 | Accelerator(s), if found, will be enumerated with some basic info.
11 | 
12 | # Usage
13 | 
14 | ```bash
15 | accelQuery
16 | ```
17 | 
18 | 


--------------------------------------------------------------------------------
/09_matAdd/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = matAdd_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | matAdd_test_00.sh: $(top_srcdir)/src/matAdd
12 | 	echo "${cmdrun} matAdd_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = matAdd_test_00.sh
16 | 


--------------------------------------------------------------------------------
/10_matMul/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = matMul_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | matMul_test_00.sh: $(top_srcdir)/src/matMul
12 | 	echo "${cmdrun} matMul_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = matMul_test_00.sh
16 | 


--------------------------------------------------------------------------------
/03_taskwait/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: taskwait
 3 | author: Xin Wu (PC²)
 4 | date: 08.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `taskwait` checks the `taskwait` construct for the deferred target task. At the
10 | time of writing, this hasn't been implemented in the GCC 9.2 compiler.
11 | 
12 | # Usage
13 | 
14 | ```bash
15 | taskwait
16 | ```
17 | 
18 | 


--------------------------------------------------------------------------------
/03_taskwait/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = taskwait_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | taskwait_test_00.sh: $(top_srcdir)/src/taskwait
12 | 	echo "${cmdrun} taskwait_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = taskwait_test_00.sh
16 | 


--------------------------------------------------------------------------------
/01_accelQuery/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = accelQuery_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | accelQuery_test_00.sh: $(top_srcdir)/src/accelQuery
12 | 	echo "${cmdrun} accelQuery_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = accelQuery_test_00.sh
16 | 


--------------------------------------------------------------------------------
/08_distThreads/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = distThreads_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | distThreads_test_00.sh: $(top_srcdir)/src/distThreads
12 | 	echo "${cmdrun} distThreads_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = distThreads_test_00.sh
16 | 


--------------------------------------------------------------------------------
/02_dataTransRate/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = dataTransRate_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | dataTransRate_test_00.sh: $(top_srcdir)/src/dataTransRate
12 | 	echo "${cmdrun} dataTransRate_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = dataTransRate_test_00.sh
16 | 


--------------------------------------------------------------------------------
/04_scalarAddition/tests/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_CCSALLOC
 2 | cmdrun = ${CCSALLOC}
 3 | else
 4 | cmdrun = `which bash`
 5 | endif
 6 | 
 7 | check_SCRIPTS = scalarAddition_test_00.sh
 8 | 
 9 | TESTS = $(check_SCRIPTS)
10 | 
11 | scalarAddition_test_00.sh: $(top_srcdir)/src/scalarAddition
12 | 	echo "${cmdrun} scalarAddition_real_00.sh" > $@
13 | 	chmod +x $@
14 | 
15 | CLEANFILES = scalarAddition_test_00.sh
16 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/bugs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Bugs Found in GCC
 3 | author: Xin Wu (PC²)
 4 | date: 15.01.2020
 5 | ---
 6 | 
 7 | # Asynchronous Offloading Execution
 8 | 
 9 | This has not been fully implemented in GCC. See `03_taskwait`.
10 | 
11 | # Limitation of Number of GPU Threads in A Team
12 | 
13 | The number of GPU threads in a team (a contention group) is limited to 8. See
14 | `05_saxpy_v1` and `06_saxpy_v2`.
15 | 


--------------------------------------------------------------------------------
/02_dataTransRate/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: dataTransRate
 3 | author: Xin Wu (PC²)
 4 | date: 07.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`.
10 | 
11 | The possible situations are:
12 | 
13 | * h2h: `src` = host  and `dst` = host
14 | * h2a: `src` = host  and `dst` = accel
15 | * a2a: `src` = accel and `dst` = accel
16 | 
17 | # Usage
18 | 
19 | ```bash
20 | export CUDA_LAUNCH_BLOCKING 1
21 | dataTransRate
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/03_taskwait/tests/taskwait_real_00.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #CCS -N taskwait
 3 | #CCS -t 10m
 4 | #CCS -g pc2-mitarbeiter
 5 | #CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:gtx1080=2
 6 | 
 7 | echo "hallo from $(hostname)"
 8 | if [ 0 -eq 1 ]; then
 9 | #
10 | # Asynchronous offloading is not available in GCC 9.2.0.
11 | #
12 | notImpld=$(../src/taskwait 2>&1 | grep "GOMP_OFFLOAD_async_run")
13 | [[ $notImpld =~ "unimplemented" ]]
14 | else
15 | #
16 | # Asynchronous offloading is     available in Clang/LLVM 9.0.1.
17 | #
18 | ../src/taskwait
19 | fi
20 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/Clang/bugs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Bugs Found in Clang/LLVM
 3 | author: Xin Wu (PC²)
 4 | date: 15.01.2020
 5 | ---
 6 | 
 7 | # Activation of Accelerator
 8 | 
 9 | `omp_get_num_devices()` constantly returns 0, if accelerator(s) have not been
10 | activated by an OpenMP directive, even though there are accelerator(s) in the
11 | computing system. See `02_dataTransRate`.
12 | 
13 | NOTE: This bug has been fixed in Clang 11. But the data transfer rate within
14 | an accelerator's DRAM, e.g. `a2a`, is still much lower than our expectation.
15 | 


--------------------------------------------------------------------------------
/04_scalarAddition/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: scalarAddition
 3 | author: Xin Wu (PC²)
 4 | date: 08.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `scalarAddition` adds two integers on host and accelerator, and also compares
10 | the performance.
11 | 
12 | # Build
13 | 
14 | ```bash
15 | autoreconf -i; ./configure; make; make check; sudo make install;
16 | ```
17 | 
18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
19 | 
20 | # Documentation
21 | 
22 | * docs/html/index.html: Source code documentation generated by Doxygen.
23 | 
24 | * docs/UserManual.md: User Manual.
25 | 
26 | 


--------------------------------------------------------------------------------
/05_saxpy/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/05_saxpy/src/wtcalc.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file wtcalc.h
 3 |  *
 4 |  * @brief Global variable for walltime of the calculation kernel.
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @date 05.04.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  */
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | #ifndef WATCALC_H
16 | #define WATCALC_H
17 | 
18 | /*
19 |  * wtcalc: walltime for the calculation kernel
20 |  *
21 |  * - wtcalc  < 0.0: reset and disable the timer
22 |  * - wtcalc == 0.0:            enable the timer
23 |  */
24 | extern double wtcalc;
25 | 
26 | #endif
27 | 
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 | 


--------------------------------------------------------------------------------
/01_accelQuery/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/03_taskwait/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/09_matAdd/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/10_matMul/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/01_accelQuery/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: accelQuery
 3 | author: Xin Wu (PC²)
 4 | date: 04.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `accelQuery` searches accelerator(s) on a heterogeneous computer.
10 | Accelerator(s), if found, will be enumerated with some basic info.
11 | 
12 | # Build
13 | 
14 | ```bash
15 | autoreconf -i; ./configure; make; make check
16 | ```
17 | 
18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
19 | 
20 | # Documentation
21 | 
22 | * docs/html/index.html: Source code documentation generated by Doxygen.
23 | 
24 | * docs/UserManual.md: User Manual.
25 | 
26 | 


--------------------------------------------------------------------------------
/02_dataTransRate/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/04_scalarAddition/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/08_distThreads/docs/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_doc_DATA = UserManual.md
 2 | 
 3 | if HAVE_DOXYGEN
 4 | 
 5 | htmlpkg  = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz
 6 | doc_DATA = $(htmlpkg)
 7 | docstamp = doc.stamp
 8 | 
 9 | $(htmlpkg): $(docstamp)
10 | 	tar chof - html | gzip -9 -c > $@
11 | 
12 | $(docstamp): Doxyfile
13 | 	$(DOXYGEN)       $<
14 | 	echo Timestamp > $@
15 | 
16 | install-data-hook:
17 | 	cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg)
18 | 
19 | uninstall-hook:
20 | 	cd $(DESTDIR)$(docdir) && $(RM) -fr html
21 | 
22 | CLEANFILES = $(docstamp) $(htmlpkg)
23 | 
24 | clean-local:
25 | 	$(RM) -fr html
26 | 
27 | endif
28 | 


--------------------------------------------------------------------------------
/03_taskwait/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: taskwait
 3 | author: Xin Wu (PC²)
 4 | date: 08.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `taskwait` checks the `taskwait` construct for the deferred target task. At the
10 | time of writing, this hasn't been implemented in the GCC 9.2 compiler.
11 | 
12 | # Build
13 | 
14 | ```bash
15 | autoreconf -i; ./configure; make; make check; sudo make install;
16 | ```
17 | 
18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
19 | 
20 | # Documentation
21 | 
22 | * docs/html/index.html: Source code documentation generated by Doxygen.
23 | 
24 | * docs/UserManual.md: User Manual.
25 | 
26 | 


--------------------------------------------------------------------------------
/05_saxpy/src/check1ns.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.c
 3 |  * @brief Function definition for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This source file contains function definition for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 07.01.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #include <assert.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <time.h>
17 | #include "check1ns.h"
18 | 
19 | void check1ns(void)
20 | {
21 |   struct timespec res;
22 | 
23 |   if (0 != clock_getres(CLOCK_REALTIME, &res)) {
24 |     printf("error: clock_getres\n");
25 |     exit(EXIT_FAILURE);
26 |   }
27 |   assert(1l == res.tv_nsec);
28 | }
29 | 


--------------------------------------------------------------------------------
/02_dataTransRate/src/check1ns.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.c
 3 |  * @brief Function definition for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This source file contains function definition for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 07.01.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #include <assert.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <time.h>
17 | #include "check1ns.h"
18 | 
19 | void check1ns(void)
20 | {
21 |   struct timespec res;
22 | 
23 |   if (0 != clock_getres(CLOCK_REALTIME, &res)) {
24 |     printf("error: clock_getres\n");
25 |     exit(EXIT_FAILURE);
26 |   }
27 |   assert(1l == res.tv_nsec);
28 | }
29 | 


--------------------------------------------------------------------------------
/04_scalarAddition/src/check1ns.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.c
 3 |  * @brief Function definition for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This source file contains function definition for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 07.01.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #include <assert.h>
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <time.h>
17 | #include "check1ns.h"
18 | 
19 | void check1ns(void)
20 | {
21 |   struct timespec res;
22 | 
23 |   if (0 != clock_getres(CLOCK_REALTIME, &res)) {
24 |     printf("error: clock_getres\n");
25 |     exit(EXIT_FAILURE);
26 |   }
27 |   assert(1l == res.tv_nsec);
28 | }
29 | 


--------------------------------------------------------------------------------
/08_distThreads/src/gpuThreads.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file gpuThreads.h
 3 |  * @brief Function prototype for organizing GPU threads.
 4 |  *
 5 |  * This header file contains function prototype for organizing GPU threads.
 6 |  *
 7 |  * @author Xin Wu (PC²)
 8 |  * @data 12.03.2020
 9 |  * @copyright CC BY-SA 2.0
10 |  */
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | #ifndef GPUTHREADS_H
17 | #define GPUTHREADS_H
18 | 
19 | void gpuThreads(int i);
20 | /**<
21 |  * @brief Show the organization of GPU threads.
22 |  *
23 |  * The ith organization of GPU threads is shown.
24 |  *
25 |  * @param i The ith organization.
26 |  *
27 |  * @return \c void.
28 |  */
29 | 
30 | #endif
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 


--------------------------------------------------------------------------------
/02_dataTransRate/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: dataTransRate
 3 | author: Xin Wu (PC²)
 4 | date: 07.01.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`.
10 | 
11 | The possible situations are:
12 | 
13 | * h2h: `src` = host  and `dst` = host
14 | * h2a: `src` = host  and `dst` = accel
15 | * a2a: `src` = accel and `dst` = accel
16 | 
17 | # Build
18 | 
19 | ```bash
20 | autoreconf -i; ./configure; make; make check
21 | ```
22 | 
23 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
24 | 
25 | # Documentation
26 | 
27 | * docs/html/index.html: Source code documentation generated by Doxygen.
28 | 
29 | * docs/UserManual.md: User Manual.
30 | 
31 | 


--------------------------------------------------------------------------------
/01_accelQuery/src/prtAccelInfo.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file prtAccelInfo.h
 3 |  * @brief Function prototype for prtAccelInfo.
 4 |  *
 5 |  * This header file contains function prototype for prtAccelInfo.
 6 |  *
 7 |  * @author Xin Wu (PC²)
 8 |  * @date 04.01.2020
 9 |  * @copyright CC BY-SA 2.0
10 |  */
11 | 
12 | #ifdef __cplusplus
13 | extern "C" {
14 | #endif
15 | 
16 | #ifndef PRTACCELINFO_H
17 | #define PRTACCELINFO_H
18 | 
19 | void prtAccelInfo(int iaccel);
20 | /**<
21 |  * @brief Print some basic info of an accelerator.
22 |  *
23 |  * Strictly speaking, \c prtAccelInfo() can only print the basic info of an
24 |  * Nvidia CUDA device.
25 |  *
26 |  * @param iaccel The index of an accelerator.
27 |  *
28 |  * @return \c void.
29 |  */
30 | 
31 | #endif
32 | 
33 | #ifdef __cplusplus
34 | }
35 | #endif
36 | 


--------------------------------------------------------------------------------
/simplifiedCode/01_accelQuery/accelQuery.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file accelQuery.c
 3 |  *
 4 |  * @brief accelQuery searches accelerator(s) on a heterogeneous computer.
 5 |  *
 6 |  * Host-only:
 7 |  * gcc -Wall -fopenmp -foffload=disable    accelQuery.c
 8 |  *
 9 |  * Offload to GPU:
10 |  * gcc -Wall -fopenmp -foffload=nvptx-none accelQuery.c
11 |  *
12 |  */
13 | 
14 | #include <stdio.h>
15 | #include <omp.h>
16 | 
17 | int main(void)
18 | {
19 | #pragma omp target
20 | {
21 |   if (omp_is_initial_device()) {
22 |     printf("Hello World from Host.\n");
23 |   } else {
24 |     printf("Hello World from Accelerator.\n");
25 |   }
26 |   /*
27 |    * Question: Why this may give _wrong_ number of accelerators? FIXME
28 |    */
29 |   printf("%d accelerator found.\n", omp_get_num_devices());
30 | }
31 |   return 0;
32 | }
33 | 


--------------------------------------------------------------------------------
/05_saxpy/src/check1ns.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.h
 3 |  * @brief Function prototype for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This header file contains function prototype for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 07.01.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | #ifndef CHECK1NS_H
18 | #define CHECK1NS_H
19 | 
20 | void check1ns(void);
21 | /**<
22 |  * @brief Check whether 1 ns time resolution is available on the system.
23 |  *
24 |  * We need 1 ns time resolution. If it's available, program continues normally.
25 |  * Otherwise, program terminates.
26 |  *
27 |  * @return \c void.
28 |  */
29 | 
30 | #endif
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 


--------------------------------------------------------------------------------
/02_dataTransRate/src/check1ns.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.h
 3 |  * @brief Function prototype for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This header file contains function prototype for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 12.03.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | #ifndef CHECK1NS_H
18 | #define CHECK1NS_H
19 | 
20 | void check1ns(void);
21 | /**<
22 |  * @brief Check whether 1 ns time resolution is available on the system.
23 |  *
24 |  * We need 1 ns time resolution. If it's available, program continues normally.
25 |  * Otherwise, program terminates.
26 |  *
27 |  * @return \c void.
28 |  */
29 | 
30 | #endif
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 


--------------------------------------------------------------------------------
/04_scalarAddition/src/check1ns.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file check1ns.h
 3 |  * @brief Function prototype for checking 1 ns time resolution on the system.
 4 |  *
 5 |  * This header file contains function prototype for checking 1 ns time
 6 |  * resolution on the system.
 7 |  *
 8 |  * @author Xin Wu (PC²)
 9 |  * @date 07.01.2020
10 |  * @copyright CC BY-SA 2.0
11 |  */
12 | 
13 | #ifdef __cplusplus
14 | extern "C" {
15 | #endif
16 | 
17 | #ifndef CHECK1NS_H
18 | #define CHECK1NS_H
19 | 
20 | void check1ns(void);
21 | /**<
22 |  * @brief Check whether 1 ns time resolution is available on the system.
23 |  *
24 |  * We need 1 ns time resolution. If it's available, program continues normally.
25 |  * Otherwise, program terminates.
26 |  *
27 |  * @return \c void.
28 |  */
29 | 
30 | #endif
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 


--------------------------------------------------------------------------------
/03_taskwait/src/taskwait.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file taskwait.c
 3 |  *
 4 |  * @mainpage taskwait
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @date 08.01.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  *
10 |  * taskwait checks the taskwait construct for the deferred target task. At the
11 |  * time of writing, this hasn't been implemented in the GCC 9.2 compiler.
12 |  */
13 | 
14 | #include <assert.h>
15 | #ifdef _OPENMP
16 | #include <omp.h>
17 | #endif
18 | 
19 | /**
20 |  * @brief Main entry point for taskwait.
21 |  */
22 | int main(int argc, char *argv[])
23 | {
24 |   int a, b, c,
25 |       x, y, z;
26 | 
27 |   a = x = 2;
28 |   b = y = 4;
29 | #pragma omp target map(a, b, c) nowait
30 | {
31 |   c = a + b; /* This is executed on accelerator. */
32 | }
33 |   z = x + y; /* This is executed on host.        */
34 | #pragma omp taskwait
35 |   assert(c == z);
36 |   return 0;
37 | }
38 | 


--------------------------------------------------------------------------------
/05_saxpy/tests/saxpy_real_00.sh.5422320.out:
--------------------------------------------------------------------------------
 1 | hallo from gpu029
 2 | The system supports 1 ns time resolution
 3 | total size of x and y is     512.0 MB
 4 | tests are averaged over 32 loops
 5 | saxpy on host (0) :   65092.0 MB/s   65093.6 MB/s maxabserr =       0.0
 6 | saxpy on host (1) :   70769.4 MB/s   70772.0 MB/s maxabserr =       0.0
 7 | saxpy on accl (1) :    1400.7 MB/s    4648.9 MB/s maxabserr =       0.0
 8 | saxpy on accl (2) :    1371.6 MB/s    4653.9 MB/s maxabserr =       0.0
 9 | saxpy on accl (3) :    2046.7 MB/s  227586.6 MB/s maxabserr =       0.0
10 | saxpy on accl (4) :    2062.4 MB/s  224540.3 MB/s maxabserr =       0.0
11 | saxpy on accl (5) :    2073.9 MB/s  276659.5 MB/s maxabserr =       0.0
12 | saxpy on accl (6) :    2045.0 MB/s  271431.4 MB/s maxabserr =       0.0
13 | saxpy on accl (7) :    2025.2 MB/s  280631.7 MB/s maxabserr =       0.0
14 | saxpy on accl (8) :    2025.7 MB/s  279577.4 MB/s maxabserr =       0.0
15 | 


--------------------------------------------------------------------------------
/10_matMul/tests/matMul_real_00.sh.5422392.out:
--------------------------------------------------------------------------------
 1 | hallo from gpu028
 2 | matrix dim: 4096 x 4096
 3 | time averaged over 16 loops
 4 | matMulAB (0) :      24.9 GFLOPS      25.5 GFLOPS maxabserr =       0.0
 5 | matMulAB (1) :       9.8 GFLOPS       9.9 GFLOPS maxabserr =       0.0
 6 | matMulAB (2) :     184.5 GFLOPS     228.9 GFLOPS maxabserr =       0.0
 7 | matMulAB (3) :       5.0 GFLOPS       5.1 GFLOPS maxabserr =    1018.4
 8 | matMulAB (4) :     176.1 GFLOPS     216.2 GFLOPS maxabserr =       0.0
 9 | matMulAB (5) :     340.9 GFLOPS     531.9 GFLOPS maxabserr =       0.0
10 | matMulAB (6) :     610.3 GFLOPS    1708.9 GFLOPS maxabserr =       0.0
11 | matMulAB (7) :     218.9 GFLOPS     284.6 GFLOPS maxabserr =       0.0
12 | matMulAB (8) :     233.8 GFLOPS     310.4 GFLOPS maxabserr =       0.0
13 | matMulAB (9) :     254.5 GFLOPS     348.1 GFLOPS maxabserr =       0.0
14 | matMulAB (10) :     931.6 GFLOPS   10126.1 GFLOPS maxabserr =       0.0
15 | 


--------------------------------------------------------------------------------
/09_matAdd/src/matAddAB.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file matAddAB.h
 3 |  *
 4 |  * @brief Function prototype for matrix addition (A += B) in single-precision.
 5 |  *
 6 |  * This header file contains function prototype for matrix addition (A += B)
 7 |  * in single-precision.
 8 |  *
 9 |  * @author Xin Wu (PC²)
10 |  * @date 07.02.2020
11 |  * @copyright CC BY-SA 2.0
12 |  */
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 | #ifndef MATADDAB_H
19 | #define MATADDAB_H
20 | 
21 | void matAddAB_accl(float *a,
22 |                    float *b,
23 |                    int n,
24 |                    int ial);
25 | /**<
26 |  * @brief Perform matrix addition (A += B) on accl.
27 |  *
28 |  * @return \c void.
29 |  */
30 | 
31 | /*
32 |  * wtcalc: walltime for the calculation kernel on GPU
33 |  *
34 |  * - wtcalc  < 0.0: reset and disable the timer
35 |  * - wtcalc == 0.0:            enable the timer
36 |  */
37 | extern double wtcalc;
38 | 
39 | #endif
40 | 
41 | #ifdef __cplusplus
42 | }
43 | #endif
44 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/01_download/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # nvptx-tools
 4 | #
 5 | echo "nvptx-tools"
 6 | git clone https://github.com/MentorEmbedded/nvptx-tools.git
 7 | cd                                          nvptx-tools
 8 | git checkout -b gcc9_gpu 5f6f343a302d620b0868edab376c00b15741e39e
 9 | cd ..
10 | #
11 | # nvptx-newlib
12 | #
13 | echo "nvptx-newlib"
14 | git clone https://github.com/MentorEmbedded/nvptx-newlib.git
15 | cd                                          nvptx-newlib
16 | git checkout -b gcc9_gpu 66dd175a9d3aea387715f00ff18ef7e535cd1272
17 | cd ..
18 | #
19 | # openacc-gcc-9-branch
20 | #
21 | echo "openacc-gcc-9-branch"
22 | wget https://github.com/gcc-mirror/gcc/archive/gcc-9_2_0-release.tar.gz
23 | tar xf                                         gcc-9_2_0-release.tar.gz
24 | cd                                         gcc-gcc-9_2_0-release
25 | ./contrib/download_prerequisites
26 | ln -s ../nvptx-newlib/newlib newlib
27 | cd ..
28 | #
29 | # Done
30 | #
31 | echo "Done"
32 | 


--------------------------------------------------------------------------------
/10_matMul/src/matMulAB.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file matMulAB.h
 3 |  *
 4 |  * @brief Function prototype for matrix multiplication in single-precision.
 5 |  *
 6 |  * This header file contains function prototype for matrix multiplication
 7 |  * in single-precision.
 8 |  *
 9 |  * @author Xin Wu (PC²)
10 |  * @date 07.02.2020
11 |  * @copyright CC BY-SA 2.0
12 |  */
13 | 
14 | #ifdef __cplusplus
15 | extern "C" {
16 | #endif
17 | 
18 | #ifndef MATMULAB_H
19 | #define MATMULAB_H
20 | 
21 | void matMulAB_accl(float *a,
22 |                    float *b,
23 |                    float *c,
24 |                    int n,
25 |                    int ial);
26 | /**<
27 |  * @brief Perform matrix multiplication on accl.
28 |  *
29 |  * @return \c void.
30 |  */
31 | 
32 | /*
33 |  * wtcalc: walltime for the calculation kernel on GPU
34 |  *
35 |  * - wtcalc  < 0.0: reset and disable the timer
36 |  * - wtcalc == 0.0:            enable the timer
37 |  */
38 | extern double wtcalc;
39 | 
40 | #endif
41 | 
42 | #ifdef __cplusplus
43 | }
44 | #endif
45 | 


--------------------------------------------------------------------------------
/01_accelQuery/src/accelQuery.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file accelQuery.c
 3 |  *
 4 |  * @mainpage accelQuery
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @date 04.01.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  *
10 |  * accelQuery searches accelerator(s) on a heterogeneous computer.
11 |  * Accelerator(s), if found, will be enumerated with some basic info.
12 |  */
13 | 
14 | #include <stdio.h>
15 | #ifdef _OPENMP
16 | #include <omp.h>
17 | #endif
18 | #include "prtAccelInfo.h"
19 | 
20 | /**
21 |  * @brief Main entry point for accelQuery.
22 |  */
23 | int main(int argc, char *argv[])
24 | {
25 |   int iaccel, naccel;
26 | 
27 |   /*
28 |    * NOTE: The behavior of an OpenMP API function may be defined differently
29 |    * for inside and outside of the target region.
30 |    */
31 | #pragma omp target
32 | {
33 |   if (omp_is_initial_device()) {
34 |     printf("Hello World from Host.\n");
35 |   } else {
36 |     printf("Hello World from Accelerator(s).\n");
37 |   }
38 | }
39 |   // no accelerator
40 |   if (0 == (naccel = omp_get_num_devices())) return 0;
41 |   // one or more accelerator(s)
42 |   printf("\n%d Accelerator(s) found\n", naccel);
43 |   for (iaccel = 0; iaccel < naccel; iaccel++) {
44 |     prtAccelInfo(iaccel);
45 |   }
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/09_matAdd/tests/matAdd_real_00.sh.5422334.out:
--------------------------------------------------------------------------------
 1 | hallo from gpu028
 2 | matrix dim: 4096 x 4096
 3 | time averaged over 64 loops
 4 | matAddAB (0) :       1.9 GB/s      86.6 GB/s maxabserr =       0.0
 5 | matAddAB (1) :       1.6 GB/s      35.7 GB/s maxabserr =       0.0
 6 | matAddAB (2) :       1.6 GB/s      48.1 GB/s maxabserr =       0.0
 7 | matAddAB (3) :       1.7 GB/s     166.6 GB/s maxabserr =       0.0
 8 | matAddAB (4) :       2.0 GB/s     183.3 GB/s maxabserr =       0.0
 9 | matAddAB (5) :       1.9 GB/s     183.7 GB/s maxabserr =       0.0
10 | matAddAB (6) :       1.9 GB/s     185.3 GB/s maxabserr =       0.0
11 | matAddAB (7) :       1.8 GB/s     185.4 GB/s maxabserr =       0.0
12 | matrix dim: 8192 x 8192
13 | time averaged over 64 loops
14 | matAddAB (0) :       1.9 GB/s     172.2 GB/s maxabserr =       0.0
15 | matAddAB (1) :       1.9 GB/s      34.0 GB/s maxabserr =       0.0
16 | matAddAB (2) :       1.6 GB/s       8.4 GB/s maxabserr =       0.0
17 | matAddAB (3) :       1.9 GB/s     265.8 GB/s maxabserr =       0.0
18 | matAddAB (4) :       1.9 GB/s     265.4 GB/s maxabserr =       0.0
19 | matAddAB (5) :       1.9 GB/s     265.8 GB/s maxabserr =       0.0
20 | matAddAB (6) :       1.9 GB/s     264.9 GB/s maxabserr =       0.0
21 | matAddAB (7) :       1.9 GB/s     269.0 GB/s maxabserr =       0.0
22 | 


--------------------------------------------------------------------------------
/05_saxpy/src/hsaxpy.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file hsaxpy.h
 3 |  * @brief Function prototype for performing the \c saxpy operation on host.
 4 |  *
 5 |  * This header file contains function prototype for the \c saxpy operation,
 6 |  * which is defined as:
 7 |  *
 8 |  * y := a * x + y
 9 |  *
10 |  * where:
11 |  *
12 |  * - a is a scalar.
13 |  * - x and y are single-precision vectors each with n elements.
14 |  *
15 |  * @author Xin Wu (PC²)
16 |  * @date 05.04.2020
17 |  * @copyright CC BY-SA 2.0
18 |  */
19 | 
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | #ifndef HSAXY_H
25 | #define HSAXY_H
26 | 
27 | void hsaxpy(const int n,
28 |             const float a,
29 |             const float *x,
30 |                   float *y,
31 |             const int ial);
32 | /**<
33 |  * @brief Performs the \c saxpy operation on host.
34 |  *
35 |  * The \c saxpy operation is defined as:
36 |  *
37 |  * y := a * x + y
38 |  *
39 |  * where:
40 |  *
41 |  * - a is a scalar.
42 |  * - x and y are single-precision vectors each with n elements.
43 |  *
44 |  * @param n   The number of elements in \p x and \p y.
45 |  * @param a   The scalar for multiplication.
46 |  * @param x   The vector \p x in \c saxpy.
47 |  * @param y   The vector \p y in \c saxpy.
48 |  * @param ial The ial-th implementation.
49 |  *
50 |  * @return \c void.
51 |  */
52 | 
53 | #endif
54 | 
55 | #ifdef __cplusplus
56 | }
57 | #endif
58 | 


--------------------------------------------------------------------------------
/05_saxpy/src/asaxpy.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file asaxpy.h
 3 |  * @brief Function prototype for performing the \c saxpy operation on accelerator.
 4 |  *
 5 |  * This header file contains function prototype for the \c saxpy operation,
 6 |  * which is defined as:
 7 |  *
 8 |  * y := a * x + y
 9 |  *
10 |  * where:
11 |  *
12 |  * - a is a scalar.
13 |  * - x and y are single-precision vectors each with n elements.
14 |  *
15 |  * @author Xin Wu (PC²)
16 |  * @date 05.04.2020
17 |  * @copyright CC BY-SA 2.0
18 |  */
19 | 
20 | #ifdef __cplusplus
21 | extern "C" {
22 | #endif
23 | 
24 | #ifndef ASAXY_H
25 | #define ASAXY_H
26 | 
27 | void asaxpy(const int n,
28 |             const float a,
29 |             const float *x,
30 |                   float *y,
31 |             const int ial);
32 | /**<
33 |  * @brief Performs the \c saxpy operation on accelerator.
34 |  *
35 |  * The \c saxpy operation is defined as:
36 |  *
37 |  * y := a * x + y
38 |  *
39 |  * where:
40 |  *
41 |  * - a is a scalar.
42 |  * - x and y are single-precision vectors each with n elements.
43 |  *
44 |  * @param n   The number of elements in \p x and \p y.
45 |  * @param a   The scalar for multiplication.
46 |  * @param x   The vector \p x in \c saxpy.
47 |  * @param y   The vector \p y in \c saxpy.
48 |  * @param ial The ial-th implementation.
49 |  *
50 |  * @return \c void.
51 |  */
52 | 
53 | #endif
54 | 
55 | #ifdef __cplusplus
56 | }
57 | #endif
58 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/00_check_gpu/tesla.log:
--------------------------------------------------------------------------------
 1 | ++ hostname
 2 | + echo 'Now, on gpu003'
 3 | Now, on gpu003
 4 | + nvidia-smi
 5 | Tue Dec 17 08:34:20 2019       
 6 | +-----------------------------------------------------------------------------+
 7 | | NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
 8 | |-------------------------------+----------------------+----------------------+
 9 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
10 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
11 | |===============================+======================+======================|
12 | |   0  Tesla K20Xm         Off  | 00000000:84:00.0 Off |                    0 |
13 | | N/A   28C    P0    59W / 235W |      0MiB /  5700MiB |     81%      Default |
14 | +-------------------------------+----------------------+----------------------+
15 |                                                                                
16 | +-----------------------------------------------------------------------------+
17 | | Processes:                                                       GPU Memory |
18 | |  GPU       PID   Type   Process name                             Usage      |
19 | |=============================================================================|
20 | |  No running processes found                                                 |
21 | +-----------------------------------------------------------------------------+
22 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/Clang/00_check_gpu/tesla.log:
--------------------------------------------------------------------------------
 1 | ++ hostname
 2 | + echo 'Now, on gpu003'
 3 | Now, on gpu003
 4 | + nvidia-smi
 5 | Tue Dec 17 08:34:20 2019       
 6 | +-----------------------------------------------------------------------------+
 7 | | NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
 8 | |-------------------------------+----------------------+----------------------+
 9 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
10 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
11 | |===============================+======================+======================|
12 | |   0  Tesla K20Xm         Off  | 00000000:84:00.0 Off |                    0 |
13 | | N/A   28C    P0    59W / 235W |      0MiB /  5700MiB |     81%      Default |
14 | +-------------------------------+----------------------+----------------------+
15 |                                                                                
16 | +-----------------------------------------------------------------------------+
17 | | Processes:                                                       GPU Memory |
18 | |  GPU       PID   Type   Process name                             Usage      |
19 | |=============================================================================|
20 | |  No running processes found                                                 |
21 | +-----------------------------------------------------------------------------+
22 | 


--------------------------------------------------------------------------------
/05_saxpy/src/hsaxpy.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file hsaxpy.c
 3 |  * @brief Function definition for performing the \c saxpy operation on host.
 4 |  *
 5 |  * This source file contains function definition for the \c saxpy operation,
 6 |  * which is defined as:
 7 |  *
 8 |  * y := a * x + y
 9 |  *
10 |  * where:
11 |  *
12 |  * - a is a scalar.
13 |  * - x and y are single-precision vectors each with n elements.
14 |  *
15 |  * @author Xin Wu (PC²)
16 |  * @date 05.04.2020
17 |  * @copyright CC BY-SA 2.0
18 |  */
19 | 
20 | #include <time.h>
21 | #ifdef _OPENMP
22 | #include <omp.h>
23 | #endif
24 | #include "mkl.h"
25 | #include "wtcalc.h"
26 | #include "hsaxpy.h"
27 | 
28 | void hsaxpy(const int n,
29 |             const float a,
30 |             const float *x,
31 |                   float *y,
32 |             const int ial)
33 | {
34 |   struct timespec rt[2];
35 | 
36 |   switch (ial) {
37 |     case 0:
38 | /*
39 |  * - naive implementation
40 |  */
41 | clock_gettime(CLOCK_REALTIME, rt + 0);
42 | #pragma omp parallel for simd schedule(simd:static) \
43 |   default(none) shared(a, n, x, y)
44 |   for (int i = 0; i < n; i++) {
45 |     y[i] = a * x[i] + y[i];
46 |   }
47 | clock_gettime(CLOCK_REALTIME, rt + 1);
48 |       break;
49 |     default:
50 | /*
51 |  * - saxpy in MKL
52 |  */
53 | clock_gettime(CLOCK_REALTIME, rt + 0);
54 | cblas_saxpy(n, a, x, 1, y, 1);
55 | clock_gettime(CLOCK_REALTIME, rt + 1);
56 |       break;
57 |   } /* end switch (ial) */
58 |   if (wtcalc >= 0.0) {
59 |     wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/simplifiedCode/04_scalarAddition/scalarAddition.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file scalarAddition.c
 3 |  *
 4 |  * @brief scalarAddition adds two integers on host and accelerator, and also
 5 |  * compares the performance.
 6 |  *
 7 |  * Offload to GPU:
 8 |  * gcc -Wall -fopenmp -foffload=nvptx-none scalarAddition.c
 9 |  *
10 |  */
11 | 
12 | #include <assert.h>
13 | #include <stdio.h>
14 | #include <time.h>
15 | #include <omp.h>
16 | 
17 | /**
18 |  * @brief Main entry point for scalarAddition.
19 |  */
20 | int main(int argc, char *argv[])
21 | {
22 |   /*
23 |    * data on host
24 |    */
25 |   int a, b, c, // c = a + b;
26 |          y, z; // z = x + y; (x in device data environment)
27 |   struct timespec rt[2];
28 | 
29 |   /*
30 |    * scalar addition on host
31 |    */
32 |   clock_gettime(CLOCK_REALTIME, rt + 0);
33 |   a = 2;
34 |   b = 4;
35 |   c = a + b;
36 |   clock_gettime(CLOCK_REALTIME, rt + 1);
37 |   printf("scalar addition on host: %12.9f s\n",
38 |       (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec));
39 |   /*
40 |    * scalar addition on accelerator
41 |    */
42 |   y = 4;
43 |   clock_gettime(CLOCK_REALTIME, rt + 0);
44 | #pragma omp target map(to:y) map(from:z)
45 | {
46 |   int x; // only accessible from accelerator
47 |   x = 2;
48 |   z = x + y;
49 | }
50 |   clock_gettime(CLOCK_REALTIME, rt + 1);
51 |   printf("scalar addition on accelerator: %12.9f s\n",
52 |       (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec));
53 |   /*
54 |    * Question: How to measure the walltime for H-A data transfer rate? FIXME
55 |    * Question: How to measure the walltime for a kernel launch on GPU? FIXME
56 |    * Question: How to monitor this tiny calculation on GPU?            FIXME
57 |    */
58 |   /*
59 |    * check the result
60 |    */
61 |   assert(c == z);
62 |   return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/04_scalarAddition/src/scalarAddition.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file scalarAddition.c
 3 |  *
 4 |  * @mainpage scalarAddition
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @date 08.01.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  *
10 |  * scalarAddition adds two integers on host and accelerator, and also compares
11 |  * the performance.
12 |  *
13 |  */
14 | 
15 | #include <assert.h>
16 | #include <stdio.h>
17 | #include <stdlib.h>
18 | #include <time.h>
19 | #ifdef _OPENMP
20 | #include <omp.h>
21 | #endif
22 | #include "check1ns.h"
23 | 
24 | /**
25 |  * @brief Main entry point for scalarAddition.
26 |  */
27 | int main(int argc, char *argv[])
28 | {
29 |   /*
30 |    * data on host
31 |    */
32 |   int a, b, c, // c = a + b;
33 |          y, z; // z = x + y; (x in device data environment)
34 |   struct timespec rt[2];
35 | 
36 |   /*
37 |    * We need 1 ns time resolution.
38 |    */
39 |   check1ns();
40 |   printf("The system supports 1 ns time resolution\n");
41 |   /*
42 |    * check the number of accelerators
43 |    */
44 |   if (0 == omp_get_num_devices()) {
45 |     printf("No accelerator found ... exit\n");
46 |     exit(EXIT_FAILURE);
47 |   }
48 |   /*
49 |    * scalar addition on host
50 |    */
51 |   clock_gettime(CLOCK_REALTIME, rt + 0);
52 |   a = 2;
53 |   b = 4;
54 |   c = a + b;
55 |   clock_gettime(CLOCK_REALTIME, rt + 1);
56 |   printf("scalar addition on host: %12.9f s\n",
57 |       (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec));
58 |   /*
59 |    * scalar addition on accelerator
60 |    */
61 |   y = 4;
62 |   clock_gettime(CLOCK_REALTIME, rt + 0);
63 | #pragma omp target map(to:y) map(from:z)
64 | {
65 |   int x; // only accessible from accelerator
66 |   x = 2;
67 |   z = x + y;
68 | }
69 |   clock_gettime(CLOCK_REALTIME, rt + 1);
70 |   printf("scalar addition on accelerator: %12.9f s\n",
71 |       (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec));
72 |   /*
73 |    * check the result
74 |    */
75 |   assert(c == z);
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/08_distThreads/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: distThreads
 3 | author: Xin Wu (PC²)
 4 | date: 12.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `distThreads` demonstrates the organization of threads and teams in a league on
10 | GPU.
11 | 
12 | * Column-major is assumed thru the entire code!
13 | 
14 | * The following tables only summarize the most important points. For more
15 |   details on the ith organization of the GPU threads see comments in
16 |   `gpuThreads.c`.
17 | 
18 | | i |  matrix league  |     GPU threads     |
19 | |:-:|:---------------:|:-------------------:|
20 | |   |  nrow  x  ncol  |  nthrds  x  lteams  |
21 | | 0 |    3   x    5   |    3     x    5     |
22 | | 1 |    3   x    5   |    3     x    5     |
23 | | 2 |    3   x    5   |    3     x    5     |
24 | | 3 |    3   x    5   |    3     x    5     |
25 | | 4 |    7   x    7   |    3     x    5     |
26 | | 5 |    7   x    7   |    3     x    5     |
27 | | 6 |   12   x    6   |    3     x    6     |
28 | | 7 |   12   x    6   |    3     x    6     |
29 | | 8 |   12   x    6   |    3     x    3     |
30 | 
31 | | i |  Remarks                                                        |
32 | |:-:|:----------------------------------------------------------------|
33 | | 0 | Used as Reference. No loop at all.                              |
34 | | 1 | Incorrect nested loop impl.                                     |
35 | | 2 | Correct impl. Manually linearized loop.                         |
36 | | 3 | Correct impl. Nested loop with collapse(2).                     |
37 | | 4 | Irreg. matrix. Default chunk_size. Some GPU threads are idle.   |
38 | | 5 | Irreg. matrix. chunk_size = nthrds. Better performance.         |
39 | | 6 | CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access. |
40 | | 7 | 2x irow-loop unrolling. Nested loop with collapse(3).           |
41 | |   | Coalesced GPU memory access.                                    |
42 | | 8 | 2x icol-loop unrolling. 2x irow-loop unrolling.                 |
43 | |   | Nested loop with collapse(3). Best Performance.                 |
44 | 
45 | # Usage
46 | 
47 | ```bash
48 | distThreads
49 | ```
50 | 
51 | 


--------------------------------------------------------------------------------
/05_saxpy/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: saxpy
 3 | author: Xin Wu (PC²)
 4 | date: 05.04.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `saxpy` performs the `saxpy` operation on host as well as accelerator.
10 | The performance (in MB/s) for different implementations is also compared.
11 | 
12 | The `saxpy` operation is defined as:
13 | 
14 | $$ y := a * x + y $$
15 | 
16 | where:
17 | 
18 | * `a` is a scalar.
19 | * `x` and `y` are single-precision vectors each with n elements.
20 | * For testing n is assumed to be $2^{26}$.
21 | * The following table only summarizes the most important points. For more
22 |   details on the ial-th implementation see comments in `hsaxpy.c` (on host)
23 |   and `asaxpy.c` (on accelerator).
24 | 
25 |     - on host
26 | 
27 | | ial |  Remarks                                                               |
28 | |:---:|------------------------------------------------------------------------|
29 | |  0  | naive implementation                                                   |
30 | |  1  | saxpy in MKL                                                           |
31 | 
32 |     - on accl
33 | 
34 | | ial |  Remarks                                                               |
35 | |:---:|------------------------------------------------------------------------|
36 | |  0  | <<<2^0 , 2^0 >>>, TOO SLOW! not tested                                 |
37 | |  1  | <<<2^0 , 2^7 >>>, auto   scheduling                                    |
38 | |  2  | <<<2^7 , 2^0 >>>, auto   scheduling                                    |
39 | |  3  | <<<2^7 , 2^7 >>>, auto   scheduling                                    |
40 | |  4  | <<<2^16, 2^10>>>, manual scheduling                                    |
41 | |  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
42 | |     | (2^15*2^7*16==2^26)                                                    |
43 | |  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
44 | |  7  | de-linearize the vector gives slightly better performance than CUBLAS  |
45 | |  8  | cublasSaxpy in CUBLAS                                                  |
46 | 
47 | # Usage
48 | 
49 | ```bash
50 | saxpy
51 | ```
52 | 
53 | 


--------------------------------------------------------------------------------
/09_matAdd/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: matAdd
 3 | author: Xin Wu (PC²)
 4 | date: 12.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `matAdd` performs matrix addition (A += B) in single-precision on GPU.
10 | The performance (in GB/s) for different implementations is compared and
11 | the numerical results are also verified.
12 | 
13 | * Column-major is assumed thru the entire code!
14 | 
15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096.
16 | 
17 | * The following table only summarizes the most important points. For more
18 |   details on the ial-th OpenMP GPU implementation see comments in `matAddAB.c`.
19 | 
20 | | ial |  Remarks                                                               |
21 | |:---:|------------------------------------------------------------------------|
22 | |  0  | ij-loop, 2^9 threads * 2^3 teams,                                      |
23 | |     | coalesced memory access                                                |
24 | |  1  | ji-loop, 2^9 threads * 2^3 teams,                                      |
25 | |     | uncoalesced memory access                                              |
26 | |  2  | ij-loop, 2^9 threads * 2^f teams, collapse(2),                         |
27 | |     | uncoalesced memory access                                              |
28 | |  3  | ji-loop, 2^9 threads * 2^f teams, collapse(2),                         |
29 | |     | coalesced memory access                                                |
30 | |  4  | ji-loop, 2^8 threads * 2^f teams, collapse(3),                         |
31 | |     | 2x i-loop unrolling (stride of 2^8 rows)                               |
32 | |  5  | ji-loop, 2^8 threads * 2^f teams, collapse(2),                         |
33 | |     | 2x i-loop unrolling (stride of n/2 rows)                               |
34 | |  6  | ji-loop, 2^8 threads * 2^e teams, collapse(3),                         |
35 | |     | 2x i-loop unrolling (stride of 2^8 rows),                              |
36 | |     | 2x j-loop unrolling (stride of 1   col )                               |
37 | |  7  | cublasSaxpy in CUBLAS                                                  |
38 | 
39 | # Usage
40 | 
41 | ```bash
42 | matAdd $((2**12))
43 | ```
44 | 
45 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/02_build/realscript.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # clean up and copy
 4 | #
 5 | echo "Copy files ..."
 6 | for i in gcc-gcc-9_2_0-release nvptx-newlib nvptx-tools; do
 7 |   echo                                                                  $i
 8 |   rm  -fr                                                               $i
 9 |   cp -afr /scratch/pc2-mitarbeiter/xinwu/GCC_OpenMP_OpenACC/01_download/$i .
10 | done
11 | echo "Finish copy files"
12 | #
13 | # environment variables
14 | #
15 | TARGSYS=$(gcc-gcc-9_2_0-release/config.guess)
16 | CUDADIR=/cm/shared/apps/pc2/EB-SW/software/system/CUDA/10.1.105
17 | ##INSTDIR=/scratch/pc2-mitarbeiter/xinwu/GCC_OpenMP_OpenACC/99_gcc9_gpu
18 | INSTDIR=/cm/shared/apps/pc2/GCC/9.2.0-offload
19 | #
20 | # nvptx-tools
21 | #
22 | echo "build nvptx-tools ..."
23 | cd nvptx-tools
24 | ./configure                                     \
25 |     --with-cuda-driver-include=$CUDADIR/include \
26 |     --with-cuda-driver-lib=$CUDADIR/lib64       \
27 |     --prefix=$INSTDIR
28 | make
29 | make install
30 | cd ..
31 | echo "Finish build nvptx-tools"
32 | #
33 | # Accel_GCC
34 | #
35 | echo "build Accel_GCC ..."
36 | mkdir Accel_GCC
37 | cd    Accel_GCC
38 | ../gcc-gcc-9_2_0-release/configure                  \
39 |     --target=nvptx-none                             \
40 |     --enable-as-accelerator-for=$TARGSYS            \
41 |     --with-build-time-tools=$INSTDIR/nvptx-none/bin \
42 |     --disable-sjlj-exceptions                       \
43 |     --enable-newlib-io-long-long                    \
44 |     --enable-languages="c,c++,fortran,lto"          \
45 |     --prefix=$INSTDIR
46 | make -j16
47 | make install
48 | cd ..
49 | echo "Finish build Accel_GCC"
50 | #
51 | # Host_GCC
52 | #
53 | echo "Host_GCC ..."
54 | mkdir Host_GCC
55 | cd    Host_GCC
56 | ../gcc-gcc-9_2_0-release/configure              \
57 |     --enable-offload-targets=nvptx-none         \
58 |     --with-cuda-driver-include=$CUDADIR/include \
59 |     --with-cuda-driver-lib=$CUDADIR/lib64       \
60 |     --disable-bootstrap                         \
61 |     --disable-multilib                          \
62 |     --enable-languages="c,c++,fortran,lto"      \
63 |     --prefix=$INSTDIR
64 | make -j16
65 | make install
66 | cd ..
67 | echo "Finish Host_GCC"
68 | #
69 | # Done
70 | #
71 | echo "Done"
72 | 


--------------------------------------------------------------------------------
/08_distThreads/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: distThreads
 3 | author: Xin Wu (PC²)
 4 | date: 12.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `distThreads` demonstrates the organization of threads and teams in a league on
10 | GPU.
11 | 
12 | * Column-major is assumed thru the entire code!
13 | 
14 | * The following tables only summarize the most important points. For more
15 |   details on the ith organization of the GPU threads see comments in
16 |   `gpuThreads.c`.
17 | 
18 | | i |  matrix league  |     GPU threads     |
19 | |:-:|:---------------:|:-------------------:|
20 | |   |  nrow  x  ncol  |  nthrds  x  lteams  |
21 | | 0 |    3   x    5   |    3     x    5     |
22 | | 1 |    3   x    5   |    3     x    5     |
23 | | 2 |    3   x    5   |    3     x    5     |
24 | | 3 |    3   x    5   |    3     x    5     |
25 | | 4 |    7   x    7   |    3     x    5     |
26 | | 5 |    7   x    7   |    3     x    5     |
27 | | 6 |   12   x    6   |    3     x    6     |
28 | | 7 |   12   x    6   |    3     x    6     |
29 | | 8 |   12   x    6   |    3     x    3     |
30 | 
31 | | i |  Remarks                                                        |
32 | |:-:|:----------------------------------------------------------------|
33 | | 0 | Used as Reference. No loop at all.                              |
34 | | 1 | Incorrect nested loop impl.                                     |
35 | | 2 | Correct impl. Manually linearized loop.                         |
36 | | 3 | Correct impl. Nested loop with collapse(2).                     |
37 | | 4 | Irreg. matrix. Default chunk_size. Some GPU threads are idle.   |
38 | | 5 | Irreg. matrix. chunk_size = nthrds. Better performance.         |
39 | | 6 | CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access. |
40 | | 7 | 2x irow-loop unrolling. Nested loop with collapse(3).           |
41 | |   | Coalesced GPU memory access.                                    |
42 | | 8 | 2x icol-loop unrolling. 2x irow-loop unrolling.                 |
43 | |   | Nested loop with collapse(3). Best Performance.                 |
44 | 
45 | # Build
46 | 
47 | ```bash
48 | autoreconf -i; ./configure; make; make check
49 | ```
50 | 
51 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
52 | 
53 | # Documentation
54 | 
55 | * docs/html/index.html: Source code documentation generated by Doxygen.
56 | 
57 | * docs/UserManual.md: User Manual.
58 | 
59 | 


--------------------------------------------------------------------------------
/05_saxpy/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: saxpy
 3 | author: Xin Wu (PC²)
 4 | date: 05.04.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `saxpy` performs the `saxpy` operation on host as well as accelerator.
10 | The performance (in MB/s) for different implementations is also compared.
11 | 
12 | The `saxpy` operation is defined as:
13 | 
14 | $$ y := a * x + y $$
15 | 
16 | where:
17 | 
18 | * `a` is a scalar.
19 | * `x` and `y` are single-precision vectors each with n elements.
20 | * For testing n is assumed to be $2^{26}$.
21 | * The following table only summarizes the most important points. For more
22 |   details on the ial-th implementation see comments in `hsaxpy.c` (on host)
23 |   and `asaxpy.c` (on accelerator).
24 | 
25 |     - on host
26 | 
27 | | ial |  Remarks                                                               |
28 | |:---:|------------------------------------------------------------------------|
29 | |  0  | naive implementation                                                   |
30 | |  1  | saxpy in MKL                                                           |
31 | 
32 |     - on accl
33 | 
34 | | ial |  Remarks                                                               |
35 | |:---:|------------------------------------------------------------------------|
36 | |  0  | <<<2^0 , 2^0 >>>, TOO SLOW! not tested                                 |
37 | |  1  | <<<2^0 , 2^7 >>>, auto   scheduling                                    |
38 | |  2  | <<<2^7 , 2^0 >>>, auto   scheduling                                    |
39 | |  3  | <<<2^7 , 2^7 >>>, auto   scheduling                                    |
40 | |  4  | <<<2^16, 2^10>>>, manual scheduling                                    |
41 | |  5  | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling                |
42 | |     | (2^15*2^7*16==2^26)                                                    |
43 | |  6  | <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling                |
44 | |  7  | de-linearize the vector gives slightly better performance than CUBLAS  |
45 | |  8  | cublasSaxpy in CUBLAS                                                  |
46 | 
47 | # Build
48 | 
49 | ```bash
50 | autoreconf -i; ./configure; make; make check;
51 | ```
52 | 
53 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
54 | 
55 | # Documentation
56 | 
57 | * docs/html/index.html: Source code documentation generated by Doxygen.
58 | 
59 | * docs/UserManual.md: User Manual.
60 | 
61 | 


--------------------------------------------------------------------------------
/09_matAdd/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: matAdd
 3 | author: Xin Wu (PC²)
 4 | date: 19.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `matAdd` performs matrix addition (A += B) in single-precision on GPU.
10 | The performance (in GB/s) for different implementations is compared and
11 | the numerical results are also verified.
12 | 
13 | * Column-major is assumed thru the entire code!
14 | 
15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096.
16 | 
17 | * The following table only summarizes the most important points. For more
18 |   details on the ial-th OpenMP GPU implementation see comments in `matAddAB.c`.
19 | 
20 | | ial |  Remarks                                                               |
21 | |:---:|------------------------------------------------------------------------|
22 | |  0  | ij-loop, 2^9 threads * 2^3 teams,                                      |
23 | |     | coalesced memory access                                                |
24 | |  1  | ji-loop, 2^9 threads * 2^3 teams,                                      |
25 | |     | uncoalesced memory access                                              |
26 | |  2  | ij-loop, 2^9 threads * 2^f teams, collapse(2),                         |
27 | |     | uncoalesced memory access                                              |
28 | |  3  | ji-loop, 2^9 threads * 2^f teams, collapse(2),                         |
29 | |     | coalesced memory access                                                |
30 | |  4  | ji-loop, 2^8 threads * 2^f teams, collapse(3),                         |
31 | |     | 2x i-loop unrolling (stride of 2^8 rows)                               |
32 | |  5  | ji-loop, 2^8 threads * 2^f teams, collapse(2),                         |
33 | |     | 2x i-loop unrolling (stride of n/2 rows)                               |
34 | |  6  | ji-loop, 2^8 threads * 2^e teams, collapse(3),                         |
35 | |     | 2x i-loop unrolling (stride of 2^8 rows),                              |
36 | |     | 2x j-loop unrolling (stride of 1   col )                               |
37 | |  7  | cublasSaxpy in CUBLAS                                                  |
38 | 
39 | # Build
40 | 
41 | ```bash
42 | autoreconf -i; ./configure; make; make check
43 | ```
44 | 
45 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
46 | 
47 | # Documentation
48 | 
49 | * docs/html/index.html: Source code documentation generated by Doxygen.
50 | 
51 | * docs/UserManual.md: User Manual.
52 | 
53 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/GCC/build_gcc_offload.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Build GCC with OpenMP Support for Nvidia GPU Offloading
 3 | author: Xin Wu (PC²)
 4 | date: 06.01.2020
 5 | ---
 6 | 
 7 | # Check Nvidia GPU
 8 | 
 9 | The build procedure was carried out on a Tesla node of OCuLUS at PC². It
10 | features an Nvidia Tesla K20X GPU. Thus it's necessary to check the Tesla K20X
11 | GPU on the compute node, before building GCC with OpenMP support for offloading
12 | computation on Nvidia GPU.
13 | 
14 | The relevant scripts and log files can be found in `00_check_gpu`.
15 | 
16 | `tesla.sh` is a driver script and should be submitted with `ccsalloc`:
17 | 
18 | ```bash
19 | ccsalloc testa.sh
20 | ```
21 | 
22 | `realscript.sh` does the real job and the output can be found in `tesla.log`.
23 | 
24 | # Download Packages and Preparation
25 | 
26 | The required packages for this GCC build are:
27 | 
28 | * nvptx-tools:[^nvptxtools]
29 | 
30 | [^nvptxtools]: At the time of writing, there is no release of nvptx-tools on
31 |   GitHub. For reproducibility the `HEAD` was checked out explicitly.
32 | 
33 | * nvptx-newlib:[^nvptxnewlib]
34 | 
35 | [^nvptxnewlib]: At the time of writing, there is no release of nvptx-newlib on
36 |   GitHub. For reproducibility the `HEAD` was checked out explicitly.
37 | 
38 | * openacc-gcc-9-branch:[^gcc9]
39 | 
40 | [^gcc9]: This Git-branch is used for development of OpenACC support and related
41 |   functionality. For more info, see <https://gcc.gnu.org/svn.html>.
42 | 
43 | It's faster to download these packages from the frontend nodes of OCuLUS at PC².
44 | `download.sh` (in `01_download`) is a convenient script to download these
45 | packages as well as to prepare other setups for our build of GCC with OpenMP for
46 | offloading on GPUs.
47 | 
48 | # Build and Install Packages
49 | 
50 | ## Load CUDA module
51 | 
52 | Because the GPU-backend of GCC depends on CUDA, we need to load the CUDA module
53 | on OCuLUS.
54 | 
55 | ```bash
56 | module load system/CUDA/10.1.105
57 | ```
58 | 
59 | ## Build `nvptx-tools`, accelerator and host GCC compilers
60 | 
61 | The build scripts can be found in `02_build`. `build.sh` is a driver script for
62 | `ccsalloc` and `realscript.sh` carries out the real build procedure.
63 | 
64 | Before running
65 | 
66 | ```bash
67 | ccsalloc build.sh
68 | ```
69 | 
70 | it's necessary to adapt some settings in `realscript.sh`, e.g. `CUDADIR`,
71 | `INSTDIR`, and perhaps `make -j` with an appropriate number of processors,
72 | to your working system.
73 | 
74 | Now, we're done.
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | The directories in this repository contain code examples for the course of
 4 | OpenMP GPU-offloading at Paderborn Center for Parallel Computing (PC²),
 5 | Paderborn University. The sub-directories are generally organized as:
 6 | 
 7 | * src: source code
 8 | * docs: documentation
 9 | * tests: some tests
10 | 
11 | Some highlights of the codes in this repository:
12 | 
13 | * The performance of our `saxpy` implemented by using OpenMP GPU-offloading is
14 |   as good as `cublasSaxpy` in CUBLAS. See `case 7` in `05_saxpy/src/asaxpy.c`
15 |   for details.
16 | 
17 | * The GPU shared memory has not been standardized in OpenMP API Specification
18 |   (Version 5.0 Nov. 2018). To optimize the performance of matrix multiplication
19 |   by using OpenMP GPU-offloading, i) `case 6` in `10_matMul/src/matMulAB.c`
20 |   implements a register blocking algorithm and ii) `case 8` in the same source
21 |   code file implements a common GPU-based tiled algorithm by blocking the local
22 |   shared memory in a very tricky manner and the OpenMP code resembles CUDA.
23 | 
24 | # List of Projects
25 | 
26 | * 00_build_OpenMP_offload
27 | 
28 |   Documentation and scripts for building GCC as well as Clang/LLVM with OpenMP
29 |   support for Nvidia GPU offloading.
30 | 
31 | * 01_accelQuery
32 | 
33 |   `accelQuery` searches accelerator(s) on a heterogeneous computer.
34 |   Accelerator(s), if found, will be enumerated with some basic info.
35 | 
36 | * 02_dataTransRate
37 | 
38 |   `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`.
39 | 
40 |   The possible situations are:
41 | 
42 |   * h2h: `src` = host  and `dst` = host
43 |   * h2a: `src` = host  and `dst` = accel
44 |   * a2a: `src` = accel and `dst` = accel
45 | 
46 |   NOTE:
47 | 
48 |   * A bug in Clang 9.0.1 has been fixed in Clang 11.
49 |   * The data transfer rata for `a2a` is still lower than our expectation.
50 | 
51 | * 03_taskwait
52 | 
53 |   `taskwait` checks the `taskwait` construct for the deferred target task.
54 | 
55 |   NOTE:
56 | 
57 |   * Asynchronous offloading hasn't been implemented in the GCC 9.2 compiler.
58 |   * Asynchronous offloading is available in Clang 11.
59 | 
60 | * 04_scalarAddition
61 | 
62 |   `scalarAddition` adds two integers on host and accelerator, and also compares
63 |   the performance.
64 | 
65 | * 05_saxpy
66 | 
67 |   `saxpy` performs the `saxpy` operation on host as well as accelerator.
68 |   The performance (in MB/s) for different implementations is also compared.
69 | 
70 | * 08_distThreads
71 | 
72 |   `distThreads` demonstrates the organization of threads and teams in a league
73 |   on GPU.
74 | 
75 | * 09_matAdd
76 | 
77 |   `matAdd` performs matrix addition (A +=B) in single-precision on GPU. The
78 |   performance (in GB/s) for different implementations is compared and the
79 |   numerical results are also verified.
80 | 
81 | * 10_matMul
82 | 
83 |   `matMul` performs matrix multiplication in single-precision on GPU. The
84 |   performance (in GFLOPS) for different implementations is compared and the
85 |   numerical results are also verified.
86 | 


--------------------------------------------------------------------------------
/08_distThreads/src/distThreads.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file distThreads.c
 3 |  *
 4 |  * @mainpage distThreads
 5 |  *
 6 |  * @author Xin Wu (PC²)
 7 |  * @data 12.03.2020
 8 |  * @copyright CC BY-SA 2.0
 9 |  *
10 |  * distThreads demonstrates the organization of threads and teams in a league on
11 |  * GPU.
12 |  */
13 | 
14 | #include "gpuThreads.h"
15 | 
16 | /**
17 |  * @brief Main entry point for distThreads.
18 |  */
19 | int main(int argc, char *argv[])
20 | {
21 |   for (int i = 0; i < 99; ++i) {
22 |   /*
23 |    * - Column-major is assumed thru the entire code!
24 |    *
25 |    * The following tables only summarize the most important points. For more
26 |    * details on the ith organization of the GPU threads see comments in
27 |    * \c gpuThreads.c.
28 |    */
29 |     gpuThreads(i);
30 |   /*
31 |    * ===========================================================================
32 |    *   i                matrix league               GPU threads
33 |    *                   nrow       ncol            nthrds   lteams
34 |    * ---------------------------------------------------------------------------
35 |    *   0                 3          5               3        5
36 |    *   1                 3          5               3        5
37 |    *   2                 3          5               3        5
38 |    *   3                 3          5               3        5
39 |    * ---------------------------------------------------------------------------
40 |    *   4                 7          7               3        5
41 |    *   5                 7          7               3        5
42 |    * ---------------------------------------------------------------------------
43 |    *   6                12          6               3        6
44 |    *   7                12          6               3        6
45 |    *   8                12          6               3        3
46 |    * ===========================================================================
47 |    *
48 |    * ===========================================================================
49 |    *   i  Remarks
50 |    * ---------------------------------------------------------------------------
51 |    *   0  Used as Reference. No loop at all.
52 |    *   1  Incorrect nested loop impl.
53 |    *   2  Correct impl. Manually linearized loop.
54 |    *   3  Correct impl. Nested loop with collapse(2).
55 |    * ---------------------------------------------------------------------------
56 |    *   4  Irreg. matrix. Default chunk_size. Some GPU threads are idle.
57 |    *   5  Irreg. matrix. chunk_size = nthrds. Better performance.
58 |    * ---------------------------------------------------------------------------
59 |    *   6  CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access.
60 |    *   7  2x irow-loop unrolling. Nested loop with collapse(3).
61 |    *      Coalesced GPU memory access.
62 |    *   8  2x icol-loop unrolling. 2x irow-loop unrolling.
63 |    *      Nested loop with collapse(3). Best Performance.
64 |    * ===========================================================================
65 |    */
66 |   }
67 |   return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/simplifiedCode/05_saxpy/saxpy.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file saxpy.c
  3 |  *
  4 |  * @brief saxpy performs the \c axpy computation in single-precision on both
  5 |  * host and accelerator. The performance (in MFLOPS) on host and accelerator is
  6 |  * compared and the numerical results are also verified for consistency.
  7 |  *
  8 |  * The \c axpy computation is defined as:
  9 |  *
 10 |  * y := a * x + y
 11 |  *
 12 |  * where:
 13 |  *
 14 |  * - a is a scalar.
 15 |  * - x and y are vectors each with n elements.
 16 |  *
 17 |  * Please note that in this version only <em>one GPU thread</em> is used.
 18 |  *
 19 |  * Offload to GPU:
 20 |  *
 21 |  * gcc -fopenmp -foffload=nvptx-none saxpy.c
 22 |  *
 23 |  */
 24 | 
 25 | #include <assert.h>
 26 | #include <stdio.h>
 27 | #include <stdlib.h>
 28 | #include <time.h>
 29 | #include <omp.h>
 30 | 
 31 | #define TWO02 (1 <<  2)
 32 | #define TWO04 (1 <<  4)
 33 | #define TWO08 (1 <<  8)
 34 | #define TWO27 (1 << 27)
 35 | 
 36 | int main(int argc, char *argv[])
 37 | {
 38 |   int   i, n = TWO27,
 39 |         iret = 0;
 40 |   float a = 101.0f / TWO02,
 41 |         *x, *y, *z;
 42 |   struct timespec rt[2];
 43 |   double wt; // walltime
 44 | 
 45 |   /*
 46 |    * 0. prepare x, y, and z
 47 |    *
 48 |    * y := a * x + y (on host)
 49 |    * z := a * x + z (on accel)
 50 |    */
 51 |   if (NULL == (x = (float *) malloc(sizeof(*x) * n))) {
 52 |     printf("error: memory allocation for 'x'\n");
 53 |     iret = -1;
 54 |   }
 55 |   if (NULL == (y = (float *) malloc(sizeof(*y) * n))) {
 56 |     printf("error: memory allocation for 'y'\n");
 57 |     iret = -1;
 58 |   }
 59 |   if (NULL == (z = (float *) malloc(sizeof(*z) * n))) {
 60 |     printf("error: memory allocation for 'z'\n");
 61 |     iret = -1;
 62 |   }
 63 |   if (0 != iret) {
 64 |     free(x);
 65 |     free(y);
 66 |     free(z);
 67 |     exit(EXIT_FAILURE);
 68 |   }
 69 |   for (i = 0; i < n; i++) {
 70 |     x[i] =        rand() % TWO04 / (float) TWO02;
 71 |     y[i] = z[i] = rand() % TWO08 / (float) TWO04;
 72 |   }
 73 |   /*
 74 |    * 1. saxpy on host
 75 |    */
 76 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 77 |   #pragma omp parallel \
 78 |     default(none) shared(n, a, x, y) private(i)
 79 |   {
 80 |     #pragma omp for simd schedule(simd:static)
 81 |     for (i = 0; i < n; i++) {
 82 |       y[i] = a * x[i] + y[i];
 83 |     }
 84 |   }
 85 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 86 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
 87 |   printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
 88 |   /*
 89 |    * 2. saxpy on accel
 90 |    */
 91 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 92 |   #pragma omp target device(0) \
 93 |     map(to:n, a, x[0:n]) map(tofrom:z[0:n]) private(i)
 94 |   {
 95 |     for (i = 0; i < n; i++) {
 96 |       z[i] = a * x[i] + z[i];
 97 |     }
 98 |   }
 99 |   clock_gettime(CLOCK_REALTIME, rt + 1);
100 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
101 |   printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt));
102 |   /*
103 |    * 3. verify numerical consistency
104 |    */
105 |   for (i = 0; i < n; i++) {
106 |     iret = *(int *) (y + i) ^ *(int *) (z + i);
107 |     assert(iret == 0);
108 |   }
109 |   return 0;
110 | }
111 | 


--------------------------------------------------------------------------------
/10_matMul/docs/UserManual.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: matMul
 3 | author: Xin Wu (PC²)
 4 | date: 12.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `matMul` performs matrix multiplication in single-precision on GPU. The
10 | performance (in GFLOPS) for different implementations is compared and the
11 | numerical results are also verified.
12 | 
13 | * Column-major is assumed thru the entire code!
14 | 
15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096.
16 | 
17 | * The following table only summarizes the most important points. For more
18 |   details on the ial-th OpenMP GPU implementation see comments in `matMulAB.c`.
19 | 
20 | | ial |  Remarks                                                               |
21 | |:---:|------------------------------------------------------------------------|
22 | |  0  | jik-loop, 2^9 threads * 2^3 teams,                                     |
23 | |     | uncoalesced memory access                                              |
24 | |  1  | jki-loop, 2^9 threads * 2^3 teams,                                     |
25 | |     | uncoalesced memory access, uncoalesced r&w in innermost loop           |
26 | |  2  | jik-loop, 2^9 threads * 2^f teams, collapse(2)                         |
27 | |  3  | jki-loop, 2^9 threads * 2^f teams, collapse(2),                        |
28 | |     | race condition for writing c!                                          |
29 | |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
30 | |     | 4x k-loop unrolling                                                    |
31 | |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
32 | |     | 4x i-loop unrolling (stride of 2^7 rows),                              |
33 | |     | 4x k-loop unrolling,                                                   |
34 | |     | rb: 4x data reuse                                                      |
35 | |  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
36 | |     | 4x j-loop unrolling (stride of 1   col ),                              |
37 | |     | 4x i-loop unrolling (stride of 2^7 rows),                              |
38 | |     | 4x k-loop unrolling,                                                   |
39 | |     | ra: 4x data reuse,                                                     |
40 | |     | rb: 4x data reuse,                                                     |
41 | |     | register blocking                                                      |
42 | |  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
43 | |  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
44 | |     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
45 | |     | shared memory blocking                                                 |
46 | |  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
47 | |     | 4x i-loop unrolling (stride of n/4 rows),                              |
48 | |     | 4x k-loop unrolling,                                                   |
49 | |     | rb: 4x data reuse                                                      |
50 | | 10  | cublasSgemm in CUBLAS                                                  |
51 | 
52 | # Usage
53 | 
54 | ```bash
55 | matMul $((2**12))
56 | ```
57 | 
58 | 


--------------------------------------------------------------------------------
/10_matMul/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: matMul
 3 | author: Xin Wu (PC²)
 4 | date: 19.03.2020
 5 | ---
 6 | 
 7 | # Introduction
 8 | 
 9 | `matMul` performs matrix multiplication in single-precision on GPU. The
10 | performance (in GFLOPS) for different implementations is compared and the
11 | numerical results are also verified.
12 | 
13 | * Column-major is assumed thru the entire code!
14 | 
15 | * `i` and `j` are indices for row and column, respectively.
16 | 
17 | * For testing the dimension of all matrices are assumed to be 4096 x 4096.
18 | 
19 | * The following table only summarizes the most important points. For more
20 |   details on the ial-th OpenMP GPU implementation see comments in `matMulAB.c`.
21 | 
22 | | ial |  Remarks                                                               |
23 | |:---:|------------------------------------------------------------------------|
24 | |  0  | jik-loop, 2^9 threads * 2^3 teams,                                     |
25 | |     | uncoalesced memory access                                              |
26 | |  1  | jki-loop, 2^9 threads * 2^3 teams,                                     |
27 | |     | uncoalesced memory access, uncoalesced r&w in innermost loop           |
28 | |  2  | jik-loop, 2^9 threads * 2^f teams, collapse(2)                         |
29 | |  3  | jki-loop, 2^9 threads * 2^f teams, collapse(2),                        |
30 | |     | race condition for writing c!                                          |
31 | |  4  | jik-loop, 2^9 threads * 2^f teams, collapse(2),                        |
32 | |     | 4x k-loop unrolling                                                    |
33 | |  5  | jik-loop, 2^7 threads * 2^f teams, collapse(3),                        |
34 | |     | 4x i-loop unrolling (stride of 2^7 rows),                              |
35 | |     | 4x k-loop unrolling,                                                   |
36 | |     | rb: 4x data reuse                                                      |
37 | |  6  | jik-loop, 2^7 threads * 2^d teams, collapse(3),                        |
38 | |     | 4x j-loop unrolling (stride of 1   col ),                              |
39 | |     | 4x i-loop unrolling (stride of 2^7 rows),                              |
40 | |     | 4x k-loop unrolling,                                                   |
41 | |     | ra: 4x data reuse,                                                     |
42 | |     | rb: 4x data reuse,                                                     |
43 | |     | register blocking                                                      |
44 | |  7  | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)           |
45 | |  8  | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2),          |
46 | |     | GPU shared memory for data re-use, 16x k-loop unrolling,               |
47 | |     | shared memory blocking                                                 |
48 | |  9  | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),          |
49 | |     | 4x i-loop unrolling (stride of n/4 rows),                              |
50 | |     | 4x k-loop unrolling,                                                   |
51 | |     | rb: 4x data reuse                                                      |
52 | | 10  | cublasSgemm in CUBLAS                                                  |
53 | 
54 | # Build
55 | 
56 | ```bash
57 | autoreconf -i; ./configure; make; make check
58 | ```
59 | 
60 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS).
61 | 
62 | # Documentation
63 | 
64 | * docs/html/index.html: Source code documentation generated by Doxygen.
65 | 
66 | * docs/UserManual.md: User Manual.
67 | 
68 | 


--------------------------------------------------------------------------------
/simplifiedCode/02_dataTransRate/dataTransRate.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file dataTransRate.c
  3 |  *
  4 |  * @brief dataTransRate gives the data transfer rate from src to dst.
  5 |  *
  6 |  * The possible situations are:
  7 |  *
  8 |  * - h2h: src = host  and dst = host
  9 |  * - h2a: src = host  and dst = accel
 10 |  * - a2a: src = accel and dst = accel
 11 |  *
 12 |  * Offload to GPU:
 13 |  * gcc -Wall -fopenmp -foffload=nvptx-none dataTransRate.c
 14 |  *
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <time.h>
 20 | #include <omp.h>
 21 | 
 22 | #define TWO27 (1 << 27)
 23 | 
 24 | int main(void)
 25 | {
 26 |   // host
 27 |   int ihost,  *hdat[2];
 28 |   // accelerator
 29 |   int iaccel, *adat[2];
 30 |   size_t ndat;
 31 |   struct timespec rt[2];
 32 |   double wt; // walltime
 33 |   int i, n = TWO27, iret = 0;
 34 | 
 35 |   /*
 36 |    * prepare data on host and accelerator
 37 |    */
 38 |   ihost  = omp_get_initial_device(); // index of the host
 39 |   iaccel = 0;                        // index of the 1st accel
 40 |   ndat   = sizeof(*hdat[0]) * n;
 41 |   for (i = 0; i < 2; i++) {
 42 |     if (NULL == (hdat[i] = (int *) omp_target_alloc(ndat, ihost))) {
 43 |       printf("error: memory allocation for hdat[%d] ...", i);
 44 |       iret = 1;
 45 |     }
 46 |     if (NULL == (adat[i] = (int *) omp_target_alloc(ndat, iaccel))) {
 47 |       printf("error: memory allocation for adat[%d] ...", i);
 48 |       iret = 1;
 49 |     }
 50 |   }
 51 |   if (1 == iret) {
 52 |     for (i = 0; i < 2; i++) {
 53 |       omp_target_free(hdat[i], ihost);
 54 |       omp_target_free(adat[i], iaccel);
 55 |     }
 56 |     exit(EXIT_FAILURE);
 57 |   }
 58 |   for (i = 0; i < n; i++) {
 59 |     (hdat[0])[i] = rand();
 60 |   }
 61 |   /*
 62 |    * data transfer rate: h2h, h2a, and a2a
 63 |    */
 64 |   printf("\nData Transfer Rate\n\n");
 65 |   printf("================================\n");
 66 |   printf(" src     dst          DTR       \n");
 67 |   printf("------- ------- ----------------\n");
 68 |   /*
 69 |    * h2h
 70 |    */
 71 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 72 |   iret = omp_target_memcpy(hdat[1], hdat[0], ndat, 0x0, 0x0, ihost, ihost);
 73 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 74 |   if (0 != iret) {
 75 |     printf("error: omp_target_memcpy (h2h)\n");
 76 |     exit(EXIT_FAILURE);
 77 |   }
 78 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
 79 |   printf(" host    host   %8.1f MB/sec\n", ndat / wt);
 80 |   /*
 81 |    * h2a
 82 |    */
 83 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 84 |   iret = omp_target_memcpy(adat[0], hdat[0], ndat, 0x0, 0x0, iaccel, ihost);
 85 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 86 |   if (0 != iret) {
 87 |     printf("error: omp_target_memcpy (h2a)\n");
 88 |     exit(EXIT_FAILURE);
 89 |   }
 90 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
 91 |   printf(" host    accel  %8.1f MB/sec\n", ndat / wt);
 92 |   /*
 93 |    * a2a
 94 |    */
 95 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 96 |   iret = omp_target_memcpy(adat[1], adat[0], ndat, 0x0, 0x0, iaccel, iaccel);
 97 |   /*
 98 |    * Question: How to get the correct A-A data transfer rate? FIXME
 99 |    */
100 |   clock_gettime(CLOCK_REALTIME, rt + 1);
101 |   if (0 != iret) {
102 |     printf("error: omp_target_memcpy (a2a)\n");
103 |     exit(EXIT_FAILURE);
104 |   }
105 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
106 |   printf(" accel   accel  %8.1f MB/sec\n", ndat / wt);
107 |   printf("================================\n\n");
108 |   /*
109 |    * release the data
110 |    */
111 |   for (i = 0; i < 2; i++) {
112 |     omp_target_free(hdat[i], ihost);
113 |     omp_target_free(adat[i], iaccel);
114 |   }
115 |   return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/01_accelQuery/src/prtAccelInfo.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file prtAccelInfo.c
 3 |  * @brief Function definition for prtAccelInfo.
 4 |  *
 5 |  * This source file contains function definition for prtAccelInfo.
 6 |  *
 7 |  * @author Xin Wu (PC²)
 8 |  * @date 04.01.2020
 9 |  * @copyright CC BY-SA 2.0
10 |  */
11 | 
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <cuda_runtime.h>
15 | #include "prtAccelInfo.h"
16 | 
17 | #define CUDAErrorCheck(funcall)                                         \
18 | do {                                                                    \
19 |   cudaError_t ierr = funcall;                                           \
20 |   if (cudaSuccess != ierr) {                                            \
21 |     fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \
22 |     __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr));      \
23 |     exit(ierr);                                                         \
24 |   }                                                                     \
25 | } while (0)
26 | 
27 | static inline int _corePerSM(int major, int minor)
28 | /**
29 |  * @brief Give the number of CUDA cores per streaming multiprocessor (SM).
30 |  *
31 |  * The number of CUDA cores per SM is determined by the compute capability.
32 |  *
33 |  * @param major Major revision number of the compute capability.
34 |  * @param minor Minor revision number of the compute capability.
35 |  *
36 |  * @return The number of CUDA cores per SM.
37 |  */
38 | {
39 |   if (1 == major) {
40 |     if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8;
41 |   }
42 |   if (2 == major) {
43 |     if (0 == minor) return 32;
44 |     if (1 == minor) return 48;
45 |   }
46 |   if (3 == major) {
47 |     if (0 == minor || 5 == minor || 7 == minor) return 192;
48 |   }
49 |   if (5 == major) {
50 |     if (0 == minor || 2 == minor) return 128;
51 |   }
52 |   if (6 == major) {
53 |     if (0 == minor) return 64;
54 |     if (1 == minor || 2 == minor) return 128;
55 |   }
56 |   if (7 == major) {
57 |     if (0 == minor || 2 == minor || 5 == minor) return 64;
58 |   }
59 |   return -1;
60 | }
61 | 
62 | void prtAccelInfo(int iaccel)
63 | {
64 |   int corePerSM;
65 |   struct cudaDeviceProp dev;
66 | 
67 |   CUDAErrorCheck(cudaSetDevice(iaccel));
68 |   CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel));
69 |   corePerSM = _corePerSM(dev.major, dev.minor);
70 |   printf("\n");
71 |   printf("============================================================\n");
72 |   printf("CUDA Device name : \"%s\"\n", dev.name);
73 |   printf("------------------------------------------------------------\n");
74 |   printf("Comp. Capability : %d.%d\n", dev.major, dev.minor);
75 |   printf("max clock rate   : %.0f MHz\n", dev.clockRate * 1.e-3f);
76 |   printf("number of SMs    : %d\n", dev.multiProcessorCount);
77 |   printf("cores  /  SM     : %d\n", corePerSM);
78 |   printf("# of CUDA cores  : %d\n", corePerSM * dev.multiProcessorCount);
79 |   printf("------------------------------------------------------------\n");
80 |   printf("global memory    : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f);
81 |   printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f);
82 |   printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor);
83 |   printf("------------------------------------------------------------\n");
84 |   printf("max # of threads / SM    : %d\n", dev.maxThreadsPerMultiProcessor);
85 |   printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock);
86 |   printf("max dim. of block        : (%d, %d, %d)\n",
87 |       dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]);
88 |   printf("max dim. of grid         : (%d, %d, %d)\n",
89 |       dev.maxGridSize[0],   dev.maxGridSize[1],   dev.maxGridSize[2]);
90 |   printf("warp size                : %d\n", dev.warpSize);
91 |   printf("============================================================\n");
92 | }
93 | 


--------------------------------------------------------------------------------
/03_taskwait/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.63])
 2 | AC_INIT([taskwait], [1.0], [xinwu@mail.uni-paderborn.de])
 3 | AC_CONFIG_AUX_DIR([build-aux])
 4 | AC_CONFIG_SRCDIR([src/taskwait.c])
 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 6 | ##############################################################################80
 7 | #
 8 | # check CUDA
 9 | #
10 | ##############################################################################80
11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
12 | if test -z "${CUDAINC}"; then
13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
14 | fi
15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
16 | if test -z "${CUDALIB}"; then
17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
18 | fi
19 | ##############################################################################80
20 | #
21 | # check C compiler
22 | #
23 | ##############################################################################80
24 | AC_PROG_CC([clang gcc])
25 | AS_IF([test "${CC}" = gcc],
26 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
27 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
28 | AS_IF([test "${CC}" = clang],
29 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
30 |    -Xopenmp-target -march=sm_61 $CFLAGS"
31 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
32 | ##############################################################################80
33 | #
34 | # check archiver
35 | #
36 | ##############################################################################80
37 | AC_PROG_RANLIB
38 | AM_PROG_AR
39 | ##############################################################################80
40 | #
41 | # check headers
42 | #
43 | ##############################################################################80
44 | AC_CHECK_HEADER([cuda_runtime.h], [],
45 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
46 | ##############################################################################80
47 | #
48 | # check libraries
49 | #
50 | ##############################################################################80
51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
52 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
53 | ##############################################################################80
54 | #
55 | # check Doxygen
56 | #
57 | ##############################################################################80
58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
60 | AS_IF([test -z "${DOXYGEN}"],
61 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
65 | ##############################################################################80
66 | #
67 | # check ccsalloc (in OpenCCS)
68 | #
69 | ##############################################################################80
70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
72 | AS_IF([test -z "${CCSALLOC}"],
73 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
75 | ##############################################################################80
76 | #
77 | # create final files
78 | #
79 | ##############################################################################80
80 | AC_CONFIG_HEADERS([config.h])
81 | AC_CONFIG_FILES([Makefile
82 |                  src/Makefile
83 |                  tests/Makefile])
84 | AC_OUTPUT
85 | 
86 | echo "
87 | //============================================================================80
88 | 
89 | Configuration:
90 | 
91 |   CC      : ${CC}
92 |   CFLAGS  : ${CFLAGS}
93 |   LDFLAGS : ${LDFLAGS}
94 |   LIBS    : ${LIBS}
95 | 
96 | //============================================================================80
97 | 
98 | Now, type make to build ..."
99 | 


--------------------------------------------------------------------------------
/01_accelQuery/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.63])
 2 | AC_INIT([accelQuery], [1.0], [xinwu@mail.uni-paderborn.de])
 3 | AC_CONFIG_AUX_DIR([build-aux])
 4 | AC_CONFIG_SRCDIR([src/accelQuery.c])
 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 6 | ##############################################################################80
 7 | #
 8 | # check CUDA
 9 | #
10 | ##############################################################################80
11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
12 | if test -z "${CUDAINC}"; then
13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
14 | fi
15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
16 | if test -z "${CUDALIB}"; then
17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
18 | fi
19 | ##############################################################################80
20 | #
21 | # check C compiler
22 | #
23 | ##############################################################################80
24 | AC_PROG_CC([clang gcc])
25 | AS_IF([test "${CC}" = gcc],
26 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
27 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
28 | AS_IF([test "${CC}" = clang],
29 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
30 |    -Xopenmp-target -march=sm_61 $CFLAGS"
31 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
32 | ##############################################################################80
33 | #
34 | # check archiver
35 | #
36 | ##############################################################################80
37 | AC_PROG_RANLIB
38 | AM_PROG_AR
39 | ##############################################################################80
40 | #
41 | # check headers
42 | #
43 | ##############################################################################80
44 | AC_CHECK_HEADER([cuda_runtime.h], [],
45 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
46 | ##############################################################################80
47 | #
48 | # check libraries
49 | #
50 | ##############################################################################80
51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
52 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
53 | ##############################################################################80
54 | #
55 | # check Doxygen
56 | #
57 | ##############################################################################80
58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
60 | AS_IF([test -z "${DOXYGEN}"],
61 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
65 | ##############################################################################80
66 | #
67 | # check ccsalloc (in OpenCCS)
68 | #
69 | ##############################################################################80
70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
72 | AS_IF([test -z "${CCSALLOC}"],
73 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
75 | ##############################################################################80
76 | #
77 | # create final files
78 | #
79 | ##############################################################################80
80 | AC_CONFIG_HEADERS([config.h])
81 | AC_CONFIG_FILES([Makefile
82 |                  src/Makefile
83 |                  tests/Makefile])
84 | AC_OUTPUT
85 | 
86 | echo "
87 | //============================================================================80
88 | 
89 | Configuration:
90 | 
91 |   CC      : ${CC}
92 |   CFLAGS  : ${CFLAGS}
93 |   LDFLAGS : ${LDFLAGS}
94 |   LIBS    : ${LIBS}
95 | 
96 | //============================================================================80
97 | 
98 | Now, type make to build ..."
99 | 


--------------------------------------------------------------------------------
/08_distThreads/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.63])
 2 | AC_INIT([distThreads], [1.0], [xinwu@mail.uni-paderborn.de])
 3 | AC_CONFIG_AUX_DIR([build-aux])
 4 | AC_CONFIG_SRCDIR([src/distThreads.c])
 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 6 | ##############################################################################80
 7 | #
 8 | # check CUDA
 9 | #
10 | ##############################################################################80
11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
12 | if test -z "${CUDAINC}"; then
13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
14 | fi
15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
16 | if test -z "${CUDALIB}"; then
17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
18 | fi
19 | ##############################################################################80
20 | #
21 | # check C compiler
22 | #
23 | ##############################################################################80
24 | AC_PROG_CC([clang gcc])
25 | AS_IF([test "${CC}" = gcc],
26 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
27 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
28 | AS_IF([test "${CC}" = clang],
29 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
30 |    -Xopenmp-target -march=sm_61 $CFLAGS"
31 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
32 | ##############################################################################80
33 | #
34 | # check archiver
35 | #
36 | ##############################################################################80
37 | AC_PROG_RANLIB
38 | AM_PROG_AR
39 | ##############################################################################80
40 | #
41 | # check headers
42 | #
43 | ##############################################################################80
44 | AC_CHECK_HEADER([cuda_runtime.h], [],
45 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
46 | ##############################################################################80
47 | #
48 | # check libraries
49 | #
50 | ##############################################################################80
51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
52 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
53 | ##############################################################################80
54 | #
55 | # check Doxygen
56 | #
57 | ##############################################################################80
58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
60 | AS_IF([test -z "${DOXYGEN}"],
61 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
65 | ##############################################################################80
66 | #
67 | # check ccsalloc (in OpenCCS)
68 | #
69 | ##############################################################################80
70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
72 | AS_IF([test -z "${CCSALLOC}"],
73 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
75 | ##############################################################################80
76 | #
77 | # create final files
78 | #
79 | ##############################################################################80
80 | AC_CONFIG_HEADERS([config.h])
81 | AC_CONFIG_FILES([Makefile
82 |                  src/Makefile
83 |                  tests/Makefile])
84 | AC_OUTPUT
85 | 
86 | echo "
87 | //============================================================================80
88 | 
89 | Configuration:
90 | 
91 |   CC      : ${CC}
92 |   CFLAGS  : ${CFLAGS}
93 |   LDFLAGS : ${LDFLAGS}
94 |   LIBS    : ${LIBS}
95 | 
96 | //============================================================================80
97 | 
98 | Now, type make to build ..."
99 | 


--------------------------------------------------------------------------------
/02_dataTransRate/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.63])
 2 | AC_INIT([dataTransRate], [1.0], [xinwu@mail.uni-paderborn.de])
 3 | AC_CONFIG_AUX_DIR([build-aux])
 4 | AC_CONFIG_SRCDIR([src/dataTransRate.c])
 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 6 | ##############################################################################80
 7 | #
 8 | # check CUDA
 9 | #
10 | ##############################################################################80
11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
12 | if test -z "${CUDAINC}"; then
13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
14 | fi
15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
16 | if test -z "${CUDALIB}"; then
17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
18 | fi
19 | ##############################################################################80
20 | #
21 | # check C compiler
22 | #
23 | ##############################################################################80
24 | AC_PROG_CC([clang gcc])
25 | AS_IF([test "${CC}" = gcc],
26 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
27 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
28 | AS_IF([test "${CC}" = clang],
29 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
30 |    -Xopenmp-target -march=sm_61 $CFLAGS"
31 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
32 | ##############################################################################80
33 | #
34 | # check archiver
35 | #
36 | ##############################################################################80
37 | AC_PROG_RANLIB
38 | AM_PROG_AR
39 | ##############################################################################80
40 | #
41 | # check headers
42 | #
43 | ##############################################################################80
44 | AC_CHECK_HEADER([cuda_runtime.h], [],
45 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
46 | ##############################################################################80
47 | #
48 | # check libraries
49 | #
50 | ##############################################################################80
51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
52 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
53 | ##############################################################################80
54 | #
55 | # check Doxygen
56 | #
57 | ##############################################################################80
58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
60 | AS_IF([test -z "${DOXYGEN}"],
61 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
65 | ##############################################################################80
66 | #
67 | # check ccsalloc (in OpenCCS)
68 | #
69 | ##############################################################################80
70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
72 | AS_IF([test -z "${CCSALLOC}"],
73 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
75 | ##############################################################################80
76 | #
77 | # create final files
78 | #
79 | ##############################################################################80
80 | AC_CONFIG_HEADERS([config.h])
81 | AC_CONFIG_FILES([Makefile
82 |                  src/Makefile
83 |                  tests/Makefile])
84 | AC_OUTPUT
85 | 
86 | echo "
87 | //============================================================================80
88 | 
89 | Configuration:
90 | 
91 |   CC      : ${CC}
92 |   CFLAGS  : ${CFLAGS}
93 |   LDFLAGS : ${LDFLAGS}
94 |   LIBS    : ${LIBS}
95 | 
96 | //============================================================================80
97 | 
98 | Now, type make to build ..."
99 | 


--------------------------------------------------------------------------------
/04_scalarAddition/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.63])
 2 | AC_INIT([scalarAddition], [1.0], [xinwu@mail.uni-paderborn.de])
 3 | AC_CONFIG_AUX_DIR([build-aux])
 4 | AC_CONFIG_SRCDIR([src/scalarAddition.c])
 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
 6 | ##############################################################################80
 7 | #
 8 | # check CUDA
 9 | #
10 | ##############################################################################80
11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
12 | if test -z "${CUDAINC}"; then
13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
14 | fi
15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
16 | if test -z "${CUDALIB}"; then
17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
18 | fi
19 | ##############################################################################80
20 | #
21 | # check C compiler
22 | #
23 | ##############################################################################80
24 | AC_PROG_CC([clang gcc])
25 | AS_IF([test "${CC}" = gcc],
26 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS"
27 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
28 | AS_IF([test "${CC}" = clang],
29 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \
30 |    -Xopenmp-target -march=sm_61 $CFLAGS"
31 |    LDFLAGS="-L${CUDALIB} $LDFLAGS"])
32 | ##############################################################################80
33 | #
34 | # check archiver
35 | #
36 | ##############################################################################80
37 | AC_PROG_RANLIB
38 | AM_PROG_AR
39 | ##############################################################################80
40 | #
41 | # check headers
42 | #
43 | ##############################################################################80
44 | AC_CHECK_HEADER([cuda_runtime.h], [],
45 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
46 | ##############################################################################80
47 | #
48 | # check libraries
49 | #
50 | ##############################################################################80
51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
52 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
53 | ##############################################################################80
54 | #
55 | # check Doxygen
56 | #
57 | ##############################################################################80
58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
60 | AS_IF([test -z "${DOXYGEN}"],
61 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
65 | ##############################################################################80
66 | #
67 | # check ccsalloc (in OpenCCS)
68 | #
69 | ##############################################################################80
70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
72 | AS_IF([test -z "${CCSALLOC}"],
73 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
75 | ##############################################################################80
76 | #
77 | # create final files
78 | #
79 | ##############################################################################80
80 | AC_CONFIG_HEADERS([config.h])
81 | AC_CONFIG_FILES([Makefile
82 |                  src/Makefile
83 |                  tests/Makefile])
84 | AC_OUTPUT
85 | 
86 | echo "
87 | //============================================================================80
88 | 
89 | Configuration:
90 | 
91 |   CC      : ${CC}
92 |   CFLAGS  : ${CFLAGS}
93 |   LDFLAGS : ${LDFLAGS}
94 |   LIBS    : ${LIBS}
95 | 
96 | //============================================================================80
97 | 
98 | Now, type make to build ..."
99 | 


--------------------------------------------------------------------------------
/09_matAdd/src/matAdd.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file matAdd.c
  3 |  *
  4 |  * @mainpage matAdd
  5 |  *
  6 |  * @author Xin Wu (PC²)
  7 |  * @date 19.03.2020
  8 |  * @copyright CC BY-SA 2.0
  9 |  *
 10 |  * matAdd performs matrix addition (A += B) in single-precision on GPU.
 11 |  * The performance (in GB/s) for different implementations is compared and
 12 |  * the numerical results are also verified.
 13 |  */
 14 | 
 15 | #include <math.h>
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <string.h>
 19 | #include <time.h>
 20 | #ifdef _OPENMP
 21 | #include <omp.h>
 22 | #endif
 23 | #include "mkl.h"
 24 | #include "matAddAB.h"
 25 | 
 26 | #define NLUP (64)
 27 | 
 28 | /**
 29 |  * @brief Main entry point for matAdd.
 30 |  */
 31 | int main(int argc, char *argv[])
 32 | {
 33 |   int    ial, idx, n,
 34 |          iret = 0;
 35 |   size_t n2bytes;
 36 |   float  *a, *b,
 37 |          *ahost, // a matrix on host (as reference)
 38 |          *aaccl, // a matrix on accl
 39 |          maxabserr;
 40 |   struct timespec rt[2];
 41 |   double wt; // walltime
 42 | 
 43 |   /*
 44 |    * preparation
 45 |    */
 46 |   n         = atoi(argv[1]); // 4096 is used for test
 47 |   n2bytes   = sizeof(float) * n * n;
 48 |   if (NULL == (a     = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 49 |   if (NULL == (b     = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 50 |   if (NULL == (ahost = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 51 |   if (NULL == (aaccl = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 52 |   if (iret != 0) {
 53 |     printf("error: memory allocation\n");
 54 |     mkl_free(a); mkl_free(b);
 55 |     mkl_free(ahost); mkl_free(aaccl);
 56 |     exit(EXIT_FAILURE);
 57 |   }
 58 | #pragma omp parallel for default(none) \
 59 |   shared(a, b, ahost, aaccl, n) private(idx)
 60 |   for (idx = 0; idx < n * n; ++idx) {
 61 |     a[idx] = rand() % 32 / 32.0f;
 62 |     b[idx] = rand() % 32 / 32.0f;
 63 |     ahost[idx] = 0.0f;
 64 |     aaccl[idx] = 0.0f;
 65 |   }
 66 |   printf("matrix dim: %d x %d\ntime averaged over %d loops\n", n, n, NLUP);
 67 |   /*
 68 |    * matAdd on host (ahost will be used as ref. value for checking aaccl)
 69 |    */
 70 |   memcpy(ahost, a, n2bytes);
 71 |   cblas_saxpy(n * n, 1.0f, b, 1, ahost, 1);
 72 |   /*
 73 |    * matAdd on accl
 74 |    */
 75 |   for (ial = 0; ial < 8; ++ial) {
 76 |     /*
 77 |      * See matAddAB.c for details:
 78 |      *
 79 |      * ial:
 80 |      *
 81 |      * 0: ij-loop, 2^9 threads * 2^3 teams,
 82 |      *    coalesced memory access
 83 |      *
 84 |      * 1: ji-loop, 2^9 threads * 2^3 teams,
 85 |      *    uncoalesced memory access
 86 |      *
 87 |      * 2: ij-loop, 2^9 threads * 2^f teams, collapse(2),
 88 |      *    uncoalesced memory access
 89 |      *
 90 |      * 3: ji-loop, 2^9 threads * 2^f teams, collapse(2),
 91 |      *    coalesced memory access
 92 |      *
 93 |      * 4: ji-loop, 2^8 threads * 2^f teams, collapse(3),
 94 |      *    2x i-loop unrolling
 95 |      *
 96 |      * 5: ji-loop, 2^8 threads * 2^f teams, collapse(2),
 97 |      *    2x i-loop unrolling
 98 |      *
 99 |      * 6: ji-loop, 2^8 threads * 2^e teams, collapse(3),
100 |      *    2x i-loop unrolling, 2x j-loop unrolling
101 |      *
102 |      * otherwise: cublasSaxpy in CUBLAS
103 |      */
104 |     memcpy(aaccl, a, n2bytes);
105 |     wtcalc = -1.0;
106 |     // skip 1st run for timing
107 |     matAddAB_accl(aaccl, b, n, ial);
108 |     // check aaccl
109 |     maxabserr = -1.0f;
110 |     for (idx = 0; idx < n * n; ++idx) {
111 |       maxabserr = fabsf(aaccl[idx] - ahost[idx]) > maxabserr?
112 |                   fabsf(aaccl[idx] - ahost[idx]) : maxabserr;
113 |     }
114 |     // skip 2nd run for timing
115 |     matAddAB_accl(aaccl, b, n, ial);
116 |     // timing : start
117 |     wtcalc = 0.0;
118 |     clock_gettime(CLOCK_REALTIME, rt + 0);
119 |     for (int i = 0; i < NLUP; ++i) {
120 |       matAddAB_accl(aaccl, b, n, ial);
121 |     }
122 |     clock_gettime(CLOCK_REALTIME, rt + 1);
123 |     wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
124 |     printf("matAddAB (%d) : %9.1f GB/s %9.1f GB/s maxabserr = %9.1f\n", ial,
125 |         NLUP * 3.0 * n2bytes / ((1 << 30) * wt),
126 |         NLUP * 3.0 * n2bytes / ((1 << 30) * wtcalc), maxabserr);
127 |   }
128 |   /*
129 |    * release memory
130 |    */
131 |   mkl_free(a); mkl_free(b);
132 |   mkl_free(ahost); mkl_free(aaccl);
133 |   return 0;
134 | }
135 | 


--------------------------------------------------------------------------------
/02_dataTransRate/src/dataTransRate.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file dataTransRate.c
  3 |  *
  4 |  * @mainpage dataTransRate
  5 |  *
  6 |  * @author Xin Wu (PC²)
  7 |  * @date 12.03.2020
  8 |  * @copyright CC BY-SA 2.0
  9 |  *
 10 |  * dataTransRate gives the data transfer rate (in MB/sec) from src to dst.
 11 |  *
 12 |  * The possible situations are:
 13 |  *
 14 |  * - h2h: src = host  and dst = host
 15 |  * - h2a: src = host  and dst = accel
 16 |  * - a2a: src = accel and dst = accel
 17 |  */
 18 | 
 19 | #include <stdio.h>
 20 | #include <stdlib.h>
 21 | #include <time.h>
 22 | #ifdef _OPENMP
 23 | #include <omp.h>
 24 | #endif
 25 | #include "check1ns.h"
 26 | 
 27 | /**
 28 |  * @brief Main entry point for dataTransRate.
 29 |  */
 30 | int main(int argc, char *argv[])
 31 | {
 32 |   // host
 33 |   int ihost,
 34 |       *hdat[2];
 35 |   // accelerator
 36 |   int iaccel, naccel,
 37 |       *adat[2];
 38 |   int    nMB;
 39 |   size_t data;
 40 |   struct timespec rt[2];
 41 |   double wt; // walltime
 42 |   int i, iret = 0;
 43 | 
 44 |   /*
 45 |    * We need 1 ns time resolution.
 46 |    */
 47 |   check1ns();
 48 |   printf("The system supports 1 ns time resolution\n");
 49 |   /*
 50 |    * check the number of accelerators
 51 |    */
 52 |   naccel = omp_get_num_devices();
 53 |   if (0 == naccel) {
 54 |     printf("No accelerator found ... exit\n");
 55 |     exit(EXIT_FAILURE);
 56 |   } else {
 57 |     printf("%d accelerator found ... continue\n", naccel);
 58 |   }
 59 |   /*
 60 |    * prepare data (default to 512 MB), host, and accel
 61 |    */
 62 |   if (1 == argc) {
 63 |     nMB = 512;
 64 |   } else {
 65 |     nMB = atoi(argv[1]);
 66 |   }
 67 |   data   = nMB * (1 << 20);
 68 |   ihost  = omp_get_initial_device(); // index of the host
 69 |   iaccel = 0;                        // index of the 1st accel
 70 |   for (i = 0; i < 2; i++) {
 71 |     if (NULL == (hdat[i] = (int *) omp_target_alloc(data, ihost))) {
 72 |       printf("error: memory allocation for hdat[%d] ...", i);
 73 |       iret = -1;
 74 |     }
 75 |     if (NULL == (adat[i] = (int *) omp_target_alloc(data, iaccel))) {
 76 |       printf("error: memory allocation for adat[%d] ...", i);
 77 |       iret = -1;
 78 |     }
 79 |   }
 80 |   if (0 != iret) {
 81 |     for (i = 0; i < 2; i++) {
 82 |       omp_target_free(hdat[i], ihost);
 83 |       omp_target_free(adat[i], iaccel);
 84 |     }
 85 |     printf(" exit\n");
 86 |     exit(EXIT_FAILURE);
 87 |   }
 88 |   printf("%d MB data will be transferred", nMB);
 89 |   for (i = 0; i < data / sizeof(*hdat[0]); i++) {
 90 |     hdat[0][i] = rand();
 91 |   }
 92 |   /*
 93 |    * data transfer rate: h2h, h2a, and a2a
 94 |    */
 95 |   printf("\nData Transfer Rate\n\n");
 96 |   printf("================================\n");
 97 |   printf(" src     dst          DTR       \n");
 98 |   printf("------- ------- ----------------\n");
 99 |   /*
100 |    * h2h
101 |    */
102 |   clock_gettime(CLOCK_REALTIME, rt + 0);
103 |   iret = omp_target_memcpy(hdat[1], hdat[0], data, 0x0, 0x0, ihost, ihost);
104 |   clock_gettime(CLOCK_REALTIME, rt + 1);
105 |   if (0 != iret) {
106 |     printf("error: omp_target_memcpy (h2h)\n");
107 |     exit(EXIT_FAILURE);
108 |   }
109 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
110 |   printf(" host    host   %8.1f MB/sec\n", nMB / wt);
111 |   /*
112 |    * h2a
113 |    */
114 |   clock_gettime(CLOCK_REALTIME, rt + 0);
115 |   iret = omp_target_memcpy(adat[0], hdat[0], data, 0x0, 0x0, iaccel, ihost);
116 |   clock_gettime(CLOCK_REALTIME, rt + 1);
117 |   if (0 != iret) {
118 |     printf("error: omp_target_memcpy (h2a)\n");
119 |     exit(EXIT_FAILURE);
120 |   }
121 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
122 |   printf(" host    accel  %8.1f MB/sec\n", nMB / wt);
123 |   /*
124 |    * a2a
125 |    *
126 |    * - Synchronous execution has been fixed in Clang 11.
127 |    * - Data transfer rate is somehow lower than our expectation.
128 |    */
129 |   clock_gettime(CLOCK_REALTIME, rt + 0);
130 |   iret = omp_target_memcpy(adat[1], adat[0], data, 0x0, 0x0, iaccel, iaccel);
131 |   clock_gettime(CLOCK_REALTIME, rt + 1);
132 |   if (0 != iret) {
133 |     printf("error: omp_target_memcpy (a2a)\n");
134 |     exit(EXIT_FAILURE);
135 |   }
136 |   wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
137 |   printf(" accel   accel  %8.1f MB/sec\n", nMB / wt);
138 |   printf("================================\n\n");
139 |   /*
140 |    * release the data
141 |    */
142 |   for (i = 0; i < 2; i++) {
143 |     omp_target_free(hdat[i], ihost);
144 |     omp_target_free(adat[i], iaccel);
145 |   }
146 |   return 0;
147 | }
148 | 


--------------------------------------------------------------------------------
/10_matMul/src/matMul.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file matMul.c
  3 |  *
  4 |  * @mainpage matMul
  5 |  *
  6 |  * @author Xin Wu (PC²)
  7 |  * @date 19.03.2020
  8 |  * @copyright CC BY-SA 2.0
  9 |  *
 10 |  * matMul performs matrix multiplication in single-precision on GPU. The
 11 |  * performance (in GFLOPS) for different implementations is compared and the
 12 |  * numerical results are also verified.
 13 |  */
 14 | 
 15 | #include <math.h>
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <string.h>
 19 | #include <time.h>
 20 | #ifdef _OPENMP
 21 | #include <omp.h>
 22 | #endif
 23 | #include "mkl.h"
 24 | #include "matMulAB.h"
 25 | 
 26 | #define NLUP (16)
 27 | 
 28 | /**
 29 |  * @brief Main entry point for matMul.
 30 |  */
 31 | int main(int argc, char *argv[])
 32 | {
 33 |   int    ial, idx, n,
 34 |          iret = 0;
 35 |   size_t n2bytes;
 36 |   float  *a, *b, *c,
 37 |          *chost, // c matrix on host (as reference)
 38 |          *caccl, // c matrix on accl
 39 |          maxabserr;
 40 |   struct timespec rt[2];
 41 |   double wt; // walltime
 42 | 
 43 |   /*
 44 |    * preparation
 45 |    */
 46 |   n         = atoi(argv[1]); // 4096 is used for test
 47 |   n2bytes   = sizeof(float) * n * n;
 48 |   if (NULL == (a     = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 49 |   if (NULL == (b     = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 50 |   if (NULL == (c     = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 51 |   if (NULL == (chost = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 52 |   if (NULL == (caccl = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1;
 53 |   if (iret != 0) {
 54 |     printf("error: memory allocation\n");
 55 |     mkl_free(a); mkl_free(b); mkl_free(c);
 56 |     mkl_free(chost); mkl_free(caccl);
 57 |     exit(EXIT_FAILURE);
 58 |   }
 59 | #pragma omp parallel for default(none) \
 60 |   shared(a, b, c, chost, caccl, n) private(idx)
 61 |   for (idx = 0; idx < n * n; idx++) {
 62 |     a[idx] = rand() % 32 / 32.0f;
 63 |     b[idx] = rand() % 32 / 32.0f;
 64 |     c[idx] = rand() % 32 / 32.0f;
 65 |     chost[idx] = 0.0f;
 66 |     caccl[idx] = 0.0f;
 67 |   }
 68 |   printf("matrix dim: %d x %d\ntime averaged over %d loops\n", n, n, NLUP);
 69 |   /*
 70 |    * matMul on host (chost will be used as ref. value for checking caccl)
 71 |    */
 72 |   memcpy(chost, c, n2bytes);
 73 |   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
 74 |       n, n, n, 1.0f, a, n, b, n, 1.0f, chost, n);
 75 |   /*
 76 |    * matMul on accl
 77 |    */
 78 |   for (ial = 0; ial < 11; ++ial) {
 79 |     /*
 80 |      * See matMulAB.c for details:
 81 |      *
 82 |      * ial:
 83 |      *
 84 |      * 0: jik-loop, 2^9 threads * 2^3 teams,
 85 |      *    uncoalesced memory access
 86 |      *
 87 |      * 1: jki-loop, 2^9 threads * 2^3 teams,
 88 |      *    uncoalesced memory access, uncoalesced r&w in innermost loop
 89 |      *
 90 |      * 2: jik-loop, 2^9 threads * 2^f teams, collapse(2)
 91 |      *
 92 |      * 3: jki-loop, 2^9 threads * 2^f teams, collapse(2),
 93 |      *    race condition for writing c!
 94 |      *
 95 |      * 4: jik-loop, 2^9 threads * 2^f teams, collapse(2),
 96 |      *    4x k-loop unrolling
 97 |      *
 98 |      * 5: jik-loop, 2^7 threads * 2^f teams, collapse(3),
 99 |      *    4x i-loop unrolling (stride of 2^7 rows),
100 |      *    4x k-loop unrolling,
101 |      *    rb: 4x data reuse
102 |      *
103 |      * 6: jik-loop, 2^7 threads * 2^d teams, collapse(3),
104 |      *    4x j-loop unrolling (stride of 1   col ),
105 |      *    4x i-loop unrolling (stride of 2^7 rows),
106 |      *    4x k-loop unrolling,
107 |      *    rb: 4x data reuse,
108 |      *    ra: 4x data reuse,
109 |      *    register blocking
110 |      *
111 |      * 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2)
112 |      *
113 |      * 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2)
114 |      *    GPU shared memory for data re-use,
115 |      *    16x k-loop unrolling,
116 |      *    shared memory blocking
117 |      *
118 |      * 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2),
119 |      *    4x i-loop unrolling (stride of n/4 rows),
120 |      *    4x k-loop unrolling,
121 |      *    rb: 4x data reuse
122 |      *
123 |      * otherwise: cublasSgemm in CUBLAS
124 |      */
125 |     memcpy(caccl, c, n2bytes);
126 |     wtcalc = -1.0;
127 |     // skip 1st run for timing
128 |     matMulAB_accl(a, b, caccl, n, ial);
129 |     // check caccl
130 |     maxabserr = -1.0f;
131 |     for (idx = 0; idx < n * n; idx++) {
132 |       maxabserr = fabsf(caccl[idx] - chost[idx]) > maxabserr?
133 |                   fabsf(caccl[idx] - chost[idx]) : maxabserr;
134 |     }
135 |     // skip 2nd run for timing
136 |     matMulAB_accl(a, b, caccl, n, ial);
137 |     // timing : start
138 |     wtcalc = 0.0;
139 |     clock_gettime(CLOCK_REALTIME, rt + 0);
140 |     for (int i = 0; i < NLUP; ++i) {
141 |       matMulAB_accl(a, b, caccl, n, ial);
142 |     }
143 |     clock_gettime(CLOCK_REALTIME, rt + 1);
144 |     wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
145 |     printf("matMulAB (%d) : %9.1f GFLOPS %9.1f GFLOPS maxabserr = %9.1f\n", ial,
146 |         NLUP * 2.0e-9 * n * n * n / wt, NLUP * 2.0e-9 * n * n * n / wtcalc,
147 |         maxabserr);
148 |   }
149 |   /*
150 |    * release memory
151 |    */
152 |   mkl_free(a); mkl_free(b); mkl_free(c);
153 |   mkl_free(chost); mkl_free(caccl);
154 |   return 0;
155 | }
156 | 


--------------------------------------------------------------------------------
/05_saxpy/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_PREREQ([2.63])
  2 | AC_INIT([saxpy], [1.0], [xinwu@mail.uni-paderborn.de])
  3 | AC_CONFIG_AUX_DIR([build-aux])
  4 | AC_CONFIG_SRCDIR([src/saxpy.c])
  5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
  6 | ##############################################################################80
  7 | #
  8 | # check CUDA
  9 | #
 10 | ##############################################################################80
 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
 12 | if test -z "${CUDAINC}"; then
 13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
 14 | fi
 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
 16 | if test -z "${CUDALIB}"; then
 17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
 18 | fi
 19 | ##############################################################################80
 20 | #
 21 | # check MKL
 22 | #
 23 | ##############################################################################80
 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found])
 25 | if test -z "${MKLINC}"; then
 26 |   AC_SUBST([MKLINC], [${MKLROOT}/include])
 27 | fi
 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found])
 29 | if test -z "${MKLLIB}"; then
 30 |   AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64])
 31 | fi
 32 | ##############################################################################80
 33 | #
 34 | # check C compiler
 35 | #
 36 | ##############################################################################80
 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}"
 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
 39 | #
 40 | AC_PROG_CC([clang gcc])
 41 | AS_IF([test "${CC}" = gcc],
 42 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 43 | AS_IF([test "${CC}" = clang],
 44 |   [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \
 45 |            -Xopenmp-target -march=sm_61 $CFLAGS"])
 46 | ##############################################################################80
 47 | #
 48 | # check archiver
 49 | #
 50 | ##############################################################################80
 51 | AC_PROG_RANLIB
 52 | AM_PROG_AR
 53 | ##############################################################################80
 54 | #
 55 | # check headers
 56 | #
 57 | ##############################################################################80
 58 | AC_CHECK_HEADER([cuda_runtime.h], [],
 59 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
 60 | AC_CHECK_HEADER([cublas_v2.h], [],
 61 |   [AC_MSG_ERROR([cublas_v2.h required, but not found])], [])
 62 | AC_CHECK_HEADER([mkl.h], [],
 63 |   [AC_MSG_ERROR([mkl.h required, but not found])], [])
 64 | ##############################################################################80
 65 | #
 66 | # check libraries
 67 | #
 68 | ##############################################################################80
 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
 70 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
 71 | AC_CHECK_LIB([cublas], [cublasSaxpy], [],
 72 |   [AC_MSG_ERROR([libcublas required, but not found])], [])
 73 | AC_CHECK_LIB([pthread], [pthread_create], [],
 74 |   [AC_MSG_ERROR([libpthread required, but not found])], [])
 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [],
 76 |   [AC_MSG_ERROR([libiomp5 required, but not found])], [])
 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [],
 78 |   [AC_MSG_ERROR([libmkl_core required, but not found])],
 79 |   [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm])
 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [],
 81 |   [AC_MSG_ERROR([libmkl_intel_thread required, but not found])],
 82 |   [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm])
 83 | AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [],
 84 |   [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])],
 85 |   [-lmkl_intel_thread -lmkl_core -liomp5 -lm])
 86 | ##############################################################################80
 87 | #
 88 | # check Doxygen
 89 | #
 90 | ##############################################################################80
 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
 93 | AS_IF([test -z "${DOXYGEN}"],
 94 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
 98 | ##############################################################################80
 99 | #
100 | # check ccsalloc (in OpenCCS)
101 | #
102 | ##############################################################################80
103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
105 | AS_IF([test -z "${CCSALLOC}"],
106 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
108 | ##############################################################################80
109 | #
110 | # create final files
111 | #
112 | ##############################################################################80
113 | AC_CONFIG_HEADERS([config.h])
114 | AC_CONFIG_FILES([Makefile
115 |                  src/Makefile
116 |                  tests/Makefile])
117 | AC_OUTPUT
118 | 
119 | echo "
120 | //============================================================================80
121 | 
122 | Configuration:
123 | 
124 |   CC      : ${CC}
125 |   CFLAGS  : ${CFLAGS}
126 |   LDFLAGS : ${LDFLAGS}
127 |   LIBS    : ${LIBS}
128 | 
129 | //============================================================================80
130 | 
131 | Now, type make to build ..."
132 | 


--------------------------------------------------------------------------------
/09_matAdd/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_PREREQ([2.63])
  2 | AC_INIT([matAdd], [1.0], [xinwu@mail.uni-paderborn.de])
  3 | AC_CONFIG_AUX_DIR([build-aux])
  4 | AC_CONFIG_SRCDIR([src/matAdd.c])
  5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
  6 | ##############################################################################80
  7 | #
  8 | # check CUDA
  9 | #
 10 | ##############################################################################80
 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
 12 | if test -z "${CUDAINC}"; then
 13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
 14 | fi
 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
 16 | if test -z "${CUDALIB}"; then
 17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
 18 | fi
 19 | ##############################################################################80
 20 | #
 21 | # check MKL
 22 | #
 23 | ##############################################################################80
 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found])
 25 | if test -z "${MKLINC}"; then
 26 |   AC_SUBST([MKLINC], [${MKLROOT}/include])
 27 | fi
 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found])
 29 | if test -z "${MKLLIB}"; then
 30 |   AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64])
 31 | fi
 32 | ##############################################################################80
 33 | #
 34 | # check C compiler
 35 | #
 36 | ##############################################################################80
 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}"
 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
 39 | #
 40 | AC_PROG_CC([clang gcc])
 41 | AS_IF([test "${CC}" = gcc],
 42 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 43 | AS_IF([test "${CC}" = clang],
 44 |   [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 45 |            -Xopenmp-target -march=sm_61 $CFLAGS"])
 46 | ##############################################################################80
 47 | #
 48 | # check archiver
 49 | #
 50 | ##############################################################################80
 51 | AC_PROG_RANLIB
 52 | AM_PROG_AR
 53 | ##############################################################################80
 54 | #
 55 | # check headers
 56 | #
 57 | ##############################################################################80
 58 | AC_CHECK_HEADER([cuda_runtime.h], [],
 59 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
 60 | AC_CHECK_HEADER([cublas_v2.h], [],
 61 |   [AC_MSG_ERROR([cublas_v2.h required, but not found])], [])
 62 | AC_CHECK_HEADER([mkl.h], [],
 63 |   [AC_MSG_ERROR([mkl.h required, but not found])], [])
 64 | ##############################################################################80
 65 | #
 66 | # check libraries
 67 | #
 68 | ##############################################################################80
 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
 70 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
 71 | AC_CHECK_LIB([cublas], [cublasSaxpy], [],
 72 |   [AC_MSG_ERROR([libcublas required, but not found])], [])
 73 | AC_CHECK_LIB([pthread], [pthread_create], [],
 74 |   [AC_MSG_ERROR([libpthread required, but not found])], [])
 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [],
 76 |   [AC_MSG_ERROR([libiomp5 required, but not found])], [])
 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [],
 78 |   [AC_MSG_ERROR([libmkl_core required, but not found])],
 79 |   [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm])
 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [],
 81 |   [AC_MSG_ERROR([libmkl_intel_thread required, but not found])],
 82 |   [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm])
 83 | AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [],
 84 |   [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])],
 85 |   [-lmkl_intel_thread -lmkl_core -liomp5 -lm])
 86 | ##############################################################################80
 87 | #
 88 | # check Doxygen
 89 | #
 90 | ##############################################################################80
 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
 93 | AS_IF([test -z "${DOXYGEN}"],
 94 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
 98 | ##############################################################################80
 99 | #
100 | # check ccsalloc (in OpenCCS)
101 | #
102 | ##############################################################################80
103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
105 | AS_IF([test -z "${CCSALLOC}"],
106 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
108 | ##############################################################################80
109 | #
110 | # create final files
111 | #
112 | ##############################################################################80
113 | AC_CONFIG_HEADERS([config.h])
114 | AC_CONFIG_FILES([Makefile
115 |                  src/Makefile
116 |                  tests/Makefile])
117 | AC_OUTPUT
118 | 
119 | echo "
120 | //============================================================================80
121 | 
122 | Configuration:
123 | 
124 |   CC      : ${CC}
125 |   CFLAGS  : ${CFLAGS}
126 |   LDFLAGS : ${LDFLAGS}
127 |   LIBS    : ${LIBS}
128 | 
129 | //============================================================================80
130 | 
131 | Now, type make to build ..."
132 | 


--------------------------------------------------------------------------------
/10_matMul/configure.ac:
--------------------------------------------------------------------------------
  1 | AC_PREREQ([2.63])
  2 | AC_INIT([matMul], [1.0], [xinwu@mail.uni-paderborn.de])
  3 | AC_CONFIG_AUX_DIR([build-aux])
  4 | AC_CONFIG_SRCDIR([src/matMul.c])
  5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects])
  6 | ##############################################################################80
  7 | #
  8 | # check CUDA
  9 | #
 10 | ##############################################################################80
 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found])
 12 | if test -z "${CUDAINC}"; then
 13 |   AC_SUBST([CUDAINC], [${CUDA_ROOT}/include])
 14 | fi
 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found])
 16 | if test -z "${CUDALIB}"; then
 17 |   AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64])
 18 | fi
 19 | ##############################################################################80
 20 | #
 21 | # check MKL
 22 | #
 23 | ##############################################################################80
 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found])
 25 | if test -z "${MKLINC}"; then
 26 |   AC_SUBST([MKLINC], [${MKLROOT}/include])
 27 | fi
 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found])
 29 | if test -z "${MKLLIB}"; then
 30 |   AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64])
 31 | fi
 32 | ##############################################################################80
 33 | #
 34 | # check C compiler
 35 | #
 36 | ##############################################################################80
 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}"
 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}"
 39 | #
 40 | AC_PROG_CC([clang gcc])
 41 | AS_IF([test "${CC}" = gcc],
 42 |   [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"])
 43 | AS_IF([test "${CC}" = clang],
 44 |   [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 45 |            -Xopenmp-target -march=sm_61 $CFLAGS"])
 46 | ##############################################################################80
 47 | #
 48 | # check archiver
 49 | #
 50 | ##############################################################################80
 51 | AC_PROG_RANLIB
 52 | AM_PROG_AR
 53 | ##############################################################################80
 54 | #
 55 | # check headers
 56 | #
 57 | ##############################################################################80
 58 | AC_CHECK_HEADER([cuda_runtime.h], [],
 59 |   [AC_MSG_ERROR([cuda_runtime.h required, but not found])], [])
 60 | AC_CHECK_HEADER([cublas_v2.h], [],
 61 |   [AC_MSG_ERROR([cublas_v2.h required, but not found])], [])
 62 | AC_CHECK_HEADER([mkl.h], [],
 63 |   [AC_MSG_ERROR([mkl.h required, but not found])], [])
 64 | ##############################################################################80
 65 | #
 66 | # check libraries
 67 | #
 68 | ##############################################################################80
 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [],
 70 |   [AC_MSG_ERROR([libcudart required, but not found])], [])
 71 | AC_CHECK_LIB([cublas], [cublasSgemm], [],
 72 |   [AC_MSG_ERROR([libcublas required, but not found])], [])
 73 | AC_CHECK_LIB([pthread], [pthread_create], [],
 74 |   [AC_MSG_ERROR([libpthread required, but not found])], [])
 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [],
 76 |   [AC_MSG_ERROR([libiomp5 required, but not found])], [])
 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsgemm], [],
 78 |   [AC_MSG_ERROR([libmkl_core required, but not found])],
 79 |   [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm])
 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_sgemm], [],
 81 |   [AC_MSG_ERROR([libmkl_intel_thread required, but not found])],
 82 |   [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm])
 83 | AC_CHECK_LIB([mkl_intel_lp64], [sgemm], [],
 84 |   [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])],
 85 |   [-lmkl_intel_thread -lmkl_core -liomp5 -lm])
 86 | ##############################################################################80
 87 | #
 88 | # check Doxygen
 89 | #
 90 | ##############################################################################80
 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program])
 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], [])
 93 | AS_IF([test -z "${DOXYGEN}"],
 94 |   [AC_MSG_WARN([doxygen not found - continue without doxygen support])])
 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"])
 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])])
 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])])
 98 | ##############################################################################80
 99 | #
100 | # check ccsalloc (in OpenCCS)
101 | #
102 | ##############################################################################80
103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software])
104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], [])
105 | AS_IF([test -z "${CCSALLOC}"],
106 |   [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])])
107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"])
108 | ##############################################################################80
109 | #
110 | # create final files
111 | #
112 | ##############################################################################80
113 | AC_CONFIG_HEADERS([config.h])
114 | AC_CONFIG_FILES([Makefile
115 |                  src/Makefile
116 |                  tests/Makefile])
117 | AC_OUTPUT
118 | 
119 | echo "
120 | //============================================================================80
121 | 
122 | Configuration:
123 | 
124 |   CC      : ${CC}
125 |   CFLAGS  : ${CFLAGS}
126 |   LDFLAGS : ${LDFLAGS}
127 |   LIBS    : ${LIBS}
128 | 
129 | //============================================================================80
130 | 
131 | Now, type make to build ..."
132 | 


--------------------------------------------------------------------------------
/05_saxpy/src/saxpy.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file saxpy.c
  3 |  *
  4 |  * @mainpage saxpy
  5 |  *
  6 |  * @author Xin Wu (PC²)
  7 |  * @date 05.04.2020
  8 |  * @copyright CC BY-SA 2.0
  9 |  *
 10 |  * saxpy performs the \c saxpy operation on host as well as accelerator.
 11 |  * The performance (in MB/s) for different implementations is also compared.
 12 |  *
 13 |  * The \c saxpy operation is defined as:
 14 |  *
 15 |  * y := a * x + y
 16 |  *
 17 |  * where:
 18 |  *
 19 |  * - a is a scalar.
 20 |  * - x and y are single-precision vectors each with n elements.
 21 |  */
 22 | 
 23 | #include <math.h>
 24 | #include <stdio.h>
 25 | #include <stdlib.h>
 26 | #include <string.h>
 27 | #include <time.h>
 28 | #ifdef _OPENMP
 29 | #include <omp.h>
 30 | #endif
 31 | #include "mkl.h"
 32 | #include "hsaxpy.h"
 33 | #include "asaxpy.h"
 34 | #include "check1ns.h"
 35 | #include "wtcalc.h"
 36 | 
 37 | #define TWO26 (1 << 26)
 38 | #define NLUP  (32)
 39 | 
 40 | /**
 41 |  * @brief Main entry point for saxpy.
 42 |  */
 43 | int main(int argc, char *argv[])
 44 | {
 45 |   int    i, n,
 46 |          iret,
 47 |          ial;
 48 |   size_t nbytes;
 49 |   float  a = 2.0f,
 50 |          *x, *y,
 51 |          *yhost,
 52 |          *yaccl,
 53 |          maxabserr;
 54 |   struct timespec rt[2];
 55 |   double wt; // walltime
 56 | 
 57 |   /*
 58 |    * We need 1 ns time resolution.
 59 |    */
 60 |   check1ns();
 61 |   printf("The system supports 1 ns time resolution\n");
 62 |   /*
 63 |    * check the number of accelerators
 64 |    */
 65 |   if (0 == omp_get_num_devices()) {
 66 |     printf("No accelerator found ... exit\n");
 67 |     exit(EXIT_FAILURE);
 68 |   }
 69 |   /*
 70 |    * preparation
 71 |    */
 72 |   n      = TWO26;
 73 |   nbytes = sizeof(float) * n;
 74 |   iret   = 0;
 75 |   if (NULL == (x     = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
 76 |   if (NULL == (y     = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
 77 |   if (NULL == (yhost = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
 78 |   if (NULL == (yaccl = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1;
 79 |   if (0 != iret) {
 80 |     printf("error: memory allocation\n");
 81 |     mkl_free(x);     mkl_free(y);
 82 |     mkl_free(yhost); mkl_free(yaccl);
 83 |     exit(EXIT_FAILURE);
 84 |   }
 85 | #pragma omp parallel for default(none) \
 86 |   shared(a, x, y, yhost, yaccl, n) private(i)
 87 |   for (i = 0; i < n; ++i) {
 88 |     x[i]     = rand() % 32 / 32.0f;
 89 |     y[i]     = rand() % 32 / 32.0f;
 90 |     yhost[i] = a * x[i] + y[i]; // yhost will be used as reference value
 91 |     yaccl[i] = 0.0f;
 92 |   }
 93 |   printf("total size of x and y is %9.1f MB\n", 2.0 * nbytes / (1 << 20));
 94 |   printf("tests are averaged over %2d loops\n", NLUP);
 95 |   /*
 96 |    * saxpy on host
 97 |    */
 98 |   for (ial = 0; ial < 2; ++ial) {
 99 |     /*
100 |      * See hsaxpy.c for details:
101 |      *
102 |      * ial:
103 |      *
104 |      * 0: naive implementation
105 |      * otherwise: saxpy in MKL
106 |      */
107 |     memcpy(yaccl, y, nbytes);
108 |     wtcalc = -1.0;
109 |     // skip 1st run for timing
110 |     hsaxpy(n, a, x, yaccl, ial);
111 |     // check yaccl
112 |     maxabserr = -1.0f;
113 |     for (i = 0; i < n; ++i) {
114 |       maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr?
115 |                   fabsf(yaccl[i] - yhost[i]) : maxabserr;
116 |     }
117 |     // skip 2nd run for timing
118 |     hsaxpy(n, a, x, yaccl, ial);
119 |     // timing : start
120 |     wtcalc = 0.0;
121 |     clock_gettime(CLOCK_REALTIME, rt + 0);
122 |     for (int ilup = 0; ilup < NLUP; ++ilup) {
123 |       hsaxpy(n, a, x, yaccl, ial);
124 |     }
125 |     clock_gettime(CLOCK_REALTIME, rt + 1);
126 |     wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
127 |     printf("saxpy on host (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n",
128 |         ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt),
129 |              NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr);
130 |   }
131 |   /*
132 |    * saxpy on accl
133 |    */
134 |   for (ial = 1; ial < 9; ++ial) {
135 |     /*
136 |      * See asaxpy.c for details:
137 |      *
138 |      * ial:
139 |      *
140 |      * 0: <<<2^0 , 2^0 >>>, TOO SLOW! not tested
141 |      * 1: <<<2^0 , 2^7 >>>, auto   scheduling
142 |      * 2: <<<2^7 , 2^0 >>>, auto   scheduling
143 |      * 3: <<<2^7 , 2^7 >>>, auto   scheduling
144 |      * 4: <<<2^16, 2^10>>>, manual scheduling
145 |      * 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
146 |      * 6: <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling
147 |      * 7: de-linearize the vector and then collapse the ji-loop.
148 |      * otherwise: cublasSaxpy in CUBLAS
149 |      */
150 |     memcpy(yaccl, y, nbytes);
151 |     wtcalc = -1.0;
152 |     // skip 1st run for timing
153 |     asaxpy(n, a, x, yaccl, ial);
154 |     // check yaccl
155 |     maxabserr = -1.0f;
156 |     for (i = 0; i < n; ++i) {
157 |       maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr?
158 |                   fabsf(yaccl[i] - yhost[i]) : maxabserr;
159 |     }
160 |     // skip 2nd run for timing
161 |     asaxpy(n, a, x, yaccl, ial);
162 |     // timing : start
163 |     wtcalc = 0.0;
164 |     clock_gettime(CLOCK_REALTIME, rt + 0);
165 |     for (int ilup = 0; ilup < NLUP; ++ilup) {
166 |       asaxpy(n, a, x, yaccl, ial);
167 |     }
168 |     clock_gettime(CLOCK_REALTIME, rt + 1);
169 |     wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
170 |     printf("saxpy on accl (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n",
171 |         ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt),
172 |              NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr);
173 |   }
174 |   /*
175 |    * release memory
176 |    */
177 |   mkl_free(x);     mkl_free(y);
178 |   mkl_free(yhost); mkl_free(yaccl);
179 |   return 0;
180 | }
181 | 


--------------------------------------------------------------------------------
/00_build_OpenMP_offload/Clang/build_clang_offload.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Build Clang/LLVM with OpenMP Support for Nvidia GPU Offloading
  3 | author: Xin Wu (PC²)
  4 | date: 28.01.2020
  5 | ---
  6 | 
  7 | # Check Nvidia GPU
  8 | 
  9 | The build procedure was carried out on a Tesla node of OCuLUS at PC². It
 10 | features an Nvidia Tesla K20X GPU. Thus it's necessary to check the Tesla K20X
 11 | GPU on the compute node, before building Clang with OpenMP support for offloading
 12 | computation on Nvidia GPU.
 13 | 
 14 | The relevant scripts and log files can be found in `00_check_gpu`.
 15 | 
 16 | `tesla.sh` is a driver script and should be submitted with `ccsalloc`:
 17 | 
 18 | ```bash
 19 | ccsalloc testa.sh
 20 | ```
 21 | 
 22 | `realscript.sh` does the real job and the output can be found in `tesla.log`.
 23 | 
 24 | # Build Clang and Necessary Toolchains
 25 | 
 26 | The necessary toolchains for building Clang need to be built first. For this
 27 | purpose we have built GCC 8.3.0,[^gcc830] binutils, autoconf, automake, OpenSSL,
 28 | CMake, and ncurses.
 29 | 
 30 | [^gcc830]: At the time of writing, GCC 9.2.0 is not supported for building Clang
 31 |   with OpenMP offloading to GPU.
 32 | 
 33 | After the toolchains have been built, Clang can be built with GCC 8.3.0 by using
 34 | the following script:
 35 | 
 36 | ```bash
 37 | pkgname="llvmorg-9.0.1"
 38 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz
 39 | tar xf ${pkgname}.tar.gz
 40 | BUILDIR="GCC"
 41 | rm -fr   $BUILDIR
 42 | mkdir -p $BUILDIR
 43 | cd       $BUILDIR
 44 | cmake                                                                          \
 45 |   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \
 46 |   -DCMAKE_PREFIX_PATH="${TOOLCHAINS}"                                          \
 47 |   -DCMAKE_BUILD_TYPE=Release                                                   \
 48 |   -DLLVM_TARGETS_TO_BUILD="X86;NVPTX"                                          \
 49 |   -DCMAKE_INSTALL_PREFIX=${DESTDIR}                                            \
 50 |   -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61                                      \
 51 |   -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75            \
 52 |   -DCMAKE_C_COMPILER=gcc                                                       \
 53 |   -DCMAKE_CXX_COMPILER=g++                                                     \
 54 |   -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile
 55 | make -j 16                                            2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile
 56 | make install                                          2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile
 57 | cd ..
 58 | ```
 59 | 
 60 | # Bootstrap Clang with `libc++`
 61 | 
 62 | We need to bootstrap Clang for OpenMP offloading. The following script
 63 | bootstraps Clang with its own `libc++`:
 64 | 
 65 | ```bash
 66 | pkgname="llvmorg-9.0.1"
 67 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz
 68 | tar xf ${pkgname}.tar.gz
 69 | BUILDIR="LIBCXX"
 70 | rm -fr   $BUILDIR
 71 | mkdir -p $BUILDIR
 72 | cd       $BUILDIR
 73 | cmake                                                                          \
 74 |   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \
 75 |   -DCMAKE_PREFIX_PATH="${TOOLCHAINS}"                                          \
 76 |   -DCMAKE_BUILD_TYPE=Release                                                   \
 77 |   -DLLVM_TARGETS_TO_BUILD="X86;NVPTX"                                          \
 78 |   -DCMAKE_INSTALL_PREFIX=${DESTDIR}                                            \
 79 |   -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61                                      \
 80 |   -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75            \
 81 |   -DCMAKE_C_COMPILER=clang                                                     \
 82 |   -DCMAKE_CXX_COMPILER=clang++                                                 \
 83 |   -DCMAKE_CXX_FLAGS="-stdlib=libc++"                                           \
 84 |   -DCMAKE_CXX_LINK_FLAGS="-stdlib=libc++"                                      \
 85 |   -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile
 86 | make -j 16                                            2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile
 87 | make install                                          2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile
 88 | cd ..
 89 | ```
 90 | 
 91 | To access this version of Clang on OCuLUS:
 92 | 
 93 | ```bash
 94 | module load clang/9.0.1_BS_libcxx_CUDA10.1
 95 | ```
 96 | 
 97 | # Bootstrap Clang with `libstdc++`
 98 | 
 99 | Clang can also be bootstrapped with GNU's `libstdc++` with the following script:
100 | 
101 | ```bash
102 | pkgname="llvmorg-9.0.1"
103 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz
104 | tar xf ${pkgname}.tar.gz
105 | BUILDIR="LIBSTDCXX"
106 | rm -fr   $BUILDIR
107 | mkdir -p $BUILDIR
108 | cd       $BUILDIR
109 | cmake                                                                          \
110 |   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \
111 |   -DCMAKE_PREFIX_PATH="${TOOLCHAINS}"                                          \
112 |   -DCMAKE_BUILD_TYPE=Release                                                   \
113 |   -DLLVM_TARGETS_TO_BUILD="X86;NVPTX"                                          \
114 |   -DCMAKE_INSTALL_PREFIX=${DESTDIR}                                            \
115 |   -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61                                      \
116 |   -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75            \
117 |   -DCMAKE_C_COMPILER=clang                                                     \
118 |   -DCMAKE_CXX_COMPILER=clang++                                                 \
119 |   -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile
120 | make -j 16                                            2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile
121 | make install                                          2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile
122 | cd ..
123 | ```
124 | 
125 | To access this version of Clang on OCuLUS:
126 | 
127 | ```bash
128 | module load clang/9.0.1_BS_libstdcxx_CUDA10.1
129 | ```
130 | 
131 | 


--------------------------------------------------------------------------------
/09_matAdd/src/matAddAB.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file matAddAB.c
  3 |  *
  4 |  * @brief Function definition for matrix addition (A += B) in single-precision.
  5 |  *
  6 |  * This source file contains function definition for matrix addition (A += B)
  7 |  * in single-precision.
  8 |  *
  9 |  * @author Xin Wu (PC²)
 10 |  * @date 07.02.2020
 11 |  * @copyright CC BY-SA 2.0
 12 |  */
 13 | 
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <time.h>
 17 | #ifdef _OPENMP
 18 | #include <omp.h>
 19 | #endif
 20 | #include <cuda_runtime.h>
 21 | #include "cublas_v2.h"
 22 | #include "matAddAB.h"
 23 | 
 24 | #define NTHRDS7 (1 << 0x7) /* 2^{7}  */
 25 | #define NTHRDS8 (1 << 0x8) /* 2^{8}  */
 26 | #define NTHRDS9 (1 << 0x9) /* 2^{9}  */
 27 | 
 28 | #define LTEAMSD (1 << 0xD) /* 2^{13} */
 29 | #define LTEAMSE (1 << 0xE) /* 2^{14} */
 30 | #define LTEAMSF (1 << 0xF) /* 2^{15} */
 31 | 
 32 | #define BLKROW  (512) /* 2x number of threads in each team */
 33 | 
 34 | double wtcalc;
 35 | 
 36 | void matAddAB_accl(float *a,
 37 |                    float *b,
 38 |                    int n,
 39 |                    int ial)
 40 | {
 41 |   cublasHandle_t handle;
 42 |   float alfa   = 1.0f,
 43 |         *a_dev = NULL,
 44 |         *b_dev = NULL;
 45 |   struct timespec rt[2];
 46 | 
 47 |   switch (ial) {
 48 |     case 0:
 49 | /*
 50 |  * - ij-loop
 51 |  * - 2^9 threads per team and 2^3 teams
 52 |  * - coalesced memory access
 53 |  */
 54 | #pragma omp target data  device(0) \
 55 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 56 | {
 57 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 58 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
 59 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
 60 |   default(none) shared(a, b, n)
 61 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
 62 |   dist_schedule(static, NTHRDS9) \
 63 |   default(none) shared(a, b, n)
 64 | for (int i = 0; i < n; ++i) { /* parallel */
 65 | for (int j = 0; j < n; ++j) { /* sequential */
 66 |   a[j * n + i] += b[j * n + i];
 67 | } /* end j-loop */
 68 | } /* end i-loop */
 69 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 70 | }
 71 |       break;
 72 |     case 1:
 73 | /*
 74 |  * - ji-loop
 75 |  * - 2^9 threads per team and 2^3 teams
 76 |  * - n-stride memory read  for a and b
 77 |  * - n-stride memory write for a
 78 |  */
 79 | #pragma omp target data  device(0) \
 80 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
 81 | {
 82 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 83 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
 84 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
 85 |   default(none) shared(a, b, n)
 86 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
 87 |   dist_schedule(static, NTHRDS9) \
 88 |   default(none) shared(a, b, n)
 89 | for (int j = 0; j < n; ++j) { /* parallel */
 90 | for (int i = 0; i < n; ++i) { /* sequential */
 91 |   a[j * n + i] += b[j * n + i];
 92 | } /* end i-loop */
 93 | } /* end j-loop */
 94 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 95 | }
 96 |       break;
 97 |     case 2:
 98 | /*
 99 |  * - ij-loop
100 |  * - 2^9 threads per team and 2^f teams
101 |  * - collapse(2)
102 |  * - n-stride memory read  for a and b
103 |  * - n-stride memory write for a
104 |  */
105 | #pragma omp target data  device(0) \
106 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
107 | {
108 |   clock_gettime(CLOCK_REALTIME, rt + 0);
109 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
110 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
111 |   default(none) shared(a, b, n)
112 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
113 |   dist_schedule(static, NTHRDS9) collapse(2) \
114 |   default(none) shared(a, b, n)
115 | for (int i = 0; i < n; ++i) {
116 | for (int j = 0; j < n; ++j) {
117 |   a[j * n + i] += b[j * n + i];
118 | } /* end j-loop */
119 | } /* end i-loop */
120 |   clock_gettime(CLOCK_REALTIME, rt + 1);
121 | }
122 |       break;
123 |     case 3:
124 | /*
125 |  * - ji-loop
126 |  * - 2^9 threads per team and 2^f teams
127 |  * - collapse(2)
128 |  * - coalesced memory access
129 |  */
130 | #pragma omp target data  device(0) \
131 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
132 | {
133 |   clock_gettime(CLOCK_REALTIME, rt + 0);
134 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
135 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
136 |   default(none) shared(a, b, n)
137 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
138 |   dist_schedule(static, NTHRDS9) collapse(2) \
139 |   default(none) shared(a, b, n)
140 | for (int j = 0; j < n; ++j) {
141 | for (int i = 0; i < n; ++i) {
142 |   a[j * n + i] += b[j * n + i];
143 | } /* end i-loop */
144 | } /* end j-loop */
145 |   clock_gettime(CLOCK_REALTIME, rt + 1);
146 | }
147 |       break;
148 |     case 4:
149 | /*
150 |  * - ji-loop
151 |  * - 2^8 threads per team and 2^f teams
152 |  * - collapse(3)
153 |  * - 2x i-loop unrolling (stride of 2^8 rows)
154 |  */
155 | #pragma omp target data  device(0) \
156 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
157 | {
158 |   clock_gettime(CLOCK_REALTIME, rt + 0);
159 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
160 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
161 |   default(none) shared(a, b, n)
162 | #pragma omp distribute parallel for num_threads(NTHRDS8) \
163 |   dist_schedule(static, NTHRDS8) collapse(3) \
164 |   default(none) shared(a, b, n)
165 | for (int j = 0; j < n; ++j) {
166 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
167 | for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
168 |   a[j * n + iblk * BLKROW + i          ] +=
169 |   b[j * n + iblk * BLKROW + i          ];
170 |   a[j * n + iblk * BLKROW + i + NTHRDS8] +=
171 |   b[j * n + iblk * BLKROW + i + NTHRDS8];
172 | } /* end i-loop */
173 | } /* end iblk-loop */
174 | } /* end j-loop */
175 |   clock_gettime(CLOCK_REALTIME, rt + 1);
176 | }
177 |       break;
178 |     case 5:
179 | /*
180 |  * - ji-loop
181 |  * - 2^8 threads per team and 2^f teams
182 |  * - collapse(2)
183 |  * - 2x i-loop unrolling (stride of n/2 rows)
184 |  */
185 | #pragma omp target data  device(0) \
186 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
187 | {
188 |   clock_gettime(CLOCK_REALTIME, rt + 0);
189 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \
190 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
191 |   default(none) shared(a, b, n)
192 | #pragma omp distribute parallel for num_threads(NTHRDS8) \
193 |   dist_schedule(static, NTHRDS8) collapse(2) \
194 |   default(none) shared(a, b, n)
195 | for (int j = 0; j < n; ++j) {
196 | for (int i = 0; i < (n >> 1); ++i) { /* 2x unrolling */
197 |   a[j * n + i           ] +=
198 |   b[j * n + i           ];
199 |   a[j * n + i + (n >> 1)] +=
200 |   b[j * n + i + (n >> 1)];
201 | } /* end i-loop */
202 | } /* end j-loop */
203 |   clock_gettime(CLOCK_REALTIME, rt + 1);
204 | }
205 |       break;
206 |     case 6:
207 | /*
208 |  * - ji-loop
209 |  * - 2^8 threads per team and 2^14 teams
210 |  * - collapse(3)
211 |  * - 2x j-loop unrolling (stride of 1   col )
212 |  * - 2x i-loop unrolling (stride of 2^8 rows)
213 |  */
214 | #pragma omp target data  device(0) \
215 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n])
216 | {
217 |   clock_gettime(CLOCK_REALTIME, rt + 0);
218 | #pragma omp target teams device(0) num_teams(LTEAMSE) thread_limit(NTHRDS8) \
219 |   map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \
220 |   default(none) shared(a, b, n)
221 | #pragma omp distribute parallel for num_threads(NTHRDS8) \
222 |   dist_schedule(static, NTHRDS8) collapse(3) \
223 |   default(none) shared(a, b, n)
224 | for (int j = 0; j < n; j += 2) { /* 2x unrolling */
225 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
226 | for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */
227 |   a[ j      * n + iblk * BLKROW + i          ] +=
228 |   b[ j      * n + iblk * BLKROW + i          ];
229 |   a[ j      * n + iblk * BLKROW + i + NTHRDS8] +=
230 |   b[ j      * n + iblk * BLKROW + i + NTHRDS8];
231 |   a[(j + 1) * n + iblk * BLKROW + i          ] +=
232 |   b[(j + 1) * n + iblk * BLKROW + i          ];
233 |   a[(j + 1) * n + iblk * BLKROW + i + NTHRDS8] +=
234 |   b[(j + 1) * n + iblk * BLKROW + i + NTHRDS8];
235 | } /* end i-loop */
236 | } /* end iblk-loop */
237 | } /* end j-loop */
238 |   clock_gettime(CLOCK_REALTIME, rt + 1);
239 | }
240 |       break;
241 |     default:
242 | /*
243 |  * cublasSaxpy in CUBLAS
244 |  */
245 |   if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) {
246 |     printf("error: initialization (CUBLAS)\n");
247 |     cublasDestroy(handle);
248 |     exit(EXIT_FAILURE);
249 |   }
250 |   if (cudaSuccess != cudaMalloc((void **) &a_dev, sizeof(*a) * n * n) ||
251 |       cudaSuccess != cudaMalloc((void **) &b_dev, sizeof(*b) * n * n)) {
252 |     printf("error: memory allocation (CUDA)\n");
253 |     cudaFree(a_dev); cudaFree(b_dev);
254 |     cublasDestroy(handle);
255 |     exit(EXIT_FAILURE);
256 |   }
257 |   if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) ||
258 |       CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n)) {
259 |     printf("error: host --> accl (CUBLAS)\n");
260 |     cudaFree(a_dev); cudaFree(b_dev);
261 |     cublasDestroy(handle);
262 |     exit(EXIT_FAILURE);
263 |   }
264 |   clock_gettime(CLOCK_REALTIME, rt + 0);
265 |   if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n * n, &alfa, b_dev, 1, a_dev, 1)) {
266 |     printf("error: cublasSaxpy (CUBLAS)\n");
267 |     cudaFree(a_dev); cudaFree(b_dev);
268 |     cublasDestroy(handle);
269 |     exit(EXIT_FAILURE);
270 |   }
271 |   if (cudaSuccess != cudaDeviceSynchronize()) {
272 |     printf("error: device synchronization (CUDA)\n");
273 |     cudaFree(a_dev); cudaFree(b_dev);
274 |     cublasDestroy(handle);
275 |     exit(EXIT_FAILURE);
276 |   }
277 |   clock_gettime(CLOCK_REALTIME, rt + 1);
278 |   if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*a), a_dev, n, a, n)) {
279 |     printf("error: accl --> host (CUBLAS)\n");
280 |     cudaFree(a_dev); cudaFree(b_dev);
281 |     cublasDestroy(handle);
282 |     exit(EXIT_FAILURE);
283 |   }
284 |   cudaFree(a_dev); cudaFree(b_dev);
285 |   cublasDestroy(handle);
286 |       break;
287 |   } /* end switch (ial) */
288 |   if (wtcalc >= 0.0) {
289 |     wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
290 |   }
291 | }
292 | 


--------------------------------------------------------------------------------
/05_saxpy/src/asaxpy.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file asaxpy.c
  3 |  * @brief Function definition for performing the \c saxpy operation on accelerator.
  4 |  *
  5 |  * This source file contains function definition for the \c saxpy operation,
  6 |  * which is defined as:
  7 |  *
  8 |  * y := a * x + y
  9 |  *
 10 |  * where:
 11 |  *
 12 |  * - a is a scalar.
 13 |  * - x and y are single-precision vectors each with n elements.
 14 |  *
 15 |  * @author Xin Wu (PC²)
 16 |  * @date 05.04.2020
 17 |  * @copyright CC BY-SA 2.0
 18 |  */
 19 | 
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <time.h>
 23 | #ifdef _OPENMP
 24 | #include <omp.h>
 25 | #endif
 26 | #include <cuda_runtime.h>
 27 | #include "cublas_v2.h"
 28 | #include "wtcalc.h"
 29 | #include "asaxpy.h"
 30 | 
 31 | void asaxpy(const int n,
 32 |             const float a,
 33 |             const float *x,
 34 |                   float *y,
 35 |             const int ial)
 36 | {
 37 |   cublasHandle_t handle;
 38 |   float alfa = a,
 39 |         *x_dev = NULL,
 40 |         *y_dev = NULL;
 41 |   struct timespec rt[2];
 42 |   int m = (n >> 4);
 43 | 
 44 |   switch (ial) {
 45 |     case 0:
 46 | /*
 47 |  * - <<<2^0 , 2^0 >>>, TOO SLOW! not tested
 48 |  */
 49 | #pragma omp target data  device(0) \
 50 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
 51 | {
 52 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 53 | #pragma omp target teams device(0) num_teams(1) \
 54 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
 55 |   default(none) shared(a, n, x, y)
 56 | #pragma omp distribute parallel for num_threads(1) \
 57 |   dist_schedule(static, 1) \
 58 |   default(none) shared(a, n, x, y)
 59 | for (int i = 0; i < n; ++i) {
 60 |   y[i] = a * x[i] + y[i];
 61 | }
 62 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 63 | }
 64 |       break;
 65 |     case 1:
 66 | /*
 67 |  * - <<<2^0 , 2^7 >>>, auto   scheduling
 68 |  */
 69 | #pragma omp target data  device(0) \
 70 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
 71 | {
 72 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 73 | #pragma omp target teams device(0) num_teams(1) \
 74 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
 75 |   default(none) shared(a, n, x, y)
 76 | #pragma omp distribute parallel for num_threads(128) \
 77 |   dist_schedule(static, 128) \
 78 |   default(none) shared(a, n, x, y)
 79 | for (int i = 0; i < n; ++i) {
 80 |   y[i] = a * x[i] + y[i];
 81 | }
 82 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 83 | }
 84 |       break;
 85 |     case 2:
 86 | /*
 87 |  * - <<<2^7 , 2^0 >>>, auto   scheduling
 88 |  */
 89 | #pragma omp target data  device(0) \
 90 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
 91 | {
 92 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 93 | #pragma omp target teams device(0) num_teams(128) \
 94 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
 95 |   default(none) shared(a, n, x, y)
 96 | #pragma omp distribute parallel for num_threads(1) \
 97 |   dist_schedule(static, 1) \
 98 |   default(none) shared(a, n, x, y)
 99 | for (int i = 0; i < n; ++i) {
100 |   y[i] = a * x[i] + y[i];
101 | }
102 |   clock_gettime(CLOCK_REALTIME, rt + 1);
103 | }
104 |       break;
105 |     case 3:
106 | /*
107 |  * - <<<2^7 , 2^7 >>>, auto   scheduling
108 |  */
109 | #pragma omp target data  device(0) \
110 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
111 | {
112 |   clock_gettime(CLOCK_REALTIME, rt + 0);
113 | #pragma omp target teams device(0) num_teams(128) \
114 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
115 |   default(none) shared(a, n, x, y)
116 | #pragma omp distribute parallel for num_threads(128) \
117 |   dist_schedule(static, 128) \
118 |   default(none) shared(a, n, x, y)
119 | for (int i = 0; i < n; ++i) {
120 |   y[i] = a * x[i] + y[i];
121 | }
122 |   clock_gettime(CLOCK_REALTIME, rt + 1);
123 | }
124 |       break;
125 |     case 4:
126 | /*
127 |  * - <<<2^16, 2^10>>>, manual scheduling
128 |  */
129 | #pragma omp target data  device(0) \
130 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n])
131 | {
132 |   clock_gettime(CLOCK_REALTIME, rt + 0);
133 | #pragma omp target teams device(0) num_teams(65536) \
134 |   map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \
135 |   default(none) shared(a, n, x, y)
136 | #pragma omp distribute parallel for num_threads(1024) \
137 |   dist_schedule(static, 1024) \
138 |   default(none) shared(a, n, x, y)
139 | for (int i = 0; i < n; ++i) {
140 |   y[i] = a * x[i] + y[i];
141 | }
142 |   clock_gettime(CLOCK_REALTIME, rt + 1);
143 | }
144 |       break;
145 |     case 5:
146 | /*
147 |  * - <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26)
148 |  */
149 | #pragma omp target data  device(0) \
150 |   map(to:a, m, x[0:n]) map(tofrom:y[0:n])
151 | {
152 |   clock_gettime(CLOCK_REALTIME, rt + 0);
153 | #pragma omp target teams device(0) num_teams(32768) \
154 |   map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
155 |   default(none) shared(a, m, x, y)
156 | #pragma omp distribute parallel for num_threads(128) \
157 |   dist_schedule(static, 128) \
158 |   default(none) shared(a, m, x, y)
159 | for (int i = 0; i < m; ++i) {
160 |   y[i          ] = a * x[i          ] + y[i          ];
161 |   y[i +       m] = a * x[i +       m] + y[i +       m];
162 |   y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m];
163 |   y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m];
164 |   y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m];
165 |   y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m];
166 |   y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m];
167 |   y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m];
168 |   y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m];
169 |   y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m];
170 |   y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m];
171 |   y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m];
172 |   y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m];
173 |   y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m];
174 |   y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m];
175 |   y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
176 | }
177 |   clock_gettime(CLOCK_REALTIME, rt + 1);
178 | }
179 |       break;
180 |     case 6:
181 | /*
182 |  * - <<<2^12, 2^7 >>>, auto   scheduling, 16x loop unrolling
183 |  */
184 | #pragma omp target data  device(0) \
185 |   map(to:a, m, x[0:n]) map(tofrom:y[0:n])
186 | {
187 |   clock_gettime(CLOCK_REALTIME, rt + 0);
188 | #pragma omp target teams device(0) num_teams(4096) \
189 |   map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \
190 |   default(none) shared(a, m, x, y)
191 | #pragma omp distribute parallel for num_threads(128) \
192 |   dist_schedule(static, 128) \
193 |   default(none) shared(a, m, x, y)
194 | for (int i = 0; i < m; ++i) {
195 |   y[i          ] = a * x[i          ] + y[i          ];
196 |   y[i +       m] = a * x[i +       m] + y[i +       m];
197 |   y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m];
198 |   y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m];
199 |   y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m];
200 |   y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m];
201 |   y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m];
202 |   y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m];
203 |   y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m];
204 |   y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m];
205 |   y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m];
206 |   y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m];
207 |   y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m];
208 |   y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m];
209 |   y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m];
210 |   y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m];
211 | }
212 |   clock_gettime(CLOCK_REALTIME, rt + 1);
213 | }
214 |       break;
215 |     case 7:
216 | /*
217 |  * - <<<2^16, 2^9>>>:
218 |  *     * de-linearize the vector (convert the vector to matrix)
219 |  *     * collapse the ji-loop
220 |  *     * 2x i-loop unrolling
221 |  */
222 | #pragma omp target data  device(0) \
223 |   map(to:a, x[0:n]) map(tofrom:y[0:n])
224 | {
225 |   clock_gettime(CLOCK_REALTIME, rt + 0);
226 | #pragma omp target teams device(0) num_teams(65536) thread_limit(512) \
227 |   map(to:a, x[0:n]) map(tofrom:y[0:n]) \
228 |   default(none) shared(a, x, y)
229 | #pragma omp distribute parallel for num_threads(512) \
230 |   dist_schedule(static, 512) collapse(2) \
231 |   default(none) shared(a, x, y)
232 | for (int j = 0; j < 65536; ++j) {
233 |   for (int i = 0; i < 512; ++i) { /* 2x i-loop unrolling */
234 |     y[j * 1024 + i      ] += a * x[j * 1024 + i      ];
235 |     y[j * 1024 + i + 512] += a * x[j * 1024 + i + 512];
236 |   }
237 | }
238 |   clock_gettime(CLOCK_REALTIME, rt + 1);
239 | }
240 |       break;
241 |     default:
242 | /*
243 |  * cublasSaxpy in CUBLAS
244 |  */
245 |   if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) {
246 |     printf("error: initialization (CUBLAS)\n");
247 |     cublasDestroy(handle);
248 |     exit(EXIT_FAILURE);
249 |   }
250 |   if (cudaSuccess != cudaMalloc((void **) &x_dev, sizeof(*x) * n) ||
251 |       cudaSuccess != cudaMalloc((void **) &y_dev, sizeof(*y) * n)) {
252 |     printf("error: memory allocation (CUDA)\n");
253 |     cudaFree(x_dev); cudaFree(y_dev);
254 |     cublasDestroy(handle);
255 |     exit(EXIT_FAILURE);
256 |   }
257 |   if (CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*x), x, 1, x_dev, 1) ||
258 |       CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*y), y, 1, y_dev, 1)) {
259 |     printf("error: host --> accl (CUBLAS)\n");
260 |     cudaFree(x_dev); cudaFree(y_dev);
261 |     cublasDestroy(handle);
262 |     exit(EXIT_FAILURE);
263 |   }
264 |   clock_gettime(CLOCK_REALTIME, rt + 0);
265 |   if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n, &alfa, x_dev, 1, y_dev, 1)) {
266 |     printf("error: cublasSaxpy (CUBLAS)\n");
267 |     cudaFree(x_dev); cudaFree(y_dev);
268 |     cublasDestroy(handle);
269 |     exit(EXIT_FAILURE);
270 |   }
271 |   if (cudaSuccess != cudaDeviceSynchronize()) {
272 |     printf("error: device synchronization (CUDA)\n");
273 |     cudaFree(x_dev); cudaFree(y_dev);
274 |     cublasDestroy(handle);
275 |     exit(EXIT_FAILURE);
276 |   }
277 |   clock_gettime(CLOCK_REALTIME, rt + 1);
278 |   if (CUBLAS_STATUS_SUCCESS != cublasGetVector(n, sizeof(*y), y_dev, 1, y, 1)) {
279 |     printf("error: accl --> host (CUBLAS)\n");
280 |     cudaFree(x_dev); cudaFree(y_dev);
281 |     cublasDestroy(handle);
282 |     exit(EXIT_FAILURE);
283 |   }
284 |   cudaFree(x_dev); cudaFree(y_dev);
285 |   cublasDestroy(handle);
286 |       break;
287 |   } /* end switch (ial) */
288 |   if (wtcalc >= 0.0) {
289 |     wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
290 |   }
291 | }
292 | 


--------------------------------------------------------------------------------
/08_distThreads/src/gpuThreads.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file gpuThreads.c
  3 |  * @brief Function definition for organizing GPU threads.
  4 |  *
  5 |  * This source file contains function definition for organizing GPU threads.
  6 |  *
  7 |  * thread_limit for the teams construct is omitted for clarity.
  8 |  *
  9 |  * @author Xin Wu (PC²)
 10 |  * @data 12.03.2020
 11 |  * @copyright CC BY-SA 2.0
 12 |  */
 13 | 
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #ifdef _OPENMP
 17 | #include <omp.h>
 18 | #endif
 19 | #include "gpuThreads.h"
 20 | 
 21 | typedef struct League {
 22 |   int itd; // index  of a thread
 23 |   int ntd; // number of threads in a team
 24 |   int itm; // index  of a team
 25 |   int ltm; // number of teams in a league
 26 | } League;
 27 | 
 28 | static void initLeague(League *league,
 29 |                        int ncol,
 30 |                        int nrow)
 31 | /**
 32 |  * @brief Initialize a league of GPU threads.
 33 |  *
 34 |  * Every element in a league is initialized as -1.
 35 |  *
 36 |  * @param league A league of GPU threads.
 37 |  * @param ncol   Number of columns in a league.
 38 |  * @param nrow   Number of rows    in a league.
 39 |  *
 40 |  * @return \c void.
 41 |  */
 42 | {
 43 |   int icol,
 44 |       irow;
 45 | 
 46 |   for (icol = 0; icol < ncol; ++icol) {
 47 |     for (irow = 0; irow < nrow; ++irow) {
 48 |       league[icol * nrow + irow].itd =
 49 |       league[icol * nrow + irow].ntd =
 50 |       league[icol * nrow + irow].itm =
 51 |       league[icol * nrow + irow].ltm = -1;
 52 |     }
 53 |   }
 54 | }
 55 | 
 56 | void gpuThreads(int i)
 57 | {
 58 |   League *league;
 59 |   int icol,
 60 |       irow,
 61 |       ncol,
 62 |       nrow;
 63 |   int lteams,
 64 |       nthrds;
 65 |   int wblk; /* width of unrolled loop block */
 66 | 
 67 |   /*
 68 |    * Initialize and assign GPU threads
 69 |    */
 70 |   switch (i)
 71 |   {
 72 |     case 0:
 73 | /*
 74 |  * 1. Dim of matrix league : 3 x 5
 75 |  * 2. Dim of GPU threads   : 3 threads/team
 76 |  *                           5 teams
 77 |  * 3. All GPU threads run thru this code block.
 78 |  *    `distribute` is not needed, because there is no for-loop.
 79 |  * 4. Each GPU thread fills the corresponding element.
 80 |  */
 81 |       ncol   = 5;
 82 |       nrow   = 3;
 83 |       lteams = 5;
 84 |       nthrds = 3;
 85 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
 86 |       initLeague(league, ncol, nrow);
 87 | #pragma omp target teams device(0) num_teams(lteams) \
 88 |   map(to: nrow) map(tofrom:league[0:nrow * ncol]) \
 89 |   default(none) shared(nrow, lteams, nthrds, league)
 90 | #pragma omp parallel num_threads(nthrds) \
 91 |   default(none) shared(nrow, lteams, nthrds, league)
 92 |   {
 93 |     int itd,
 94 |         itm;
 95 |     itd = omp_get_thread_num();
 96 |     itm = omp_get_team_num();
 97 |     league[itm * nrow + itd].itd = itd;
 98 |     league[itm * nrow + itd].ntd = omp_get_num_threads();
 99 |     league[itm * nrow + itd].itm = itm;
100 |     league[itm * nrow + itd].ltm = omp_get_num_teams();
101 |   }
102 |       break;
103 |     case 1:
104 | /*
105 |  * 1. Dim of matrix league : 3 x 5
106 |  * 2. Dim of GPU threads   : 3 threads/team
107 |  *                           5 teams
108 |  * 3. Incorrect nested loop implementation.
109 |  * 4. The number of teams equals the number of icol-loop iterations.
110 |  * 5. Only one thread in each team will run thru the irow-loop.
111 |  * 6. Other threads in each team are idle.
112 |  */
113 |       ncol   = 5;
114 |       nrow   = 3;
115 |       lteams = 5;
116 |       nthrds = 3;
117 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
118 |       initLeague(league, ncol, nrow);
119 | #pragma omp target teams device(0) num_teams(lteams) \
120 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
121 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
122 | #pragma omp distribute parallel for num_threads(nthrds) \
123 |   dist_schedule(static) \
124 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
125 |   for (int icol = 0; icol < ncol; ++icol) {
126 |     for (int irow = 0; irow < nrow; ++irow) {
127 |       league[icol * nrow + irow].itd = omp_get_thread_num();
128 |       league[icol * nrow + irow].ntd = omp_get_num_threads();
129 |       league[icol * nrow + irow].itm = omp_get_team_num();
130 |       league[icol * nrow + irow].ltm = omp_get_num_teams();
131 |     }
132 |   }
133 |       break;
134 |     case 2:
135 | /*
136 |  * 1. Dim of matrix league : 3 x 5
137 |  * 2. Dim of GPU threads   : 3 threads/team
138 |  *                           5 teams
139 |  * 3. The previous icol- and irow-loops are linearized manually.
140 |  * 4. The total number of GPU threads equals the number of iterations in
141 |  *    the linearized loop.
142 |  * 5. All GPU threads will be distributed and fill the matrix league.
143 |  */
144 |       ncol   = 5;
145 |       nrow   = 3;
146 |       lteams = 5;
147 |       nthrds = 3;
148 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
149 |       initLeague(league, ncol, nrow);
150 | #pragma omp target teams device(0) num_teams(lteams) \
151 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
152 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
153 | #pragma omp distribute parallel for num_threads(nthrds) \
154 |   dist_schedule(static) \
155 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
156 |   for (int idx = 0; idx < nrow * ncol; ++idx) {
157 |     league[idx].itd = omp_get_thread_num();
158 |     league[idx].ntd = omp_get_num_threads();
159 |     league[idx].itm = omp_get_team_num();
160 |     league[idx].ltm = omp_get_num_teams();
161 |   }
162 |       break;
163 |     case 3:
164 | /*
165 |  * 1. Dim of matrix league : 3 x 5
166 |  * 2. Dim of GPU threads   : 3 threads/team
167 |  *                           5 teams
168 |  * 3. Not everyone wants to linearize loops manually.
169 |  * 4. The icol- and irow-loops are collapsed.
170 |  * 5. All GPU threads will be distributed and fill the matrix league.
171 |  * 6. Please note that the GPU threads are organized such that the index
172 |  *    increases continuously with respect to irow (the loop index of the
173 |  *    innermost loop).
174 |  */
175 |       ncol   = 5;
176 |       nrow   = 3;
177 |       lteams = 5;
178 |       nthrds = 3;
179 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
180 |       initLeague(league, ncol, nrow);
181 | #pragma omp target teams device(0) num_teams(lteams) \
182 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
183 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
184 | #pragma omp distribute parallel for num_threads(nthrds) \
185 |   dist_schedule(static) collapse(2) \
186 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
187 |   for (int icol = 0; icol < ncol; ++icol) {
188 |     for (int irow = 0; irow < nrow; ++irow) {
189 |       league[icol * nrow + irow].itd = omp_get_thread_num();
190 |       league[icol * nrow + irow].ntd = omp_get_num_threads();
191 |       league[icol * nrow + irow].itm = omp_get_team_num();
192 |       league[icol * nrow + irow].ltm = omp_get_num_teams();
193 |     }
194 |   }
195 |       break;
196 |     case 4:
197 | /*
198 |  * 1. Dim of matrix league : 7 x 7
199 |  * 2. Dim of GPU threads   : 3 threads/team
200 |  *                           5 teams
201 |  * 3. The size of matrix league does not match with the number of GPU threads.
202 |  * 4. dist_schedule(kind, chunk_size)
203 |  *    - kind: must be static
204 |  *    - chunk_size: When no chunk_size is specified, the iterations are divided
205 |  *      into chunks of approximately equal in size.
206 |  * 5. Please note that in some teams *not* all GPU threads are working!
207 |  */
208 |       ncol   = 7;
209 |       nrow   = 7;
210 |       lteams = 5;
211 |       nthrds = 3;
212 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
213 |       initLeague(league, ncol, nrow);
214 | #pragma omp target teams device(0) num_teams(lteams) \
215 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
216 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
217 | #pragma omp distribute parallel for num_threads(nthrds) \
218 |   dist_schedule(static) collapse(2) \
219 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
220 |   for (int icol = 0; icol < ncol; ++icol) {
221 |     for (int irow = 0; irow < nrow; ++irow) {
222 |       league[icol * nrow + irow].itd = omp_get_thread_num();
223 |       league[icol * nrow + irow].ntd = omp_get_num_threads();
224 |       league[icol * nrow + irow].itm = omp_get_team_num();
225 |       league[icol * nrow + irow].ltm = omp_get_num_teams();
226 |     }
227 |   }
228 |       break;
229 |     case 5:
230 | /*
231 |  * 1. Dim of matrix league : 7 x 7
232 |  * 2. Dim of GPU threads   : 3 threads/team
233 |  *                           5 teams
234 |  * 3. The size of matrix league does not match with the number of GPU threads.
235 |  * 4. dist_schedule(kind, chunk_size)
236 |  *    - kind: must be static
237 |  *    - chunk_size: If specified, iterations are divided into chunks of size
238 |  *      chunk_size. Chunks are then assigned to the GPU thread teams in
239 |  *      a round-robin fashion.
240 |  * 5. The different ways of organizing GPU threads will impact on
241 |  *    the performance of GPU memory access.
242 |  */
243 |       ncol   = 7;
244 |       nrow   = 7;
245 |       lteams = 5;
246 |       nthrds = 3;
247 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
248 |       initLeague(league, ncol, nrow);
249 | #pragma omp target teams device(0) num_teams(lteams) \
250 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
251 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
252 | #pragma omp distribute parallel for num_threads(nthrds) \
253 |   dist_schedule(static, nthrds) collapse(2) \
254 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
255 |   for (int icol = 0; icol < ncol; ++icol) {
256 |     for (int irow = 0; irow < nrow; ++irow) {
257 |       league[icol * nrow + irow].itd = omp_get_thread_num();
258 |       league[icol * nrow + irow].ntd = omp_get_num_threads();
259 |       league[icol * nrow + irow].itm = omp_get_team_num();
260 |       league[icol * nrow + irow].ltm = omp_get_num_teams();
261 |     }
262 |   }
263 |       break;
264 |     case 6:
265 | /*
266 |  * 1. Dim of matrix league : 12 x 6
267 |  * 2. Dim of GPU threads   : 3 threads/team
268 |  *                           6 teams
269 |  * 3. icol-loop: intact
270 |  * 4. irow-loop: CPU-like 2x loop unrolling.
271 |  * 5. It results in uncoalesced GPU memory access and reduced performance.
272 |  * 6. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
273 |  */
274 |       ncol   = 6;
275 |       nrow   =12;
276 |       lteams = 6;
277 |       nthrds = 3;
278 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
279 |       initLeague(league, ncol, nrow);
280 | #pragma omp target teams device(0) num_teams(lteams) \
281 |   map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \
282 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
283 | #pragma omp distribute parallel for num_threads(nthrds) \
284 |   dist_schedule(static, nthrds) collapse(2) \
285 |   default(none) shared(ncol, nrow, lteams, nthrds, league)
286 |   for (int icol = 0; icol < ncol; ++icol) {
287 |     for (int irow = 0; irow < nrow; irow += 2) {
288 |       league[icol * nrow + irow    ].itd = omp_get_thread_num();
289 |       league[icol * nrow + irow    ].ntd = omp_get_num_threads();
290 |       league[icol * nrow + irow    ].itm = omp_get_team_num();
291 |       league[icol * nrow + irow    ].ltm = omp_get_num_teams();
292 |       league[icol * nrow + irow + 1].itd = omp_get_thread_num() + 10;
293 |       league[icol * nrow + irow + 1].ntd = omp_get_num_threads();
294 |       league[icol * nrow + irow + 1].itm = omp_get_team_num();
295 |       league[icol * nrow + irow + 1].ltm = omp_get_num_teams();
296 |     }
297 |   }
298 |       break;
299 |     case 7:
300 | /*
301 |  * 1. Dim of matrix league : 12 x 6
302 |  * 2. Dim of GPU threads   : 3 threads/team
303 |  *                           6 teams
304 |  * 3. icol-loop: intact
305 |  * 4. irow-loop: 2x loop unrolling.
306 |  * 5. Nested loop with collapse(3).
307 |  * 6. It features coalesced GPU memory access and good performance.
308 |  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
309 |  *
310 |  * Caveat: especially for the innermost loop
311 |  *
312 |  * OpenMP API Specification: Version 5.0 November 2018
313 |  *
314 |  * https://www.openmp.org/spec-html/5.0/openmpsu44.html
315 |  *
316 |  * If a collapse clause is specified with a parameter value greater than 1, then
317 |  * the iterations of the associated loops to which the clause applies are
318 |  * collapsed into one larger iteration space with *unspecified ordering*.
319 |  *
320 |  */
321 |       ncol   = 6;
322 |       nrow   =12;
323 |       lteams = 6;
324 |       nthrds = 3;
325 |       wblk   = nthrds * 2;
326 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
327 |       initLeague(league, ncol, nrow);
328 | #pragma omp target teams device(0) num_teams(lteams) \
329 |   map(to: ncol, nrow, wblk) map(tofrom:league[0:nrow * ncol]) \
330 |   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
331 | #pragma omp distribute parallel for num_threads(nthrds) \
332 |   dist_schedule(static, wblk) collapse(3) \
333 |   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
334 |   for (int icol = 0; icol < ncol; ++icol) {
335 |     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
336 |       for (int irow = 0; irow < nthrds; ++irow) {
337 | league[icol * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
338 | league[icol * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
339 | league[icol * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
340 | league[icol * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
341 | league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
342 | league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
343 | league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
344 | league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
345 |       }
346 |     }
347 |   }
348 |       break;
349 |     case 8:
350 | /*
351 |  * 1. Dim of matrix league : 12 x 6
352 |  * 2. Dim of GPU threads   : 3 threads/team
353 |  *                           3 teams
354 |  * 3. icol-loop: 2x loop unrolling.
355 |  * 4. irow-loop: 2x loop unrolling.
356 |  * 5. Nested loop with collapse(3).
357 |  * 6. +10 to each unrolled team   is used to label the 2x icol-loop unrolling.
358 |  * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling.
359 |  *
360 |  * More work for each thread is an approach to achieve high performance.
361 |  *
362 |  */
363 |       ncol   = 6;
364 |       nrow   =12;
365 |       lteams = 3;
366 |       nthrds = 3;
367 |       wblk   = nthrds * 2;
368 |       league = (League *) malloc(sizeof(League) * ncol * nrow);
369 |       initLeague(league, ncol, nrow);
370 | #pragma omp target teams device(0) num_teams(lteams) \
371 |   map(to: ncol, nrow, wblk) map(tofrom:league[0:nrow * ncol]) \
372 |   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
373 | #pragma omp distribute parallel for num_threads(nthrds) \
374 |   dist_schedule(static, wblk) collapse(3) \
375 |   default(none) shared(ncol, nrow, wblk, lteams, nthrds, league)
376 |   for (int icol = 0; icol < ncol; icol += 2) {
377 |     for (int iblk = 0; iblk < nrow / wblk; ++iblk) {
378 |       for (int irow = 0; irow < nthrds; ++irow) {
379 | league[ icol      * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
380 | league[ icol      * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
381 | league[ icol      * nrow + iblk * wblk + irow         ].itm = omp_get_team_num();
382 | league[ icol      * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
383 | league[ icol      * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
384 | league[ icol      * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
385 | league[ icol      * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num();
386 | league[ icol      * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
387 | league[(icol + 1) * nrow + iblk * wblk + irow         ].itd = omp_get_thread_num();
388 | league[(icol + 1) * nrow + iblk * wblk + irow         ].ntd = omp_get_num_threads();
389 | league[(icol + 1) * nrow + iblk * wblk + irow         ].itm = omp_get_team_num() + 10;
390 | league[(icol + 1) * nrow + iblk * wblk + irow         ].ltm = omp_get_num_teams();
391 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10;
392 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads();
393 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10;
394 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams();
395 |       }
396 |     }
397 |   }
398 |       break;
399 |     default:
400 |       printf("Tschüß!\n");
401 |       exit(EXIT_SUCCESS);
402 |       break;
403 |   }
404 |   /*
405 |    * Show the organization of GPU threads
406 |    */
407 |   printf("%dth GPU threads organization:\n", i);
408 |   printf("\n");
409 |   printf("No. of rows    : %3d\n", nrow);
410 |   printf("No. of cols    : %3d\n", ncol);
411 |   printf("No. of threads : %3d\n", league[0].ntd);
412 |   printf("No. of teams   : %3d\n", league[0].ltm);
413 |   printf("\n");
414 |   for (irow = 0; irow < nrow; ++irow) {
415 |     for (icol = 0; icol < ncol; ++icol) {
416 |       printf("(%2d,%2d):[%2d,%2d]%s", irow, icol,
417 |           league[icol * nrow + irow].itd,
418 |           league[icol * nrow + irow].itm,
419 |           icol == ncol - 1 ? "\n" : "  ");
420 |     }
421 |   }
422 |   printf("\n");
423 |   /*
424 |    * Release the memory
425 |    */
426 |   free(league);
427 | }
428 | 


--------------------------------------------------------------------------------
/10_matMul/src/matMulAB.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file matMulAB.c
  3 |  *
  4 |  * @brief Function definition for matrix multiplication in single-precision.
  5 |  *
  6 |  * This source file contains function definition for matrix multiplication
  7 |  * in single-precision.
  8 |  *
  9 |  * @author Xin Wu (PC²)
 10 |  * @date 07.02.2020
 11 |  * @copyright CC BY-SA 2.0
 12 |  */
 13 | 
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <time.h>
 17 | #ifdef _OPENMP
 18 | #include <omp.h>
 19 | #endif
 20 | #include <cuda_runtime.h>
 21 | #include "cublas_v2.h"
 22 | #include "matMulAB.h"
 23 | 
 24 | #define NTHRDS7 (1 << 0x7) /* 2^{7}  */
 25 | #define NTHRDS8 (1 << 0x8) /* 2^{8}  */
 26 | #define NTHRDS9 (1 << 0x9) /* 2^{9}  */
 27 | 
 28 | #define LTEAMSD (1 << 0xD) /* 2^{13} */
 29 | #define LTEAMSE (1 << 0xE) /* 2^{14} */
 30 | #define LTEAMSF (1 << 0xF) /* 2^{15} */
 31 | #define LTEAMSG (1 << 020) /* 2^{16} */
 32 | 
 33 | #define BLKROW  (512) /* 4x number of threads in each team */
 34 | #define BLKDIM  (16)
 35 | 
 36 | double wtcalc;
 37 | 
 38 | void matMulAB_accl(float *a,
 39 |                    float *b,
 40 |                    float *c,
 41 |                    int n,
 42 |                    int ial)
 43 | {
 44 |   cublasHandle_t handle;
 45 |   float alfa   = 1.0f,
 46 |         beta   = 1.0f,
 47 |         *a_dev = NULL,
 48 |         *b_dev = NULL,
 49 |         *c_dev = NULL;
 50 |   struct timespec rt[2];
 51 | 
 52 |   switch (ial) {
 53 |     case 0:
 54 | /*
 55 |  * - jik-loop
 56 |  * - 2^9 threads per team and 2^3 teams
 57 |  * - n-stride memory read  for c (then in rc)
 58 |  * - n-stride memory read  for b (innermost loop)
 59 |  * - n-stride memory write for c
 60 |  */
 61 | #pragma omp target data  device(0) \
 62 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 63 | {
 64 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 65 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
 66 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
 67 |   default(none) shared(a, b, c, n)
 68 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
 69 |   dist_schedule(static, NTHRDS9) \
 70 |   default(none) shared(a, b, c, n)
 71 | for (int j = 0; j < n; ++j) { /* parallel */
 72 | for (int i = 0; i < n; ++i) { /* sequential */
 73 |   float rc;
 74 |   rc = c[j * n + i];
 75 |   for (int k = 0; k < n; ++k) {
 76 |     rc += a[k * n + i] * b[j * n + k];
 77 |   }
 78 |   c[j * n + i] = rc;
 79 | } /* end i-loop */
 80 | } /* end j-loop */
 81 |   clock_gettime(CLOCK_REALTIME, rt + 1);
 82 | }
 83 |       break;
 84 |     case 1:
 85 | /*
 86 |  * - jki-loop
 87 |  * - 2^9 threads per team and 2^3 teams
 88 |  * - n-stride memory read  for b (then in rb)
 89 |  * - n-stride memory read  for c (innermost loop)
 90 |  * - n-stride memory write for c (innermost loop)
 91 |  */
 92 | #pragma omp target data  device(0) \
 93 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
 94 | {
 95 |   clock_gettime(CLOCK_REALTIME, rt + 0);
 96 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
 97 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
 98 |   default(none) shared(a, b, c, n)
 99 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
100 |   dist_schedule(static, NTHRDS9) \
101 |   default(none) shared(a, b, c, n)
102 | for (int j = 0; j < n; ++j) { /* parallel */
103 | for (int k = 0; k < n; ++k) { /* sequential */
104 |   float rb;
105 |   rb = b[j * n + k];
106 |   for (int i = 0; i < n; ++i) {
107 |     c[j * n + i] += a[k * n + i] * rb; /* uncoalesced r&w */
108 |   }
109 | } /* end k-loop */
110 | } /* end j-loop */
111 |   clock_gettime(CLOCK_REALTIME, rt + 1);
112 | }
113 |       break;
114 |     case 2:
115 | /*
116 |  * - jik-loop
117 |  * - 2^9 threads per team and 2^15 teams
118 |  * - collapse(2)
119 |  * - no race condition
120 |  */
121 | #pragma omp target data  device(0) \
122 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
123 | {
124 |   clock_gettime(CLOCK_REALTIME, rt + 0);
125 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
126 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
127 |   default(none) shared(a, b, c, n)
128 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
129 |   dist_schedule(static, NTHRDS9) collapse(2) \
130 |   default(none) shared(a, b, c, n)
131 | for (int j = 0; j < n; ++j) { /* parallel */
132 | for (int i = 0; i < n; ++i) { /* parallel */
133 |   float rc;
134 |   rc = c[j * n + i];
135 |   for (int k = 0; k < n; ++k) { /* sequential */
136 |     rc += a[k * n + i] * b[j * n + k];
137 |   }
138 |   c[j * n + i] = rc;
139 | } /* end i-loop */
140 | } /* end j-loop */
141 |   clock_gettime(CLOCK_REALTIME, rt + 1);
142 | }
143 |       break;
144 |     case 3:
145 | /*
146 |  * - jki-loop
147 |  * - 2^9 threads per team and 2^15 teams
148 |  * - collapse(2)
149 |  * - race condition for writing c: not only one thread has the index j, a total
150 |  *   of n GPU threads has the index j. (n / 32) warps are then scheduled on GPU.
151 |  */
152 | #pragma omp target data  device(0) \
153 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
154 | {
155 |   clock_gettime(CLOCK_REALTIME, rt + 0);
156 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
157 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
158 |   default(none) shared(a, b, c, n)
159 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
160 |   dist_schedule(static, NTHRDS9) collapse(2) \
161 |   default(none) shared(a, b, c, n)
162 | for (int j = 0; j < n; ++j) { /* parallel */
163 | for (int k = 0; k < n; ++k) { /* parallel */
164 |   float rb;
165 |   rb = b[j * n + k];
166 |   for (int i = 0; i < n; ++i) {
167 |     c[j * n + i] += a[k * n + i] * rb; /* race condition between diff. warps */
168 |   }
169 | } /* end k-loop */
170 | } /* end j-loop */
171 |   clock_gettime(CLOCK_REALTIME, rt + 1);
172 | }
173 |       break;
174 |     case 4:
175 | /*
176 |  * - jik-loop
177 |  * - 2^9 threads per team and 2^15 teams
178 |  * - 4x k-loop unrolling
179 |  *
180 |  * good: more work for one thread per iteration.
181 |  * bad : one thread must read b 4 times in k-loop.
182 |  *       all threads in a team do the same read of b (waste of instructions).
183 |  * tips: each thread reads the corresponding element in b and
184 |  *       saves it in shared memory.
185 |  */
186 | #pragma omp target data  device(0) \
187 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
188 | {
189 |   clock_gettime(CLOCK_REALTIME, rt + 0);
190 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \
191 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
192 |   default(none) shared(a, b, c, n)
193 | #pragma omp distribute parallel for num_threads(NTHRDS9) \
194 |   dist_schedule(static, NTHRDS9) collapse(2) \
195 |   default(none) shared(a, b, c, n)
196 | for (int j = 0; j < n; ++j) {
197 | for (int i = 0; i < n; ++i) {
198 |   float rc;
199 |   rc = c[j * n + i];
200 |   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
201 |     rc += a[ k      * n + i] * b[j * n + k    ];
202 |     rc += a[(k + 1) * n + i] * b[j * n + k + 1];
203 |     rc += a[(k + 2) * n + i] * b[j * n + k + 2];
204 |     rc += a[(k + 3) * n + i] * b[j * n + k + 3];
205 |   }
206 |   c[j * n + i] = rc;
207 | } /* end i-loop */
208 | } /* end j-loop */
209 |   clock_gettime(CLOCK_REALTIME, rt + 1);
210 | }
211 |       break;
212 |     case 5:
213 | /*
214 |  * - jik-loop
215 |  * - 2^7 threads per team and 2^15 teams
216 |  * - collapse(3)
217 |  * - 4x i-loop unrolling (stride of 2^7 rows)
218 |  * - 4x k-loop unrolling
219 |  * - rb: 4x data re-use
220 |  *
221 |  * The integer calculation of matrix indices looks ugly. But considering the GPU
222 |  * hardware architecture, e.g. many separate INT32 units, these calculations are
223 |  * much faster than accessing GPU global memory and save the precious registers.
224 |  *
225 |  */
226 | #pragma omp target data  device(0) \
227 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
228 | {
229 |   clock_gettime(CLOCK_REALTIME, rt + 0);
230 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \
231 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
232 |   default(none) shared(a, b, c, n)
233 | #pragma omp distribute parallel for num_threads(NTHRDS7) \
234 |   dist_schedule(static, NTHRDS7) collapse(3) \
235 |   default(none) shared(a, b, c, n)
236 | for (int j = 0; j < n; ++j) {
237 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
238 | for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */
239 |   float rc0, rc1, rc2, rc3;
240 |   rc0 = c[j * n + iblk * BLKROW + i              ];
241 |   rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7    ];
242 |   rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2];
243 |   rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3];
244 |   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
245 |     /* register for b: 4x k-loop */
246 |     float rb0, rb1, rb2, rb3;
247 |     rb0  = b[j * n + k    ];
248 |     rb1  = b[j * n + k + 1];
249 |     rb2  = b[j * n + k + 2];
250 |     rb3  = b[j * n + k + 3];
251 |     rc0 += a[ k      * n + iblk * BLKROW + i              ] * rb0;
252 |     rc0 += a[(k + 1) * n + iblk * BLKROW + i              ] * rb1;
253 |     rc0 += a[(k + 2) * n + iblk * BLKROW + i              ] * rb2;
254 |     rc0 += a[(k + 3) * n + iblk * BLKROW + i              ] * rb3;
255 |     rc1 += a[ k      * n + iblk * BLKROW + i + NTHRDS7    ] * rb0;
256 |     rc1 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7    ] * rb1;
257 |     rc1 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7    ] * rb2;
258 |     rc1 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7    ] * rb3;
259 |     rc2 += a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb0;
260 |     rc2 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb1;
261 |     rc2 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb2;
262 |     rc2 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb3;
263 |     rc3 += a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb0;
264 |     rc3 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb1;
265 |     rc3 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb2;
266 |     rc3 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb3;
267 |   }
268 |   c[j * n + iblk * BLKROW + i              ] = rc0;
269 |   c[j * n + iblk * BLKROW + i + NTHRDS7    ] = rc1;
270 |   c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
271 |   c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
272 | } /* end i-loop */
273 | } /* end iblk-loop */
274 | } /* end j-loop */
275 |   clock_gettime(CLOCK_REALTIME, rt + 1);
276 | }
277 |       break;
278 |     case 6:
279 | /*
280 |  * - jik-loop
281 |  * - 2^7 threads per team and 2^13 teams
282 |  * - collapse(3)
283 |  * - 4x j-loop unrolling (stride of 1   col )
284 |  * - 4x i-loop unrolling (stride of 2^7 rows)
285 |  * - 4x k-loop unrolling
286 |  * - rb: 4x data re-use
287 |  * - ra: 4x data re-use
288 |  * - register blocking
289 |  */
290 | #pragma omp target data  device(0) \
291 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
292 | {
293 |   clock_gettime(CLOCK_REALTIME, rt + 0);
294 | #pragma omp target teams device(0) num_teams(LTEAMSD) thread_limit(NTHRDS7) \
295 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
296 |   default(none) shared(a, b, c, n)
297 | #pragma omp distribute parallel for num_threads(NTHRDS7) \
298 |   dist_schedule(static, NTHRDS7) collapse(3) \
299 |   default(none) shared(a, b, c, n)
300 | for (int j = 0; j < n; j += 4) { /* 4x unrolling */
301 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) {
302 | for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */
303 |   /* register for c: 4x j-loop * 4x i-loop */
304 |   float rc0, rc1, rc2, rc3,
305 |         rc4, rc5, rc6, rc7,
306 |         rc8, rc9, rca, rcb,
307 |         rcc, rcd, rce, rcf;
308 |   rc0 = c[ j      * n + iblk * BLKROW + i              ];
309 |   rc1 = c[ j      * n + iblk * BLKROW + i + NTHRDS7    ];
310 |   rc2 = c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 2];
311 |   rc3 = c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 3];
312 |   rc4 = c[(j + 1) * n + iblk * BLKROW + i              ];
313 |   rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7    ];
314 |   rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
315 |   rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
316 |   rc8 = c[(j + 2) * n + iblk * BLKROW + i              ];
317 |   rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7    ];
318 |   rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
319 |   rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
320 |   rcc = c[(j + 3) * n + iblk * BLKROW + i              ];
321 |   rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7    ];
322 |   rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
323 |   rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
324 |   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
325 |     /* register for b: 4x j-loop * 4x k-loop */
326 |     float rb0, rb1, rb2, rb3,
327 |           rb4, rb5, rb6, rb7,
328 |           rb8, rb9, rba, rbb,
329 |           rbc, rbd, rbe, rbf;
330 |     rb0 = b[ j      * n + k    ];
331 |     rb1 = b[ j      * n + k + 1];
332 |     rb2 = b[ j      * n + k + 2];
333 |     rb3 = b[ j      * n + k + 3];
334 |     rb4 = b[(j + 1) * n + k    ];
335 |     rb5 = b[(j + 1) * n + k + 1];
336 |     rb6 = b[(j + 1) * n + k + 2];
337 |     rb7 = b[(j + 1) * n + k + 3];
338 |     rb8 = b[(j + 2) * n + k    ];
339 |     rb9 = b[(j + 2) * n + k + 1];
340 |     rba = b[(j + 2) * n + k + 2];
341 |     rbb = b[(j + 2) * n + k + 3];
342 |     rbc = b[(j + 3) * n + k    ];
343 |     rbd = b[(j + 3) * n + k + 1];
344 |     rbe = b[(j + 3) * n + k + 2];
345 |     rbf = b[(j + 3) * n + k + 3];
346 |     /* register for a: 4x i-loop * 4x k-loop */
347 |     float ra0, ra1, ra2, ra3,
348 |           ra4, ra5, ra6, ra7,
349 |           ra8, ra9, raa, rab,
350 |           rac, rad, rae, raf;
351 |     ra0 = a[ k      * n + iblk * BLKROW + i              ];
352 |     ra1 = a[ k      * n + iblk * BLKROW + i + NTHRDS7    ];
353 |     ra2 = a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 2];
354 |     ra3 = a[ k      * n + iblk * BLKROW + i + NTHRDS7 * 3];
355 |     ra4 = a[(k + 1) * n + iblk * BLKROW + i              ];
356 |     ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7    ];
357 |     ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2];
358 |     ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3];
359 |     ra8 = a[(k + 2) * n + iblk * BLKROW + i              ];
360 |     ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7    ];
361 |     raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2];
362 |     rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3];
363 |     rac = a[(k + 3) * n + iblk * BLKROW + i              ];
364 |     rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7    ];
365 |     rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2];
366 |     raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3];
367 |     /*
368 |      * register blocking
369 |      */
370 |     // col 1 of c:
371 |     rc0 += ra0 * rb0;
372 |     rc0 += ra4 * rb1;
373 |     rc0 += ra8 * rb2;
374 |     rc0 += rac * rb3;
375 |     rc1 += ra1 * rb0;
376 |     rc1 += ra5 * rb1;
377 |     rc1 += ra9 * rb2;
378 |     rc1 += rad * rb3;
379 |     rc2 += ra2 * rb0;
380 |     rc2 += ra6 * rb1;
381 |     rc2 += raa * rb2;
382 |     rc2 += rae * rb3;
383 |     rc3 += ra3 * rb0;
384 |     rc3 += ra7 * rb1;
385 |     rc3 += rab * rb2;
386 |     rc3 += raf * rb3;
387 |     // col 2 of c:
388 |     rc4 += ra0 * rb4;
389 |     rc4 += ra4 * rb5;
390 |     rc4 += ra8 * rb6;
391 |     rc4 += rac * rb7;
392 |     rc5 += ra1 * rb4;
393 |     rc5 += ra5 * rb5;
394 |     rc5 += ra9 * rb6;
395 |     rc5 += rad * rb7;
396 |     rc6 += ra2 * rb4;
397 |     rc6 += ra6 * rb5;
398 |     rc6 += raa * rb6;
399 |     rc6 += rae * rb7;
400 |     rc7 += ra3 * rb4;
401 |     rc7 += ra7 * rb5;
402 |     rc7 += rab * rb6;
403 |     rc7 += raf * rb7;
404 |     // col 3 of c:
405 |     rc8 += ra0 * rb8;
406 |     rc8 += ra4 * rb9;
407 |     rc8 += ra8 * rba;
408 |     rc8 += rac * rbb;
409 |     rc9 += ra1 * rb8;
410 |     rc9 += ra5 * rb9;
411 |     rc9 += ra9 * rba;
412 |     rc9 += rad * rbb;
413 |     rca += ra2 * rb8;
414 |     rca += ra6 * rb9;
415 |     rca += raa * rba;
416 |     rca += rae * rbb;
417 |     rcb += ra3 * rb8;
418 |     rcb += ra7 * rb9;
419 |     rcb += rab * rba;
420 |     rcb += raf * rbb;
421 |     // col 4 of c:
422 |     rcc += ra0 * rbc;
423 |     rcc += ra4 * rbd;
424 |     rcc += ra8 * rbe;
425 |     rcc += rac * rbf;
426 |     rcd += ra1 * rbc;
427 |     rcd += ra5 * rbd;
428 |     rcd += ra9 * rbe;
429 |     rcd += rad * rbf;
430 |     rce += ra2 * rbc;
431 |     rce += ra6 * rbd;
432 |     rce += raa * rbe;
433 |     rce += rae * rbf;
434 |     rcf += ra3 * rbc;
435 |     rcf += ra7 * rbd;
436 |     rcf += rab * rbe;
437 |     rcf += raf * rbf;
438 |   }
439 |   c[ j      * n + iblk * BLKROW + i              ] = rc0;
440 |   c[ j      * n + iblk * BLKROW + i + NTHRDS7    ] = rc1;
441 |   c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2;
442 |   c[ j      * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3;
443 |   c[(j + 1) * n + iblk * BLKROW + i              ] = rc4;
444 |   c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7    ] = rc5;
445 |   c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6;
446 |   c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7;
447 |   c[(j + 2) * n + iblk * BLKROW + i              ] = rc8;
448 |   c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7    ] = rc9;
449 |   c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca;
450 |   c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb;
451 |   c[(j + 3) * n + iblk * BLKROW + i              ] = rcc;
452 |   c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7    ] = rcd;
453 |   c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce;
454 |   c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf;
455 | } /* end i-loop */
456 | } /* end iblk-loop */
457 | } /* end j-loop */
458 |   clock_gettime(CLOCK_REALTIME, rt + 1);
459 | }
460 |       break;
461 |     case 7:
462 | /*
463 |  * - based on case 2
464 |  * - jik-loop
465 |  * - 2^8 threads per team and 2^16 teams
466 |  * - collapse(2)
467 |  * - no race condition
468 |  */
469 | #pragma omp target data  device(0) \
470 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
471 | {
472 |   clock_gettime(CLOCK_REALTIME, rt + 0);
473 | #pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \
474 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
475 |   default(none) shared(a, b, c, n)
476 | #pragma omp distribute parallel for num_threads(NTHRDS8) \
477 |   dist_schedule(static, NTHRDS8) collapse(2) \
478 |   default(none) shared(a, b, c, n)
479 | for (int j = 0; j < n; ++j) { /* parallel */
480 | for (int i = 0; i < n; ++i) { /* parallel */
481 |   float rc;
482 |   rc = c[j * n + i];
483 |   for (int k = 0; k < n; ++k) { /* sequential */
484 |     rc += a[k * n + i] * b[j * n + k];
485 |   }
486 |   c[j * n + i] = rc;
487 | } /* end i-loop */
488 | } /* end j-loop */
489 |   clock_gettime(CLOCK_REALTIME, rt + 1);
490 | }
491 |       break;
492 |     case 8:
493 | /*
494 |  * - based on case 7
495 |  * - jik-loop
496 |  * - 2^8 threads per team and 2^16 teams
497 |  * - collapse(2)
498 |  * - GPU shared memory for data re-use
499 |  * - 16x k-loop unrolling
500 |  */
501 | #pragma omp target data  device(0) \
502 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
503 | {
504 |   clock_gettime(CLOCK_REALTIME, rt + 0);
505 | #pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \
506 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
507 |   default(none) shared(a, b, c, n)
508 | {
509 |   // GPU shared memory for each team
510 |   /*
511 |    * I have tested the bank conflict-free version, but it gives worse results,
512 |    * e.g. ~ 290 GFLOPS (20 GFLOPS less than the bank conflict version).
513 |    * I cannot explain ...
514 |    *
515 |   float ashm[BLKDIM][BLKDIM + 1],
516 |         bshm[BLKDIM][BLKDIM + 1];
517 |    */
518 |   float ashm[BLKDIM][BLKDIM],
519 |         bshm[BLKDIM][BLKDIM];
520 | #pragma omp distribute dist_schedule(static, 1) collapse(2)
521 | for (int j = 0; j < n / BLKDIM; ++j) {
522 | for (int i = 0; i < n / BLKDIM; ++i) {
523 | #pragma omp parallel num_threads(NTHRDS8) \
524 |   default(none) shared(a, b, c, n, ashm, bshm, i, j)
525 | {
526 |   /*
527 |    * The code here resembles CUDA.
528 |    */
529 |   int td = omp_get_thread_num();
530 |   // de-linearize the thread number
531 |   int it, // thread number along the row
532 |       jt; // thread number along the col
533 |   it = td % BLKDIM;
534 |   jt = td / BLKDIM;
535 |   int ib, // row at the beginning of block
536 |       jb; // col at the beginning of block
537 |   ib = i * BLKDIM;
538 |   jb = j * BLKDIM;
539 |   int ii, // the real row
540 |       jj; // the real col
541 |   ii = ib + it;
542 |   jj = jb + jt;
543 |   float rc = c[jj * n + ii]; // c in register
544 |   /*
545 |    * the k blocks
546 |    */
547 |   for (int k = 0; k < n / BLKDIM; ++k) {
548 |     // read the global data to shared memory
549 |     ashm[jt][it] = a[(k * 16 + jt) * n + ii];
550 |     bshm[jt][it] = b[jj * n + (k * 16 + it)];
551 | #pragma omp barrier
552 |     // shared memory blocking and 16x k-loop unrolling
553 |     rc += ashm[0x0][it] * bshm[jt][0x0];
554 |     rc += ashm[0x1][it] * bshm[jt][0x1];
555 |     rc += ashm[0x2][it] * bshm[jt][0x2];
556 |     rc += ashm[0x3][it] * bshm[jt][0x3];
557 |     rc += ashm[0x4][it] * bshm[jt][0x4];
558 |     rc += ashm[0x5][it] * bshm[jt][0x5];
559 |     rc += ashm[0x6][it] * bshm[jt][0x6];
560 |     rc += ashm[0x7][it] * bshm[jt][0x7];
561 |     rc += ashm[0x8][it] * bshm[jt][0x8];
562 |     rc += ashm[0x9][it] * bshm[jt][0x9];
563 |     rc += ashm[0xa][it] * bshm[jt][0xa];
564 |     rc += ashm[0xb][it] * bshm[jt][0xb];
565 |     rc += ashm[0xc][it] * bshm[jt][0xc];
566 |     rc += ashm[0xd][it] * bshm[jt][0xd];
567 |     rc += ashm[0xe][it] * bshm[jt][0xe];
568 |     rc += ashm[0xf][it] * bshm[jt][0xf];
569 | #pragma omp barrier
570 |   } /* end k-loop */
571 |   c[jj * n + ii] =rc;
572 | } /* end omp parallel */
573 | } /* end i-loop */
574 | } /* end j-loop */
575 | } /* end omp target teams */
576 |   clock_gettime(CLOCK_REALTIME, rt + 1);
577 | }
578 |       break;
579 |     case 9:
580 | /*
581 |  * - based on case 5
582 |  * - only diffs are listed here:
583 |  *     * collapse(2)
584 |  *     * 4x i-loop unrolling (stride of n/4 rows)
585 |  */
586 | #pragma omp target data  device(0) \
587 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n])
588 | {
589 |   clock_gettime(CLOCK_REALTIME, rt + 0);
590 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \
591 |   map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \
592 |   default(none) shared(a, b, c, n)
593 | #pragma omp distribute parallel for num_threads(NTHRDS7) \
594 |   dist_schedule(static, NTHRDS7) collapse(2) \
595 |   default(none) shared(a, b, c, n)
596 | for (int j = 0; j < n; ++j) {
597 | for (int i = 0; i < (n >> 2); ++i) { /* 4x unrolling */
598 |   float rc0, rc1, rc2, rc3;
599 |   rc0 = c[j * n + i               ];
600 |   rc1 = c[j * n + i + (n >> 2)    ];
601 |   rc2 = c[j * n + i + (n >> 2) * 2];
602 |   rc3 = c[j * n + i + (n >> 2) * 3];
603 |   for (int k = 0; k < n; k += 4) { /* 4x unrolling */
604 |     /* register for b: 4x k-loop */
605 |     float rb0, rb1, rb2, rb3;
606 |     rb0  = b[j * n + k    ];
607 |     rb1  = b[j * n + k + 1];
608 |     rb2  = b[j * n + k + 2];
609 |     rb3  = b[j * n + k + 3];
610 |     rc0 += a[ k      * n + i               ] * rb0;
611 |     rc0 += a[(k + 1) * n + i               ] * rb1;
612 |     rc0 += a[(k + 2) * n + i               ] * rb2;
613 |     rc0 += a[(k + 3) * n + i               ] * rb3;
614 |     rc1 += a[ k      * n + i + (n >> 2)    ] * rb0;
615 |     rc1 += a[(k + 1) * n + i + (n >> 2)    ] * rb1;
616 |     rc1 += a[(k + 2) * n + i + (n >> 2)    ] * rb2;
617 |     rc1 += a[(k + 3) * n + i + (n >> 2)    ] * rb3;
618 |     rc2 += a[ k      * n + i + (n >> 2) * 2] * rb0;
619 |     rc2 += a[(k + 1) * n + i + (n >> 2) * 2] * rb1;
620 |     rc2 += a[(k + 2) * n + i + (n >> 2) * 2] * rb2;
621 |     rc2 += a[(k + 3) * n + i + (n >> 2) * 2] * rb3;
622 |     rc3 += a[ k      * n + i + (n >> 2) * 3] * rb0;
623 |     rc3 += a[(k + 1) * n + i + (n >> 2) * 3] * rb1;
624 |     rc3 += a[(k + 2) * n + i + (n >> 2) * 3] * rb2;
625 |     rc3 += a[(k + 3) * n + i + (n >> 2) * 3] * rb3;
626 |   }
627 |   c[j * n + i               ] = rc0;
628 |   c[j * n + i + (n >> 2)    ] = rc1;
629 |   c[j * n + i + (n >> 2) * 2] = rc2;
630 |   c[j * n + i + (n >> 2) * 3] = rc3;
631 | } /* end i-loop */
632 | } /* end j-loop */
633 |   clock_gettime(CLOCK_REALTIME, rt + 1);
634 | }
635 |       break;
636 |     default:
637 | /*
638 |  * cublasSgemm in CUBLAS
639 |  */
640 |   if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) {
641 |     printf("error: initialization (CUBLAS)\n");
642 |     cublasDestroy(handle);
643 |     exit(EXIT_FAILURE);
644 |   }
645 |   if (cudaSuccess != cudaMalloc((void **) &a_dev, sizeof(*a) * n * n) ||
646 |       cudaSuccess != cudaMalloc((void **) &b_dev, sizeof(*b) * n * n) ||
647 |       cudaSuccess != cudaMalloc((void **) &c_dev, sizeof(*c) * n * n)) {
648 |     printf("error: memory allocation (CUDA)\n");
649 |     cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
650 |     cublasDestroy(handle);
651 |     exit(EXIT_FAILURE);
652 |   }
653 |   if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) ||
654 |       CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n) ||
655 |       CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*c), c, n, c_dev, n)) {
656 |     printf("error: host --> accl (CUBLAS)\n");
657 |     cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
658 |     cublasDestroy(handle);
659 |     exit(EXIT_FAILURE);
660 |   }
661 |   clock_gettime(CLOCK_REALTIME, rt + 0);
662 |   if (CUBLAS_STATUS_SUCCESS != cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
663 |         n, n, n, &alfa, a_dev, n, b_dev, n, &beta, c_dev, n)) {
664 |     printf("error: cublasSgemm (CUBLAS)\n");
665 |     cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
666 |     cublasDestroy(handle);
667 |     exit(EXIT_FAILURE);
668 |   }
669 |   if (cudaSuccess != cudaDeviceSynchronize()) {
670 |     printf("error: device synchronization (CUDA)\n");
671 |     cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
672 |     cublasDestroy(handle);
673 |     exit(EXIT_FAILURE);
674 |   }
675 |   clock_gettime(CLOCK_REALTIME, rt + 1);
676 |   if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*c), c_dev, n, c, n)) {
677 |     printf("error: accl --> host (CUBLAS)\n");
678 |     cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
679 |     cublasDestroy(handle);
680 |     exit(EXIT_FAILURE);
681 |   }
682 |   cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev);
683 |   cublasDestroy(handle);
684 |       break;
685 |   } /* end switch (ial) */
686 |   if (wtcalc >= 0.0) {
687 |     wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec);
688 |   }
689 | }
690 | 


--------------------------------------------------------------------------------