├── 03_taskwait ├── src │ ├── Makefile.am │ └── taskwait.c ├── Makefile.am ├── docs │ ├── Doxyfile.in │ ├── UserManual.md │ └── Makefile.am ├── tests │ ├── Makefile.am │ └── taskwait_real_00.sh ├── README.md └── configure.ac ├── 00_build_OpenMP_offload ├── Clang │ ├── 00_check_gpu │ │ ├── realscript.sh │ │ ├── tesla.sh │ │ └── tesla.log │ ├── bugs.md │ └── build_clang_offload.md └── GCC │ ├── 00_check_gpu │ ├── realscript.sh │ ├── tesla.sh │ └── tesla.log │ ├── 02_build │ ├── build.sh │ └── realscript.sh │ ├── bugs.md │ ├── 01_download │ └── download.sh │ └── build_gcc_offload.md ├── 09_matAdd ├── src │ ├── Makefile.am │ ├── matAddAB.h │ ├── matAdd.c │ └── matAddAB.c ├── Makefile.am ├── tests │ ├── matAdd_real_00.sh │ ├── Makefile.am │ └── matAdd_real_00.sh.5422334.out ├── docs │ ├── Doxyfile.in │ ├── Makefile.am │ └── UserManual.md ├── README.md └── configure.ac ├── 10_matMul ├── src │ ├── Makefile.am │ ├── matMulAB.h │ ├── matMul.c │ └── matMulAB.c ├── Makefile.am ├── tests │ ├── matMul_real_00.sh │ ├── Makefile.am │ └── matMul_real_00.sh.5422392.out ├── docs │ ├── Doxyfile.in │ ├── Makefile.am │ └── UserManual.md ├── README.md └── configure.ac ├── 01_accelQuery ├── src │ ├── Makefile.am │ ├── prtAccelInfo.h │ ├── accelQuery.c │ └── prtAccelInfo.c ├── Makefile.am ├── tests │ ├── accelQuery_real_00.sh │ └── Makefile.am ├── docs │ ├── Doxyfile.in │ ├── UserManual.md │ └── Makefile.am ├── README.md └── configure.ac ├── 02_dataTransRate ├── src │ ├── Makefile.am │ ├── check1ns.c │ ├── check1ns.h │ └── dataTransRate.c ├── Makefile.am ├── tests │ ├── dataTransRate_real_00.sh │ └── Makefile.am ├── docs │ ├── Doxyfile.in │ ├── UserManual.md │ └── Makefile.am ├── README.md └── configure.ac ├── 04_scalarAddition ├── src │ ├── Makefile.am │ ├── check1ns.c │ ├── check1ns.h │ └── scalarAddition.c ├── Makefile.am ├── tests │ ├── scalarAddition_real_00.sh │ └── Makefile.am ├── docs │ ├── Doxyfile.in │ ├── UserManual.md │ └── Makefile.am ├── README.md └── configure.ac ├── 05_saxpy ├── Makefile.am ├── tests │ ├── saxpy_real_00.sh │ ├── Makefile.am │ └── saxpy_real_00.sh.5422320.out ├── src │ ├── Makefile.am │ ├── wtcalc.c │ ├── wtcalc.h │ ├── check1ns.c │ ├── check1ns.h │ ├── hsaxpy.h │ ├── asaxpy.h │ ├── hsaxpy.c │ ├── saxpy.c │ └── asaxpy.c ├── docs │ ├── Doxyfile.in │ ├── Makefile.am │ └── UserManual.md ├── README.md └── configure.ac ├── 08_distThreads ├── src │ ├── Makefile.am │ ├── gpuThreads.h │ ├── distThreads.c │ └── gpuThreads.c ├── Makefile.am ├── tests │ ├── distThreads_real_00.sh │ └── Makefile.am ├── docs │ ├── Doxyfile.in │ ├── Makefile.am │ └── UserManual.md ├── README.md └── configure.ac ├── simplifiedCode ├── 01_accelQuery │ └── accelQuery.c ├── 04_scalarAddition │ └── scalarAddition.c ├── 05_saxpy │ └── saxpy.c └── 02_dataTransRate │ └── dataTransRate.c └── README.md /03_taskwait/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = taskwait 2 | taskwait_SOURCES = taskwait.c 3 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/Clang/00_check_gpu/realscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Now, on $(hostname)" 3 | nvidia-smi 4 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/00_check_gpu/realscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Now, on $(hostname)" 3 | nvidia-smi 4 | -------------------------------------------------------------------------------- /09_matAdd/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = matAdd 2 | matAdd_SOURCES = matAdd.c \ 3 | matAddAB.h \ 4 | matAddAB.c 5 | -------------------------------------------------------------------------------- /10_matMul/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = matMul 2 | matMul_SOURCES = matMul.c \ 3 | matMulAB.h \ 4 | matMulAB.c 5 | -------------------------------------------------------------------------------- /01_accelQuery/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = accelQuery 2 | accelQuery_SOURCES = accelQuery.c \ 3 | prtAccelInfo.h \ 4 | prtAccelInfo.c 5 | -------------------------------------------------------------------------------- /02_dataTransRate/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = dataTransRate 2 | dataTransRate_SOURCES = dataTransRate.c \ 3 | check1ns.h \ 4 | check1ns.c 5 | -------------------------------------------------------------------------------- /03_taskwait/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /04_scalarAddition/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = scalarAddition 2 | scalarAddition_SOURCES = scalarAddition.c \ 3 | check1ns.h \ 4 | check1ns.c 5 | -------------------------------------------------------------------------------- /05_saxpy/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /08_distThreads/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = distThreads 2 | distThreads_SOURCES = distThreads.c \ 3 | gpuThreads.h \ 4 | gpuThreads.c 5 | -------------------------------------------------------------------------------- /09_matAdd/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /10_matMul/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /01_accelQuery/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /02_dataTransRate/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /08_distThreads/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /04_scalarAddition/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | dist_doc_DATA = README.md 3 | if HAVE_DOXYGEN 4 | SUBDIRS += docs 5 | endif 6 | SUBDIRS += tests 7 | -------------------------------------------------------------------------------- /05_saxpy/tests/saxpy_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N saxpy 3 | #CCS -t 600m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/saxpy 9 | -------------------------------------------------------------------------------- /10_matMul/tests/matMul_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N matMul 3 | #CCS -t 600m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/matMul $((2**12)) 9 | -------------------------------------------------------------------------------- /01_accelQuery/tests/accelQuery_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N accelQuery 3 | #CCS -t 10m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/accelQuery 9 | -------------------------------------------------------------------------------- /08_distThreads/tests/distThreads_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N distThreads 3 | #CCS -t 10m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/distThreads 9 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/Clang/00_check_gpu/tesla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N nvidia_smi 3 | #CCS -t 1m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:ncpus=1:mem=8g:vmem=16g:tesla=1 6 | 7 | sh -x realscript.sh 2>&1 | tee tesla.log 8 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/00_check_gpu/tesla.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N nvidia_smi 3 | #CCS -t 1m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:ncpus=1:mem=8g:vmem=16g:tesla=1 6 | 7 | sh -x realscript.sh 2>&1 | tee tesla.log 8 | -------------------------------------------------------------------------------- /02_dataTransRate/tests/dataTransRate_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N dataTransRate 3 | #CCS -t 10m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/dataTransRate 9 | -------------------------------------------------------------------------------- /05_saxpy/src/Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS = saxpy 2 | saxpy_SOURCES = saxpy.c \ 3 | check1ns.h \ 4 | check1ns.c \ 5 | hsaxpy.h \ 6 | hsaxpy.c \ 7 | asaxpy.h \ 8 | asaxpy.c \ 9 | wtcalc.h \ 10 | wtcalc.c 11 | -------------------------------------------------------------------------------- /04_scalarAddition/tests/scalarAddition_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N scalarAddition 3 | #CCS -t 10m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:gtx1080=2 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/scalarAddition 9 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/02_build/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N build 3 | #CCS -t 600m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:ncpus=16:mem=32g:vmem=32g:tesla=1 6 | 7 | module load system/CUDA/10.1.105 8 | sh -x realscript.sh 2>&1 | tee build.log 9 | -------------------------------------------------------------------------------- /09_matAdd/tests/matAdd_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N matAdd 3 | #CCS -t 600m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:gtx1080=1,place=:excl 6 | 7 | echo "hallo from $(hostname)" 8 | ../src/matAdd $((2**12)) 9 | ../src/matAdd $((2**13)) 10 | -------------------------------------------------------------------------------- /05_saxpy/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /09_matAdd/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /10_matMul/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /01_accelQuery/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /03_taskwait/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /08_distThreads/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /02_dataTransRate/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /04_scalarAddition/docs/Doxyfile.in: -------------------------------------------------------------------------------- 1 | PROJECT_NAME = @PACKAGE_NAME@ 2 | PROJECT_NUMBER = @PACKAGE_VERSION@ 3 | INPUT = @top_srcdir@/src 4 | RECURSIVE = YES 5 | GENERATE_LATEX = NO 6 | QUIET = YES 7 | -------------------------------------------------------------------------------- /05_saxpy/src/wtcalc.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file wtcalc.c 3 | * 4 | * @brief Global variable for walltime of the calculation kernel. 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 05.04.2020 8 | * @copyright CC BY-SA 2.0 9 | */ 10 | 11 | #include "wtcalc.h" 12 | 13 | double wtcalc; 14 | -------------------------------------------------------------------------------- /04_scalarAddition/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: scalarAddition 3 | author: Xin Wu (PC²) 4 | date: 08.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `scalarAddition` adds two integers on host and accelerator, and also compares 10 | the performance. 11 | 12 | # Usage 13 | 14 | ```bash 15 | scalarAddition 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /05_saxpy/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = saxpy_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | saxpy_test_00.sh: $(top_srcdir)/src/saxpy 12 | echo "${cmdrun} saxpy_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = saxpy_test_00.sh 16 | -------------------------------------------------------------------------------- /01_accelQuery/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: accelQuery 3 | author: Xin Wu (PC²) 4 | date: 04.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `accelQuery` searches accelerator(s) on a heterogeneous computer. 10 | Accelerator(s), if found, will be enumerated with some basic info. 11 | 12 | # Usage 13 | 14 | ```bash 15 | accelQuery 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /09_matAdd/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = matAdd_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | matAdd_test_00.sh: $(top_srcdir)/src/matAdd 12 | echo "${cmdrun} matAdd_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = matAdd_test_00.sh 16 | -------------------------------------------------------------------------------- /10_matMul/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = matMul_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | matMul_test_00.sh: $(top_srcdir)/src/matMul 12 | echo "${cmdrun} matMul_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = matMul_test_00.sh 16 | -------------------------------------------------------------------------------- /03_taskwait/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: taskwait 3 | author: Xin Wu (PC²) 4 | date: 08.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `taskwait` checks the `taskwait` construct for the deferred target task. At the 10 | time of writing, this hasn't been implemented in the GCC 9.2 compiler. 11 | 12 | # Usage 13 | 14 | ```bash 15 | taskwait 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /03_taskwait/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = taskwait_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | taskwait_test_00.sh: $(top_srcdir)/src/taskwait 12 | echo "${cmdrun} taskwait_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = taskwait_test_00.sh 16 | -------------------------------------------------------------------------------- /01_accelQuery/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = accelQuery_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | accelQuery_test_00.sh: $(top_srcdir)/src/accelQuery 12 | echo "${cmdrun} accelQuery_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = accelQuery_test_00.sh 16 | -------------------------------------------------------------------------------- /08_distThreads/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = distThreads_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | distThreads_test_00.sh: $(top_srcdir)/src/distThreads 12 | echo "${cmdrun} distThreads_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = distThreads_test_00.sh 16 | -------------------------------------------------------------------------------- /02_dataTransRate/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = dataTransRate_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | dataTransRate_test_00.sh: $(top_srcdir)/src/dataTransRate 12 | echo "${cmdrun} dataTransRate_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = dataTransRate_test_00.sh 16 | -------------------------------------------------------------------------------- /04_scalarAddition/tests/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_CCSALLOC 2 | cmdrun = ${CCSALLOC} 3 | else 4 | cmdrun = `which bash` 5 | endif 6 | 7 | check_SCRIPTS = scalarAddition_test_00.sh 8 | 9 | TESTS = $(check_SCRIPTS) 10 | 11 | scalarAddition_test_00.sh: $(top_srcdir)/src/scalarAddition 12 | echo "${cmdrun} scalarAddition_real_00.sh" > $@ 13 | chmod +x $@ 14 | 15 | CLEANFILES = scalarAddition_test_00.sh 16 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Bugs Found in GCC 3 | author: Xin Wu (PC²) 4 | date: 15.01.2020 5 | --- 6 | 7 | # Asynchronous Offloading Execution 8 | 9 | This has not been fully implemented in GCC. See `03_taskwait`. 10 | 11 | # Limitation of Number of GPU Threads in A Team 12 | 13 | The number of GPU threads in a team (a contention group) is limited to 8. See 14 | `05_saxpy_v1` and `06_saxpy_v2`. 15 | -------------------------------------------------------------------------------- /02_dataTransRate/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: dataTransRate 3 | author: Xin Wu (PC²) 4 | date: 07.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`. 10 | 11 | The possible situations are: 12 | 13 | * h2h: `src` = host and `dst` = host 14 | * h2a: `src` = host and `dst` = accel 15 | * a2a: `src` = accel and `dst` = accel 16 | 17 | # Usage 18 | 19 | ```bash 20 | export CUDA_LAUNCH_BLOCKING 1 21 | dataTransRate 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /03_taskwait/tests/taskwait_real_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #CCS -N taskwait 3 | #CCS -t 10m 4 | #CCS -g pc2-mitarbeiter 5 | #CCS --res=rset=1:ncpus=1:mem=4g:vmem=8g:gtx1080=2 6 | 7 | echo "hallo from $(hostname)" 8 | if [ 0 -eq 1 ]; then 9 | # 10 | # Asynchronous offloading is not available in GCC 9.2.0. 11 | # 12 | notImpld=$(../src/taskwait 2>&1 | grep "GOMP_OFFLOAD_async_run") 13 | [[ $notImpld =~ "unimplemented" ]] 14 | else 15 | # 16 | # Asynchronous offloading is available in Clang/LLVM 9.0.1. 17 | # 18 | ../src/taskwait 19 | fi 20 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/Clang/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Bugs Found in Clang/LLVM 3 | author: Xin Wu (PC²) 4 | date: 15.01.2020 5 | --- 6 | 7 | # Activation of Accelerator 8 | 9 | `omp_get_num_devices()` constantly returns 0, if accelerator(s) have not been 10 | activated by an OpenMP directive, even though there are accelerator(s) in the 11 | computing system. See `02_dataTransRate`. 12 | 13 | NOTE: This bug has been fixed in Clang 11. But the data transfer rate within 14 | an accelerator's DRAM, e.g. `a2a`, is still much lower than our expectation. 15 | -------------------------------------------------------------------------------- /04_scalarAddition/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: scalarAddition 3 | author: Xin Wu (PC²) 4 | date: 08.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `scalarAddition` adds two integers on host and accelerator, and also compares 10 | the performance. 11 | 12 | # Build 13 | 14 | ```bash 15 | autoreconf -i; ./configure; make; make check; sudo make install; 16 | ``` 17 | 18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 19 | 20 | # Documentation 21 | 22 | * docs/html/index.html: Source code documentation generated by Doxygen. 23 | 24 | * docs/UserManual.md: User Manual. 25 | 26 | -------------------------------------------------------------------------------- /05_saxpy/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /05_saxpy/src/wtcalc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file wtcalc.h 3 | * 4 | * @brief Global variable for walltime of the calculation kernel. 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 05.04.2020 8 | * @copyright CC BY-SA 2.0 9 | */ 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | #ifndef WATCALC_H 16 | #define WATCALC_H 17 | 18 | /* 19 | * wtcalc: walltime for the calculation kernel 20 | * 21 | * - wtcalc < 0.0: reset and disable the timer 22 | * - wtcalc == 0.0: enable the timer 23 | */ 24 | extern double wtcalc; 25 | 26 | #endif 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | -------------------------------------------------------------------------------- /01_accelQuery/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /03_taskwait/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /09_matAdd/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /10_matMul/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /01_accelQuery/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: accelQuery 3 | author: Xin Wu (PC²) 4 | date: 04.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `accelQuery` searches accelerator(s) on a heterogeneous computer. 10 | Accelerator(s), if found, will be enumerated with some basic info. 11 | 12 | # Build 13 | 14 | ```bash 15 | autoreconf -i; ./configure; make; make check 16 | ``` 17 | 18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 19 | 20 | # Documentation 21 | 22 | * docs/html/index.html: Source code documentation generated by Doxygen. 23 | 24 | * docs/UserManual.md: User Manual. 25 | 26 | -------------------------------------------------------------------------------- /02_dataTransRate/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /04_scalarAddition/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /08_distThreads/docs/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_doc_DATA = UserManual.md 2 | 3 | if HAVE_DOXYGEN 4 | 5 | htmlpkg = $(PACKAGE_NAME)-$(PACKAGE_VERSION)-html.tar.gz 6 | doc_DATA = $(htmlpkg) 7 | docstamp = doc.stamp 8 | 9 | $(htmlpkg): $(docstamp) 10 | tar chof - html | gzip -9 -c > $@ 11 | 12 | $(docstamp): Doxyfile 13 | $(DOXYGEN) $< 14 | echo Timestamp > $@ 15 | 16 | install-data-hook: 17 | cd $(DESTDIR)$(docdir) && tar xf $(htmlpkg) 18 | 19 | uninstall-hook: 20 | cd $(DESTDIR)$(docdir) && $(RM) -fr html 21 | 22 | CLEANFILES = $(docstamp) $(htmlpkg) 23 | 24 | clean-local: 25 | $(RM) -fr html 26 | 27 | endif 28 | -------------------------------------------------------------------------------- /03_taskwait/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: taskwait 3 | author: Xin Wu (PC²) 4 | date: 08.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `taskwait` checks the `taskwait` construct for the deferred target task. At the 10 | time of writing, this hasn't been implemented in the GCC 9.2 compiler. 11 | 12 | # Build 13 | 14 | ```bash 15 | autoreconf -i; ./configure; make; make check; sudo make install; 16 | ``` 17 | 18 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 19 | 20 | # Documentation 21 | 22 | * docs/html/index.html: Source code documentation generated by Doxygen. 23 | 24 | * docs/UserManual.md: User Manual. 25 | 26 | -------------------------------------------------------------------------------- /05_saxpy/src/check1ns.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.c 3 | * @brief Function definition for checking 1 ns time resolution on the system. 4 | * 5 | * This source file contains function definition for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 07.01.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "check1ns.h" 18 | 19 | void check1ns(void) 20 | { 21 | struct timespec res; 22 | 23 | if (0 != clock_getres(CLOCK_REALTIME, &res)) { 24 | printf("error: clock_getres\n"); 25 | exit(EXIT_FAILURE); 26 | } 27 | assert(1l == res.tv_nsec); 28 | } 29 | -------------------------------------------------------------------------------- /02_dataTransRate/src/check1ns.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.c 3 | * @brief Function definition for checking 1 ns time resolution on the system. 4 | * 5 | * This source file contains function definition for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 07.01.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "check1ns.h" 18 | 19 | void check1ns(void) 20 | { 21 | struct timespec res; 22 | 23 | if (0 != clock_getres(CLOCK_REALTIME, &res)) { 24 | printf("error: clock_getres\n"); 25 | exit(EXIT_FAILURE); 26 | } 27 | assert(1l == res.tv_nsec); 28 | } 29 | -------------------------------------------------------------------------------- /04_scalarAddition/src/check1ns.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.c 3 | * @brief Function definition for checking 1 ns time resolution on the system. 4 | * 5 | * This source file contains function definition for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 07.01.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "check1ns.h" 18 | 19 | void check1ns(void) 20 | { 21 | struct timespec res; 22 | 23 | if (0 != clock_getres(CLOCK_REALTIME, &res)) { 24 | printf("error: clock_getres\n"); 25 | exit(EXIT_FAILURE); 26 | } 27 | assert(1l == res.tv_nsec); 28 | } 29 | -------------------------------------------------------------------------------- /08_distThreads/src/gpuThreads.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file gpuThreads.h 3 | * @brief Function prototype for organizing GPU threads. 4 | * 5 | * This header file contains function prototype for organizing GPU threads. 6 | * 7 | * @author Xin Wu (PC²) 8 | * @data 12.03.2020 9 | * @copyright CC BY-SA 2.0 10 | */ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #ifndef GPUTHREADS_H 17 | #define GPUTHREADS_H 18 | 19 | void gpuThreads(int i); 20 | /**< 21 | * @brief Show the organization of GPU threads. 22 | * 23 | * The ith organization of GPU threads is shown. 24 | * 25 | * @param i The ith organization. 26 | * 27 | * @return \c void. 28 | */ 29 | 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | -------------------------------------------------------------------------------- /02_dataTransRate/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: dataTransRate 3 | author: Xin Wu (PC²) 4 | date: 07.01.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`. 10 | 11 | The possible situations are: 12 | 13 | * h2h: `src` = host and `dst` = host 14 | * h2a: `src` = host and `dst` = accel 15 | * a2a: `src` = accel and `dst` = accel 16 | 17 | # Build 18 | 19 | ```bash 20 | autoreconf -i; ./configure; make; make check 21 | ``` 22 | 23 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 24 | 25 | # Documentation 26 | 27 | * docs/html/index.html: Source code documentation generated by Doxygen. 28 | 29 | * docs/UserManual.md: User Manual. 30 | 31 | -------------------------------------------------------------------------------- /01_accelQuery/src/prtAccelInfo.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file prtAccelInfo.h 3 | * @brief Function prototype for prtAccelInfo. 4 | * 5 | * This header file contains function prototype for prtAccelInfo. 6 | * 7 | * @author Xin Wu (PC²) 8 | * @date 04.01.2020 9 | * @copyright CC BY-SA 2.0 10 | */ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #ifndef PRTACCELINFO_H 17 | #define PRTACCELINFO_H 18 | 19 | void prtAccelInfo(int iaccel); 20 | /**< 21 | * @brief Print some basic info of an accelerator. 22 | * 23 | * Strictly speaking, \c prtAccelInfo() can only print the basic info of an 24 | * Nvidia CUDA device. 25 | * 26 | * @param iaccel The index of an accelerator. 27 | * 28 | * @return \c void. 29 | */ 30 | 31 | #endif 32 | 33 | #ifdef __cplusplus 34 | } 35 | #endif 36 | -------------------------------------------------------------------------------- /simplifiedCode/01_accelQuery/accelQuery.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file accelQuery.c 3 | * 4 | * @brief accelQuery searches accelerator(s) on a heterogeneous computer. 5 | * 6 | * Host-only: 7 | * gcc -Wall -fopenmp -foffload=disable accelQuery.c 8 | * 9 | * Offload to GPU: 10 | * gcc -Wall -fopenmp -foffload=nvptx-none accelQuery.c 11 | * 12 | */ 13 | 14 | #include 15 | #include 16 | 17 | int main(void) 18 | { 19 | #pragma omp target 20 | { 21 | if (omp_is_initial_device()) { 22 | printf("Hello World from Host.\n"); 23 | } else { 24 | printf("Hello World from Accelerator.\n"); 25 | } 26 | /* 27 | * Question: Why this may give _wrong_ number of accelerators? FIXME 28 | */ 29 | printf("%d accelerator found.\n", omp_get_num_devices()); 30 | } 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /05_saxpy/src/check1ns.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.h 3 | * @brief Function prototype for checking 1 ns time resolution on the system. 4 | * 5 | * This header file contains function prototype for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 07.01.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | #ifndef CHECK1NS_H 18 | #define CHECK1NS_H 19 | 20 | void check1ns(void); 21 | /**< 22 | * @brief Check whether 1 ns time resolution is available on the system. 23 | * 24 | * We need 1 ns time resolution. If it's available, program continues normally. 25 | * Otherwise, program terminates. 26 | * 27 | * @return \c void. 28 | */ 29 | 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | -------------------------------------------------------------------------------- /02_dataTransRate/src/check1ns.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.h 3 | * @brief Function prototype for checking 1 ns time resolution on the system. 4 | * 5 | * This header file contains function prototype for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 12.03.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | #ifndef CHECK1NS_H 18 | #define CHECK1NS_H 19 | 20 | void check1ns(void); 21 | /**< 22 | * @brief Check whether 1 ns time resolution is available on the system. 23 | * 24 | * We need 1 ns time resolution. If it's available, program continues normally. 25 | * Otherwise, program terminates. 26 | * 27 | * @return \c void. 28 | */ 29 | 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | -------------------------------------------------------------------------------- /04_scalarAddition/src/check1ns.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file check1ns.h 3 | * @brief Function prototype for checking 1 ns time resolution on the system. 4 | * 5 | * This header file contains function prototype for checking 1 ns time 6 | * resolution on the system. 7 | * 8 | * @author Xin Wu (PC²) 9 | * @date 07.01.2020 10 | * @copyright CC BY-SA 2.0 11 | */ 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | #ifndef CHECK1NS_H 18 | #define CHECK1NS_H 19 | 20 | void check1ns(void); 21 | /**< 22 | * @brief Check whether 1 ns time resolution is available on the system. 23 | * 24 | * We need 1 ns time resolution. If it's available, program continues normally. 25 | * Otherwise, program terminates. 26 | * 27 | * @return \c void. 28 | */ 29 | 30 | #endif 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | -------------------------------------------------------------------------------- /03_taskwait/src/taskwait.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file taskwait.c 3 | * 4 | * @mainpage taskwait 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 08.01.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * taskwait checks the taskwait construct for the deferred target task. At the 11 | * time of writing, this hasn't been implemented in the GCC 9.2 compiler. 12 | */ 13 | 14 | #include 15 | #ifdef _OPENMP 16 | #include 17 | #endif 18 | 19 | /** 20 | * @brief Main entry point for taskwait. 21 | */ 22 | int main(int argc, char *argv[]) 23 | { 24 | int a, b, c, 25 | x, y, z; 26 | 27 | a = x = 2; 28 | b = y = 4; 29 | #pragma omp target map(a, b, c) nowait 30 | { 31 | c = a + b; /* This is executed on accelerator. */ 32 | } 33 | z = x + y; /* This is executed on host. */ 34 | #pragma omp taskwait 35 | assert(c == z); 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /05_saxpy/tests/saxpy_real_00.sh.5422320.out: -------------------------------------------------------------------------------- 1 | hallo from gpu029 2 | The system supports 1 ns time resolution 3 | total size of x and y is 512.0 MB 4 | tests are averaged over 32 loops 5 | saxpy on host (0) : 65092.0 MB/s 65093.6 MB/s maxabserr = 0.0 6 | saxpy on host (1) : 70769.4 MB/s 70772.0 MB/s maxabserr = 0.0 7 | saxpy on accl (1) : 1400.7 MB/s 4648.9 MB/s maxabserr = 0.0 8 | saxpy on accl (2) : 1371.6 MB/s 4653.9 MB/s maxabserr = 0.0 9 | saxpy on accl (3) : 2046.7 MB/s 227586.6 MB/s maxabserr = 0.0 10 | saxpy on accl (4) : 2062.4 MB/s 224540.3 MB/s maxabserr = 0.0 11 | saxpy on accl (5) : 2073.9 MB/s 276659.5 MB/s maxabserr = 0.0 12 | saxpy on accl (6) : 2045.0 MB/s 271431.4 MB/s maxabserr = 0.0 13 | saxpy on accl (7) : 2025.2 MB/s 280631.7 MB/s maxabserr = 0.0 14 | saxpy on accl (8) : 2025.7 MB/s 279577.4 MB/s maxabserr = 0.0 15 | -------------------------------------------------------------------------------- /10_matMul/tests/matMul_real_00.sh.5422392.out: -------------------------------------------------------------------------------- 1 | hallo from gpu028 2 | matrix dim: 4096 x 4096 3 | time averaged over 16 loops 4 | matMulAB (0) : 24.9 GFLOPS 25.5 GFLOPS maxabserr = 0.0 5 | matMulAB (1) : 9.8 GFLOPS 9.9 GFLOPS maxabserr = 0.0 6 | matMulAB (2) : 184.5 GFLOPS 228.9 GFLOPS maxabserr = 0.0 7 | matMulAB (3) : 5.0 GFLOPS 5.1 GFLOPS maxabserr = 1018.4 8 | matMulAB (4) : 176.1 GFLOPS 216.2 GFLOPS maxabserr = 0.0 9 | matMulAB (5) : 340.9 GFLOPS 531.9 GFLOPS maxabserr = 0.0 10 | matMulAB (6) : 610.3 GFLOPS 1708.9 GFLOPS maxabserr = 0.0 11 | matMulAB (7) : 218.9 GFLOPS 284.6 GFLOPS maxabserr = 0.0 12 | matMulAB (8) : 233.8 GFLOPS 310.4 GFLOPS maxabserr = 0.0 13 | matMulAB (9) : 254.5 GFLOPS 348.1 GFLOPS maxabserr = 0.0 14 | matMulAB (10) : 931.6 GFLOPS 10126.1 GFLOPS maxabserr = 0.0 15 | -------------------------------------------------------------------------------- /09_matAdd/src/matAddAB.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matAddAB.h 3 | * 4 | * @brief Function prototype for matrix addition (A += B) in single-precision. 5 | * 6 | * This header file contains function prototype for matrix addition (A += B) 7 | * in single-precision. 8 | * 9 | * @author Xin Wu (PC²) 10 | * @date 07.02.2020 11 | * @copyright CC BY-SA 2.0 12 | */ 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | #ifndef MATADDAB_H 19 | #define MATADDAB_H 20 | 21 | void matAddAB_accl(float *a, 22 | float *b, 23 | int n, 24 | int ial); 25 | /**< 26 | * @brief Perform matrix addition (A += B) on accl. 27 | * 28 | * @return \c void. 29 | */ 30 | 31 | /* 32 | * wtcalc: walltime for the calculation kernel on GPU 33 | * 34 | * - wtcalc < 0.0: reset and disable the timer 35 | * - wtcalc == 0.0: enable the timer 36 | */ 37 | extern double wtcalc; 38 | 39 | #endif 40 | 41 | #ifdef __cplusplus 42 | } 43 | #endif 44 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/01_download/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # nvptx-tools 4 | # 5 | echo "nvptx-tools" 6 | git clone https://github.com/MentorEmbedded/nvptx-tools.git 7 | cd nvptx-tools 8 | git checkout -b gcc9_gpu 5f6f343a302d620b0868edab376c00b15741e39e 9 | cd .. 10 | # 11 | # nvptx-newlib 12 | # 13 | echo "nvptx-newlib" 14 | git clone https://github.com/MentorEmbedded/nvptx-newlib.git 15 | cd nvptx-newlib 16 | git checkout -b gcc9_gpu 66dd175a9d3aea387715f00ff18ef7e535cd1272 17 | cd .. 18 | # 19 | # openacc-gcc-9-branch 20 | # 21 | echo "openacc-gcc-9-branch" 22 | wget https://github.com/gcc-mirror/gcc/archive/gcc-9_2_0-release.tar.gz 23 | tar xf gcc-9_2_0-release.tar.gz 24 | cd gcc-gcc-9_2_0-release 25 | ./contrib/download_prerequisites 26 | ln -s ../nvptx-newlib/newlib newlib 27 | cd .. 28 | # 29 | # Done 30 | # 31 | echo "Done" 32 | -------------------------------------------------------------------------------- /10_matMul/src/matMulAB.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matMulAB.h 3 | * 4 | * @brief Function prototype for matrix multiplication in single-precision. 5 | * 6 | * This header file contains function prototype for matrix multiplication 7 | * in single-precision. 8 | * 9 | * @author Xin Wu (PC²) 10 | * @date 07.02.2020 11 | * @copyright CC BY-SA 2.0 12 | */ 13 | 14 | #ifdef __cplusplus 15 | extern "C" { 16 | #endif 17 | 18 | #ifndef MATMULAB_H 19 | #define MATMULAB_H 20 | 21 | void matMulAB_accl(float *a, 22 | float *b, 23 | float *c, 24 | int n, 25 | int ial); 26 | /**< 27 | * @brief Perform matrix multiplication on accl. 28 | * 29 | * @return \c void. 30 | */ 31 | 32 | /* 33 | * wtcalc: walltime for the calculation kernel on GPU 34 | * 35 | * - wtcalc < 0.0: reset and disable the timer 36 | * - wtcalc == 0.0: enable the timer 37 | */ 38 | extern double wtcalc; 39 | 40 | #endif 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | -------------------------------------------------------------------------------- /01_accelQuery/src/accelQuery.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file accelQuery.c 3 | * 4 | * @mainpage accelQuery 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 04.01.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * accelQuery searches accelerator(s) on a heterogeneous computer. 11 | * Accelerator(s), if found, will be enumerated with some basic info. 12 | */ 13 | 14 | #include 15 | #ifdef _OPENMP 16 | #include 17 | #endif 18 | #include "prtAccelInfo.h" 19 | 20 | /** 21 | * @brief Main entry point for accelQuery. 22 | */ 23 | int main(int argc, char *argv[]) 24 | { 25 | int iaccel, naccel; 26 | 27 | /* 28 | * NOTE: The behavior of an OpenMP API function may be defined differently 29 | * for inside and outside of the target region. 30 | */ 31 | #pragma omp target 32 | { 33 | if (omp_is_initial_device()) { 34 | printf("Hello World from Host.\n"); 35 | } else { 36 | printf("Hello World from Accelerator(s).\n"); 37 | } 38 | } 39 | // no accelerator 40 | if (0 == (naccel = omp_get_num_devices())) return 0; 41 | // one or more accelerator(s) 42 | printf("\n%d Accelerator(s) found\n", naccel); 43 | for (iaccel = 0; iaccel < naccel; iaccel++) { 44 | prtAccelInfo(iaccel); 45 | } 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /09_matAdd/tests/matAdd_real_00.sh.5422334.out: -------------------------------------------------------------------------------- 1 | hallo from gpu028 2 | matrix dim: 4096 x 4096 3 | time averaged over 64 loops 4 | matAddAB (0) : 1.9 GB/s 86.6 GB/s maxabserr = 0.0 5 | matAddAB (1) : 1.6 GB/s 35.7 GB/s maxabserr = 0.0 6 | matAddAB (2) : 1.6 GB/s 48.1 GB/s maxabserr = 0.0 7 | matAddAB (3) : 1.7 GB/s 166.6 GB/s maxabserr = 0.0 8 | matAddAB (4) : 2.0 GB/s 183.3 GB/s maxabserr = 0.0 9 | matAddAB (5) : 1.9 GB/s 183.7 GB/s maxabserr = 0.0 10 | matAddAB (6) : 1.9 GB/s 185.3 GB/s maxabserr = 0.0 11 | matAddAB (7) : 1.8 GB/s 185.4 GB/s maxabserr = 0.0 12 | matrix dim: 8192 x 8192 13 | time averaged over 64 loops 14 | matAddAB (0) : 1.9 GB/s 172.2 GB/s maxabserr = 0.0 15 | matAddAB (1) : 1.9 GB/s 34.0 GB/s maxabserr = 0.0 16 | matAddAB (2) : 1.6 GB/s 8.4 GB/s maxabserr = 0.0 17 | matAddAB (3) : 1.9 GB/s 265.8 GB/s maxabserr = 0.0 18 | matAddAB (4) : 1.9 GB/s 265.4 GB/s maxabserr = 0.0 19 | matAddAB (5) : 1.9 GB/s 265.8 GB/s maxabserr = 0.0 20 | matAddAB (6) : 1.9 GB/s 264.9 GB/s maxabserr = 0.0 21 | matAddAB (7) : 1.9 GB/s 269.0 GB/s maxabserr = 0.0 22 | -------------------------------------------------------------------------------- /05_saxpy/src/hsaxpy.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file hsaxpy.h 3 | * @brief Function prototype for performing the \c saxpy operation on host. 4 | * 5 | * This header file contains function prototype for the \c saxpy operation, 6 | * which is defined as: 7 | * 8 | * y := a * x + y 9 | * 10 | * where: 11 | * 12 | * - a is a scalar. 13 | * - x and y are single-precision vectors each with n elements. 14 | * 15 | * @author Xin Wu (PC²) 16 | * @date 05.04.2020 17 | * @copyright CC BY-SA 2.0 18 | */ 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #ifndef HSAXY_H 25 | #define HSAXY_H 26 | 27 | void hsaxpy(const int n, 28 | const float a, 29 | const float *x, 30 | float *y, 31 | const int ial); 32 | /**< 33 | * @brief Performs the \c saxpy operation on host. 34 | * 35 | * The \c saxpy operation is defined as: 36 | * 37 | * y := a * x + y 38 | * 39 | * where: 40 | * 41 | * - a is a scalar. 42 | * - x and y are single-precision vectors each with n elements. 43 | * 44 | * @param n The number of elements in \p x and \p y. 45 | * @param a The scalar for multiplication. 46 | * @param x The vector \p x in \c saxpy. 47 | * @param y The vector \p y in \c saxpy. 48 | * @param ial The ial-th implementation. 49 | * 50 | * @return \c void. 51 | */ 52 | 53 | #endif 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | -------------------------------------------------------------------------------- /05_saxpy/src/asaxpy.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file asaxpy.h 3 | * @brief Function prototype for performing the \c saxpy operation on accelerator. 4 | * 5 | * This header file contains function prototype for the \c saxpy operation, 6 | * which is defined as: 7 | * 8 | * y := a * x + y 9 | * 10 | * where: 11 | * 12 | * - a is a scalar. 13 | * - x and y are single-precision vectors each with n elements. 14 | * 15 | * @author Xin Wu (PC²) 16 | * @date 05.04.2020 17 | * @copyright CC BY-SA 2.0 18 | */ 19 | 20 | #ifdef __cplusplus 21 | extern "C" { 22 | #endif 23 | 24 | #ifndef ASAXY_H 25 | #define ASAXY_H 26 | 27 | void asaxpy(const int n, 28 | const float a, 29 | const float *x, 30 | float *y, 31 | const int ial); 32 | /**< 33 | * @brief Performs the \c saxpy operation on accelerator. 34 | * 35 | * The \c saxpy operation is defined as: 36 | * 37 | * y := a * x + y 38 | * 39 | * where: 40 | * 41 | * - a is a scalar. 42 | * - x and y are single-precision vectors each with n elements. 43 | * 44 | * @param n The number of elements in \p x and \p y. 45 | * @param a The scalar for multiplication. 46 | * @param x The vector \p x in \c saxpy. 47 | * @param y The vector \p y in \c saxpy. 48 | * @param ial The ial-th implementation. 49 | * 50 | * @return \c void. 51 | */ 52 | 53 | #endif 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/00_check_gpu/tesla.log: -------------------------------------------------------------------------------- 1 | ++ hostname 2 | + echo 'Now, on gpu003' 3 | Now, on gpu003 4 | + nvidia-smi 5 | Tue Dec 17 08:34:20 2019 6 | +-----------------------------------------------------------------------------+ 7 | | NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 | 8 | |-------------------------------+----------------------+----------------------+ 9 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 10 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 11 | |===============================+======================+======================| 12 | | 0 Tesla K20Xm Off | 00000000:84:00.0 Off | 0 | 13 | | N/A 28C P0 59W / 235W | 0MiB / 5700MiB | 81% Default | 14 | +-------------------------------+----------------------+----------------------+ 15 | 16 | +-----------------------------------------------------------------------------+ 17 | | Processes: GPU Memory | 18 | | GPU PID Type Process name Usage | 19 | |=============================================================================| 20 | | No running processes found | 21 | +-----------------------------------------------------------------------------+ 22 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/Clang/00_check_gpu/tesla.log: -------------------------------------------------------------------------------- 1 | ++ hostname 2 | + echo 'Now, on gpu003' 3 | Now, on gpu003 4 | + nvidia-smi 5 | Tue Dec 17 08:34:20 2019 6 | +-----------------------------------------------------------------------------+ 7 | | NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 | 8 | |-------------------------------+----------------------+----------------------+ 9 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 10 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 11 | |===============================+======================+======================| 12 | | 0 Tesla K20Xm Off | 00000000:84:00.0 Off | 0 | 13 | | N/A 28C P0 59W / 235W | 0MiB / 5700MiB | 81% Default | 14 | +-------------------------------+----------------------+----------------------+ 15 | 16 | +-----------------------------------------------------------------------------+ 17 | | Processes: GPU Memory | 18 | | GPU PID Type Process name Usage | 19 | |=============================================================================| 20 | | No running processes found | 21 | +-----------------------------------------------------------------------------+ 22 | -------------------------------------------------------------------------------- /05_saxpy/src/hsaxpy.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file hsaxpy.c 3 | * @brief Function definition for performing the \c saxpy operation on host. 4 | * 5 | * This source file contains function definition for the \c saxpy operation, 6 | * which is defined as: 7 | * 8 | * y := a * x + y 9 | * 10 | * where: 11 | * 12 | * - a is a scalar. 13 | * - x and y are single-precision vectors each with n elements. 14 | * 15 | * @author Xin Wu (PC²) 16 | * @date 05.04.2020 17 | * @copyright CC BY-SA 2.0 18 | */ 19 | 20 | #include 21 | #ifdef _OPENMP 22 | #include 23 | #endif 24 | #include "mkl.h" 25 | #include "wtcalc.h" 26 | #include "hsaxpy.h" 27 | 28 | void hsaxpy(const int n, 29 | const float a, 30 | const float *x, 31 | float *y, 32 | const int ial) 33 | { 34 | struct timespec rt[2]; 35 | 36 | switch (ial) { 37 | case 0: 38 | /* 39 | * - naive implementation 40 | */ 41 | clock_gettime(CLOCK_REALTIME, rt + 0); 42 | #pragma omp parallel for simd schedule(simd:static) \ 43 | default(none) shared(a, n, x, y) 44 | for (int i = 0; i < n; i++) { 45 | y[i] = a * x[i] + y[i]; 46 | } 47 | clock_gettime(CLOCK_REALTIME, rt + 1); 48 | break; 49 | default: 50 | /* 51 | * - saxpy in MKL 52 | */ 53 | clock_gettime(CLOCK_REALTIME, rt + 0); 54 | cblas_saxpy(n, a, x, 1, y, 1); 55 | clock_gettime(CLOCK_REALTIME, rt + 1); 56 | break; 57 | } /* end switch (ial) */ 58 | if (wtcalc >= 0.0) { 59 | wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /simplifiedCode/04_scalarAddition/scalarAddition.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file scalarAddition.c 3 | * 4 | * @brief scalarAddition adds two integers on host and accelerator, and also 5 | * compares the performance. 6 | * 7 | * Offload to GPU: 8 | * gcc -Wall -fopenmp -foffload=nvptx-none scalarAddition.c 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | /** 18 | * @brief Main entry point for scalarAddition. 19 | */ 20 | int main(int argc, char *argv[]) 21 | { 22 | /* 23 | * data on host 24 | */ 25 | int a, b, c, // c = a + b; 26 | y, z; // z = x + y; (x in device data environment) 27 | struct timespec rt[2]; 28 | 29 | /* 30 | * scalar addition on host 31 | */ 32 | clock_gettime(CLOCK_REALTIME, rt + 0); 33 | a = 2; 34 | b = 4; 35 | c = a + b; 36 | clock_gettime(CLOCK_REALTIME, rt + 1); 37 | printf("scalar addition on host: %12.9f s\n", 38 | (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec)); 39 | /* 40 | * scalar addition on accelerator 41 | */ 42 | y = 4; 43 | clock_gettime(CLOCK_REALTIME, rt + 0); 44 | #pragma omp target map(to:y) map(from:z) 45 | { 46 | int x; // only accessible from accelerator 47 | x = 2; 48 | z = x + y; 49 | } 50 | clock_gettime(CLOCK_REALTIME, rt + 1); 51 | printf("scalar addition on accelerator: %12.9f s\n", 52 | (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec)); 53 | /* 54 | * Question: How to measure the walltime for H-A data transfer rate? FIXME 55 | * Question: How to measure the walltime for a kernel launch on GPU? FIXME 56 | * Question: How to monitor this tiny calculation on GPU? FIXME 57 | */ 58 | /* 59 | * check the result 60 | */ 61 | assert(c == z); 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /04_scalarAddition/src/scalarAddition.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file scalarAddition.c 3 | * 4 | * @mainpage scalarAddition 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 08.01.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * scalarAddition adds two integers on host and accelerator, and also compares 11 | * the performance. 12 | * 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #ifdef _OPENMP 20 | #include 21 | #endif 22 | #include "check1ns.h" 23 | 24 | /** 25 | * @brief Main entry point for scalarAddition. 26 | */ 27 | int main(int argc, char *argv[]) 28 | { 29 | /* 30 | * data on host 31 | */ 32 | int a, b, c, // c = a + b; 33 | y, z; // z = x + y; (x in device data environment) 34 | struct timespec rt[2]; 35 | 36 | /* 37 | * We need 1 ns time resolution. 38 | */ 39 | check1ns(); 40 | printf("The system supports 1 ns time resolution\n"); 41 | /* 42 | * check the number of accelerators 43 | */ 44 | if (0 == omp_get_num_devices()) { 45 | printf("No accelerator found ... exit\n"); 46 | exit(EXIT_FAILURE); 47 | } 48 | /* 49 | * scalar addition on host 50 | */ 51 | clock_gettime(CLOCK_REALTIME, rt + 0); 52 | a = 2; 53 | b = 4; 54 | c = a + b; 55 | clock_gettime(CLOCK_REALTIME, rt + 1); 56 | printf("scalar addition on host: %12.9f s\n", 57 | (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec)); 58 | /* 59 | * scalar addition on accelerator 60 | */ 61 | y = 4; 62 | clock_gettime(CLOCK_REALTIME, rt + 0); 63 | #pragma omp target map(to:y) map(from:z) 64 | { 65 | int x; // only accessible from accelerator 66 | x = 2; 67 | z = x + y; 68 | } 69 | clock_gettime(CLOCK_REALTIME, rt + 1); 70 | printf("scalar addition on accelerator: %12.9f s\n", 71 | (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec)); 72 | /* 73 | * check the result 74 | */ 75 | assert(c == z); 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /08_distThreads/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: distThreads 3 | author: Xin Wu (PC²) 4 | date: 12.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `distThreads` demonstrates the organization of threads and teams in a league on 10 | GPU. 11 | 12 | * Column-major is assumed thru the entire code! 13 | 14 | * The following tables only summarize the most important points. For more 15 | details on the ith organization of the GPU threads see comments in 16 | `gpuThreads.c`. 17 | 18 | | i | matrix league | GPU threads | 19 | |:-:|:---------------:|:-------------------:| 20 | | | nrow x ncol | nthrds x lteams | 21 | | 0 | 3 x 5 | 3 x 5 | 22 | | 1 | 3 x 5 | 3 x 5 | 23 | | 2 | 3 x 5 | 3 x 5 | 24 | | 3 | 3 x 5 | 3 x 5 | 25 | | 4 | 7 x 7 | 3 x 5 | 26 | | 5 | 7 x 7 | 3 x 5 | 27 | | 6 | 12 x 6 | 3 x 6 | 28 | | 7 | 12 x 6 | 3 x 6 | 29 | | 8 | 12 x 6 | 3 x 3 | 30 | 31 | | i | Remarks | 32 | |:-:|:----------------------------------------------------------------| 33 | | 0 | Used as Reference. No loop at all. | 34 | | 1 | Incorrect nested loop impl. | 35 | | 2 | Correct impl. Manually linearized loop. | 36 | | 3 | Correct impl. Nested loop with collapse(2). | 37 | | 4 | Irreg. matrix. Default chunk_size. Some GPU threads are idle. | 38 | | 5 | Irreg. matrix. chunk_size = nthrds. Better performance. | 39 | | 6 | CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access. | 40 | | 7 | 2x irow-loop unrolling. Nested loop with collapse(3). | 41 | | | Coalesced GPU memory access. | 42 | | 8 | 2x icol-loop unrolling. 2x irow-loop unrolling. | 43 | | | Nested loop with collapse(3). Best Performance. | 44 | 45 | # Usage 46 | 47 | ```bash 48 | distThreads 49 | ``` 50 | 51 | -------------------------------------------------------------------------------- /05_saxpy/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: saxpy 3 | author: Xin Wu (PC²) 4 | date: 05.04.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `saxpy` performs the `saxpy` operation on host as well as accelerator. 10 | The performance (in MB/s) for different implementations is also compared. 11 | 12 | The `saxpy` operation is defined as: 13 | 14 | $$ y := a * x + y $$ 15 | 16 | where: 17 | 18 | * `a` is a scalar. 19 | * `x` and `y` are single-precision vectors each with n elements. 20 | * For testing n is assumed to be $2^{26}$. 21 | * The following table only summarizes the most important points. For more 22 | details on the ial-th implementation see comments in `hsaxpy.c` (on host) 23 | and `asaxpy.c` (on accelerator). 24 | 25 | - on host 26 | 27 | | ial | Remarks | 28 | |:---:|------------------------------------------------------------------------| 29 | | 0 | naive implementation | 30 | | 1 | saxpy in MKL | 31 | 32 | - on accl 33 | 34 | | ial | Remarks | 35 | |:---:|------------------------------------------------------------------------| 36 | | 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested | 37 | | 1 | <<<2^0 , 2^7 >>>, auto scheduling | 38 | | 2 | <<<2^7 , 2^0 >>>, auto scheduling | 39 | | 3 | <<<2^7 , 2^7 >>>, auto scheduling | 40 | | 4 | <<<2^16, 2^10>>>, manual scheduling | 41 | | 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling | 42 | | | (2^15*2^7*16==2^26) | 43 | | 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling | 44 | | 7 | de-linearize the vector gives slightly better performance than CUBLAS | 45 | | 8 | cublasSaxpy in CUBLAS | 46 | 47 | # Usage 48 | 49 | ```bash 50 | saxpy 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /09_matAdd/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: matAdd 3 | author: Xin Wu (PC²) 4 | date: 12.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `matAdd` performs matrix addition (A += B) in single-precision on GPU. 10 | The performance (in GB/s) for different implementations is compared and 11 | the numerical results are also verified. 12 | 13 | * Column-major is assumed thru the entire code! 14 | 15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096. 16 | 17 | * The following table only summarizes the most important points. For more 18 | details on the ial-th OpenMP GPU implementation see comments in `matAddAB.c`. 19 | 20 | | ial | Remarks | 21 | |:---:|------------------------------------------------------------------------| 22 | | 0 | ij-loop, 2^9 threads * 2^3 teams, | 23 | | | coalesced memory access | 24 | | 1 | ji-loop, 2^9 threads * 2^3 teams, | 25 | | | uncoalesced memory access | 26 | | 2 | ij-loop, 2^9 threads * 2^f teams, collapse(2), | 27 | | | uncoalesced memory access | 28 | | 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), | 29 | | | coalesced memory access | 30 | | 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), | 31 | | | 2x i-loop unrolling (stride of 2^8 rows) | 32 | | 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), | 33 | | | 2x i-loop unrolling (stride of n/2 rows) | 34 | | 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), | 35 | | | 2x i-loop unrolling (stride of 2^8 rows), | 36 | | | 2x j-loop unrolling (stride of 1 col ) | 37 | | 7 | cublasSaxpy in CUBLAS | 38 | 39 | # Usage 40 | 41 | ```bash 42 | matAdd $((2**12)) 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/02_build/realscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # clean up and copy 4 | # 5 | echo "Copy files ..." 6 | for i in gcc-gcc-9_2_0-release nvptx-newlib nvptx-tools; do 7 | echo $i 8 | rm -fr $i 9 | cp -afr /scratch/pc2-mitarbeiter/xinwu/GCC_OpenMP_OpenACC/01_download/$i . 10 | done 11 | echo "Finish copy files" 12 | # 13 | # environment variables 14 | # 15 | TARGSYS=$(gcc-gcc-9_2_0-release/config.guess) 16 | CUDADIR=/cm/shared/apps/pc2/EB-SW/software/system/CUDA/10.1.105 17 | ##INSTDIR=/scratch/pc2-mitarbeiter/xinwu/GCC_OpenMP_OpenACC/99_gcc9_gpu 18 | INSTDIR=/cm/shared/apps/pc2/GCC/9.2.0-offload 19 | # 20 | # nvptx-tools 21 | # 22 | echo "build nvptx-tools ..." 23 | cd nvptx-tools 24 | ./configure \ 25 | --with-cuda-driver-include=$CUDADIR/include \ 26 | --with-cuda-driver-lib=$CUDADIR/lib64 \ 27 | --prefix=$INSTDIR 28 | make 29 | make install 30 | cd .. 31 | echo "Finish build nvptx-tools" 32 | # 33 | # Accel_GCC 34 | # 35 | echo "build Accel_GCC ..." 36 | mkdir Accel_GCC 37 | cd Accel_GCC 38 | ../gcc-gcc-9_2_0-release/configure \ 39 | --target=nvptx-none \ 40 | --enable-as-accelerator-for=$TARGSYS \ 41 | --with-build-time-tools=$INSTDIR/nvptx-none/bin \ 42 | --disable-sjlj-exceptions \ 43 | --enable-newlib-io-long-long \ 44 | --enable-languages="c,c++,fortran,lto" \ 45 | --prefix=$INSTDIR 46 | make -j16 47 | make install 48 | cd .. 49 | echo "Finish build Accel_GCC" 50 | # 51 | # Host_GCC 52 | # 53 | echo "Host_GCC ..." 54 | mkdir Host_GCC 55 | cd Host_GCC 56 | ../gcc-gcc-9_2_0-release/configure \ 57 | --enable-offload-targets=nvptx-none \ 58 | --with-cuda-driver-include=$CUDADIR/include \ 59 | --with-cuda-driver-lib=$CUDADIR/lib64 \ 60 | --disable-bootstrap \ 61 | --disable-multilib \ 62 | --enable-languages="c,c++,fortran,lto" \ 63 | --prefix=$INSTDIR 64 | make -j16 65 | make install 66 | cd .. 67 | echo "Finish Host_GCC" 68 | # 69 | # Done 70 | # 71 | echo "Done" 72 | -------------------------------------------------------------------------------- /08_distThreads/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: distThreads 3 | author: Xin Wu (PC²) 4 | date: 12.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `distThreads` demonstrates the organization of threads and teams in a league on 10 | GPU. 11 | 12 | * Column-major is assumed thru the entire code! 13 | 14 | * The following tables only summarize the most important points. For more 15 | details on the ith organization of the GPU threads see comments in 16 | `gpuThreads.c`. 17 | 18 | | i | matrix league | GPU threads | 19 | |:-:|:---------------:|:-------------------:| 20 | | | nrow x ncol | nthrds x lteams | 21 | | 0 | 3 x 5 | 3 x 5 | 22 | | 1 | 3 x 5 | 3 x 5 | 23 | | 2 | 3 x 5 | 3 x 5 | 24 | | 3 | 3 x 5 | 3 x 5 | 25 | | 4 | 7 x 7 | 3 x 5 | 26 | | 5 | 7 x 7 | 3 x 5 | 27 | | 6 | 12 x 6 | 3 x 6 | 28 | | 7 | 12 x 6 | 3 x 6 | 29 | | 8 | 12 x 6 | 3 x 3 | 30 | 31 | | i | Remarks | 32 | |:-:|:----------------------------------------------------------------| 33 | | 0 | Used as Reference. No loop at all. | 34 | | 1 | Incorrect nested loop impl. | 35 | | 2 | Correct impl. Manually linearized loop. | 36 | | 3 | Correct impl. Nested loop with collapse(2). | 37 | | 4 | Irreg. matrix. Default chunk_size. Some GPU threads are idle. | 38 | | 5 | Irreg. matrix. chunk_size = nthrds. Better performance. | 39 | | 6 | CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access. | 40 | | 7 | 2x irow-loop unrolling. Nested loop with collapse(3). | 41 | | | Coalesced GPU memory access. | 42 | | 8 | 2x icol-loop unrolling. 2x irow-loop unrolling. | 43 | | | Nested loop with collapse(3). Best Performance. | 44 | 45 | # Build 46 | 47 | ```bash 48 | autoreconf -i; ./configure; make; make check 49 | ``` 50 | 51 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 52 | 53 | # Documentation 54 | 55 | * docs/html/index.html: Source code documentation generated by Doxygen. 56 | 57 | * docs/UserManual.md: User Manual. 58 | 59 | -------------------------------------------------------------------------------- /05_saxpy/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: saxpy 3 | author: Xin Wu (PC²) 4 | date: 05.04.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `saxpy` performs the `saxpy` operation on host as well as accelerator. 10 | The performance (in MB/s) for different implementations is also compared. 11 | 12 | The `saxpy` operation is defined as: 13 | 14 | $$ y := a * x + y $$ 15 | 16 | where: 17 | 18 | * `a` is a scalar. 19 | * `x` and `y` are single-precision vectors each with n elements. 20 | * For testing n is assumed to be $2^{26}$. 21 | * The following table only summarizes the most important points. For more 22 | details on the ial-th implementation see comments in `hsaxpy.c` (on host) 23 | and `asaxpy.c` (on accelerator). 24 | 25 | - on host 26 | 27 | | ial | Remarks | 28 | |:---:|------------------------------------------------------------------------| 29 | | 0 | naive implementation | 30 | | 1 | saxpy in MKL | 31 | 32 | - on accl 33 | 34 | | ial | Remarks | 35 | |:---:|------------------------------------------------------------------------| 36 | | 0 | <<<2^0 , 2^0 >>>, TOO SLOW! not tested | 37 | | 1 | <<<2^0 , 2^7 >>>, auto scheduling | 38 | | 2 | <<<2^7 , 2^0 >>>, auto scheduling | 39 | | 3 | <<<2^7 , 2^7 >>>, auto scheduling | 40 | | 4 | <<<2^16, 2^10>>>, manual scheduling | 41 | | 5 | <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling | 42 | | | (2^15*2^7*16==2^26) | 43 | | 6 | <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling | 44 | | 7 | de-linearize the vector gives slightly better performance than CUBLAS | 45 | | 8 | cublasSaxpy in CUBLAS | 46 | 47 | # Build 48 | 49 | ```bash 50 | autoreconf -i; ./configure; make; make check; 51 | ``` 52 | 53 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 54 | 55 | # Documentation 56 | 57 | * docs/html/index.html: Source code documentation generated by Doxygen. 58 | 59 | * docs/UserManual.md: User Manual. 60 | 61 | -------------------------------------------------------------------------------- /09_matAdd/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: matAdd 3 | author: Xin Wu (PC²) 4 | date: 19.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `matAdd` performs matrix addition (A += B) in single-precision on GPU. 10 | The performance (in GB/s) for different implementations is compared and 11 | the numerical results are also verified. 12 | 13 | * Column-major is assumed thru the entire code! 14 | 15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096. 16 | 17 | * The following table only summarizes the most important points. For more 18 | details on the ial-th OpenMP GPU implementation see comments in `matAddAB.c`. 19 | 20 | | ial | Remarks | 21 | |:---:|------------------------------------------------------------------------| 22 | | 0 | ij-loop, 2^9 threads * 2^3 teams, | 23 | | | coalesced memory access | 24 | | 1 | ji-loop, 2^9 threads * 2^3 teams, | 25 | | | uncoalesced memory access | 26 | | 2 | ij-loop, 2^9 threads * 2^f teams, collapse(2), | 27 | | | uncoalesced memory access | 28 | | 3 | ji-loop, 2^9 threads * 2^f teams, collapse(2), | 29 | | | coalesced memory access | 30 | | 4 | ji-loop, 2^8 threads * 2^f teams, collapse(3), | 31 | | | 2x i-loop unrolling (stride of 2^8 rows) | 32 | | 5 | ji-loop, 2^8 threads * 2^f teams, collapse(2), | 33 | | | 2x i-loop unrolling (stride of n/2 rows) | 34 | | 6 | ji-loop, 2^8 threads * 2^e teams, collapse(3), | 35 | | | 2x i-loop unrolling (stride of 2^8 rows), | 36 | | | 2x j-loop unrolling (stride of 1 col ) | 37 | | 7 | cublasSaxpy in CUBLAS | 38 | 39 | # Build 40 | 41 | ```bash 42 | autoreconf -i; ./configure; make; make check 43 | ``` 44 | 45 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 46 | 47 | # Documentation 48 | 49 | * docs/html/index.html: Source code documentation generated by Doxygen. 50 | 51 | * docs/UserManual.md: User Manual. 52 | 53 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/GCC/build_gcc_offload.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Build GCC with OpenMP Support for Nvidia GPU Offloading 3 | author: Xin Wu (PC²) 4 | date: 06.01.2020 5 | --- 6 | 7 | # Check Nvidia GPU 8 | 9 | The build procedure was carried out on a Tesla node of OCuLUS at PC². It 10 | features an Nvidia Tesla K20X GPU. Thus it's necessary to check the Tesla K20X 11 | GPU on the compute node, before building GCC with OpenMP support for offloading 12 | computation on Nvidia GPU. 13 | 14 | The relevant scripts and log files can be found in `00_check_gpu`. 15 | 16 | `tesla.sh` is a driver script and should be submitted with `ccsalloc`: 17 | 18 | ```bash 19 | ccsalloc testa.sh 20 | ``` 21 | 22 | `realscript.sh` does the real job and the output can be found in `tesla.log`. 23 | 24 | # Download Packages and Preparation 25 | 26 | The required packages for this GCC build are: 27 | 28 | * nvptx-tools:[^nvptxtools] 29 | 30 | [^nvptxtools]: At the time of writing, there is no release of nvptx-tools on 31 | GitHub. For reproducibility the `HEAD` was checked out explicitly. 32 | 33 | * nvptx-newlib:[^nvptxnewlib] 34 | 35 | [^nvptxnewlib]: At the time of writing, there is no release of nvptx-newlib on 36 | GitHub. For reproducibility the `HEAD` was checked out explicitly. 37 | 38 | * openacc-gcc-9-branch:[^gcc9] 39 | 40 | [^gcc9]: This Git-branch is used for development of OpenACC support and related 41 | functionality. For more info, see . 42 | 43 | It's faster to download these packages from the frontend nodes of OCuLUS at PC². 44 | `download.sh` (in `01_download`) is a convenient script to download these 45 | packages as well as to prepare other setups for our build of GCC with OpenMP for 46 | offloading on GPUs. 47 | 48 | # Build and Install Packages 49 | 50 | ## Load CUDA module 51 | 52 | Because the GPU-backend of GCC depends on CUDA, we need to load the CUDA module 53 | on OCuLUS. 54 | 55 | ```bash 56 | module load system/CUDA/10.1.105 57 | ``` 58 | 59 | ## Build `nvptx-tools`, accelerator and host GCC compilers 60 | 61 | The build scripts can be found in `02_build`. `build.sh` is a driver script for 62 | `ccsalloc` and `realscript.sh` carries out the real build procedure. 63 | 64 | Before running 65 | 66 | ```bash 67 | ccsalloc build.sh 68 | ``` 69 | 70 | it's necessary to adapt some settings in `realscript.sh`, e.g. `CUDADIR`, 71 | `INSTDIR`, and perhaps `make -j` with an appropriate number of processors, 72 | to your working system. 73 | 74 | Now, we're done. 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | The directories in this repository contain code examples for the course of 4 | OpenMP GPU-offloading at Paderborn Center for Parallel Computing (PC²), 5 | Paderborn University. The sub-directories are generally organized as: 6 | 7 | * src: source code 8 | * docs: documentation 9 | * tests: some tests 10 | 11 | Some highlights of the codes in this repository: 12 | 13 | * The performance of our `saxpy` implemented by using OpenMP GPU-offloading is 14 | as good as `cublasSaxpy` in CUBLAS. See `case 7` in `05_saxpy/src/asaxpy.c` 15 | for details. 16 | 17 | * The GPU shared memory has not been standardized in OpenMP API Specification 18 | (Version 5.0 Nov. 2018). To optimize the performance of matrix multiplication 19 | by using OpenMP GPU-offloading, i) `case 6` in `10_matMul/src/matMulAB.c` 20 | implements a register blocking algorithm and ii) `case 8` in the same source 21 | code file implements a common GPU-based tiled algorithm by blocking the local 22 | shared memory in a very tricky manner and the OpenMP code resembles CUDA. 23 | 24 | # List of Projects 25 | 26 | * 00_build_OpenMP_offload 27 | 28 | Documentation and scripts for building GCC as well as Clang/LLVM with OpenMP 29 | support for Nvidia GPU offloading. 30 | 31 | * 01_accelQuery 32 | 33 | `accelQuery` searches accelerator(s) on a heterogeneous computer. 34 | Accelerator(s), if found, will be enumerated with some basic info. 35 | 36 | * 02_dataTransRate 37 | 38 | `dataTransRate` gives the data transfer rate (in MB/sec) from `src` to `dst`. 39 | 40 | The possible situations are: 41 | 42 | * h2h: `src` = host and `dst` = host 43 | * h2a: `src` = host and `dst` = accel 44 | * a2a: `src` = accel and `dst` = accel 45 | 46 | NOTE: 47 | 48 | * A bug in Clang 9.0.1 has been fixed in Clang 11. 49 | * The data transfer rata for `a2a` is still lower than our expectation. 50 | 51 | * 03_taskwait 52 | 53 | `taskwait` checks the `taskwait` construct for the deferred target task. 54 | 55 | NOTE: 56 | 57 | * Asynchronous offloading hasn't been implemented in the GCC 9.2 compiler. 58 | * Asynchronous offloading is available in Clang 11. 59 | 60 | * 04_scalarAddition 61 | 62 | `scalarAddition` adds two integers on host and accelerator, and also compares 63 | the performance. 64 | 65 | * 05_saxpy 66 | 67 | `saxpy` performs the `saxpy` operation on host as well as accelerator. 68 | The performance (in MB/s) for different implementations is also compared. 69 | 70 | * 08_distThreads 71 | 72 | `distThreads` demonstrates the organization of threads and teams in a league 73 | on GPU. 74 | 75 | * 09_matAdd 76 | 77 | `matAdd` performs matrix addition (A +=B) in single-precision on GPU. The 78 | performance (in GB/s) for different implementations is compared and the 79 | numerical results are also verified. 80 | 81 | * 10_matMul 82 | 83 | `matMul` performs matrix multiplication in single-precision on GPU. The 84 | performance (in GFLOPS) for different implementations is compared and the 85 | numerical results are also verified. 86 | -------------------------------------------------------------------------------- /08_distThreads/src/distThreads.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file distThreads.c 3 | * 4 | * @mainpage distThreads 5 | * 6 | * @author Xin Wu (PC²) 7 | * @data 12.03.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * distThreads demonstrates the organization of threads and teams in a league on 11 | * GPU. 12 | */ 13 | 14 | #include "gpuThreads.h" 15 | 16 | /** 17 | * @brief Main entry point for distThreads. 18 | */ 19 | int main(int argc, char *argv[]) 20 | { 21 | for (int i = 0; i < 99; ++i) { 22 | /* 23 | * - Column-major is assumed thru the entire code! 24 | * 25 | * The following tables only summarize the most important points. For more 26 | * details on the ith organization of the GPU threads see comments in 27 | * \c gpuThreads.c. 28 | */ 29 | gpuThreads(i); 30 | /* 31 | * =========================================================================== 32 | * i matrix league GPU threads 33 | * nrow ncol nthrds lteams 34 | * --------------------------------------------------------------------------- 35 | * 0 3 5 3 5 36 | * 1 3 5 3 5 37 | * 2 3 5 3 5 38 | * 3 3 5 3 5 39 | * --------------------------------------------------------------------------- 40 | * 4 7 7 3 5 41 | * 5 7 7 3 5 42 | * --------------------------------------------------------------------------- 43 | * 6 12 6 3 6 44 | * 7 12 6 3 6 45 | * 8 12 6 3 3 46 | * =========================================================================== 47 | * 48 | * =========================================================================== 49 | * i Remarks 50 | * --------------------------------------------------------------------------- 51 | * 0 Used as Reference. No loop at all. 52 | * 1 Incorrect nested loop impl. 53 | * 2 Correct impl. Manually linearized loop. 54 | * 3 Correct impl. Nested loop with collapse(2). 55 | * --------------------------------------------------------------------------- 56 | * 4 Irreg. matrix. Default chunk_size. Some GPU threads are idle. 57 | * 5 Irreg. matrix. chunk_size = nthrds. Better performance. 58 | * --------------------------------------------------------------------------- 59 | * 6 CPU-like 2x irow-loop unrolling. Uncoalesced GPU memory access. 60 | * 7 2x irow-loop unrolling. Nested loop with collapse(3). 61 | * Coalesced GPU memory access. 62 | * 8 2x icol-loop unrolling. 2x irow-loop unrolling. 63 | * Nested loop with collapse(3). Best Performance. 64 | * =========================================================================== 65 | */ 66 | } 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /simplifiedCode/05_saxpy/saxpy.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file saxpy.c 3 | * 4 | * @brief saxpy performs the \c axpy computation in single-precision on both 5 | * host and accelerator. The performance (in MFLOPS) on host and accelerator is 6 | * compared and the numerical results are also verified for consistency. 7 | * 8 | * The \c axpy computation is defined as: 9 | * 10 | * y := a * x + y 11 | * 12 | * where: 13 | * 14 | * - a is a scalar. 15 | * - x and y are vectors each with n elements. 16 | * 17 | * Please note that in this version only one GPU thread is used. 18 | * 19 | * Offload to GPU: 20 | * 21 | * gcc -fopenmp -foffload=nvptx-none saxpy.c 22 | * 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #define TWO02 (1 << 2) 32 | #define TWO04 (1 << 4) 33 | #define TWO08 (1 << 8) 34 | #define TWO27 (1 << 27) 35 | 36 | int main(int argc, char *argv[]) 37 | { 38 | int i, n = TWO27, 39 | iret = 0; 40 | float a = 101.0f / TWO02, 41 | *x, *y, *z; 42 | struct timespec rt[2]; 43 | double wt; // walltime 44 | 45 | /* 46 | * 0. prepare x, y, and z 47 | * 48 | * y := a * x + y (on host) 49 | * z := a * x + z (on accel) 50 | */ 51 | if (NULL == (x = (float *) malloc(sizeof(*x) * n))) { 52 | printf("error: memory allocation for 'x'\n"); 53 | iret = -1; 54 | } 55 | if (NULL == (y = (float *) malloc(sizeof(*y) * n))) { 56 | printf("error: memory allocation for 'y'\n"); 57 | iret = -1; 58 | } 59 | if (NULL == (z = (float *) malloc(sizeof(*z) * n))) { 60 | printf("error: memory allocation for 'z'\n"); 61 | iret = -1; 62 | } 63 | if (0 != iret) { 64 | free(x); 65 | free(y); 66 | free(z); 67 | exit(EXIT_FAILURE); 68 | } 69 | for (i = 0; i < n; i++) { 70 | x[i] = rand() % TWO04 / (float) TWO02; 71 | y[i] = z[i] = rand() % TWO08 / (float) TWO04; 72 | } 73 | /* 74 | * 1. saxpy on host 75 | */ 76 | clock_gettime(CLOCK_REALTIME, rt + 0); 77 | #pragma omp parallel \ 78 | default(none) shared(n, a, x, y) private(i) 79 | { 80 | #pragma omp for simd schedule(simd:static) 81 | for (i = 0; i < n; i++) { 82 | y[i] = a * x[i] + y[i]; 83 | } 84 | } 85 | clock_gettime(CLOCK_REALTIME, rt + 1); 86 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 87 | printf("saxpy on host : %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt)); 88 | /* 89 | * 2. saxpy on accel 90 | */ 91 | clock_gettime(CLOCK_REALTIME, rt + 0); 92 | #pragma omp target device(0) \ 93 | map(to:n, a, x[0:n]) map(tofrom:z[0:n]) private(i) 94 | { 95 | for (i = 0; i < n; i++) { 96 | z[i] = a * x[i] + z[i]; 97 | } 98 | } 99 | clock_gettime(CLOCK_REALTIME, rt + 1); 100 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 101 | printf("saxpy on accel: %9.3f sec %9.1f MFLOPS\n", wt, 2.0 * n / (1.0e6 * wt)); 102 | /* 103 | * 3. verify numerical consistency 104 | */ 105 | for (i = 0; i < n; i++) { 106 | iret = *(int *) (y + i) ^ *(int *) (z + i); 107 | assert(iret == 0); 108 | } 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /10_matMul/docs/UserManual.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: matMul 3 | author: Xin Wu (PC²) 4 | date: 12.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `matMul` performs matrix multiplication in single-precision on GPU. The 10 | performance (in GFLOPS) for different implementations is compared and the 11 | numerical results are also verified. 12 | 13 | * Column-major is assumed thru the entire code! 14 | 15 | * For testing the dimension of all matrices are assumed to be 4096 x 4096. 16 | 17 | * The following table only summarizes the most important points. For more 18 | details on the ial-th OpenMP GPU implementation see comments in `matMulAB.c`. 19 | 20 | | ial | Remarks | 21 | |:---:|------------------------------------------------------------------------| 22 | | 0 | jik-loop, 2^9 threads * 2^3 teams, | 23 | | | uncoalesced memory access | 24 | | 1 | jki-loop, 2^9 threads * 2^3 teams, | 25 | | | uncoalesced memory access, uncoalesced r&w in innermost loop | 26 | | 2 | jik-loop, 2^9 threads * 2^f teams, collapse(2) | 27 | | 3 | jki-loop, 2^9 threads * 2^f teams, collapse(2), | 28 | | | race condition for writing c! | 29 | | 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), | 30 | | | 4x k-loop unrolling | 31 | | 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), | 32 | | | 4x i-loop unrolling (stride of 2^7 rows), | 33 | | | 4x k-loop unrolling, | 34 | | | rb: 4x data reuse | 35 | | 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), | 36 | | | 4x j-loop unrolling (stride of 1 col ), | 37 | | | 4x i-loop unrolling (stride of 2^7 rows), | 38 | | | 4x k-loop unrolling, | 39 | | | ra: 4x data reuse, | 40 | | | rb: 4x data reuse, | 41 | | | register blocking | 42 | | 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) | 43 | | 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), | 44 | | | GPU shared memory for data re-use, 16x k-loop unrolling, | 45 | | | shared memory blocking | 46 | | 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), | 47 | | | 4x i-loop unrolling (stride of n/4 rows), | 48 | | | 4x k-loop unrolling, | 49 | | | rb: 4x data reuse | 50 | | 10 | cublasSgemm in CUBLAS | 51 | 52 | # Usage 53 | 54 | ```bash 55 | matMul $((2**12)) 56 | ``` 57 | 58 | -------------------------------------------------------------------------------- /10_matMul/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: matMul 3 | author: Xin Wu (PC²) 4 | date: 19.03.2020 5 | --- 6 | 7 | # Introduction 8 | 9 | `matMul` performs matrix multiplication in single-precision on GPU. The 10 | performance (in GFLOPS) for different implementations is compared and the 11 | numerical results are also verified. 12 | 13 | * Column-major is assumed thru the entire code! 14 | 15 | * `i` and `j` are indices for row and column, respectively. 16 | 17 | * For testing the dimension of all matrices are assumed to be 4096 x 4096. 18 | 19 | * The following table only summarizes the most important points. For more 20 | details on the ial-th OpenMP GPU implementation see comments in `matMulAB.c`. 21 | 22 | | ial | Remarks | 23 | |:---:|------------------------------------------------------------------------| 24 | | 0 | jik-loop, 2^9 threads * 2^3 teams, | 25 | | | uncoalesced memory access | 26 | | 1 | jki-loop, 2^9 threads * 2^3 teams, | 27 | | | uncoalesced memory access, uncoalesced r&w in innermost loop | 28 | | 2 | jik-loop, 2^9 threads * 2^f teams, collapse(2) | 29 | | 3 | jki-loop, 2^9 threads * 2^f teams, collapse(2), | 30 | | | race condition for writing c! | 31 | | 4 | jik-loop, 2^9 threads * 2^f teams, collapse(2), | 32 | | | 4x k-loop unrolling | 33 | | 5 | jik-loop, 2^7 threads * 2^f teams, collapse(3), | 34 | | | 4x i-loop unrolling (stride of 2^7 rows), | 35 | | | 4x k-loop unrolling, | 36 | | | rb: 4x data reuse | 37 | | 6 | jik-loop, 2^7 threads * 2^d teams, collapse(3), | 38 | | | 4x j-loop unrolling (stride of 1 col ), | 39 | | | 4x i-loop unrolling (stride of 2^7 rows), | 40 | | | 4x k-loop unrolling, | 41 | | | ra: 4x data reuse, | 42 | | | rb: 4x data reuse, | 43 | | | register blocking | 44 | | 7 | based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) | 45 | | 8 | based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2), | 46 | | | GPU shared memory for data re-use, 16x k-loop unrolling, | 47 | | | shared memory blocking | 48 | | 9 | based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), | 49 | | | 4x i-loop unrolling (stride of n/4 rows), | 50 | | | 4x k-loop unrolling, | 51 | | | rb: 4x data reuse | 52 | | 10 | cublasSgemm in CUBLAS | 53 | 54 | # Build 55 | 56 | ```bash 57 | autoreconf -i; ./configure; make; make check 58 | ``` 59 | 60 | `make check` has been tested on OCuLUS (with OpenCCS) and P53s (without OpenCCS). 61 | 62 | # Documentation 63 | 64 | * docs/html/index.html: Source code documentation generated by Doxygen. 65 | 66 | * docs/UserManual.md: User Manual. 67 | 68 | -------------------------------------------------------------------------------- /simplifiedCode/02_dataTransRate/dataTransRate.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file dataTransRate.c 3 | * 4 | * @brief dataTransRate gives the data transfer rate from src to dst. 5 | * 6 | * The possible situations are: 7 | * 8 | * - h2h: src = host and dst = host 9 | * - h2a: src = host and dst = accel 10 | * - a2a: src = accel and dst = accel 11 | * 12 | * Offload to GPU: 13 | * gcc -Wall -fopenmp -foffload=nvptx-none dataTransRate.c 14 | * 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #define TWO27 (1 << 27) 23 | 24 | int main(void) 25 | { 26 | // host 27 | int ihost, *hdat[2]; 28 | // accelerator 29 | int iaccel, *adat[2]; 30 | size_t ndat; 31 | struct timespec rt[2]; 32 | double wt; // walltime 33 | int i, n = TWO27, iret = 0; 34 | 35 | /* 36 | * prepare data on host and accelerator 37 | */ 38 | ihost = omp_get_initial_device(); // index of the host 39 | iaccel = 0; // index of the 1st accel 40 | ndat = sizeof(*hdat[0]) * n; 41 | for (i = 0; i < 2; i++) { 42 | if (NULL == (hdat[i] = (int *) omp_target_alloc(ndat, ihost))) { 43 | printf("error: memory allocation for hdat[%d] ...", i); 44 | iret = 1; 45 | } 46 | if (NULL == (adat[i] = (int *) omp_target_alloc(ndat, iaccel))) { 47 | printf("error: memory allocation for adat[%d] ...", i); 48 | iret = 1; 49 | } 50 | } 51 | if (1 == iret) { 52 | for (i = 0; i < 2; i++) { 53 | omp_target_free(hdat[i], ihost); 54 | omp_target_free(adat[i], iaccel); 55 | } 56 | exit(EXIT_FAILURE); 57 | } 58 | for (i = 0; i < n; i++) { 59 | (hdat[0])[i] = rand(); 60 | } 61 | /* 62 | * data transfer rate: h2h, h2a, and a2a 63 | */ 64 | printf("\nData Transfer Rate\n\n"); 65 | printf("================================\n"); 66 | printf(" src dst DTR \n"); 67 | printf("------- ------- ----------------\n"); 68 | /* 69 | * h2h 70 | */ 71 | clock_gettime(CLOCK_REALTIME, rt + 0); 72 | iret = omp_target_memcpy(hdat[1], hdat[0], ndat, 0x0, 0x0, ihost, ihost); 73 | clock_gettime(CLOCK_REALTIME, rt + 1); 74 | if (0 != iret) { 75 | printf("error: omp_target_memcpy (h2h)\n"); 76 | exit(EXIT_FAILURE); 77 | } 78 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 79 | printf(" host host %8.1f MB/sec\n", ndat / wt); 80 | /* 81 | * h2a 82 | */ 83 | clock_gettime(CLOCK_REALTIME, rt + 0); 84 | iret = omp_target_memcpy(adat[0], hdat[0], ndat, 0x0, 0x0, iaccel, ihost); 85 | clock_gettime(CLOCK_REALTIME, rt + 1); 86 | if (0 != iret) { 87 | printf("error: omp_target_memcpy (h2a)\n"); 88 | exit(EXIT_FAILURE); 89 | } 90 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 91 | printf(" host accel %8.1f MB/sec\n", ndat / wt); 92 | /* 93 | * a2a 94 | */ 95 | clock_gettime(CLOCK_REALTIME, rt + 0); 96 | iret = omp_target_memcpy(adat[1], adat[0], ndat, 0x0, 0x0, iaccel, iaccel); 97 | /* 98 | * Question: How to get the correct A-A data transfer rate? FIXME 99 | */ 100 | clock_gettime(CLOCK_REALTIME, rt + 1); 101 | if (0 != iret) { 102 | printf("error: omp_target_memcpy (a2a)\n"); 103 | exit(EXIT_FAILURE); 104 | } 105 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 106 | printf(" accel accel %8.1f MB/sec\n", ndat / wt); 107 | printf("================================\n\n"); 108 | /* 109 | * release the data 110 | */ 111 | for (i = 0; i < 2; i++) { 112 | omp_target_free(hdat[i], ihost); 113 | omp_target_free(adat[i], iaccel); 114 | } 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /01_accelQuery/src/prtAccelInfo.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file prtAccelInfo.c 3 | * @brief Function definition for prtAccelInfo. 4 | * 5 | * This source file contains function definition for prtAccelInfo. 6 | * 7 | * @author Xin Wu (PC²) 8 | * @date 04.01.2020 9 | * @copyright CC BY-SA 2.0 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include "prtAccelInfo.h" 16 | 17 | #define CUDAErrorCheck(funcall) \ 18 | do { \ 19 | cudaError_t ierr = funcall; \ 20 | if (cudaSuccess != ierr) { \ 21 | fprintf(stderr, "%s(line %d) : CUDA RT API error : %s(%d) -> %s\n", \ 22 | __FILE__, __LINE__, #funcall, ierr, cudaGetErrorString(ierr)); \ 23 | exit(ierr); \ 24 | } \ 25 | } while (0) 26 | 27 | static inline int _corePerSM(int major, int minor) 28 | /** 29 | * @brief Give the number of CUDA cores per streaming multiprocessor (SM). 30 | * 31 | * The number of CUDA cores per SM is determined by the compute capability. 32 | * 33 | * @param major Major revision number of the compute capability. 34 | * @param minor Minor revision number of the compute capability. 35 | * 36 | * @return The number of CUDA cores per SM. 37 | */ 38 | { 39 | if (1 == major) { 40 | if (0 == minor || 1 == minor || 2 == minor || 3 == minor) return 8; 41 | } 42 | if (2 == major) { 43 | if (0 == minor) return 32; 44 | if (1 == minor) return 48; 45 | } 46 | if (3 == major) { 47 | if (0 == minor || 5 == minor || 7 == minor) return 192; 48 | } 49 | if (5 == major) { 50 | if (0 == minor || 2 == minor) return 128; 51 | } 52 | if (6 == major) { 53 | if (0 == minor) return 64; 54 | if (1 == minor || 2 == minor) return 128; 55 | } 56 | if (7 == major) { 57 | if (0 == minor || 2 == minor || 5 == minor) return 64; 58 | } 59 | return -1; 60 | } 61 | 62 | void prtAccelInfo(int iaccel) 63 | { 64 | int corePerSM; 65 | struct cudaDeviceProp dev; 66 | 67 | CUDAErrorCheck(cudaSetDevice(iaccel)); 68 | CUDAErrorCheck(cudaGetDeviceProperties(&dev, iaccel)); 69 | corePerSM = _corePerSM(dev.major, dev.minor); 70 | printf("\n"); 71 | printf("============================================================\n"); 72 | printf("CUDA Device name : \"%s\"\n", dev.name); 73 | printf("------------------------------------------------------------\n"); 74 | printf("Comp. Capability : %d.%d\n", dev.major, dev.minor); 75 | printf("max clock rate : %.0f MHz\n", dev.clockRate * 1.e-3f); 76 | printf("number of SMs : %d\n", dev.multiProcessorCount); 77 | printf("cores / SM : %d\n", corePerSM); 78 | printf("# of CUDA cores : %d\n", corePerSM * dev.multiProcessorCount); 79 | printf("------------------------------------------------------------\n"); 80 | printf("global memory : %5.0f MBytes\n", dev.totalGlobalMem / 1048576.0f); 81 | printf("shared mem. / SM : %5.1f KBytes\n", dev.sharedMemPerMultiprocessor / 1024.0f); 82 | printf("32-bit reg. / SM : %d\n", dev.regsPerMultiprocessor); 83 | printf("------------------------------------------------------------\n"); 84 | printf("max # of threads / SM : %d\n", dev.maxThreadsPerMultiProcessor); 85 | printf("max # of threads / block : %d\n", dev.maxThreadsPerBlock); 86 | printf("max dim. of block : (%d, %d, %d)\n", 87 | dev.maxThreadsDim[0], dev.maxThreadsDim[1], dev.maxThreadsDim[2]); 88 | printf("max dim. of grid : (%d, %d, %d)\n", 89 | dev.maxGridSize[0], dev.maxGridSize[1], dev.maxGridSize[2]); 90 | printf("warp size : %d\n", dev.warpSize); 91 | printf("============================================================\n"); 92 | } 93 | -------------------------------------------------------------------------------- /03_taskwait/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([taskwait], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/taskwait.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check C compiler 22 | # 23 | ##############################################################################80 24 | AC_PROG_CC([clang gcc]) 25 | AS_IF([test "${CC}" = gcc], 26 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" 27 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 28 | AS_IF([test "${CC}" = clang], 29 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ 30 | -Xopenmp-target -march=sm_61 $CFLAGS" 31 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 32 | ##############################################################################80 33 | # 34 | # check archiver 35 | # 36 | ##############################################################################80 37 | AC_PROG_RANLIB 38 | AM_PROG_AR 39 | ##############################################################################80 40 | # 41 | # check headers 42 | # 43 | ##############################################################################80 44 | AC_CHECK_HEADER([cuda_runtime.h], [], 45 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 46 | ##############################################################################80 47 | # 48 | # check libraries 49 | # 50 | ##############################################################################80 51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 52 | [AC_MSG_ERROR([libcudart required, but not found])], []) 53 | ##############################################################################80 54 | # 55 | # check Doxygen 56 | # 57 | ##############################################################################80 58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 60 | AS_IF([test -z "${DOXYGEN}"], 61 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 65 | ##############################################################################80 66 | # 67 | # check ccsalloc (in OpenCCS) 68 | # 69 | ##############################################################################80 70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 72 | AS_IF([test -z "${CCSALLOC}"], 73 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 75 | ##############################################################################80 76 | # 77 | # create final files 78 | # 79 | ##############################################################################80 80 | AC_CONFIG_HEADERS([config.h]) 81 | AC_CONFIG_FILES([Makefile 82 | src/Makefile 83 | tests/Makefile]) 84 | AC_OUTPUT 85 | 86 | echo " 87 | //============================================================================80 88 | 89 | Configuration: 90 | 91 | CC : ${CC} 92 | CFLAGS : ${CFLAGS} 93 | LDFLAGS : ${LDFLAGS} 94 | LIBS : ${LIBS} 95 | 96 | //============================================================================80 97 | 98 | Now, type make to build ..." 99 | -------------------------------------------------------------------------------- /01_accelQuery/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([accelQuery], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/accelQuery.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check C compiler 22 | # 23 | ##############################################################################80 24 | AC_PROG_CC([clang gcc]) 25 | AS_IF([test "${CC}" = gcc], 26 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" 27 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 28 | AS_IF([test "${CC}" = clang], 29 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ 30 | -Xopenmp-target -march=sm_61 $CFLAGS" 31 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 32 | ##############################################################################80 33 | # 34 | # check archiver 35 | # 36 | ##############################################################################80 37 | AC_PROG_RANLIB 38 | AM_PROG_AR 39 | ##############################################################################80 40 | # 41 | # check headers 42 | # 43 | ##############################################################################80 44 | AC_CHECK_HEADER([cuda_runtime.h], [], 45 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 46 | ##############################################################################80 47 | # 48 | # check libraries 49 | # 50 | ##############################################################################80 51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 52 | [AC_MSG_ERROR([libcudart required, but not found])], []) 53 | ##############################################################################80 54 | # 55 | # check Doxygen 56 | # 57 | ##############################################################################80 58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 60 | AS_IF([test -z "${DOXYGEN}"], 61 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 65 | ##############################################################################80 66 | # 67 | # check ccsalloc (in OpenCCS) 68 | # 69 | ##############################################################################80 70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 72 | AS_IF([test -z "${CCSALLOC}"], 73 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 75 | ##############################################################################80 76 | # 77 | # create final files 78 | # 79 | ##############################################################################80 80 | AC_CONFIG_HEADERS([config.h]) 81 | AC_CONFIG_FILES([Makefile 82 | src/Makefile 83 | tests/Makefile]) 84 | AC_OUTPUT 85 | 86 | echo " 87 | //============================================================================80 88 | 89 | Configuration: 90 | 91 | CC : ${CC} 92 | CFLAGS : ${CFLAGS} 93 | LDFLAGS : ${LDFLAGS} 94 | LIBS : ${LIBS} 95 | 96 | //============================================================================80 97 | 98 | Now, type make to build ..." 99 | -------------------------------------------------------------------------------- /08_distThreads/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([distThreads], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/distThreads.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check C compiler 22 | # 23 | ##############################################################################80 24 | AC_PROG_CC([clang gcc]) 25 | AS_IF([test "${CC}" = gcc], 26 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" 27 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 28 | AS_IF([test "${CC}" = clang], 29 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ 30 | -Xopenmp-target -march=sm_61 $CFLAGS" 31 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 32 | ##############################################################################80 33 | # 34 | # check archiver 35 | # 36 | ##############################################################################80 37 | AC_PROG_RANLIB 38 | AM_PROG_AR 39 | ##############################################################################80 40 | # 41 | # check headers 42 | # 43 | ##############################################################################80 44 | AC_CHECK_HEADER([cuda_runtime.h], [], 45 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 46 | ##############################################################################80 47 | # 48 | # check libraries 49 | # 50 | ##############################################################################80 51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 52 | [AC_MSG_ERROR([libcudart required, but not found])], []) 53 | ##############################################################################80 54 | # 55 | # check Doxygen 56 | # 57 | ##############################################################################80 58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 60 | AS_IF([test -z "${DOXYGEN}"], 61 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 65 | ##############################################################################80 66 | # 67 | # check ccsalloc (in OpenCCS) 68 | # 69 | ##############################################################################80 70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 72 | AS_IF([test -z "${CCSALLOC}"], 73 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 75 | ##############################################################################80 76 | # 77 | # create final files 78 | # 79 | ##############################################################################80 80 | AC_CONFIG_HEADERS([config.h]) 81 | AC_CONFIG_FILES([Makefile 82 | src/Makefile 83 | tests/Makefile]) 84 | AC_OUTPUT 85 | 86 | echo " 87 | //============================================================================80 88 | 89 | Configuration: 90 | 91 | CC : ${CC} 92 | CFLAGS : ${CFLAGS} 93 | LDFLAGS : ${LDFLAGS} 94 | LIBS : ${LIBS} 95 | 96 | //============================================================================80 97 | 98 | Now, type make to build ..." 99 | -------------------------------------------------------------------------------- /02_dataTransRate/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([dataTransRate], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/dataTransRate.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check C compiler 22 | # 23 | ##############################################################################80 24 | AC_PROG_CC([clang gcc]) 25 | AS_IF([test "${CC}" = gcc], 26 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" 27 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 28 | AS_IF([test "${CC}" = clang], 29 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ 30 | -Xopenmp-target -march=sm_61 $CFLAGS" 31 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 32 | ##############################################################################80 33 | # 34 | # check archiver 35 | # 36 | ##############################################################################80 37 | AC_PROG_RANLIB 38 | AM_PROG_AR 39 | ##############################################################################80 40 | # 41 | # check headers 42 | # 43 | ##############################################################################80 44 | AC_CHECK_HEADER([cuda_runtime.h], [], 45 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 46 | ##############################################################################80 47 | # 48 | # check libraries 49 | # 50 | ##############################################################################80 51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 52 | [AC_MSG_ERROR([libcudart required, but not found])], []) 53 | ##############################################################################80 54 | # 55 | # check Doxygen 56 | # 57 | ##############################################################################80 58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 60 | AS_IF([test -z "${DOXYGEN}"], 61 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 65 | ##############################################################################80 66 | # 67 | # check ccsalloc (in OpenCCS) 68 | # 69 | ##############################################################################80 70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 72 | AS_IF([test -z "${CCSALLOC}"], 73 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 75 | ##############################################################################80 76 | # 77 | # create final files 78 | # 79 | ##############################################################################80 80 | AC_CONFIG_HEADERS([config.h]) 81 | AC_CONFIG_FILES([Makefile 82 | src/Makefile 83 | tests/Makefile]) 84 | AC_OUTPUT 85 | 86 | echo " 87 | //============================================================================80 88 | 89 | Configuration: 90 | 91 | CC : ${CC} 92 | CFLAGS : ${CFLAGS} 93 | LDFLAGS : ${LDFLAGS} 94 | LIBS : ${LIBS} 95 | 96 | //============================================================================80 97 | 98 | Now, type make to build ..." 99 | -------------------------------------------------------------------------------- /04_scalarAddition/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([scalarAddition], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/scalarAddition.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check C compiler 22 | # 23 | ##############################################################################80 24 | AC_PROG_CC([clang gcc]) 25 | AS_IF([test "${CC}" = gcc], 26 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none -I${CUDAINC} $CFLAGS" 27 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 28 | AS_IF([test "${CC}" = clang], 29 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -I${CUDAINC} \ 30 | -Xopenmp-target -march=sm_61 $CFLAGS" 31 | LDFLAGS="-L${CUDALIB} $LDFLAGS"]) 32 | ##############################################################################80 33 | # 34 | # check archiver 35 | # 36 | ##############################################################################80 37 | AC_PROG_RANLIB 38 | AM_PROG_AR 39 | ##############################################################################80 40 | # 41 | # check headers 42 | # 43 | ##############################################################################80 44 | AC_CHECK_HEADER([cuda_runtime.h], [], 45 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 46 | ##############################################################################80 47 | # 48 | # check libraries 49 | # 50 | ##############################################################################80 51 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 52 | [AC_MSG_ERROR([libcudart required, but not found])], []) 53 | ##############################################################################80 54 | # 55 | # check Doxygen 56 | # 57 | ##############################################################################80 58 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 59 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 60 | AS_IF([test -z "${DOXYGEN}"], 61 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 62 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 63 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 64 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 65 | ##############################################################################80 66 | # 67 | # check ccsalloc (in OpenCCS) 68 | # 69 | ##############################################################################80 70 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 71 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 72 | AS_IF([test -z "${CCSALLOC}"], 73 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 74 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 75 | ##############################################################################80 76 | # 77 | # create final files 78 | # 79 | ##############################################################################80 80 | AC_CONFIG_HEADERS([config.h]) 81 | AC_CONFIG_FILES([Makefile 82 | src/Makefile 83 | tests/Makefile]) 84 | AC_OUTPUT 85 | 86 | echo " 87 | //============================================================================80 88 | 89 | Configuration: 90 | 91 | CC : ${CC} 92 | CFLAGS : ${CFLAGS} 93 | LDFLAGS : ${LDFLAGS} 94 | LIBS : ${LIBS} 95 | 96 | //============================================================================80 97 | 98 | Now, type make to build ..." 99 | -------------------------------------------------------------------------------- /09_matAdd/src/matAdd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matAdd.c 3 | * 4 | * @mainpage matAdd 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 19.03.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * matAdd performs matrix addition (A += B) in single-precision on GPU. 11 | * The performance (in GB/s) for different implementations is compared and 12 | * the numerical results are also verified. 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #ifdef _OPENMP 21 | #include 22 | #endif 23 | #include "mkl.h" 24 | #include "matAddAB.h" 25 | 26 | #define NLUP (64) 27 | 28 | /** 29 | * @brief Main entry point for matAdd. 30 | */ 31 | int main(int argc, char *argv[]) 32 | { 33 | int ial, idx, n, 34 | iret = 0; 35 | size_t n2bytes; 36 | float *a, *b, 37 | *ahost, // a matrix on host (as reference) 38 | *aaccl, // a matrix on accl 39 | maxabserr; 40 | struct timespec rt[2]; 41 | double wt; // walltime 42 | 43 | /* 44 | * preparation 45 | */ 46 | n = atoi(argv[1]); // 4096 is used for test 47 | n2bytes = sizeof(float) * n * n; 48 | if (NULL == (a = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 49 | if (NULL == (b = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 50 | if (NULL == (ahost = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 51 | if (NULL == (aaccl = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 52 | if (iret != 0) { 53 | printf("error: memory allocation\n"); 54 | mkl_free(a); mkl_free(b); 55 | mkl_free(ahost); mkl_free(aaccl); 56 | exit(EXIT_FAILURE); 57 | } 58 | #pragma omp parallel for default(none) \ 59 | shared(a, b, ahost, aaccl, n) private(idx) 60 | for (idx = 0; idx < n * n; ++idx) { 61 | a[idx] = rand() % 32 / 32.0f; 62 | b[idx] = rand() % 32 / 32.0f; 63 | ahost[idx] = 0.0f; 64 | aaccl[idx] = 0.0f; 65 | } 66 | printf("matrix dim: %d x %d\ntime averaged over %d loops\n", n, n, NLUP); 67 | /* 68 | * matAdd on host (ahost will be used as ref. value for checking aaccl) 69 | */ 70 | memcpy(ahost, a, n2bytes); 71 | cblas_saxpy(n * n, 1.0f, b, 1, ahost, 1); 72 | /* 73 | * matAdd on accl 74 | */ 75 | for (ial = 0; ial < 8; ++ial) { 76 | /* 77 | * See matAddAB.c for details: 78 | * 79 | * ial: 80 | * 81 | * 0: ij-loop, 2^9 threads * 2^3 teams, 82 | * coalesced memory access 83 | * 84 | * 1: ji-loop, 2^9 threads * 2^3 teams, 85 | * uncoalesced memory access 86 | * 87 | * 2: ij-loop, 2^9 threads * 2^f teams, collapse(2), 88 | * uncoalesced memory access 89 | * 90 | * 3: ji-loop, 2^9 threads * 2^f teams, collapse(2), 91 | * coalesced memory access 92 | * 93 | * 4: ji-loop, 2^8 threads * 2^f teams, collapse(3), 94 | * 2x i-loop unrolling 95 | * 96 | * 5: ji-loop, 2^8 threads * 2^f teams, collapse(2), 97 | * 2x i-loop unrolling 98 | * 99 | * 6: ji-loop, 2^8 threads * 2^e teams, collapse(3), 100 | * 2x i-loop unrolling, 2x j-loop unrolling 101 | * 102 | * otherwise: cublasSaxpy in CUBLAS 103 | */ 104 | memcpy(aaccl, a, n2bytes); 105 | wtcalc = -1.0; 106 | // skip 1st run for timing 107 | matAddAB_accl(aaccl, b, n, ial); 108 | // check aaccl 109 | maxabserr = -1.0f; 110 | for (idx = 0; idx < n * n; ++idx) { 111 | maxabserr = fabsf(aaccl[idx] - ahost[idx]) > maxabserr? 112 | fabsf(aaccl[idx] - ahost[idx]) : maxabserr; 113 | } 114 | // skip 2nd run for timing 115 | matAddAB_accl(aaccl, b, n, ial); 116 | // timing : start 117 | wtcalc = 0.0; 118 | clock_gettime(CLOCK_REALTIME, rt + 0); 119 | for (int i = 0; i < NLUP; ++i) { 120 | matAddAB_accl(aaccl, b, n, ial); 121 | } 122 | clock_gettime(CLOCK_REALTIME, rt + 1); 123 | wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 124 | printf("matAddAB (%d) : %9.1f GB/s %9.1f GB/s maxabserr = %9.1f\n", ial, 125 | NLUP * 3.0 * n2bytes / ((1 << 30) * wt), 126 | NLUP * 3.0 * n2bytes / ((1 << 30) * wtcalc), maxabserr); 127 | } 128 | /* 129 | * release memory 130 | */ 131 | mkl_free(a); mkl_free(b); 132 | mkl_free(ahost); mkl_free(aaccl); 133 | return 0; 134 | } 135 | -------------------------------------------------------------------------------- /02_dataTransRate/src/dataTransRate.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file dataTransRate.c 3 | * 4 | * @mainpage dataTransRate 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 12.03.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * dataTransRate gives the data transfer rate (in MB/sec) from src to dst. 11 | * 12 | * The possible situations are: 13 | * 14 | * - h2h: src = host and dst = host 15 | * - h2a: src = host and dst = accel 16 | * - a2a: src = accel and dst = accel 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #ifdef _OPENMP 23 | #include 24 | #endif 25 | #include "check1ns.h" 26 | 27 | /** 28 | * @brief Main entry point for dataTransRate. 29 | */ 30 | int main(int argc, char *argv[]) 31 | { 32 | // host 33 | int ihost, 34 | *hdat[2]; 35 | // accelerator 36 | int iaccel, naccel, 37 | *adat[2]; 38 | int nMB; 39 | size_t data; 40 | struct timespec rt[2]; 41 | double wt; // walltime 42 | int i, iret = 0; 43 | 44 | /* 45 | * We need 1 ns time resolution. 46 | */ 47 | check1ns(); 48 | printf("The system supports 1 ns time resolution\n"); 49 | /* 50 | * check the number of accelerators 51 | */ 52 | naccel = omp_get_num_devices(); 53 | if (0 == naccel) { 54 | printf("No accelerator found ... exit\n"); 55 | exit(EXIT_FAILURE); 56 | } else { 57 | printf("%d accelerator found ... continue\n", naccel); 58 | } 59 | /* 60 | * prepare data (default to 512 MB), host, and accel 61 | */ 62 | if (1 == argc) { 63 | nMB = 512; 64 | } else { 65 | nMB = atoi(argv[1]); 66 | } 67 | data = nMB * (1 << 20); 68 | ihost = omp_get_initial_device(); // index of the host 69 | iaccel = 0; // index of the 1st accel 70 | for (i = 0; i < 2; i++) { 71 | if (NULL == (hdat[i] = (int *) omp_target_alloc(data, ihost))) { 72 | printf("error: memory allocation for hdat[%d] ...", i); 73 | iret = -1; 74 | } 75 | if (NULL == (adat[i] = (int *) omp_target_alloc(data, iaccel))) { 76 | printf("error: memory allocation for adat[%d] ...", i); 77 | iret = -1; 78 | } 79 | } 80 | if (0 != iret) { 81 | for (i = 0; i < 2; i++) { 82 | omp_target_free(hdat[i], ihost); 83 | omp_target_free(adat[i], iaccel); 84 | } 85 | printf(" exit\n"); 86 | exit(EXIT_FAILURE); 87 | } 88 | printf("%d MB data will be transferred", nMB); 89 | for (i = 0; i < data / sizeof(*hdat[0]); i++) { 90 | hdat[0][i] = rand(); 91 | } 92 | /* 93 | * data transfer rate: h2h, h2a, and a2a 94 | */ 95 | printf("\nData Transfer Rate\n\n"); 96 | printf("================================\n"); 97 | printf(" src dst DTR \n"); 98 | printf("------- ------- ----------------\n"); 99 | /* 100 | * h2h 101 | */ 102 | clock_gettime(CLOCK_REALTIME, rt + 0); 103 | iret = omp_target_memcpy(hdat[1], hdat[0], data, 0x0, 0x0, ihost, ihost); 104 | clock_gettime(CLOCK_REALTIME, rt + 1); 105 | if (0 != iret) { 106 | printf("error: omp_target_memcpy (h2h)\n"); 107 | exit(EXIT_FAILURE); 108 | } 109 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 110 | printf(" host host %8.1f MB/sec\n", nMB / wt); 111 | /* 112 | * h2a 113 | */ 114 | clock_gettime(CLOCK_REALTIME, rt + 0); 115 | iret = omp_target_memcpy(adat[0], hdat[0], data, 0x0, 0x0, iaccel, ihost); 116 | clock_gettime(CLOCK_REALTIME, rt + 1); 117 | if (0 != iret) { 118 | printf("error: omp_target_memcpy (h2a)\n"); 119 | exit(EXIT_FAILURE); 120 | } 121 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 122 | printf(" host accel %8.1f MB/sec\n", nMB / wt); 123 | /* 124 | * a2a 125 | * 126 | * - Synchronous execution has been fixed in Clang 11. 127 | * - Data transfer rate is somehow lower than our expectation. 128 | */ 129 | clock_gettime(CLOCK_REALTIME, rt + 0); 130 | iret = omp_target_memcpy(adat[1], adat[0], data, 0x0, 0x0, iaccel, iaccel); 131 | clock_gettime(CLOCK_REALTIME, rt + 1); 132 | if (0 != iret) { 133 | printf("error: omp_target_memcpy (a2a)\n"); 134 | exit(EXIT_FAILURE); 135 | } 136 | wt = (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 137 | printf(" accel accel %8.1f MB/sec\n", nMB / wt); 138 | printf("================================\n\n"); 139 | /* 140 | * release the data 141 | */ 142 | for (i = 0; i < 2; i++) { 143 | omp_target_free(hdat[i], ihost); 144 | omp_target_free(adat[i], iaccel); 145 | } 146 | return 0; 147 | } 148 | -------------------------------------------------------------------------------- /10_matMul/src/matMul.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matMul.c 3 | * 4 | * @mainpage matMul 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 19.03.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * matMul performs matrix multiplication in single-precision on GPU. The 11 | * performance (in GFLOPS) for different implementations is compared and the 12 | * numerical results are also verified. 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #ifdef _OPENMP 21 | #include 22 | #endif 23 | #include "mkl.h" 24 | #include "matMulAB.h" 25 | 26 | #define NLUP (16) 27 | 28 | /** 29 | * @brief Main entry point for matMul. 30 | */ 31 | int main(int argc, char *argv[]) 32 | { 33 | int ial, idx, n, 34 | iret = 0; 35 | size_t n2bytes; 36 | float *a, *b, *c, 37 | *chost, // c matrix on host (as reference) 38 | *caccl, // c matrix on accl 39 | maxabserr; 40 | struct timespec rt[2]; 41 | double wt; // walltime 42 | 43 | /* 44 | * preparation 45 | */ 46 | n = atoi(argv[1]); // 4096 is used for test 47 | n2bytes = sizeof(float) * n * n; 48 | if (NULL == (a = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 49 | if (NULL == (b = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 50 | if (NULL == (c = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 51 | if (NULL == (chost = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 52 | if (NULL == (caccl = (float *) mkl_malloc(n2bytes, (16 * 256)))) iret = -1; 53 | if (iret != 0) { 54 | printf("error: memory allocation\n"); 55 | mkl_free(a); mkl_free(b); mkl_free(c); 56 | mkl_free(chost); mkl_free(caccl); 57 | exit(EXIT_FAILURE); 58 | } 59 | #pragma omp parallel for default(none) \ 60 | shared(a, b, c, chost, caccl, n) private(idx) 61 | for (idx = 0; idx < n * n; idx++) { 62 | a[idx] = rand() % 32 / 32.0f; 63 | b[idx] = rand() % 32 / 32.0f; 64 | c[idx] = rand() % 32 / 32.0f; 65 | chost[idx] = 0.0f; 66 | caccl[idx] = 0.0f; 67 | } 68 | printf("matrix dim: %d x %d\ntime averaged over %d loops\n", n, n, NLUP); 69 | /* 70 | * matMul on host (chost will be used as ref. value for checking caccl) 71 | */ 72 | memcpy(chost, c, n2bytes); 73 | cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 74 | n, n, n, 1.0f, a, n, b, n, 1.0f, chost, n); 75 | /* 76 | * matMul on accl 77 | */ 78 | for (ial = 0; ial < 11; ++ial) { 79 | /* 80 | * See matMulAB.c for details: 81 | * 82 | * ial: 83 | * 84 | * 0: jik-loop, 2^9 threads * 2^3 teams, 85 | * uncoalesced memory access 86 | * 87 | * 1: jki-loop, 2^9 threads * 2^3 teams, 88 | * uncoalesced memory access, uncoalesced r&w in innermost loop 89 | * 90 | * 2: jik-loop, 2^9 threads * 2^f teams, collapse(2) 91 | * 92 | * 3: jki-loop, 2^9 threads * 2^f teams, collapse(2), 93 | * race condition for writing c! 94 | * 95 | * 4: jik-loop, 2^9 threads * 2^f teams, collapse(2), 96 | * 4x k-loop unrolling 97 | * 98 | * 5: jik-loop, 2^7 threads * 2^f teams, collapse(3), 99 | * 4x i-loop unrolling (stride of 2^7 rows), 100 | * 4x k-loop unrolling, 101 | * rb: 4x data reuse 102 | * 103 | * 6: jik-loop, 2^7 threads * 2^d teams, collapse(3), 104 | * 4x j-loop unrolling (stride of 1 col ), 105 | * 4x i-loop unrolling (stride of 2^7 rows), 106 | * 4x k-loop unrolling, 107 | * rb: 4x data reuse, 108 | * ra: 4x data reuse, 109 | * register blocking 110 | * 111 | * 7: based on (2), jik-loop, 2^8 threads * 2^g teams, collapse(2) 112 | * 113 | * 8: based on (7), jik-loop, 2^8 threads * 2^g teams, collapse(2) 114 | * GPU shared memory for data re-use, 115 | * 16x k-loop unrolling, 116 | * shared memory blocking 117 | * 118 | * 9: based on (5), jik-loop, 2^7 threads * 2^f teams, collapse(2), 119 | * 4x i-loop unrolling (stride of n/4 rows), 120 | * 4x k-loop unrolling, 121 | * rb: 4x data reuse 122 | * 123 | * otherwise: cublasSgemm in CUBLAS 124 | */ 125 | memcpy(caccl, c, n2bytes); 126 | wtcalc = -1.0; 127 | // skip 1st run for timing 128 | matMulAB_accl(a, b, caccl, n, ial); 129 | // check caccl 130 | maxabserr = -1.0f; 131 | for (idx = 0; idx < n * n; idx++) { 132 | maxabserr = fabsf(caccl[idx] - chost[idx]) > maxabserr? 133 | fabsf(caccl[idx] - chost[idx]) : maxabserr; 134 | } 135 | // skip 2nd run for timing 136 | matMulAB_accl(a, b, caccl, n, ial); 137 | // timing : start 138 | wtcalc = 0.0; 139 | clock_gettime(CLOCK_REALTIME, rt + 0); 140 | for (int i = 0; i < NLUP; ++i) { 141 | matMulAB_accl(a, b, caccl, n, ial); 142 | } 143 | clock_gettime(CLOCK_REALTIME, rt + 1); 144 | wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 145 | printf("matMulAB (%d) : %9.1f GFLOPS %9.1f GFLOPS maxabserr = %9.1f\n", ial, 146 | NLUP * 2.0e-9 * n * n * n / wt, NLUP * 2.0e-9 * n * n * n / wtcalc, 147 | maxabserr); 148 | } 149 | /* 150 | * release memory 151 | */ 152 | mkl_free(a); mkl_free(b); mkl_free(c); 153 | mkl_free(chost); mkl_free(caccl); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /05_saxpy/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([saxpy], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/saxpy.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check MKL 22 | # 23 | ##############################################################################80 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found]) 25 | if test -z "${MKLINC}"; then 26 | AC_SUBST([MKLINC], [${MKLROOT}/include]) 27 | fi 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found]) 29 | if test -z "${MKLLIB}"; then 30 | AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64]) 31 | fi 32 | ##############################################################################80 33 | # 34 | # check C compiler 35 | # 36 | ##############################################################################80 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}" 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}" 39 | # 40 | AC_PROG_CC([clang gcc]) 41 | AS_IF([test "${CC}" = gcc], 42 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"]) 43 | AS_IF([test "${CC}" = clang], 44 | [CFLAGS="-Wall -O2 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda \ 45 | -Xopenmp-target -march=sm_61 $CFLAGS"]) 46 | ##############################################################################80 47 | # 48 | # check archiver 49 | # 50 | ##############################################################################80 51 | AC_PROG_RANLIB 52 | AM_PROG_AR 53 | ##############################################################################80 54 | # 55 | # check headers 56 | # 57 | ##############################################################################80 58 | AC_CHECK_HEADER([cuda_runtime.h], [], 59 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 60 | AC_CHECK_HEADER([cublas_v2.h], [], 61 | [AC_MSG_ERROR([cublas_v2.h required, but not found])], []) 62 | AC_CHECK_HEADER([mkl.h], [], 63 | [AC_MSG_ERROR([mkl.h required, but not found])], []) 64 | ##############################################################################80 65 | # 66 | # check libraries 67 | # 68 | ##############################################################################80 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 70 | [AC_MSG_ERROR([libcudart required, but not found])], []) 71 | AC_CHECK_LIB([cublas], [cublasSaxpy], [], 72 | [AC_MSG_ERROR([libcublas required, but not found])], []) 73 | AC_CHECK_LIB([pthread], [pthread_create], [], 74 | [AC_MSG_ERROR([libpthread required, but not found])], []) 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [], 76 | [AC_MSG_ERROR([libiomp5 required, but not found])], []) 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [], 78 | [AC_MSG_ERROR([libmkl_core required, but not found])], 79 | [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm]) 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [], 81 | [AC_MSG_ERROR([libmkl_intel_thread required, but not found])], 82 | [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm]) 83 | AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [], 84 | [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])], 85 | [-lmkl_intel_thread -lmkl_core -liomp5 -lm]) 86 | ##############################################################################80 87 | # 88 | # check Doxygen 89 | # 90 | ##############################################################################80 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 93 | AS_IF([test -z "${DOXYGEN}"], 94 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 98 | ##############################################################################80 99 | # 100 | # check ccsalloc (in OpenCCS) 101 | # 102 | ##############################################################################80 103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 105 | AS_IF([test -z "${CCSALLOC}"], 106 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 108 | ##############################################################################80 109 | # 110 | # create final files 111 | # 112 | ##############################################################################80 113 | AC_CONFIG_HEADERS([config.h]) 114 | AC_CONFIG_FILES([Makefile 115 | src/Makefile 116 | tests/Makefile]) 117 | AC_OUTPUT 118 | 119 | echo " 120 | //============================================================================80 121 | 122 | Configuration: 123 | 124 | CC : ${CC} 125 | CFLAGS : ${CFLAGS} 126 | LDFLAGS : ${LDFLAGS} 127 | LIBS : ${LIBS} 128 | 129 | //============================================================================80 130 | 131 | Now, type make to build ..." 132 | -------------------------------------------------------------------------------- /09_matAdd/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([matAdd], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/matAdd.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check MKL 22 | # 23 | ##############################################################################80 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found]) 25 | if test -z "${MKLINC}"; then 26 | AC_SUBST([MKLINC], [${MKLROOT}/include]) 27 | fi 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found]) 29 | if test -z "${MKLLIB}"; then 30 | AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64]) 31 | fi 32 | ##############################################################################80 33 | # 34 | # check C compiler 35 | # 36 | ##############################################################################80 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}" 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}" 39 | # 40 | AC_PROG_CC([clang gcc]) 41 | AS_IF([test "${CC}" = gcc], 42 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"]) 43 | AS_IF([test "${CC}" = clang], 44 | [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ 45 | -Xopenmp-target -march=sm_61 $CFLAGS"]) 46 | ##############################################################################80 47 | # 48 | # check archiver 49 | # 50 | ##############################################################################80 51 | AC_PROG_RANLIB 52 | AM_PROG_AR 53 | ##############################################################################80 54 | # 55 | # check headers 56 | # 57 | ##############################################################################80 58 | AC_CHECK_HEADER([cuda_runtime.h], [], 59 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 60 | AC_CHECK_HEADER([cublas_v2.h], [], 61 | [AC_MSG_ERROR([cublas_v2.h required, but not found])], []) 62 | AC_CHECK_HEADER([mkl.h], [], 63 | [AC_MSG_ERROR([mkl.h required, but not found])], []) 64 | ##############################################################################80 65 | # 66 | # check libraries 67 | # 68 | ##############################################################################80 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 70 | [AC_MSG_ERROR([libcudart required, but not found])], []) 71 | AC_CHECK_LIB([cublas], [cublasSaxpy], [], 72 | [AC_MSG_ERROR([libcublas required, but not found])], []) 73 | AC_CHECK_LIB([pthread], [pthread_create], [], 74 | [AC_MSG_ERROR([libpthread required, but not found])], []) 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [], 76 | [AC_MSG_ERROR([libiomp5 required, but not found])], []) 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsaxpy], [], 78 | [AC_MSG_ERROR([libmkl_core required, but not found])], 79 | [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm]) 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_saxpy], [], 81 | [AC_MSG_ERROR([libmkl_intel_thread required, but not found])], 82 | [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm]) 83 | AC_CHECK_LIB([mkl_intel_lp64], [saxpy], [], 84 | [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])], 85 | [-lmkl_intel_thread -lmkl_core -liomp5 -lm]) 86 | ##############################################################################80 87 | # 88 | # check Doxygen 89 | # 90 | ##############################################################################80 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 93 | AS_IF([test -z "${DOXYGEN}"], 94 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 98 | ##############################################################################80 99 | # 100 | # check ccsalloc (in OpenCCS) 101 | # 102 | ##############################################################################80 103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 105 | AS_IF([test -z "${CCSALLOC}"], 106 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 108 | ##############################################################################80 109 | # 110 | # create final files 111 | # 112 | ##############################################################################80 113 | AC_CONFIG_HEADERS([config.h]) 114 | AC_CONFIG_FILES([Makefile 115 | src/Makefile 116 | tests/Makefile]) 117 | AC_OUTPUT 118 | 119 | echo " 120 | //============================================================================80 121 | 122 | Configuration: 123 | 124 | CC : ${CC} 125 | CFLAGS : ${CFLAGS} 126 | LDFLAGS : ${LDFLAGS} 127 | LIBS : ${LIBS} 128 | 129 | //============================================================================80 130 | 131 | Now, type make to build ..." 132 | -------------------------------------------------------------------------------- /10_matMul/configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.63]) 2 | AC_INIT([matMul], [1.0], [xinwu@mail.uni-paderborn.de]) 3 | AC_CONFIG_AUX_DIR([build-aux]) 4 | AC_CONFIG_SRCDIR([src/matMul.c]) 5 | AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) 6 | ##############################################################################80 7 | # 8 | # check CUDA 9 | # 10 | ##############################################################################80 11 | AC_ARG_VAR([CUDAINC], [The PATH wherein cuda_runtime.h can be found]) 12 | if test -z "${CUDAINC}"; then 13 | AC_SUBST([CUDAINC], [${CUDA_ROOT}/include]) 14 | fi 15 | AC_ARG_VAR([CUDALIB], [The PATH wherein libcudart.so can be found]) 16 | if test -z "${CUDALIB}"; then 17 | AC_SUBST([CUDALIB], [${CUDA_ROOT}/lib64]) 18 | fi 19 | ##############################################################################80 20 | # 21 | # check MKL 22 | # 23 | ##############################################################################80 24 | AC_ARG_VAR([MKLINC], [The PATH wherein mkl.h can be found]) 25 | if test -z "${MKLINC}"; then 26 | AC_SUBST([MKLINC], [${MKLROOT}/include]) 27 | fi 28 | AC_ARG_VAR([MKLLIB], [The PATH wherein MKL library can be found]) 29 | if test -z "${MKLLIB}"; then 30 | AC_SUBST([MKLLIB], [${MKLROOT}/lib/intel64]) 31 | fi 32 | ##############################################################################80 33 | # 34 | # check C compiler 35 | # 36 | ##############################################################################80 37 | CFLAGS+="-I${CUDAINC} -I${MKLINC}" 38 | LDFLAGS+="-L${CUDALIB} -L${MKLLIB}" 39 | # 40 | AC_PROG_CC([clang gcc]) 41 | AS_IF([test "${CC}" = gcc], 42 | [CFLAGS="-Wall -O2 -fopenmp -foffload=nvptx-none $CFLAGS"]) 43 | AS_IF([test "${CC}" = clang], 44 | [CFLAGS="-Wall -Werror -O2 -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ 45 | -Xopenmp-target -march=sm_61 $CFLAGS"]) 46 | ##############################################################################80 47 | # 48 | # check archiver 49 | # 50 | ##############################################################################80 51 | AC_PROG_RANLIB 52 | AM_PROG_AR 53 | ##############################################################################80 54 | # 55 | # check headers 56 | # 57 | ##############################################################################80 58 | AC_CHECK_HEADER([cuda_runtime.h], [], 59 | [AC_MSG_ERROR([cuda_runtime.h required, but not found])], []) 60 | AC_CHECK_HEADER([cublas_v2.h], [], 61 | [AC_MSG_ERROR([cublas_v2.h required, but not found])], []) 62 | AC_CHECK_HEADER([mkl.h], [], 63 | [AC_MSG_ERROR([mkl.h required, but not found])], []) 64 | ##############################################################################80 65 | # 66 | # check libraries 67 | # 68 | ##############################################################################80 69 | AC_CHECK_LIB([cudart], [cudaSetDevice], [], 70 | [AC_MSG_ERROR([libcudart required, but not found])], []) 71 | AC_CHECK_LIB([cublas], [cublasSgemm], [], 72 | [AC_MSG_ERROR([libcublas required, but not found])], []) 73 | AC_CHECK_LIB([pthread], [pthread_create], [], 74 | [AC_MSG_ERROR([libpthread required, but not found])], []) 75 | AC_CHECK_LIB([iomp5], [omp_set_num_threads], [], 76 | [AC_MSG_ERROR([libiomp5 required, but not found])], []) 77 | AC_CHECK_LIB([mkl_core], [mkl_blas_xsgemm], [], 78 | [AC_MSG_ERROR([libmkl_core required, but not found])], 79 | [-lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lm]) 80 | AC_CHECK_LIB([mkl_intel_thread], [mkl_blas_sgemm], [], 81 | [AC_MSG_ERROR([libmkl_intel_thread required, but not found])], 82 | [-lmkl_intel_lp64 -lmkl_core -liomp5 -lm]) 83 | AC_CHECK_LIB([mkl_intel_lp64], [sgemm], [], 84 | [AC_MSG_ERROR([libmkl_intel_lp64 required, but not found])], 85 | [-lmkl_intel_thread -lmkl_core -liomp5 -lm]) 86 | ##############################################################################80 87 | # 88 | # check Doxygen 89 | # 90 | ##############################################################################80 91 | AC_ARG_VAR([DOXYGEN], [Doxygen: source documentation generation program]) 92 | AC_CHECK_PROG([DOXYGEN], [doxygen], [doxygen], [], [], []) 93 | AS_IF([test -z "${DOXYGEN}"], 94 | [AC_MSG_WARN([doxygen not found - continue without doxygen support])]) 95 | AM_CONDITIONAL([HAVE_DOXYGEN], [test -n "${DOXYGEN}"]) 96 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Doxyfile])]) 97 | AM_COND_IF([HAVE_DOXYGEN], [AC_CONFIG_FILES([docs/Makefile])]) 98 | ##############################################################################80 99 | # 100 | # check ccsalloc (in OpenCCS) 101 | # 102 | ##############################################################################80 103 | AC_ARG_VAR([CCSALLOC], [OpenCCS: Open Computing Center Software]) 104 | AC_CHECK_PROG([CCSALLOC], [ccsalloc], [ccsalloc], [], [], []) 105 | AS_IF([test -z "${CCSALLOC}"], 106 | [AC_MSG_WARN([ccsalloc not found - continue without OpenCCS support])]) 107 | AM_CONDITIONAL([HAVE_CCSALLOC], [test -n "${CCSALLOC}"]) 108 | ##############################################################################80 109 | # 110 | # create final files 111 | # 112 | ##############################################################################80 113 | AC_CONFIG_HEADERS([config.h]) 114 | AC_CONFIG_FILES([Makefile 115 | src/Makefile 116 | tests/Makefile]) 117 | AC_OUTPUT 118 | 119 | echo " 120 | //============================================================================80 121 | 122 | Configuration: 123 | 124 | CC : ${CC} 125 | CFLAGS : ${CFLAGS} 126 | LDFLAGS : ${LDFLAGS} 127 | LIBS : ${LIBS} 128 | 129 | //============================================================================80 130 | 131 | Now, type make to build ..." 132 | -------------------------------------------------------------------------------- /05_saxpy/src/saxpy.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file saxpy.c 3 | * 4 | * @mainpage saxpy 5 | * 6 | * @author Xin Wu (PC²) 7 | * @date 05.04.2020 8 | * @copyright CC BY-SA 2.0 9 | * 10 | * saxpy performs the \c saxpy operation on host as well as accelerator. 11 | * The performance (in MB/s) for different implementations is also compared. 12 | * 13 | * The \c saxpy operation is defined as: 14 | * 15 | * y := a * x + y 16 | * 17 | * where: 18 | * 19 | * - a is a scalar. 20 | * - x and y are single-precision vectors each with n elements. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #ifdef _OPENMP 29 | #include 30 | #endif 31 | #include "mkl.h" 32 | #include "hsaxpy.h" 33 | #include "asaxpy.h" 34 | #include "check1ns.h" 35 | #include "wtcalc.h" 36 | 37 | #define TWO26 (1 << 26) 38 | #define NLUP (32) 39 | 40 | /** 41 | * @brief Main entry point for saxpy. 42 | */ 43 | int main(int argc, char *argv[]) 44 | { 45 | int i, n, 46 | iret, 47 | ial; 48 | size_t nbytes; 49 | float a = 2.0f, 50 | *x, *y, 51 | *yhost, 52 | *yaccl, 53 | maxabserr; 54 | struct timespec rt[2]; 55 | double wt; // walltime 56 | 57 | /* 58 | * We need 1 ns time resolution. 59 | */ 60 | check1ns(); 61 | printf("The system supports 1 ns time resolution\n"); 62 | /* 63 | * check the number of accelerators 64 | */ 65 | if (0 == omp_get_num_devices()) { 66 | printf("No accelerator found ... exit\n"); 67 | exit(EXIT_FAILURE); 68 | } 69 | /* 70 | * preparation 71 | */ 72 | n = TWO26; 73 | nbytes = sizeof(float) * n; 74 | iret = 0; 75 | if (NULL == (x = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; 76 | if (NULL == (y = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; 77 | if (NULL == (yhost = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; 78 | if (NULL == (yaccl = (float *) mkl_malloc(nbytes, (16 * 256)))) iret = -1; 79 | if (0 != iret) { 80 | printf("error: memory allocation\n"); 81 | mkl_free(x); mkl_free(y); 82 | mkl_free(yhost); mkl_free(yaccl); 83 | exit(EXIT_FAILURE); 84 | } 85 | #pragma omp parallel for default(none) \ 86 | shared(a, x, y, yhost, yaccl, n) private(i) 87 | for (i = 0; i < n; ++i) { 88 | x[i] = rand() % 32 / 32.0f; 89 | y[i] = rand() % 32 / 32.0f; 90 | yhost[i] = a * x[i] + y[i]; // yhost will be used as reference value 91 | yaccl[i] = 0.0f; 92 | } 93 | printf("total size of x and y is %9.1f MB\n", 2.0 * nbytes / (1 << 20)); 94 | printf("tests are averaged over %2d loops\n", NLUP); 95 | /* 96 | * saxpy on host 97 | */ 98 | for (ial = 0; ial < 2; ++ial) { 99 | /* 100 | * See hsaxpy.c for details: 101 | * 102 | * ial: 103 | * 104 | * 0: naive implementation 105 | * otherwise: saxpy in MKL 106 | */ 107 | memcpy(yaccl, y, nbytes); 108 | wtcalc = -1.0; 109 | // skip 1st run for timing 110 | hsaxpy(n, a, x, yaccl, ial); 111 | // check yaccl 112 | maxabserr = -1.0f; 113 | for (i = 0; i < n; ++i) { 114 | maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr? 115 | fabsf(yaccl[i] - yhost[i]) : maxabserr; 116 | } 117 | // skip 2nd run for timing 118 | hsaxpy(n, a, x, yaccl, ial); 119 | // timing : start 120 | wtcalc = 0.0; 121 | clock_gettime(CLOCK_REALTIME, rt + 0); 122 | for (int ilup = 0; ilup < NLUP; ++ilup) { 123 | hsaxpy(n, a, x, yaccl, ial); 124 | } 125 | clock_gettime(CLOCK_REALTIME, rt + 1); 126 | wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 127 | printf("saxpy on host (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n", 128 | ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt), 129 | NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr); 130 | } 131 | /* 132 | * saxpy on accl 133 | */ 134 | for (ial = 1; ial < 9; ++ial) { 135 | /* 136 | * See asaxpy.c for details: 137 | * 138 | * ial: 139 | * 140 | * 0: <<<2^0 , 2^0 >>>, TOO SLOW! not tested 141 | * 1: <<<2^0 , 2^7 >>>, auto scheduling 142 | * 2: <<<2^7 , 2^0 >>>, auto scheduling 143 | * 3: <<<2^7 , 2^7 >>>, auto scheduling 144 | * 4: <<<2^16, 2^10>>>, manual scheduling 145 | * 5: <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26) 146 | * 6: <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling 147 | * 7: de-linearize the vector and then collapse the ji-loop. 148 | * otherwise: cublasSaxpy in CUBLAS 149 | */ 150 | memcpy(yaccl, y, nbytes); 151 | wtcalc = -1.0; 152 | // skip 1st run for timing 153 | asaxpy(n, a, x, yaccl, ial); 154 | // check yaccl 155 | maxabserr = -1.0f; 156 | for (i = 0; i < n; ++i) { 157 | maxabserr = fabsf(yaccl[i] - yhost[i]) > maxabserr? 158 | fabsf(yaccl[i] - yhost[i]) : maxabserr; 159 | } 160 | // skip 2nd run for timing 161 | asaxpy(n, a, x, yaccl, ial); 162 | // timing : start 163 | wtcalc = 0.0; 164 | clock_gettime(CLOCK_REALTIME, rt + 0); 165 | for (int ilup = 0; ilup < NLUP; ++ilup) { 166 | asaxpy(n, a, x, yaccl, ial); 167 | } 168 | clock_gettime(CLOCK_REALTIME, rt + 1); 169 | wt=(rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 170 | printf("saxpy on accl (%d) : %9.1f MB/s %9.1f MB/s maxabserr = %9.1f\n", 171 | ial, NLUP * 3.0 * nbytes / ((1 << 20) * wt), 172 | NLUP * 3.0 * nbytes / ((1 << 20) * wtcalc), maxabserr); 173 | } 174 | /* 175 | * release memory 176 | */ 177 | mkl_free(x); mkl_free(y); 178 | mkl_free(yhost); mkl_free(yaccl); 179 | return 0; 180 | } 181 | -------------------------------------------------------------------------------- /00_build_OpenMP_offload/Clang/build_clang_offload.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Build Clang/LLVM with OpenMP Support for Nvidia GPU Offloading 3 | author: Xin Wu (PC²) 4 | date: 28.01.2020 5 | --- 6 | 7 | # Check Nvidia GPU 8 | 9 | The build procedure was carried out on a Tesla node of OCuLUS at PC². It 10 | features an Nvidia Tesla K20X GPU. Thus it's necessary to check the Tesla K20X 11 | GPU on the compute node, before building Clang with OpenMP support for offloading 12 | computation on Nvidia GPU. 13 | 14 | The relevant scripts and log files can be found in `00_check_gpu`. 15 | 16 | `tesla.sh` is a driver script and should be submitted with `ccsalloc`: 17 | 18 | ```bash 19 | ccsalloc testa.sh 20 | ``` 21 | 22 | `realscript.sh` does the real job and the output can be found in `tesla.log`. 23 | 24 | # Build Clang and Necessary Toolchains 25 | 26 | The necessary toolchains for building Clang need to be built first. For this 27 | purpose we have built GCC 8.3.0,[^gcc830] binutils, autoconf, automake, OpenSSL, 28 | CMake, and ncurses. 29 | 30 | [^gcc830]: At the time of writing, GCC 9.2.0 is not supported for building Clang 31 | with OpenMP offloading to GPU. 32 | 33 | After the toolchains have been built, Clang can be built with GCC 8.3.0 by using 34 | the following script: 35 | 36 | ```bash 37 | pkgname="llvmorg-9.0.1" 38 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz 39 | tar xf ${pkgname}.tar.gz 40 | BUILDIR="GCC" 41 | rm -fr $BUILDIR 42 | mkdir -p $BUILDIR 43 | cd $BUILDIR 44 | cmake \ 45 | -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \ 46 | -DCMAKE_PREFIX_PATH="${TOOLCHAINS}" \ 47 | -DCMAKE_BUILD_TYPE=Release \ 48 | -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ 49 | -DCMAKE_INSTALL_PREFIX=${DESTDIR} \ 50 | -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61 \ 51 | -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75 \ 52 | -DCMAKE_C_COMPILER=gcc \ 53 | -DCMAKE_CXX_COMPILER=g++ \ 54 | -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile 55 | make -j 16 2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile 56 | make install 2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile 57 | cd .. 58 | ``` 59 | 60 | # Bootstrap Clang with `libc++` 61 | 62 | We need to bootstrap Clang for OpenMP offloading. The following script 63 | bootstraps Clang with its own `libc++`: 64 | 65 | ```bash 66 | pkgname="llvmorg-9.0.1" 67 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz 68 | tar xf ${pkgname}.tar.gz 69 | BUILDIR="LIBCXX" 70 | rm -fr $BUILDIR 71 | mkdir -p $BUILDIR 72 | cd $BUILDIR 73 | cmake \ 74 | -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \ 75 | -DCMAKE_PREFIX_PATH="${TOOLCHAINS}" \ 76 | -DCMAKE_BUILD_TYPE=Release \ 77 | -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ 78 | -DCMAKE_INSTALL_PREFIX=${DESTDIR} \ 79 | -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61 \ 80 | -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75 \ 81 | -DCMAKE_C_COMPILER=clang \ 82 | -DCMAKE_CXX_COMPILER=clang++ \ 83 | -DCMAKE_CXX_FLAGS="-stdlib=libc++" \ 84 | -DCMAKE_CXX_LINK_FLAGS="-stdlib=libc++" \ 85 | -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile 86 | make -j 16 2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile 87 | make install 2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile 88 | cd .. 89 | ``` 90 | 91 | To access this version of Clang on OCuLUS: 92 | 93 | ```bash 94 | module load clang/9.0.1_BS_libcxx_CUDA10.1 95 | ``` 96 | 97 | # Bootstrap Clang with `libstdc++` 98 | 99 | Clang can also be bootstrapped with GNU's `libstdc++` with the following script: 100 | 101 | ```bash 102 | pkgname="llvmorg-9.0.1" 103 | curl -L -O https://github.com/llvm/llvm-project/archive/${pkgname}.tar.gz 104 | tar xf ${pkgname}.tar.gz 105 | BUILDIR="LIBSTDCXX" 106 | rm -fr $BUILDIR 107 | mkdir -p $BUILDIR 108 | cd $BUILDIR 109 | cmake \ 110 | -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;libcxx;libcxxabi;lld;openmp" \ 111 | -DCMAKE_PREFIX_PATH="${TOOLCHAINS}" \ 112 | -DCMAKE_BUILD_TYPE=Release \ 113 | -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \ 114 | -DCMAKE_INSTALL_PREFIX=${DESTDIR} \ 115 | -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61 \ 116 | -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=35,37,50,52,60,61,70,75 \ 117 | -DCMAKE_C_COMPILER=clang \ 118 | -DCMAKE_CXX_COMPILER=clang++ \ 119 | -G "Unix Makefiles" ../llvm-project-${pkgname}/llvm 2>&1 | tee ${pkgname}.${BUILDIR}.cmak.logfile 120 | make -j 16 2>&1 | tee ${pkgname}.${BUILDIR}.make.logfile 121 | make install 2>&1 | tee ${pkgname}.${BUILDIR}.inst.logfile 122 | cd .. 123 | ``` 124 | 125 | To access this version of Clang on OCuLUS: 126 | 127 | ```bash 128 | module load clang/9.0.1_BS_libstdcxx_CUDA10.1 129 | ``` 130 | 131 | -------------------------------------------------------------------------------- /09_matAdd/src/matAddAB.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matAddAB.c 3 | * 4 | * @brief Function definition for matrix addition (A += B) in single-precision. 5 | * 6 | * This source file contains function definition for matrix addition (A += B) 7 | * in single-precision. 8 | * 9 | * @author Xin Wu (PC²) 10 | * @date 07.02.2020 11 | * @copyright CC BY-SA 2.0 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #ifdef _OPENMP 18 | #include 19 | #endif 20 | #include 21 | #include "cublas_v2.h" 22 | #include "matAddAB.h" 23 | 24 | #define NTHRDS7 (1 << 0x7) /* 2^{7} */ 25 | #define NTHRDS8 (1 << 0x8) /* 2^{8} */ 26 | #define NTHRDS9 (1 << 0x9) /* 2^{9} */ 27 | 28 | #define LTEAMSD (1 << 0xD) /* 2^{13} */ 29 | #define LTEAMSE (1 << 0xE) /* 2^{14} */ 30 | #define LTEAMSF (1 << 0xF) /* 2^{15} */ 31 | 32 | #define BLKROW (512) /* 2x number of threads in each team */ 33 | 34 | double wtcalc; 35 | 36 | void matAddAB_accl(float *a, 37 | float *b, 38 | int n, 39 | int ial) 40 | { 41 | cublasHandle_t handle; 42 | float alfa = 1.0f, 43 | *a_dev = NULL, 44 | *b_dev = NULL; 45 | struct timespec rt[2]; 46 | 47 | switch (ial) { 48 | case 0: 49 | /* 50 | * - ij-loop 51 | * - 2^9 threads per team and 2^3 teams 52 | * - coalesced memory access 53 | */ 54 | #pragma omp target data device(0) \ 55 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 56 | { 57 | clock_gettime(CLOCK_REALTIME, rt + 0); 58 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 59 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 60 | default(none) shared(a, b, n) 61 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 62 | dist_schedule(static, NTHRDS9) \ 63 | default(none) shared(a, b, n) 64 | for (int i = 0; i < n; ++i) { /* parallel */ 65 | for (int j = 0; j < n; ++j) { /* sequential */ 66 | a[j * n + i] += b[j * n + i]; 67 | } /* end j-loop */ 68 | } /* end i-loop */ 69 | clock_gettime(CLOCK_REALTIME, rt + 1); 70 | } 71 | break; 72 | case 1: 73 | /* 74 | * - ji-loop 75 | * - 2^9 threads per team and 2^3 teams 76 | * - n-stride memory read for a and b 77 | * - n-stride memory write for a 78 | */ 79 | #pragma omp target data device(0) \ 80 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 81 | { 82 | clock_gettime(CLOCK_REALTIME, rt + 0); 83 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 84 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 85 | default(none) shared(a, b, n) 86 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 87 | dist_schedule(static, NTHRDS9) \ 88 | default(none) shared(a, b, n) 89 | for (int j = 0; j < n; ++j) { /* parallel */ 90 | for (int i = 0; i < n; ++i) { /* sequential */ 91 | a[j * n + i] += b[j * n + i]; 92 | } /* end i-loop */ 93 | } /* end j-loop */ 94 | clock_gettime(CLOCK_REALTIME, rt + 1); 95 | } 96 | break; 97 | case 2: 98 | /* 99 | * - ij-loop 100 | * - 2^9 threads per team and 2^f teams 101 | * - collapse(2) 102 | * - n-stride memory read for a and b 103 | * - n-stride memory write for a 104 | */ 105 | #pragma omp target data device(0) \ 106 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 107 | { 108 | clock_gettime(CLOCK_REALTIME, rt + 0); 109 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 110 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 111 | default(none) shared(a, b, n) 112 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 113 | dist_schedule(static, NTHRDS9) collapse(2) \ 114 | default(none) shared(a, b, n) 115 | for (int i = 0; i < n; ++i) { 116 | for (int j = 0; j < n; ++j) { 117 | a[j * n + i] += b[j * n + i]; 118 | } /* end j-loop */ 119 | } /* end i-loop */ 120 | clock_gettime(CLOCK_REALTIME, rt + 1); 121 | } 122 | break; 123 | case 3: 124 | /* 125 | * - ji-loop 126 | * - 2^9 threads per team and 2^f teams 127 | * - collapse(2) 128 | * - coalesced memory access 129 | */ 130 | #pragma omp target data device(0) \ 131 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 132 | { 133 | clock_gettime(CLOCK_REALTIME, rt + 0); 134 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 135 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 136 | default(none) shared(a, b, n) 137 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 138 | dist_schedule(static, NTHRDS9) collapse(2) \ 139 | default(none) shared(a, b, n) 140 | for (int j = 0; j < n; ++j) { 141 | for (int i = 0; i < n; ++i) { 142 | a[j * n + i] += b[j * n + i]; 143 | } /* end i-loop */ 144 | } /* end j-loop */ 145 | clock_gettime(CLOCK_REALTIME, rt + 1); 146 | } 147 | break; 148 | case 4: 149 | /* 150 | * - ji-loop 151 | * - 2^8 threads per team and 2^f teams 152 | * - collapse(3) 153 | * - 2x i-loop unrolling (stride of 2^8 rows) 154 | */ 155 | #pragma omp target data device(0) \ 156 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 157 | { 158 | clock_gettime(CLOCK_REALTIME, rt + 0); 159 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \ 160 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 161 | default(none) shared(a, b, n) 162 | #pragma omp distribute parallel for num_threads(NTHRDS8) \ 163 | dist_schedule(static, NTHRDS8) collapse(3) \ 164 | default(none) shared(a, b, n) 165 | for (int j = 0; j < n; ++j) { 166 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) { 167 | for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */ 168 | a[j * n + iblk * BLKROW + i ] += 169 | b[j * n + iblk * BLKROW + i ]; 170 | a[j * n + iblk * BLKROW + i + NTHRDS8] += 171 | b[j * n + iblk * BLKROW + i + NTHRDS8]; 172 | } /* end i-loop */ 173 | } /* end iblk-loop */ 174 | } /* end j-loop */ 175 | clock_gettime(CLOCK_REALTIME, rt + 1); 176 | } 177 | break; 178 | case 5: 179 | /* 180 | * - ji-loop 181 | * - 2^8 threads per team and 2^f teams 182 | * - collapse(2) 183 | * - 2x i-loop unrolling (stride of n/2 rows) 184 | */ 185 | #pragma omp target data device(0) \ 186 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 187 | { 188 | clock_gettime(CLOCK_REALTIME, rt + 0); 189 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS8) \ 190 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 191 | default(none) shared(a, b, n) 192 | #pragma omp distribute parallel for num_threads(NTHRDS8) \ 193 | dist_schedule(static, NTHRDS8) collapse(2) \ 194 | default(none) shared(a, b, n) 195 | for (int j = 0; j < n; ++j) { 196 | for (int i = 0; i < (n >> 1); ++i) { /* 2x unrolling */ 197 | a[j * n + i ] += 198 | b[j * n + i ]; 199 | a[j * n + i + (n >> 1)] += 200 | b[j * n + i + (n >> 1)]; 201 | } /* end i-loop */ 202 | } /* end j-loop */ 203 | clock_gettime(CLOCK_REALTIME, rt + 1); 204 | } 205 | break; 206 | case 6: 207 | /* 208 | * - ji-loop 209 | * - 2^8 threads per team and 2^14 teams 210 | * - collapse(3) 211 | * - 2x j-loop unrolling (stride of 1 col ) 212 | * - 2x i-loop unrolling (stride of 2^8 rows) 213 | */ 214 | #pragma omp target data device(0) \ 215 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) 216 | { 217 | clock_gettime(CLOCK_REALTIME, rt + 0); 218 | #pragma omp target teams device(0) num_teams(LTEAMSE) thread_limit(NTHRDS8) \ 219 | map(to:n, b[0:n * n]) map(tofrom:a[0:n * n]) \ 220 | default(none) shared(a, b, n) 221 | #pragma omp distribute parallel for num_threads(NTHRDS8) \ 222 | dist_schedule(static, NTHRDS8) collapse(3) \ 223 | default(none) shared(a, b, n) 224 | for (int j = 0; j < n; j += 2) { /* 2x unrolling */ 225 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) { 226 | for (int i = 0; i < NTHRDS8; ++i) { /* 2x unrolling */ 227 | a[ j * n + iblk * BLKROW + i ] += 228 | b[ j * n + iblk * BLKROW + i ]; 229 | a[ j * n + iblk * BLKROW + i + NTHRDS8] += 230 | b[ j * n + iblk * BLKROW + i + NTHRDS8]; 231 | a[(j + 1) * n + iblk * BLKROW + i ] += 232 | b[(j + 1) * n + iblk * BLKROW + i ]; 233 | a[(j + 1) * n + iblk * BLKROW + i + NTHRDS8] += 234 | b[(j + 1) * n + iblk * BLKROW + i + NTHRDS8]; 235 | } /* end i-loop */ 236 | } /* end iblk-loop */ 237 | } /* end j-loop */ 238 | clock_gettime(CLOCK_REALTIME, rt + 1); 239 | } 240 | break; 241 | default: 242 | /* 243 | * cublasSaxpy in CUBLAS 244 | */ 245 | if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) { 246 | printf("error: initialization (CUBLAS)\n"); 247 | cublasDestroy(handle); 248 | exit(EXIT_FAILURE); 249 | } 250 | if (cudaSuccess != cudaMalloc((void **) &a_dev, sizeof(*a) * n * n) || 251 | cudaSuccess != cudaMalloc((void **) &b_dev, sizeof(*b) * n * n)) { 252 | printf("error: memory allocation (CUDA)\n"); 253 | cudaFree(a_dev); cudaFree(b_dev); 254 | cublasDestroy(handle); 255 | exit(EXIT_FAILURE); 256 | } 257 | if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) || 258 | CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n)) { 259 | printf("error: host --> accl (CUBLAS)\n"); 260 | cudaFree(a_dev); cudaFree(b_dev); 261 | cublasDestroy(handle); 262 | exit(EXIT_FAILURE); 263 | } 264 | clock_gettime(CLOCK_REALTIME, rt + 0); 265 | if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n * n, &alfa, b_dev, 1, a_dev, 1)) { 266 | printf("error: cublasSaxpy (CUBLAS)\n"); 267 | cudaFree(a_dev); cudaFree(b_dev); 268 | cublasDestroy(handle); 269 | exit(EXIT_FAILURE); 270 | } 271 | if (cudaSuccess != cudaDeviceSynchronize()) { 272 | printf("error: device synchronization (CUDA)\n"); 273 | cudaFree(a_dev); cudaFree(b_dev); 274 | cublasDestroy(handle); 275 | exit(EXIT_FAILURE); 276 | } 277 | clock_gettime(CLOCK_REALTIME, rt + 1); 278 | if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*a), a_dev, n, a, n)) { 279 | printf("error: accl --> host (CUBLAS)\n"); 280 | cudaFree(a_dev); cudaFree(b_dev); 281 | cublasDestroy(handle); 282 | exit(EXIT_FAILURE); 283 | } 284 | cudaFree(a_dev); cudaFree(b_dev); 285 | cublasDestroy(handle); 286 | break; 287 | } /* end switch (ial) */ 288 | if (wtcalc >= 0.0) { 289 | wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /05_saxpy/src/asaxpy.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file asaxpy.c 3 | * @brief Function definition for performing the \c saxpy operation on accelerator. 4 | * 5 | * This source file contains function definition for the \c saxpy operation, 6 | * which is defined as: 7 | * 8 | * y := a * x + y 9 | * 10 | * where: 11 | * 12 | * - a is a scalar. 13 | * - x and y are single-precision vectors each with n elements. 14 | * 15 | * @author Xin Wu (PC²) 16 | * @date 05.04.2020 17 | * @copyright CC BY-SA 2.0 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #ifdef _OPENMP 24 | #include 25 | #endif 26 | #include 27 | #include "cublas_v2.h" 28 | #include "wtcalc.h" 29 | #include "asaxpy.h" 30 | 31 | void asaxpy(const int n, 32 | const float a, 33 | const float *x, 34 | float *y, 35 | const int ial) 36 | { 37 | cublasHandle_t handle; 38 | float alfa = a, 39 | *x_dev = NULL, 40 | *y_dev = NULL; 41 | struct timespec rt[2]; 42 | int m = (n >> 4); 43 | 44 | switch (ial) { 45 | case 0: 46 | /* 47 | * - <<<2^0 , 2^0 >>>, TOO SLOW! not tested 48 | */ 49 | #pragma omp target data device(0) \ 50 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) 51 | { 52 | clock_gettime(CLOCK_REALTIME, rt + 0); 53 | #pragma omp target teams device(0) num_teams(1) \ 54 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ 55 | default(none) shared(a, n, x, y) 56 | #pragma omp distribute parallel for num_threads(1) \ 57 | dist_schedule(static, 1) \ 58 | default(none) shared(a, n, x, y) 59 | for (int i = 0; i < n; ++i) { 60 | y[i] = a * x[i] + y[i]; 61 | } 62 | clock_gettime(CLOCK_REALTIME, rt + 1); 63 | } 64 | break; 65 | case 1: 66 | /* 67 | * - <<<2^0 , 2^7 >>>, auto scheduling 68 | */ 69 | #pragma omp target data device(0) \ 70 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) 71 | { 72 | clock_gettime(CLOCK_REALTIME, rt + 0); 73 | #pragma omp target teams device(0) num_teams(1) \ 74 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ 75 | default(none) shared(a, n, x, y) 76 | #pragma omp distribute parallel for num_threads(128) \ 77 | dist_schedule(static, 128) \ 78 | default(none) shared(a, n, x, y) 79 | for (int i = 0; i < n; ++i) { 80 | y[i] = a * x[i] + y[i]; 81 | } 82 | clock_gettime(CLOCK_REALTIME, rt + 1); 83 | } 84 | break; 85 | case 2: 86 | /* 87 | * - <<<2^7 , 2^0 >>>, auto scheduling 88 | */ 89 | #pragma omp target data device(0) \ 90 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) 91 | { 92 | clock_gettime(CLOCK_REALTIME, rt + 0); 93 | #pragma omp target teams device(0) num_teams(128) \ 94 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ 95 | default(none) shared(a, n, x, y) 96 | #pragma omp distribute parallel for num_threads(1) \ 97 | dist_schedule(static, 1) \ 98 | default(none) shared(a, n, x, y) 99 | for (int i = 0; i < n; ++i) { 100 | y[i] = a * x[i] + y[i]; 101 | } 102 | clock_gettime(CLOCK_REALTIME, rt + 1); 103 | } 104 | break; 105 | case 3: 106 | /* 107 | * - <<<2^7 , 2^7 >>>, auto scheduling 108 | */ 109 | #pragma omp target data device(0) \ 110 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) 111 | { 112 | clock_gettime(CLOCK_REALTIME, rt + 0); 113 | #pragma omp target teams device(0) num_teams(128) \ 114 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ 115 | default(none) shared(a, n, x, y) 116 | #pragma omp distribute parallel for num_threads(128) \ 117 | dist_schedule(static, 128) \ 118 | default(none) shared(a, n, x, y) 119 | for (int i = 0; i < n; ++i) { 120 | y[i] = a * x[i] + y[i]; 121 | } 122 | clock_gettime(CLOCK_REALTIME, rt + 1); 123 | } 124 | break; 125 | case 4: 126 | /* 127 | * - <<<2^16, 2^10>>>, manual scheduling 128 | */ 129 | #pragma omp target data device(0) \ 130 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) 131 | { 132 | clock_gettime(CLOCK_REALTIME, rt + 0); 133 | #pragma omp target teams device(0) num_teams(65536) \ 134 | map(to:a, n, x[0:n]) map(tofrom:y[0:n]) \ 135 | default(none) shared(a, n, x, y) 136 | #pragma omp distribute parallel for num_threads(1024) \ 137 | dist_schedule(static, 1024) \ 138 | default(none) shared(a, n, x, y) 139 | for (int i = 0; i < n; ++i) { 140 | y[i] = a * x[i] + y[i]; 141 | } 142 | clock_gettime(CLOCK_REALTIME, rt + 1); 143 | } 144 | break; 145 | case 5: 146 | /* 147 | * - <<<2^15, 2^7 >>>, manual scheduling, 16x loop unrolling (2^15*2^7*16==2^26) 148 | */ 149 | #pragma omp target data device(0) \ 150 | map(to:a, m, x[0:n]) map(tofrom:y[0:n]) 151 | { 152 | clock_gettime(CLOCK_REALTIME, rt + 0); 153 | #pragma omp target teams device(0) num_teams(32768) \ 154 | map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \ 155 | default(none) shared(a, m, x, y) 156 | #pragma omp distribute parallel for num_threads(128) \ 157 | dist_schedule(static, 128) \ 158 | default(none) shared(a, m, x, y) 159 | for (int i = 0; i < m; ++i) { 160 | y[i ] = a * x[i ] + y[i ]; 161 | y[i + m] = a * x[i + m] + y[i + m]; 162 | y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m]; 163 | y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m]; 164 | y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m]; 165 | y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m]; 166 | y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m]; 167 | y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m]; 168 | y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m]; 169 | y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m]; 170 | y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m]; 171 | y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m]; 172 | y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m]; 173 | y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m]; 174 | y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m]; 175 | y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m]; 176 | } 177 | clock_gettime(CLOCK_REALTIME, rt + 1); 178 | } 179 | break; 180 | case 6: 181 | /* 182 | * - <<<2^12, 2^7 >>>, auto scheduling, 16x loop unrolling 183 | */ 184 | #pragma omp target data device(0) \ 185 | map(to:a, m, x[0:n]) map(tofrom:y[0:n]) 186 | { 187 | clock_gettime(CLOCK_REALTIME, rt + 0); 188 | #pragma omp target teams device(0) num_teams(4096) \ 189 | map(to:a, m, x[0:n]) map(tofrom:y[0:n]) \ 190 | default(none) shared(a, m, x, y) 191 | #pragma omp distribute parallel for num_threads(128) \ 192 | dist_schedule(static, 128) \ 193 | default(none) shared(a, m, x, y) 194 | for (int i = 0; i < m; ++i) { 195 | y[i ] = a * x[i ] + y[i ]; 196 | y[i + m] = a * x[i + m] + y[i + m]; 197 | y[i + 0x2 * m] = a * x[i + 0x2 * m] + y[i + 0x2 * m]; 198 | y[i + 0x3 * m] = a * x[i + 0x3 * m] + y[i + 0x3 * m]; 199 | y[i + 0x4 * m] = a * x[i + 0x4 * m] + y[i + 0x4 * m]; 200 | y[i + 0x5 * m] = a * x[i + 0x5 * m] + y[i + 0x5 * m]; 201 | y[i + 0x6 * m] = a * x[i + 0x6 * m] + y[i + 0x6 * m]; 202 | y[i + 0x7 * m] = a * x[i + 0x7 * m] + y[i + 0x7 * m]; 203 | y[i + 0x8 * m] = a * x[i + 0x8 * m] + y[i + 0x8 * m]; 204 | y[i + 0x9 * m] = a * x[i + 0x9 * m] + y[i + 0x9 * m]; 205 | y[i + 0xa * m] = a * x[i + 0xa * m] + y[i + 0xa * m]; 206 | y[i + 0xb * m] = a * x[i + 0xb * m] + y[i + 0xb * m]; 207 | y[i + 0xc * m] = a * x[i + 0xc * m] + y[i + 0xc * m]; 208 | y[i + 0xd * m] = a * x[i + 0xd * m] + y[i + 0xd * m]; 209 | y[i + 0xe * m] = a * x[i + 0xe * m] + y[i + 0xe * m]; 210 | y[i + 0xf * m] = a * x[i + 0xf * m] + y[i + 0xf * m]; 211 | } 212 | clock_gettime(CLOCK_REALTIME, rt + 1); 213 | } 214 | break; 215 | case 7: 216 | /* 217 | * - <<<2^16, 2^9>>>: 218 | * * de-linearize the vector (convert the vector to matrix) 219 | * * collapse the ji-loop 220 | * * 2x i-loop unrolling 221 | */ 222 | #pragma omp target data device(0) \ 223 | map(to:a, x[0:n]) map(tofrom:y[0:n]) 224 | { 225 | clock_gettime(CLOCK_REALTIME, rt + 0); 226 | #pragma omp target teams device(0) num_teams(65536) thread_limit(512) \ 227 | map(to:a, x[0:n]) map(tofrom:y[0:n]) \ 228 | default(none) shared(a, x, y) 229 | #pragma omp distribute parallel for num_threads(512) \ 230 | dist_schedule(static, 512) collapse(2) \ 231 | default(none) shared(a, x, y) 232 | for (int j = 0; j < 65536; ++j) { 233 | for (int i = 0; i < 512; ++i) { /* 2x i-loop unrolling */ 234 | y[j * 1024 + i ] += a * x[j * 1024 + i ]; 235 | y[j * 1024 + i + 512] += a * x[j * 1024 + i + 512]; 236 | } 237 | } 238 | clock_gettime(CLOCK_REALTIME, rt + 1); 239 | } 240 | break; 241 | default: 242 | /* 243 | * cublasSaxpy in CUBLAS 244 | */ 245 | if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) { 246 | printf("error: initialization (CUBLAS)\n"); 247 | cublasDestroy(handle); 248 | exit(EXIT_FAILURE); 249 | } 250 | if (cudaSuccess != cudaMalloc((void **) &x_dev, sizeof(*x) * n) || 251 | cudaSuccess != cudaMalloc((void **) &y_dev, sizeof(*y) * n)) { 252 | printf("error: memory allocation (CUDA)\n"); 253 | cudaFree(x_dev); cudaFree(y_dev); 254 | cublasDestroy(handle); 255 | exit(EXIT_FAILURE); 256 | } 257 | if (CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*x), x, 1, x_dev, 1) || 258 | CUBLAS_STATUS_SUCCESS != cublasSetVector(n, sizeof(*y), y, 1, y_dev, 1)) { 259 | printf("error: host --> accl (CUBLAS)\n"); 260 | cudaFree(x_dev); cudaFree(y_dev); 261 | cublasDestroy(handle); 262 | exit(EXIT_FAILURE); 263 | } 264 | clock_gettime(CLOCK_REALTIME, rt + 0); 265 | if (CUBLAS_STATUS_SUCCESS != cublasSaxpy(handle, n, &alfa, x_dev, 1, y_dev, 1)) { 266 | printf("error: cublasSaxpy (CUBLAS)\n"); 267 | cudaFree(x_dev); cudaFree(y_dev); 268 | cublasDestroy(handle); 269 | exit(EXIT_FAILURE); 270 | } 271 | if (cudaSuccess != cudaDeviceSynchronize()) { 272 | printf("error: device synchronization (CUDA)\n"); 273 | cudaFree(x_dev); cudaFree(y_dev); 274 | cublasDestroy(handle); 275 | exit(EXIT_FAILURE); 276 | } 277 | clock_gettime(CLOCK_REALTIME, rt + 1); 278 | if (CUBLAS_STATUS_SUCCESS != cublasGetVector(n, sizeof(*y), y_dev, 1, y, 1)) { 279 | printf("error: accl --> host (CUBLAS)\n"); 280 | cudaFree(x_dev); cudaFree(y_dev); 281 | cublasDestroy(handle); 282 | exit(EXIT_FAILURE); 283 | } 284 | cudaFree(x_dev); cudaFree(y_dev); 285 | cublasDestroy(handle); 286 | break; 287 | } /* end switch (ial) */ 288 | if (wtcalc >= 0.0) { 289 | wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 290 | } 291 | } 292 | -------------------------------------------------------------------------------- /08_distThreads/src/gpuThreads.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file gpuThreads.c 3 | * @brief Function definition for organizing GPU threads. 4 | * 5 | * This source file contains function definition for organizing GPU threads. 6 | * 7 | * thread_limit for the teams construct is omitted for clarity. 8 | * 9 | * @author Xin Wu (PC²) 10 | * @data 12.03.2020 11 | * @copyright CC BY-SA 2.0 12 | */ 13 | 14 | #include 15 | #include 16 | #ifdef _OPENMP 17 | #include 18 | #endif 19 | #include "gpuThreads.h" 20 | 21 | typedef struct League { 22 | int itd; // index of a thread 23 | int ntd; // number of threads in a team 24 | int itm; // index of a team 25 | int ltm; // number of teams in a league 26 | } League; 27 | 28 | static void initLeague(League *league, 29 | int ncol, 30 | int nrow) 31 | /** 32 | * @brief Initialize a league of GPU threads. 33 | * 34 | * Every element in a league is initialized as -1. 35 | * 36 | * @param league A league of GPU threads. 37 | * @param ncol Number of columns in a league. 38 | * @param nrow Number of rows in a league. 39 | * 40 | * @return \c void. 41 | */ 42 | { 43 | int icol, 44 | irow; 45 | 46 | for (icol = 0; icol < ncol; ++icol) { 47 | for (irow = 0; irow < nrow; ++irow) { 48 | league[icol * nrow + irow].itd = 49 | league[icol * nrow + irow].ntd = 50 | league[icol * nrow + irow].itm = 51 | league[icol * nrow + irow].ltm = -1; 52 | } 53 | } 54 | } 55 | 56 | void gpuThreads(int i) 57 | { 58 | League *league; 59 | int icol, 60 | irow, 61 | ncol, 62 | nrow; 63 | int lteams, 64 | nthrds; 65 | int wblk; /* width of unrolled loop block */ 66 | 67 | /* 68 | * Initialize and assign GPU threads 69 | */ 70 | switch (i) 71 | { 72 | case 0: 73 | /* 74 | * 1. Dim of matrix league : 3 x 5 75 | * 2. Dim of GPU threads : 3 threads/team 76 | * 5 teams 77 | * 3. All GPU threads run thru this code block. 78 | * `distribute` is not needed, because there is no for-loop. 79 | * 4. Each GPU thread fills the corresponding element. 80 | */ 81 | ncol = 5; 82 | nrow = 3; 83 | lteams = 5; 84 | nthrds = 3; 85 | league = (League *) malloc(sizeof(League) * ncol * nrow); 86 | initLeague(league, ncol, nrow); 87 | #pragma omp target teams device(0) num_teams(lteams) \ 88 | map(to: nrow) map(tofrom:league[0:nrow * ncol]) \ 89 | default(none) shared(nrow, lteams, nthrds, league) 90 | #pragma omp parallel num_threads(nthrds) \ 91 | default(none) shared(nrow, lteams, nthrds, league) 92 | { 93 | int itd, 94 | itm; 95 | itd = omp_get_thread_num(); 96 | itm = omp_get_team_num(); 97 | league[itm * nrow + itd].itd = itd; 98 | league[itm * nrow + itd].ntd = omp_get_num_threads(); 99 | league[itm * nrow + itd].itm = itm; 100 | league[itm * nrow + itd].ltm = omp_get_num_teams(); 101 | } 102 | break; 103 | case 1: 104 | /* 105 | * 1. Dim of matrix league : 3 x 5 106 | * 2. Dim of GPU threads : 3 threads/team 107 | * 5 teams 108 | * 3. Incorrect nested loop implementation. 109 | * 4. The number of teams equals the number of icol-loop iterations. 110 | * 5. Only one thread in each team will run thru the irow-loop. 111 | * 6. Other threads in each team are idle. 112 | */ 113 | ncol = 5; 114 | nrow = 3; 115 | lteams = 5; 116 | nthrds = 3; 117 | league = (League *) malloc(sizeof(League) * ncol * nrow); 118 | initLeague(league, ncol, nrow); 119 | #pragma omp target teams device(0) num_teams(lteams) \ 120 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 121 | default(none) shared(ncol, nrow, lteams, nthrds, league) 122 | #pragma omp distribute parallel for num_threads(nthrds) \ 123 | dist_schedule(static) \ 124 | default(none) shared(ncol, nrow, lteams, nthrds, league) 125 | for (int icol = 0; icol < ncol; ++icol) { 126 | for (int irow = 0; irow < nrow; ++irow) { 127 | league[icol * nrow + irow].itd = omp_get_thread_num(); 128 | league[icol * nrow + irow].ntd = omp_get_num_threads(); 129 | league[icol * nrow + irow].itm = omp_get_team_num(); 130 | league[icol * nrow + irow].ltm = omp_get_num_teams(); 131 | } 132 | } 133 | break; 134 | case 2: 135 | /* 136 | * 1. Dim of matrix league : 3 x 5 137 | * 2. Dim of GPU threads : 3 threads/team 138 | * 5 teams 139 | * 3. The previous icol- and irow-loops are linearized manually. 140 | * 4. The total number of GPU threads equals the number of iterations in 141 | * the linearized loop. 142 | * 5. All GPU threads will be distributed and fill the matrix league. 143 | */ 144 | ncol = 5; 145 | nrow = 3; 146 | lteams = 5; 147 | nthrds = 3; 148 | league = (League *) malloc(sizeof(League) * ncol * nrow); 149 | initLeague(league, ncol, nrow); 150 | #pragma omp target teams device(0) num_teams(lteams) \ 151 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 152 | default(none) shared(ncol, nrow, lteams, nthrds, league) 153 | #pragma omp distribute parallel for num_threads(nthrds) \ 154 | dist_schedule(static) \ 155 | default(none) shared(ncol, nrow, lteams, nthrds, league) 156 | for (int idx = 0; idx < nrow * ncol; ++idx) { 157 | league[idx].itd = omp_get_thread_num(); 158 | league[idx].ntd = omp_get_num_threads(); 159 | league[idx].itm = omp_get_team_num(); 160 | league[idx].ltm = omp_get_num_teams(); 161 | } 162 | break; 163 | case 3: 164 | /* 165 | * 1. Dim of matrix league : 3 x 5 166 | * 2. Dim of GPU threads : 3 threads/team 167 | * 5 teams 168 | * 3. Not everyone wants to linearize loops manually. 169 | * 4. The icol- and irow-loops are collapsed. 170 | * 5. All GPU threads will be distributed and fill the matrix league. 171 | * 6. Please note that the GPU threads are organized such that the index 172 | * increases continuously with respect to irow (the loop index of the 173 | * innermost loop). 174 | */ 175 | ncol = 5; 176 | nrow = 3; 177 | lteams = 5; 178 | nthrds = 3; 179 | league = (League *) malloc(sizeof(League) * ncol * nrow); 180 | initLeague(league, ncol, nrow); 181 | #pragma omp target teams device(0) num_teams(lteams) \ 182 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 183 | default(none) shared(ncol, nrow, lteams, nthrds, league) 184 | #pragma omp distribute parallel for num_threads(nthrds) \ 185 | dist_schedule(static) collapse(2) \ 186 | default(none) shared(ncol, nrow, lteams, nthrds, league) 187 | for (int icol = 0; icol < ncol; ++icol) { 188 | for (int irow = 0; irow < nrow; ++irow) { 189 | league[icol * nrow + irow].itd = omp_get_thread_num(); 190 | league[icol * nrow + irow].ntd = omp_get_num_threads(); 191 | league[icol * nrow + irow].itm = omp_get_team_num(); 192 | league[icol * nrow + irow].ltm = omp_get_num_teams(); 193 | } 194 | } 195 | break; 196 | case 4: 197 | /* 198 | * 1. Dim of matrix league : 7 x 7 199 | * 2. Dim of GPU threads : 3 threads/team 200 | * 5 teams 201 | * 3. The size of matrix league does not match with the number of GPU threads. 202 | * 4. dist_schedule(kind, chunk_size) 203 | * - kind: must be static 204 | * - chunk_size: When no chunk_size is specified, the iterations are divided 205 | * into chunks of approximately equal in size. 206 | * 5. Please note that in some teams *not* all GPU threads are working! 207 | */ 208 | ncol = 7; 209 | nrow = 7; 210 | lteams = 5; 211 | nthrds = 3; 212 | league = (League *) malloc(sizeof(League) * ncol * nrow); 213 | initLeague(league, ncol, nrow); 214 | #pragma omp target teams device(0) num_teams(lteams) \ 215 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 216 | default(none) shared(ncol, nrow, lteams, nthrds, league) 217 | #pragma omp distribute parallel for num_threads(nthrds) \ 218 | dist_schedule(static) collapse(2) \ 219 | default(none) shared(ncol, nrow, lteams, nthrds, league) 220 | for (int icol = 0; icol < ncol; ++icol) { 221 | for (int irow = 0; irow < nrow; ++irow) { 222 | league[icol * nrow + irow].itd = omp_get_thread_num(); 223 | league[icol * nrow + irow].ntd = omp_get_num_threads(); 224 | league[icol * nrow + irow].itm = omp_get_team_num(); 225 | league[icol * nrow + irow].ltm = omp_get_num_teams(); 226 | } 227 | } 228 | break; 229 | case 5: 230 | /* 231 | * 1. Dim of matrix league : 7 x 7 232 | * 2. Dim of GPU threads : 3 threads/team 233 | * 5 teams 234 | * 3. The size of matrix league does not match with the number of GPU threads. 235 | * 4. dist_schedule(kind, chunk_size) 236 | * - kind: must be static 237 | * - chunk_size: If specified, iterations are divided into chunks of size 238 | * chunk_size. Chunks are then assigned to the GPU thread teams in 239 | * a round-robin fashion. 240 | * 5. The different ways of organizing GPU threads will impact on 241 | * the performance of GPU memory access. 242 | */ 243 | ncol = 7; 244 | nrow = 7; 245 | lteams = 5; 246 | nthrds = 3; 247 | league = (League *) malloc(sizeof(League) * ncol * nrow); 248 | initLeague(league, ncol, nrow); 249 | #pragma omp target teams device(0) num_teams(lteams) \ 250 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 251 | default(none) shared(ncol, nrow, lteams, nthrds, league) 252 | #pragma omp distribute parallel for num_threads(nthrds) \ 253 | dist_schedule(static, nthrds) collapse(2) \ 254 | default(none) shared(ncol, nrow, lteams, nthrds, league) 255 | for (int icol = 0; icol < ncol; ++icol) { 256 | for (int irow = 0; irow < nrow; ++irow) { 257 | league[icol * nrow + irow].itd = omp_get_thread_num(); 258 | league[icol * nrow + irow].ntd = omp_get_num_threads(); 259 | league[icol * nrow + irow].itm = omp_get_team_num(); 260 | league[icol * nrow + irow].ltm = omp_get_num_teams(); 261 | } 262 | } 263 | break; 264 | case 6: 265 | /* 266 | * 1. Dim of matrix league : 12 x 6 267 | * 2. Dim of GPU threads : 3 threads/team 268 | * 6 teams 269 | * 3. icol-loop: intact 270 | * 4. irow-loop: CPU-like 2x loop unrolling. 271 | * 5. It results in uncoalesced GPU memory access and reduced performance. 272 | * 6. +10 to each unrolled thread is used to label the 2x irow-loop unrolling. 273 | */ 274 | ncol = 6; 275 | nrow =12; 276 | lteams = 6; 277 | nthrds = 3; 278 | league = (League *) malloc(sizeof(League) * ncol * nrow); 279 | initLeague(league, ncol, nrow); 280 | #pragma omp target teams device(0) num_teams(lteams) \ 281 | map(to: ncol, nrow) map(tofrom:league[0:nrow * ncol]) \ 282 | default(none) shared(ncol, nrow, lteams, nthrds, league) 283 | #pragma omp distribute parallel for num_threads(nthrds) \ 284 | dist_schedule(static, nthrds) collapse(2) \ 285 | default(none) shared(ncol, nrow, lteams, nthrds, league) 286 | for (int icol = 0; icol < ncol; ++icol) { 287 | for (int irow = 0; irow < nrow; irow += 2) { 288 | league[icol * nrow + irow ].itd = omp_get_thread_num(); 289 | league[icol * nrow + irow ].ntd = omp_get_num_threads(); 290 | league[icol * nrow + irow ].itm = omp_get_team_num(); 291 | league[icol * nrow + irow ].ltm = omp_get_num_teams(); 292 | league[icol * nrow + irow + 1].itd = omp_get_thread_num() + 10; 293 | league[icol * nrow + irow + 1].ntd = omp_get_num_threads(); 294 | league[icol * nrow + irow + 1].itm = omp_get_team_num(); 295 | league[icol * nrow + irow + 1].ltm = omp_get_num_teams(); 296 | } 297 | } 298 | break; 299 | case 7: 300 | /* 301 | * 1. Dim of matrix league : 12 x 6 302 | * 2. Dim of GPU threads : 3 threads/team 303 | * 6 teams 304 | * 3. icol-loop: intact 305 | * 4. irow-loop: 2x loop unrolling. 306 | * 5. Nested loop with collapse(3). 307 | * 6. It features coalesced GPU memory access and good performance. 308 | * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling. 309 | * 310 | * Caveat: especially for the innermost loop 311 | * 312 | * OpenMP API Specification: Version 5.0 November 2018 313 | * 314 | * https://www.openmp.org/spec-html/5.0/openmpsu44.html 315 | * 316 | * If a collapse clause is specified with a parameter value greater than 1, then 317 | * the iterations of the associated loops to which the clause applies are 318 | * collapsed into one larger iteration space with *unspecified ordering*. 319 | * 320 | */ 321 | ncol = 6; 322 | nrow =12; 323 | lteams = 6; 324 | nthrds = 3; 325 | wblk = nthrds * 2; 326 | league = (League *) malloc(sizeof(League) * ncol * nrow); 327 | initLeague(league, ncol, nrow); 328 | #pragma omp target teams device(0) num_teams(lteams) \ 329 | map(to: ncol, nrow, wblk) map(tofrom:league[0:nrow * ncol]) \ 330 | default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) 331 | #pragma omp distribute parallel for num_threads(nthrds) \ 332 | dist_schedule(static, wblk) collapse(3) \ 333 | default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) 334 | for (int icol = 0; icol < ncol; ++icol) { 335 | for (int iblk = 0; iblk < nrow / wblk; ++iblk) { 336 | for (int irow = 0; irow < nthrds; ++irow) { 337 | league[icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); 338 | league[icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); 339 | league[icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num(); 340 | league[icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); 341 | league[icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; 342 | league[icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); 343 | league[icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num(); 344 | league[icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); 345 | } 346 | } 347 | } 348 | break; 349 | case 8: 350 | /* 351 | * 1. Dim of matrix league : 12 x 6 352 | * 2. Dim of GPU threads : 3 threads/team 353 | * 3 teams 354 | * 3. icol-loop: 2x loop unrolling. 355 | * 4. irow-loop: 2x loop unrolling. 356 | * 5. Nested loop with collapse(3). 357 | * 6. +10 to each unrolled team is used to label the 2x icol-loop unrolling. 358 | * 7. +10 to each unrolled thread is used to label the 2x irow-loop unrolling. 359 | * 360 | * More work for each thread is an approach to achieve high performance. 361 | * 362 | */ 363 | ncol = 6; 364 | nrow =12; 365 | lteams = 3; 366 | nthrds = 3; 367 | wblk = nthrds * 2; 368 | league = (League *) malloc(sizeof(League) * ncol * nrow); 369 | initLeague(league, ncol, nrow); 370 | #pragma omp target teams device(0) num_teams(lteams) \ 371 | map(to: ncol, nrow, wblk) map(tofrom:league[0:nrow * ncol]) \ 372 | default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) 373 | #pragma omp distribute parallel for num_threads(nthrds) \ 374 | dist_schedule(static, wblk) collapse(3) \ 375 | default(none) shared(ncol, nrow, wblk, lteams, nthrds, league) 376 | for (int icol = 0; icol < ncol; icol += 2) { 377 | for (int iblk = 0; iblk < nrow / wblk; ++iblk) { 378 | for (int irow = 0; irow < nthrds; ++irow) { 379 | league[ icol * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); 380 | league[ icol * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); 381 | league[ icol * nrow + iblk * wblk + irow ].itm = omp_get_team_num(); 382 | league[ icol * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); 383 | league[ icol * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; 384 | league[ icol * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); 385 | league[ icol * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num(); 386 | league[ icol * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); 387 | league[(icol + 1) * nrow + iblk * wblk + irow ].itd = omp_get_thread_num(); 388 | league[(icol + 1) * nrow + iblk * wblk + irow ].ntd = omp_get_num_threads(); 389 | league[(icol + 1) * nrow + iblk * wblk + irow ].itm = omp_get_team_num() + 10; 390 | league[(icol + 1) * nrow + iblk * wblk + irow ].ltm = omp_get_num_teams(); 391 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itd = omp_get_thread_num() + 10; 392 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ntd = omp_get_num_threads(); 393 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].itm = omp_get_team_num() + 10; 394 | league[(icol + 1) * nrow + iblk * wblk + irow + nthrds].ltm = omp_get_num_teams(); 395 | } 396 | } 397 | } 398 | break; 399 | default: 400 | printf("Tschüß!\n"); 401 | exit(EXIT_SUCCESS); 402 | break; 403 | } 404 | /* 405 | * Show the organization of GPU threads 406 | */ 407 | printf("%dth GPU threads organization:\n", i); 408 | printf("\n"); 409 | printf("No. of rows : %3d\n", nrow); 410 | printf("No. of cols : %3d\n", ncol); 411 | printf("No. of threads : %3d\n", league[0].ntd); 412 | printf("No. of teams : %3d\n", league[0].ltm); 413 | printf("\n"); 414 | for (irow = 0; irow < nrow; ++irow) { 415 | for (icol = 0; icol < ncol; ++icol) { 416 | printf("(%2d,%2d):[%2d,%2d]%s", irow, icol, 417 | league[icol * nrow + irow].itd, 418 | league[icol * nrow + irow].itm, 419 | icol == ncol - 1 ? "\n" : " "); 420 | } 421 | } 422 | printf("\n"); 423 | /* 424 | * Release the memory 425 | */ 426 | free(league); 427 | } 428 | -------------------------------------------------------------------------------- /10_matMul/src/matMulAB.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file matMulAB.c 3 | * 4 | * @brief Function definition for matrix multiplication in single-precision. 5 | * 6 | * This source file contains function definition for matrix multiplication 7 | * in single-precision. 8 | * 9 | * @author Xin Wu (PC²) 10 | * @date 07.02.2020 11 | * @copyright CC BY-SA 2.0 12 | */ 13 | 14 | #include 15 | #include 16 | #include 17 | #ifdef _OPENMP 18 | #include 19 | #endif 20 | #include 21 | #include "cublas_v2.h" 22 | #include "matMulAB.h" 23 | 24 | #define NTHRDS7 (1 << 0x7) /* 2^{7} */ 25 | #define NTHRDS8 (1 << 0x8) /* 2^{8} */ 26 | #define NTHRDS9 (1 << 0x9) /* 2^{9} */ 27 | 28 | #define LTEAMSD (1 << 0xD) /* 2^{13} */ 29 | #define LTEAMSE (1 << 0xE) /* 2^{14} */ 30 | #define LTEAMSF (1 << 0xF) /* 2^{15} */ 31 | #define LTEAMSG (1 << 020) /* 2^{16} */ 32 | 33 | #define BLKROW (512) /* 4x number of threads in each team */ 34 | #define BLKDIM (16) 35 | 36 | double wtcalc; 37 | 38 | void matMulAB_accl(float *a, 39 | float *b, 40 | float *c, 41 | int n, 42 | int ial) 43 | { 44 | cublasHandle_t handle; 45 | float alfa = 1.0f, 46 | beta = 1.0f, 47 | *a_dev = NULL, 48 | *b_dev = NULL, 49 | *c_dev = NULL; 50 | struct timespec rt[2]; 51 | 52 | switch (ial) { 53 | case 0: 54 | /* 55 | * - jik-loop 56 | * - 2^9 threads per team and 2^3 teams 57 | * - n-stride memory read for c (then in rc) 58 | * - n-stride memory read for b (innermost loop) 59 | * - n-stride memory write for c 60 | */ 61 | #pragma omp target data device(0) \ 62 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 63 | { 64 | clock_gettime(CLOCK_REALTIME, rt + 0); 65 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 66 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 67 | default(none) shared(a, b, c, n) 68 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 69 | dist_schedule(static, NTHRDS9) \ 70 | default(none) shared(a, b, c, n) 71 | for (int j = 0; j < n; ++j) { /* parallel */ 72 | for (int i = 0; i < n; ++i) { /* sequential */ 73 | float rc; 74 | rc = c[j * n + i]; 75 | for (int k = 0; k < n; ++k) { 76 | rc += a[k * n + i] * b[j * n + k]; 77 | } 78 | c[j * n + i] = rc; 79 | } /* end i-loop */ 80 | } /* end j-loop */ 81 | clock_gettime(CLOCK_REALTIME, rt + 1); 82 | } 83 | break; 84 | case 1: 85 | /* 86 | * - jki-loop 87 | * - 2^9 threads per team and 2^3 teams 88 | * - n-stride memory read for b (then in rb) 89 | * - n-stride memory read for c (innermost loop) 90 | * - n-stride memory write for c (innermost loop) 91 | */ 92 | #pragma omp target data device(0) \ 93 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 94 | { 95 | clock_gettime(CLOCK_REALTIME, rt + 0); 96 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 97 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 98 | default(none) shared(a, b, c, n) 99 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 100 | dist_schedule(static, NTHRDS9) \ 101 | default(none) shared(a, b, c, n) 102 | for (int j = 0; j < n; ++j) { /* parallel */ 103 | for (int k = 0; k < n; ++k) { /* sequential */ 104 | float rb; 105 | rb = b[j * n + k]; 106 | for (int i = 0; i < n; ++i) { 107 | c[j * n + i] += a[k * n + i] * rb; /* uncoalesced r&w */ 108 | } 109 | } /* end k-loop */ 110 | } /* end j-loop */ 111 | clock_gettime(CLOCK_REALTIME, rt + 1); 112 | } 113 | break; 114 | case 2: 115 | /* 116 | * - jik-loop 117 | * - 2^9 threads per team and 2^15 teams 118 | * - collapse(2) 119 | * - no race condition 120 | */ 121 | #pragma omp target data device(0) \ 122 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 123 | { 124 | clock_gettime(CLOCK_REALTIME, rt + 0); 125 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 126 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 127 | default(none) shared(a, b, c, n) 128 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 129 | dist_schedule(static, NTHRDS9) collapse(2) \ 130 | default(none) shared(a, b, c, n) 131 | for (int j = 0; j < n; ++j) { /* parallel */ 132 | for (int i = 0; i < n; ++i) { /* parallel */ 133 | float rc; 134 | rc = c[j * n + i]; 135 | for (int k = 0; k < n; ++k) { /* sequential */ 136 | rc += a[k * n + i] * b[j * n + k]; 137 | } 138 | c[j * n + i] = rc; 139 | } /* end i-loop */ 140 | } /* end j-loop */ 141 | clock_gettime(CLOCK_REALTIME, rt + 1); 142 | } 143 | break; 144 | case 3: 145 | /* 146 | * - jki-loop 147 | * - 2^9 threads per team and 2^15 teams 148 | * - collapse(2) 149 | * - race condition for writing c: not only one thread has the index j, a total 150 | * of n GPU threads has the index j. (n / 32) warps are then scheduled on GPU. 151 | */ 152 | #pragma omp target data device(0) \ 153 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 154 | { 155 | clock_gettime(CLOCK_REALTIME, rt + 0); 156 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 157 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 158 | default(none) shared(a, b, c, n) 159 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 160 | dist_schedule(static, NTHRDS9) collapse(2) \ 161 | default(none) shared(a, b, c, n) 162 | for (int j = 0; j < n; ++j) { /* parallel */ 163 | for (int k = 0; k < n; ++k) { /* parallel */ 164 | float rb; 165 | rb = b[j * n + k]; 166 | for (int i = 0; i < n; ++i) { 167 | c[j * n + i] += a[k * n + i] * rb; /* race condition between diff. warps */ 168 | } 169 | } /* end k-loop */ 170 | } /* end j-loop */ 171 | clock_gettime(CLOCK_REALTIME, rt + 1); 172 | } 173 | break; 174 | case 4: 175 | /* 176 | * - jik-loop 177 | * - 2^9 threads per team and 2^15 teams 178 | * - 4x k-loop unrolling 179 | * 180 | * good: more work for one thread per iteration. 181 | * bad : one thread must read b 4 times in k-loop. 182 | * all threads in a team do the same read of b (waste of instructions). 183 | * tips: each thread reads the corresponding element in b and 184 | * saves it in shared memory. 185 | */ 186 | #pragma omp target data device(0) \ 187 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 188 | { 189 | clock_gettime(CLOCK_REALTIME, rt + 0); 190 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS9) \ 191 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 192 | default(none) shared(a, b, c, n) 193 | #pragma omp distribute parallel for num_threads(NTHRDS9) \ 194 | dist_schedule(static, NTHRDS9) collapse(2) \ 195 | default(none) shared(a, b, c, n) 196 | for (int j = 0; j < n; ++j) { 197 | for (int i = 0; i < n; ++i) { 198 | float rc; 199 | rc = c[j * n + i]; 200 | for (int k = 0; k < n; k += 4) { /* 4x unrolling */ 201 | rc += a[ k * n + i] * b[j * n + k ]; 202 | rc += a[(k + 1) * n + i] * b[j * n + k + 1]; 203 | rc += a[(k + 2) * n + i] * b[j * n + k + 2]; 204 | rc += a[(k + 3) * n + i] * b[j * n + k + 3]; 205 | } 206 | c[j * n + i] = rc; 207 | } /* end i-loop */ 208 | } /* end j-loop */ 209 | clock_gettime(CLOCK_REALTIME, rt + 1); 210 | } 211 | break; 212 | case 5: 213 | /* 214 | * - jik-loop 215 | * - 2^7 threads per team and 2^15 teams 216 | * - collapse(3) 217 | * - 4x i-loop unrolling (stride of 2^7 rows) 218 | * - 4x k-loop unrolling 219 | * - rb: 4x data re-use 220 | * 221 | * The integer calculation of matrix indices looks ugly. But considering the GPU 222 | * hardware architecture, e.g. many separate INT32 units, these calculations are 223 | * much faster than accessing GPU global memory and save the precious registers. 224 | * 225 | */ 226 | #pragma omp target data device(0) \ 227 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 228 | { 229 | clock_gettime(CLOCK_REALTIME, rt + 0); 230 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \ 231 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 232 | default(none) shared(a, b, c, n) 233 | #pragma omp distribute parallel for num_threads(NTHRDS7) \ 234 | dist_schedule(static, NTHRDS7) collapse(3) \ 235 | default(none) shared(a, b, c, n) 236 | for (int j = 0; j < n; ++j) { 237 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) { 238 | for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */ 239 | float rc0, rc1, rc2, rc3; 240 | rc0 = c[j * n + iblk * BLKROW + i ]; 241 | rc1 = c[j * n + iblk * BLKROW + i + NTHRDS7 ]; 242 | rc2 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 2]; 243 | rc3 = c[j * n + iblk * BLKROW + i + NTHRDS7 * 3]; 244 | for (int k = 0; k < n; k += 4) { /* 4x unrolling */ 245 | /* register for b: 4x k-loop */ 246 | float rb0, rb1, rb2, rb3; 247 | rb0 = b[j * n + k ]; 248 | rb1 = b[j * n + k + 1]; 249 | rb2 = b[j * n + k + 2]; 250 | rb3 = b[j * n + k + 3]; 251 | rc0 += a[ k * n + iblk * BLKROW + i ] * rb0; 252 | rc0 += a[(k + 1) * n + iblk * BLKROW + i ] * rb1; 253 | rc0 += a[(k + 2) * n + iblk * BLKROW + i ] * rb2; 254 | rc0 += a[(k + 3) * n + iblk * BLKROW + i ] * rb3; 255 | rc1 += a[ k * n + iblk * BLKROW + i + NTHRDS7 ] * rb0; 256 | rc1 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 ] * rb1; 257 | rc1 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 ] * rb2; 258 | rc1 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 ] * rb3; 259 | rc2 += a[ k * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb0; 260 | rc2 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb1; 261 | rc2 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb2; 262 | rc2 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] * rb3; 263 | rc3 += a[ k * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb0; 264 | rc3 += a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb1; 265 | rc3 += a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb2; 266 | rc3 += a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] * rb3; 267 | } 268 | c[j * n + iblk * BLKROW + i ] = rc0; 269 | c[j * n + iblk * BLKROW + i + NTHRDS7 ] = rc1; 270 | c[j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2; 271 | c[j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3; 272 | } /* end i-loop */ 273 | } /* end iblk-loop */ 274 | } /* end j-loop */ 275 | clock_gettime(CLOCK_REALTIME, rt + 1); 276 | } 277 | break; 278 | case 6: 279 | /* 280 | * - jik-loop 281 | * - 2^7 threads per team and 2^13 teams 282 | * - collapse(3) 283 | * - 4x j-loop unrolling (stride of 1 col ) 284 | * - 4x i-loop unrolling (stride of 2^7 rows) 285 | * - 4x k-loop unrolling 286 | * - rb: 4x data re-use 287 | * - ra: 4x data re-use 288 | * - register blocking 289 | */ 290 | #pragma omp target data device(0) \ 291 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 292 | { 293 | clock_gettime(CLOCK_REALTIME, rt + 0); 294 | #pragma omp target teams device(0) num_teams(LTEAMSD) thread_limit(NTHRDS7) \ 295 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 296 | default(none) shared(a, b, c, n) 297 | #pragma omp distribute parallel for num_threads(NTHRDS7) \ 298 | dist_schedule(static, NTHRDS7) collapse(3) \ 299 | default(none) shared(a, b, c, n) 300 | for (int j = 0; j < n; j += 4) { /* 4x unrolling */ 301 | for (int iblk = 0; iblk < n / BLKROW; ++iblk) { 302 | for (int i = 0; i < NTHRDS7; ++i) { /* 4x unrolling */ 303 | /* register for c: 4x j-loop * 4x i-loop */ 304 | float rc0, rc1, rc2, rc3, 305 | rc4, rc5, rc6, rc7, 306 | rc8, rc9, rca, rcb, 307 | rcc, rcd, rce, rcf; 308 | rc0 = c[ j * n + iblk * BLKROW + i ]; 309 | rc1 = c[ j * n + iblk * BLKROW + i + NTHRDS7 ]; 310 | rc2 = c[ j * n + iblk * BLKROW + i + NTHRDS7 * 2]; 311 | rc3 = c[ j * n + iblk * BLKROW + i + NTHRDS7 * 3]; 312 | rc4 = c[(j + 1) * n + iblk * BLKROW + i ]; 313 | rc5 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 ]; 314 | rc6 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 315 | rc7 = c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 316 | rc8 = c[(j + 2) * n + iblk * BLKROW + i ]; 317 | rc9 = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 ]; 318 | rca = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 319 | rcb = c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 320 | rcc = c[(j + 3) * n + iblk * BLKROW + i ]; 321 | rcd = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 ]; 322 | rce = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 323 | rcf = c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 324 | for (int k = 0; k < n; k += 4) { /* 4x unrolling */ 325 | /* register for b: 4x j-loop * 4x k-loop */ 326 | float rb0, rb1, rb2, rb3, 327 | rb4, rb5, rb6, rb7, 328 | rb8, rb9, rba, rbb, 329 | rbc, rbd, rbe, rbf; 330 | rb0 = b[ j * n + k ]; 331 | rb1 = b[ j * n + k + 1]; 332 | rb2 = b[ j * n + k + 2]; 333 | rb3 = b[ j * n + k + 3]; 334 | rb4 = b[(j + 1) * n + k ]; 335 | rb5 = b[(j + 1) * n + k + 1]; 336 | rb6 = b[(j + 1) * n + k + 2]; 337 | rb7 = b[(j + 1) * n + k + 3]; 338 | rb8 = b[(j + 2) * n + k ]; 339 | rb9 = b[(j + 2) * n + k + 1]; 340 | rba = b[(j + 2) * n + k + 2]; 341 | rbb = b[(j + 2) * n + k + 3]; 342 | rbc = b[(j + 3) * n + k ]; 343 | rbd = b[(j + 3) * n + k + 1]; 344 | rbe = b[(j + 3) * n + k + 2]; 345 | rbf = b[(j + 3) * n + k + 3]; 346 | /* register for a: 4x i-loop * 4x k-loop */ 347 | float ra0, ra1, ra2, ra3, 348 | ra4, ra5, ra6, ra7, 349 | ra8, ra9, raa, rab, 350 | rac, rad, rae, raf; 351 | ra0 = a[ k * n + iblk * BLKROW + i ]; 352 | ra1 = a[ k * n + iblk * BLKROW + i + NTHRDS7 ]; 353 | ra2 = a[ k * n + iblk * BLKROW + i + NTHRDS7 * 2]; 354 | ra3 = a[ k * n + iblk * BLKROW + i + NTHRDS7 * 3]; 355 | ra4 = a[(k + 1) * n + iblk * BLKROW + i ]; 356 | ra5 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 ]; 357 | ra6 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 358 | ra7 = a[(k + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 359 | ra8 = a[(k + 2) * n + iblk * BLKROW + i ]; 360 | ra9 = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 ]; 361 | raa = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 362 | rab = a[(k + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 363 | rac = a[(k + 3) * n + iblk * BLKROW + i ]; 364 | rad = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 ]; 365 | rae = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2]; 366 | raf = a[(k + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3]; 367 | /* 368 | * register blocking 369 | */ 370 | // col 1 of c: 371 | rc0 += ra0 * rb0; 372 | rc0 += ra4 * rb1; 373 | rc0 += ra8 * rb2; 374 | rc0 += rac * rb3; 375 | rc1 += ra1 * rb0; 376 | rc1 += ra5 * rb1; 377 | rc1 += ra9 * rb2; 378 | rc1 += rad * rb3; 379 | rc2 += ra2 * rb0; 380 | rc2 += ra6 * rb1; 381 | rc2 += raa * rb2; 382 | rc2 += rae * rb3; 383 | rc3 += ra3 * rb0; 384 | rc3 += ra7 * rb1; 385 | rc3 += rab * rb2; 386 | rc3 += raf * rb3; 387 | // col 2 of c: 388 | rc4 += ra0 * rb4; 389 | rc4 += ra4 * rb5; 390 | rc4 += ra8 * rb6; 391 | rc4 += rac * rb7; 392 | rc5 += ra1 * rb4; 393 | rc5 += ra5 * rb5; 394 | rc5 += ra9 * rb6; 395 | rc5 += rad * rb7; 396 | rc6 += ra2 * rb4; 397 | rc6 += ra6 * rb5; 398 | rc6 += raa * rb6; 399 | rc6 += rae * rb7; 400 | rc7 += ra3 * rb4; 401 | rc7 += ra7 * rb5; 402 | rc7 += rab * rb6; 403 | rc7 += raf * rb7; 404 | // col 3 of c: 405 | rc8 += ra0 * rb8; 406 | rc8 += ra4 * rb9; 407 | rc8 += ra8 * rba; 408 | rc8 += rac * rbb; 409 | rc9 += ra1 * rb8; 410 | rc9 += ra5 * rb9; 411 | rc9 += ra9 * rba; 412 | rc9 += rad * rbb; 413 | rca += ra2 * rb8; 414 | rca += ra6 * rb9; 415 | rca += raa * rba; 416 | rca += rae * rbb; 417 | rcb += ra3 * rb8; 418 | rcb += ra7 * rb9; 419 | rcb += rab * rba; 420 | rcb += raf * rbb; 421 | // col 4 of c: 422 | rcc += ra0 * rbc; 423 | rcc += ra4 * rbd; 424 | rcc += ra8 * rbe; 425 | rcc += rac * rbf; 426 | rcd += ra1 * rbc; 427 | rcd += ra5 * rbd; 428 | rcd += ra9 * rbe; 429 | rcd += rad * rbf; 430 | rce += ra2 * rbc; 431 | rce += ra6 * rbd; 432 | rce += raa * rbe; 433 | rce += rae * rbf; 434 | rcf += ra3 * rbc; 435 | rcf += ra7 * rbd; 436 | rcf += rab * rbe; 437 | rcf += raf * rbf; 438 | } 439 | c[ j * n + iblk * BLKROW + i ] = rc0; 440 | c[ j * n + iblk * BLKROW + i + NTHRDS7 ] = rc1; 441 | c[ j * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc2; 442 | c[ j * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc3; 443 | c[(j + 1) * n + iblk * BLKROW + i ] = rc4; 444 | c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 ] = rc5; 445 | c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rc6; 446 | c[(j + 1) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rc7; 447 | c[(j + 2) * n + iblk * BLKROW + i ] = rc8; 448 | c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 ] = rc9; 449 | c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rca; 450 | c[(j + 2) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcb; 451 | c[(j + 3) * n + iblk * BLKROW + i ] = rcc; 452 | c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 ] = rcd; 453 | c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 2] = rce; 454 | c[(j + 3) * n + iblk * BLKROW + i + NTHRDS7 * 3] = rcf; 455 | } /* end i-loop */ 456 | } /* end iblk-loop */ 457 | } /* end j-loop */ 458 | clock_gettime(CLOCK_REALTIME, rt + 1); 459 | } 460 | break; 461 | case 7: 462 | /* 463 | * - based on case 2 464 | * - jik-loop 465 | * - 2^8 threads per team and 2^16 teams 466 | * - collapse(2) 467 | * - no race condition 468 | */ 469 | #pragma omp target data device(0) \ 470 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 471 | { 472 | clock_gettime(CLOCK_REALTIME, rt + 0); 473 | #pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \ 474 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 475 | default(none) shared(a, b, c, n) 476 | #pragma omp distribute parallel for num_threads(NTHRDS8) \ 477 | dist_schedule(static, NTHRDS8) collapse(2) \ 478 | default(none) shared(a, b, c, n) 479 | for (int j = 0; j < n; ++j) { /* parallel */ 480 | for (int i = 0; i < n; ++i) { /* parallel */ 481 | float rc; 482 | rc = c[j * n + i]; 483 | for (int k = 0; k < n; ++k) { /* sequential */ 484 | rc += a[k * n + i] * b[j * n + k]; 485 | } 486 | c[j * n + i] = rc; 487 | } /* end i-loop */ 488 | } /* end j-loop */ 489 | clock_gettime(CLOCK_REALTIME, rt + 1); 490 | } 491 | break; 492 | case 8: 493 | /* 494 | * - based on case 7 495 | * - jik-loop 496 | * - 2^8 threads per team and 2^16 teams 497 | * - collapse(2) 498 | * - GPU shared memory for data re-use 499 | * - 16x k-loop unrolling 500 | */ 501 | #pragma omp target data device(0) \ 502 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 503 | { 504 | clock_gettime(CLOCK_REALTIME, rt + 0); 505 | #pragma omp target teams device(0) num_teams(LTEAMSG) thread_limit(NTHRDS8) \ 506 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 507 | default(none) shared(a, b, c, n) 508 | { 509 | // GPU shared memory for each team 510 | /* 511 | * I have tested the bank conflict-free version, but it gives worse results, 512 | * e.g. ~ 290 GFLOPS (20 GFLOPS less than the bank conflict version). 513 | * I cannot explain ... 514 | * 515 | float ashm[BLKDIM][BLKDIM + 1], 516 | bshm[BLKDIM][BLKDIM + 1]; 517 | */ 518 | float ashm[BLKDIM][BLKDIM], 519 | bshm[BLKDIM][BLKDIM]; 520 | #pragma omp distribute dist_schedule(static, 1) collapse(2) 521 | for (int j = 0; j < n / BLKDIM; ++j) { 522 | for (int i = 0; i < n / BLKDIM; ++i) { 523 | #pragma omp parallel num_threads(NTHRDS8) \ 524 | default(none) shared(a, b, c, n, ashm, bshm, i, j) 525 | { 526 | /* 527 | * The code here resembles CUDA. 528 | */ 529 | int td = omp_get_thread_num(); 530 | // de-linearize the thread number 531 | int it, // thread number along the row 532 | jt; // thread number along the col 533 | it = td % BLKDIM; 534 | jt = td / BLKDIM; 535 | int ib, // row at the beginning of block 536 | jb; // col at the beginning of block 537 | ib = i * BLKDIM; 538 | jb = j * BLKDIM; 539 | int ii, // the real row 540 | jj; // the real col 541 | ii = ib + it; 542 | jj = jb + jt; 543 | float rc = c[jj * n + ii]; // c in register 544 | /* 545 | * the k blocks 546 | */ 547 | for (int k = 0; k < n / BLKDIM; ++k) { 548 | // read the global data to shared memory 549 | ashm[jt][it] = a[(k * 16 + jt) * n + ii]; 550 | bshm[jt][it] = b[jj * n + (k * 16 + it)]; 551 | #pragma omp barrier 552 | // shared memory blocking and 16x k-loop unrolling 553 | rc += ashm[0x0][it] * bshm[jt][0x0]; 554 | rc += ashm[0x1][it] * bshm[jt][0x1]; 555 | rc += ashm[0x2][it] * bshm[jt][0x2]; 556 | rc += ashm[0x3][it] * bshm[jt][0x3]; 557 | rc += ashm[0x4][it] * bshm[jt][0x4]; 558 | rc += ashm[0x5][it] * bshm[jt][0x5]; 559 | rc += ashm[0x6][it] * bshm[jt][0x6]; 560 | rc += ashm[0x7][it] * bshm[jt][0x7]; 561 | rc += ashm[0x8][it] * bshm[jt][0x8]; 562 | rc += ashm[0x9][it] * bshm[jt][0x9]; 563 | rc += ashm[0xa][it] * bshm[jt][0xa]; 564 | rc += ashm[0xb][it] * bshm[jt][0xb]; 565 | rc += ashm[0xc][it] * bshm[jt][0xc]; 566 | rc += ashm[0xd][it] * bshm[jt][0xd]; 567 | rc += ashm[0xe][it] * bshm[jt][0xe]; 568 | rc += ashm[0xf][it] * bshm[jt][0xf]; 569 | #pragma omp barrier 570 | } /* end k-loop */ 571 | c[jj * n + ii] =rc; 572 | } /* end omp parallel */ 573 | } /* end i-loop */ 574 | } /* end j-loop */ 575 | } /* end omp target teams */ 576 | clock_gettime(CLOCK_REALTIME, rt + 1); 577 | } 578 | break; 579 | case 9: 580 | /* 581 | * - based on case 5 582 | * - only diffs are listed here: 583 | * * collapse(2) 584 | * * 4x i-loop unrolling (stride of n/4 rows) 585 | */ 586 | #pragma omp target data device(0) \ 587 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) 588 | { 589 | clock_gettime(CLOCK_REALTIME, rt + 0); 590 | #pragma omp target teams device(0) num_teams(LTEAMSF) thread_limit(NTHRDS7) \ 591 | map(to:n, a[0:n * n], b[0:n * n]) map(tofrom:c[0:n * n]) \ 592 | default(none) shared(a, b, c, n) 593 | #pragma omp distribute parallel for num_threads(NTHRDS7) \ 594 | dist_schedule(static, NTHRDS7) collapse(2) \ 595 | default(none) shared(a, b, c, n) 596 | for (int j = 0; j < n; ++j) { 597 | for (int i = 0; i < (n >> 2); ++i) { /* 4x unrolling */ 598 | float rc0, rc1, rc2, rc3; 599 | rc0 = c[j * n + i ]; 600 | rc1 = c[j * n + i + (n >> 2) ]; 601 | rc2 = c[j * n + i + (n >> 2) * 2]; 602 | rc3 = c[j * n + i + (n >> 2) * 3]; 603 | for (int k = 0; k < n; k += 4) { /* 4x unrolling */ 604 | /* register for b: 4x k-loop */ 605 | float rb0, rb1, rb2, rb3; 606 | rb0 = b[j * n + k ]; 607 | rb1 = b[j * n + k + 1]; 608 | rb2 = b[j * n + k + 2]; 609 | rb3 = b[j * n + k + 3]; 610 | rc0 += a[ k * n + i ] * rb0; 611 | rc0 += a[(k + 1) * n + i ] * rb1; 612 | rc0 += a[(k + 2) * n + i ] * rb2; 613 | rc0 += a[(k + 3) * n + i ] * rb3; 614 | rc1 += a[ k * n + i + (n >> 2) ] * rb0; 615 | rc1 += a[(k + 1) * n + i + (n >> 2) ] * rb1; 616 | rc1 += a[(k + 2) * n + i + (n >> 2) ] * rb2; 617 | rc1 += a[(k + 3) * n + i + (n >> 2) ] * rb3; 618 | rc2 += a[ k * n + i + (n >> 2) * 2] * rb0; 619 | rc2 += a[(k + 1) * n + i + (n >> 2) * 2] * rb1; 620 | rc2 += a[(k + 2) * n + i + (n >> 2) * 2] * rb2; 621 | rc2 += a[(k + 3) * n + i + (n >> 2) * 2] * rb3; 622 | rc3 += a[ k * n + i + (n >> 2) * 3] * rb0; 623 | rc3 += a[(k + 1) * n + i + (n >> 2) * 3] * rb1; 624 | rc3 += a[(k + 2) * n + i + (n >> 2) * 3] * rb2; 625 | rc3 += a[(k + 3) * n + i + (n >> 2) * 3] * rb3; 626 | } 627 | c[j * n + i ] = rc0; 628 | c[j * n + i + (n >> 2) ] = rc1; 629 | c[j * n + i + (n >> 2) * 2] = rc2; 630 | c[j * n + i + (n >> 2) * 3] = rc3; 631 | } /* end i-loop */ 632 | } /* end j-loop */ 633 | clock_gettime(CLOCK_REALTIME, rt + 1); 634 | } 635 | break; 636 | default: 637 | /* 638 | * cublasSgemm in CUBLAS 639 | */ 640 | if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle)) { 641 | printf("error: initialization (CUBLAS)\n"); 642 | cublasDestroy(handle); 643 | exit(EXIT_FAILURE); 644 | } 645 | if (cudaSuccess != cudaMalloc((void **) &a_dev, sizeof(*a) * n * n) || 646 | cudaSuccess != cudaMalloc((void **) &b_dev, sizeof(*b) * n * n) || 647 | cudaSuccess != cudaMalloc((void **) &c_dev, sizeof(*c) * n * n)) { 648 | printf("error: memory allocation (CUDA)\n"); 649 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 650 | cublasDestroy(handle); 651 | exit(EXIT_FAILURE); 652 | } 653 | if (CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*a), a, n, a_dev, n) || 654 | CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*b), b, n, b_dev, n) || 655 | CUBLAS_STATUS_SUCCESS != cublasSetMatrix(n, n, sizeof(*c), c, n, c_dev, n)) { 656 | printf("error: host --> accl (CUBLAS)\n"); 657 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 658 | cublasDestroy(handle); 659 | exit(EXIT_FAILURE); 660 | } 661 | clock_gettime(CLOCK_REALTIME, rt + 0); 662 | if (CUBLAS_STATUS_SUCCESS != cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 663 | n, n, n, &alfa, a_dev, n, b_dev, n, &beta, c_dev, n)) { 664 | printf("error: cublasSgemm (CUBLAS)\n"); 665 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 666 | cublasDestroy(handle); 667 | exit(EXIT_FAILURE); 668 | } 669 | if (cudaSuccess != cudaDeviceSynchronize()) { 670 | printf("error: device synchronization (CUDA)\n"); 671 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 672 | cublasDestroy(handle); 673 | exit(EXIT_FAILURE); 674 | } 675 | clock_gettime(CLOCK_REALTIME, rt + 1); 676 | if (CUBLAS_STATUS_SUCCESS != cublasGetMatrix(n, n, sizeof(*c), c_dev, n, c, n)) { 677 | printf("error: accl --> host (CUBLAS)\n"); 678 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 679 | cublasDestroy(handle); 680 | exit(EXIT_FAILURE); 681 | } 682 | cudaFree(a_dev); cudaFree(b_dev); cudaFree(c_dev); 683 | cublasDestroy(handle); 684 | break; 685 | } /* end switch (ial) */ 686 | if (wtcalc >= 0.0) { 687 | wtcalc += (rt[1].tv_sec - rt[0].tv_sec) + 1.0e-9 * (rt[1].tv_nsec - rt[0].tv_nsec); 688 | } 689 | } 690 | --------------------------------------------------------------------------------