├── samples ├── .gitignore ├── multi-dgemm │ ├── .gitignore │ ├── results │ │ ├── Xeon_Phi-3120 │ │ │ ├── plot.png │ │ │ └── benchmark.txt │ │ └── Xeon_Phi-7120 │ │ │ ├── plot.png │ │ │ └── benchmark.txt │ ├── multi-dgemm.plt │ ├── benchmark.sh │ ├── plot.sh │ ├── multi-dgemm.sh │ ├── multi-dgemm-type.hpp │ ├── Makefile │ ├── multi-dgemm.cpp │ └── multi-dgemm-type.cpp ├── smm │ ├── requirements.txt │ ├── .gitignore │ ├── params │ │ └── README.md │ ├── kernels │ │ └── transpose.cl │ ├── README-backend.md │ ├── acc_bench.h │ ├── acc_libsmm.h │ ├── README.md │ ├── README-autotune.md │ ├── opencl_libsmm.h │ ├── README-bulktune.md │ ├── tune_multiply.sh │ └── Makefile ├── copy │ ├── results │ │ ├── Xeon_Phi-3120 │ │ │ ├── copy.pdf │ │ │ ├── copy.png │ │ │ ├── copyin.dat │ │ │ └── copyout.dat │ │ └── Xeon_Phi-7120 │ │ │ ├── copy.pdf │ │ │ ├── copy.png │ │ │ ├── copyin.dat │ │ │ └── copyout.dat │ ├── plot.sh │ ├── copy.sh │ ├── copy.plt │ ├── Makefile │ ├── copy.cpp │ └── Makefile.inc ├── entropy │ ├── entropy.sh │ ├── Makefile │ └── entropy.c └── test │ ├── test.hpp │ └── Makefile ├── documentation ├── cp2k.pdf ├── libxstream.pdf ├── libxstream.pptx └── cp2k.md ├── .gitignore ├── .travis.yml ├── LICENSE.md ├── README.sh ├── scripts ├── acc_getenv.sh ├── acc_triplets.sh └── acc_opencl.sh ├── include ├── common │ ├── opencl_common.h │ └── opencl_atomics.h └── acc.h ├── Makefile └── src └── acc_opencl_event.c /samples/.gitignore: -------------------------------------------------------------------------------- 1 | *.sln 2 | *.csv -------------------------------------------------------------------------------- /samples/multi-dgemm/.gitignore: -------------------------------------------------------------------------------- 1 | plot.txt -------------------------------------------------------------------------------- /samples/smm/requirements.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | opentuner 3 | -------------------------------------------------------------------------------- /samples/smm/.gitignore: -------------------------------------------------------------------------------- 1 | opencl_kernels.h 2 | .with_gpu 3 | -------------------------------------------------------------------------------- /documentation/cp2k.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/documentation/cp2k.pdf -------------------------------------------------------------------------------- /documentation/libxstream.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/documentation/libxstream.pdf -------------------------------------------------------------------------------- /documentation/libxstream.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/documentation/libxstream.pptx -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-3120/copy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/copy/results/Xeon_Phi-3120/copy.pdf -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-3120/copy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/copy/results/Xeon_Phi-3120/copy.png -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-7120/copy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/copy/results/Xeon_Phi-7120/copy.pdf -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-7120/copy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/copy/results/Xeon_Phi-7120/copy.png -------------------------------------------------------------------------------- /samples/multi-dgemm/results/Xeon_Phi-3120/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/multi-dgemm/results/Xeon_Phi-3120/plot.png -------------------------------------------------------------------------------- /samples/multi-dgemm/results/Xeon_Phi-7120/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hfp/libxstream/HEAD/samples/multi-dgemm/results/Xeon_Phi-7120/plot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | My Amplifier* 2 | My Inspector* 3 | bin 4 | lib 5 | build 6 | tmp 7 | samples/*/*.dat 8 | samples/*/*.pdf 9 | samples/*/*.png 10 | *.stackdump 11 | *.opensdf 12 | *.user 13 | *.docx 14 | *.log 15 | *.exe 16 | *.dll 17 | *.ilk 18 | *.sdf 19 | *.pdb 20 | *.suo 21 | *.exp 22 | *.lib 23 | *.obj 24 | *.o -------------------------------------------------------------------------------- /samples/multi-dgemm/multi-dgemm.plt: -------------------------------------------------------------------------------- 1 | set terminal png 2 | set output "plot.png" 3 | set grid xtics lc "grey" 4 | set grid ytics lc "grey" 5 | set xlabel "batch size" 6 | set ylabel "GFLOP/s" 7 | set autoscale fix 8 | set mytics 2 9 | 10 | plot "plot.txt" using 2:3 notitle smooth sbezier lc "grey", \ 11 | "" using 2:3 notitle with points pt 7 12 | -------------------------------------------------------------------------------- /samples/multi-dgemm/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #TRY="echo" 4 | 5 | FILE="benchmark.txt" 6 | SIZE=250 7 | BSIZE=16 8 | STRIDE=1 9 | 10 | if [ "" != "$1" ] ; then 11 | SIZE=$1 12 | shift 13 | fi 14 | if [ "" != "$1" ] ; then 15 | BSIZE=$1 16 | shift 17 | fi 18 | if [ "" != "$1" ] ; then 19 | STRIDE=$1 20 | shift 21 | fi 22 | 23 | cat /dev/null > ${FILE} 24 | 25 | BATCH=${STRIDE} 26 | while [[ ${BATCH} -le ${BSIZE} ]] ; do 27 | env CHECK=1 ${TRY} \ 28 | ./multi-dgemm.sh ${SIZE} ${BATCH} $* >> ${FILE} 29 | BATCH=$((BATCH + STRIDE)) 30 | done 31 | -------------------------------------------------------------------------------- /samples/copy/plot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HERE=$(cd $(dirname $0); pwd -P) 4 | 5 | FILE=copy.pdf 6 | if [[ "" != "$1" ]] ; then 7 | FILE=$1 8 | shift 9 | fi 10 | 11 | if [[ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]] ; then 12 | GNUPLOT=/cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot 13 | elif [[ -f /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot ]] ; then 14 | GNUPLOT=/cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot 15 | else 16 | GNUPLOT=$(which gnuplot 2> /dev/null) 17 | fi 18 | 19 | if [[ "" != "${GNUPLOT}" ]] ; then 20 | env \ 21 | GDFONTPATH=/cygdrive/c/Windows/Fonts \ 22 | FILENAME=${FILE} \ 23 | "${GNUPLOT}" copy.plt 24 | fi 25 | 26 | -------------------------------------------------------------------------------- /samples/entropy/entropy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HERE=$(cd $(dirname $0); pwd -P) 4 | 5 | export OFFLOAD_INIT=on_start 6 | export MIC_USE_2MB_BUFFERS=2m 7 | export MIC_ENV_PREFIX=MIC 8 | export MIC_KMP_AFFINITY=balanced,granularity=fine 9 | 10 | if [[ "" != "$(ldd ${HERE}/${NAME} | grep libiomp5\.so)" ]] ; then 11 | export KMP_AFFINITY=scatter,granularity=fine,1 12 | else 13 | export OMP_PROC_BIND=TRUE 14 | fi 15 | 16 | if [[ "-test" == "$1" ]] ; then 17 | TESTS=( \ 18 | "16 1 2" \ 19 | "19 1 4" \ 20 | "25 1 13" \ 21 | "45 1 4" \ 22 | ) 23 | else 24 | TESTS=( "$*" ) 25 | fi 26 | 27 | for TEST in "${TESTS[@]}" ; do 28 | ${HERE}/entropy ${TEST} 29 | done 30 | 31 | -------------------------------------------------------------------------------- /samples/multi-dgemm/plot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILE="benchmark.txt" 4 | 5 | grep -A1 "Running " ${FILE} | tr "\n" " " | sed \ 6 | -e "s/Running //g" \ 7 | -e "s/ batche*s* of//g" \ 8 | -e "s/items*... Performance: //g" \ 9 | -e "s/ GFLOPS\/s//g" \ 10 | -e "s/ -- /\n/g" \ 11 | > plot.txt 12 | 13 | if [ "${OS}" != "Windows_NT" ] ; then 14 | gnuplot multi-dgemm.plt 15 | else 16 | export GDFONTPATH=/cygdrive/c/Windows/Fonts 17 | 18 | if [[ -f /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot ]] ; then 19 | /cygdrive/c/Program\ Files/gnuplot/bin/wgnuplot multi-dgemm.plt 20 | else 21 | /cygdrive/c/Program\ Files\ \(x86\)/gnuplot/bin/wgnuplot multi-dgemm.plt 22 | fi 23 | fi 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | addons: 4 | apt: 5 | packages: 6 | - gfortran 7 | - libblas-dev 8 | - liblapack-dev 9 | 10 | compiler: 11 | - gcc 12 | 13 | script: 14 | - make -e realclean && 15 | make -e PEDANTIC=1 -j 16 | 17 | - cd ${TRAVIS_BUILD_DIR}/samples/copy && 18 | make -e realclean && 19 | make -e PEDANTIC=1 && 20 | ./copy.sh i 2 2 512 21 | 22 | - cd ${TRAVIS_BUILD_DIR}/samples/entropy && 23 | make -e realclean && 24 | make -e PEDANTIC=1 && 25 | ./entropy.sh -test 26 | 27 | - cd ${TRAVIS_BUILD_DIR}/samples/multi-dgemm && 28 | make -e realclean && 29 | make -e PEDANTIC=1 30 | 31 | - cd ${TRAVIS_BUILD_DIR}/samples/test && 32 | make -e realclean && 33 | make -e PEDANTIC=1 34 | -------------------------------------------------------------------------------- /samples/multi-dgemm/multi-dgemm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HERE=$(cd $(dirname $0); pwd -P) 4 | 5 | export OFFLOAD_INIT=on_start 6 | export MIC_USE_2MB_BUFFERS=2m 7 | export MIC_ENV_PREFIX=MIC 8 | export MIC_KMP_AFFINITY=balanced,granularity=fine 9 | 10 | if [[ "" != "$(ldd ${HERE}/${NAME} | grep libiomp5\.so)" ]] ; then 11 | export KMP_AFFINITY=scatter,granularity=fine,1 12 | else 13 | export OMP_PROC_BIND=TRUE 14 | fi 15 | 16 | if [[ "-test" == "$1" ]] ; then 17 | export CHECK=1 18 | TESTS=( \ 19 | "4 1 1" \ 20 | "10 2 2" \ 21 | "14 2 2" \ 22 | "20 1 2" \ 23 | "40 2 2" \ 24 | "40 1 2" \ 25 | "60 2 4" \ 26 | "128 2 2" \ 27 | ) 28 | else 29 | TESTS=( "$*" ) 30 | fi 31 | 32 | for TEST in "${TESTS[@]}" ; do 33 | ${HERE}/multi-dgemm ${TEST} 34 | done 35 | 36 | -------------------------------------------------------------------------------- /samples/smm/params/README.md: -------------------------------------------------------------------------------- 1 | # Tuned Parameters 2 | 3 | The OpenCL based implementation of LIBSMM supports default kernel-parameters, i.e., kernels can be successfuly generated for every requested multiplication/matrix shape (M, N, K) within the definition of a "Small Matrix Multiplication" (maximum M, N, and K). 4 | 5 | Tuned parameters targeting different devices can co-exist and can be embeded into the same executable, i.e., the executable does not depend on a particular build-path or location of parameter-files. 6 | 7 | Parameters are selected by matching against a device-ID with fallback to the "best-matching" parameters. The device-ID can be based on a vendor-specific function to identify a certain device or is generated from device's name as exposed by the OpenCL API. 8 | 9 | Parameters can be loaded from a CSV-file at runtime (`OPENCL_LIBSMM_SMM_PARAMS` environment variable) and thereby disable matching devices, i.e., parameters loaded this way will take precedence. 10 | -------------------------------------------------------------------------------- /samples/copy/copy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HERE=$(cd $(dirname $0); pwd -P) 4 | NAME=$(basename ${HERE}) 5 | 6 | if [[ "-mic" != "$1" ]] ; then 7 | if [[ "$1" == "o"* ]] ; then 8 | FILE=copyout.dat 9 | else 10 | FILE=copyin.dat 11 | fi 12 | if [[ "" != "$(ldd ${HERE}/${NAME} | grep libiomp5\.so)" ]] ; then 13 | env OFFLOAD_INIT=on_start \ 14 | KMP_AFFINITY=scatter,granularity=fine,1 \ 15 | MIC_KMP_AFFINITY=scatter,granularity=fine \ 16 | MIC_ENV_PREFIX=MIC \ 17 | ${HERE}/${NAME} $* | \ 18 | tee ${FILE} 19 | else 20 | env \ 21 | OMP_PROC_BIND=TRUE \ 22 | ${HERE}/${NAME} $* | \ 23 | tee ${FILE} 24 | fi 25 | else 26 | shift 27 | if [[ "$1" == "o"* ]] ; then 28 | FILE=copyout.dat 29 | else 30 | FILE=copyin.dat 31 | fi 32 | env \ 33 | SINK_LD_LIBRARY_PATH=$MIC_LD_LIBRARY_PATH \ 34 | micnativeloadex \ 35 | ${HERE}/${NAME} -a "$*" \ 36 | -e "KMP_AFFINITY=scatter,granularity=fine" | \ 37 | tee ${FILE} 38 | fi 39 | 40 | -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-3120/copyin.dat: -------------------------------------------------------------------------------- 1 | 8 Byte x 4096: 0.6 MB/s 2 | 16 Byte x 4096: 1.3 MB/s 3 | 32 Byte x 4096: 2.7 MB/s 4 | 64 Byte x 2048: 5.6 MB/s 5 | 128 Byte x 2048: 11.1 MB/s 6 | 256 Byte x 2048: 22.2 MB/s 7 | 512 Byte x 1024: 44.3 MB/s 8 | 1024 Byte x 1024: 88.9 MB/s 9 | 2048 Byte x 1024: 177.5 MB/s 10 | 4096 Byte x 512: 354.5 MB/s 11 | 8192 Byte x 512: 685.1 MB/s 12 | 16384 Byte x 512: 1313.8 MB/s 13 | 32768 Byte x 256: 2655.3 MB/s 14 | 65536 Byte x 256: 4137.7 MB/s 15 | 131072 Byte x 256: 5111.7 MB/s 16 | 262144 Byte x 128: 5594.5 MB/s 17 | 524288 Byte x 128: 6044.5 MB/s 18 | 1048576 Byte x 128: 6277.9 MB/s 19 | 2097152 Byte x 64: 6373.6 MB/s 20 | 4194304 Byte x 64: 6369.0 MB/s 21 | 8388608 Byte x 64: 6395.3 MB/s 22 | 16777216 Byte x 32: 6430.7 MB/s 23 | 33554432 Byte x 32: 6443.0 MB/s 24 | 67108864 Byte x 32: 6443.0 MB/s 25 | 134217728 Byte x 16: 6358.7 MB/s 26 | 268435456 Byte x 16: 6363.4 MB/s 27 | 536870912 Byte x 16: 6346.9 MB/s 28 | 1073741824 Byte x 8: 5802.0 MB/s 29 | 30 | Finished after 5 s 31 | max: 6443 MB/s 32 | rgm: 6075 MB/s 33 | avg: 3423 MB/s 34 | -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-3120/copyout.dat: -------------------------------------------------------------------------------- 1 | 8 Byte x 4096: 0.9 MB/s 2 | 16 Byte x 4096: 1.9 MB/s 3 | 32 Byte x 4096: 3.9 MB/s 4 | 64 Byte x 2048: 7.4 MB/s 5 | 128 Byte x 2048: 14.8 MB/s 6 | 256 Byte x 2048: 29.9 MB/s 7 | 512 Byte x 1024: 58.9 MB/s 8 | 1024 Byte x 1024: 117.6 MB/s 9 | 2048 Byte x 1024: 235.5 MB/s 10 | 4096 Byte x 512: 463.5 MB/s 11 | 8192 Byte x 512: 922.7 MB/s 12 | 16384 Byte x 512: 1732.7 MB/s 13 | 32768 Byte x 256: 3247.9 MB/s 14 | 65536 Byte x 256: 4080.6 MB/s 15 | 131072 Byte x 256: 4881.0 MB/s 16 | 262144 Byte x 128: 5408.3 MB/s 17 | 524288 Byte x 128: 5998.7 MB/s 18 | 1048576 Byte x 128: 6272.4 MB/s 19 | 2097152 Byte x 64: 6361.5 MB/s 20 | 4194304 Byte x 64: 6396.8 MB/s 21 | 8388608 Byte x 64: 6476.2 MB/s 22 | 16777216 Byte x 32: 6403.7 MB/s 23 | 33554432 Byte x 32: 6348.1 MB/s 24 | 67108864 Byte x 32: 6208.1 MB/s 25 | 134217728 Byte x 16: 6205.0 MB/s 26 | 268435456 Byte x 16: 5596.0 MB/s 27 | 536870912 Byte x 16: 5110.8 MB/s 28 | 1073741824 Byte x 8: 5235.0 MB/s 29 | 30 | Finished after 5 s 31 | max: 6476 MB/s 32 | rgm: 5363 MB/s 33 | avg: 3351 MB/s 34 | -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-7120/copyin.dat: -------------------------------------------------------------------------------- 1 | 8 Byte x 4096: 0.5 MB/s 2 | 16 Byte x 4096: 1.1 MB/s 3 | 32 Byte x 4096: 2.6 MB/s 4 | 64 Byte x 2048: 6.5 MB/s 5 | 128 Byte x 2048: 13.2 MB/s 6 | 256 Byte x 2048: 26.1 MB/s 7 | 512 Byte x 1024: 52.3 MB/s 8 | 1024 Byte x 1024: 104.0 MB/s 9 | 2048 Byte x 1024: 214.1 MB/s 10 | 4096 Byte x 512: 417.6 MB/s 11 | 8192 Byte x 512: 748.6 MB/s 12 | 16384 Byte x 512: 1254.1 MB/s 13 | 32768 Byte x 256: 2278.6 MB/s 14 | 65536 Byte x 256: 3625.5 MB/s 15 | 131072 Byte x 256: 4842.6 MB/s 16 | 262144 Byte x 128: 5371.9 MB/s 17 | 524288 Byte x 128: 5899.2 MB/s 18 | 1048576 Byte x 128: 6158.0 MB/s 19 | 2097152 Byte x 64: 6242.3 MB/s 20 | 4194304 Byte x 64: 6075.6 MB/s 21 | 8388608 Byte x 64: 6252.4 MB/s 22 | 16777216 Byte x 32: 6309.3 MB/s 23 | 33554432 Byte x 32: 6367.3 MB/s 24 | 67108864 Byte x 32: 6400.4 MB/s 25 | 134217728 Byte x 16: 6395.7 MB/s 26 | 268435456 Byte x 16: 6385.2 MB/s 27 | 536870912 Byte x 16: 6375.3 MB/s 28 | 1073741824 Byte x 8: 5477.8 MB/s 29 | 2147483648 Byte x 8: 5613.5 MB/s 30 | 31 | Finished after 8 s 32 | max: 6400 MB/s 33 | rgm: 5760 MB/s 34 | avg: 3411 MB/s 35 | -------------------------------------------------------------------------------- /samples/copy/results/Xeon_Phi-7120/copyout.dat: -------------------------------------------------------------------------------- 1 | 8 Byte x 4096: 0.7 MB/s 2 | 16 Byte x 4096: 1.3 MB/s 3 | 32 Byte x 4096: 2.6 MB/s 4 | 64 Byte x 2048: 6.9 MB/s 5 | 128 Byte x 2048: 13.4 MB/s 6 | 256 Byte x 2048: 26.8 MB/s 7 | 512 Byte x 1024: 52.3 MB/s 8 | 1024 Byte x 1024: 108.2 MB/s 9 | 2048 Byte x 1024: 213.4 MB/s 10 | 4096 Byte x 512: 428.9 MB/s 11 | 8192 Byte x 512: 823.5 MB/s 12 | 16384 Byte x 512: 1236.1 MB/s 13 | 32768 Byte x 256: 2491.4 MB/s 14 | 65536 Byte x 256: 4086.8 MB/s 15 | 131072 Byte x 256: 5029.7 MB/s 16 | 262144 Byte x 128: 5593.3 MB/s 17 | 524288 Byte x 128: 6069.4 MB/s 18 | 1048576 Byte x 128: 6329.1 MB/s 19 | 2097152 Byte x 64: 6364.0 MB/s 20 | 4194304 Byte x 64: 6139.2 MB/s 21 | 8388608 Byte x 64: 6297.7 MB/s 22 | 16777216 Byte x 32: 6292.9 MB/s 23 | 33554432 Byte x 32: 6350.2 MB/s 24 | 67108864 Byte x 32: 6383.2 MB/s 25 | 134217728 Byte x 16: 6230.0 MB/s 26 | 268435456 Byte x 16: 5423.2 MB/s 27 | 536870912 Byte x 16: 4958.8 MB/s 28 | 1073741824 Byte x 8: 5153.0 MB/s 29 | 2147483648 Byte x 8: 5131.9 MB/s 30 | 31 | Finished after 9 s 32 | max: 6383 MB/s 33 | rgm: 5198 MB/s 34 | avg: 3353 MB/s 35 | -------------------------------------------------------------------------------- /samples/copy/copy.plt: -------------------------------------------------------------------------------- 1 | FILENAME = system("sh -c \"echo ${FILENAME}\"") 2 | if (FILENAME eq "") { 3 | FILENAME = "copy.pdf" 4 | } 5 | FILEEXT = system("sh -c \"echo ".FILENAME." | sed 's/.\\+\\.\\(.\\+\\)/\\1/'\"") 6 | FILTER = "sed -n 's/\\(.\\+\\) Byte x \\(.\\+\\): \\(.\\+\\) MB\\/s/\\1 \\2 \\3/p'" 7 | 8 | set output FILENAME 9 | set terminal FILEEXT 10 | set termoption enhanced 11 | #set termoption font "Times-Roman,7" 12 | save_encoding = GPVAL_ENCODING 13 | set encoding utf8 14 | 15 | set grid xtics lc "grey" 16 | set grid ytics lc "grey" 17 | set xlabel "data size [MB]" 18 | set ylabel "bandwidth [MB/s]" 19 | set autoscale fix 20 | set mytics 2 21 | set logscale x 22 | set format x "%g" 23 | 24 | plot "<(sh -c \"".FILTER." copyin.dat\")" using ($1/(1024*1024)):3 notitle smooth sbezier lc 1, \ 25 | (1/0) title "copy-in" with points pt 7 ps 1 lc 1, \ 26 | "" using ($1/(1024*1024)):3 notitle with points pt 7 ps 0.3 lc 1, \ 27 | "<(sh -c \"".FILTER." copyout.dat\")" using ($1/(1024*1024)):3 notitle smooth sbezier lc 2, \ 28 | (1/0) title "copy-out" with points pt 7 ps 1 lc 2, \ 29 | "" using ($1/(1024*1024)):3 notitle with points pt 7 ps 0.3 lc 2 30 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # BSD 3-Clause License 2 | 3 | Copyright (c) 2009-2025, Intel Corporation 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HERE=$(cd $(dirname $0); pwd -P) 4 | 5 | # output directory 6 | if [[ "" != "$1" ]] ; then 7 | DOCDIR=$1 8 | shift 9 | else 10 | DOCDIR=documentation 11 | fi 12 | 13 | # temporary file 14 | TEMPLATE=$(mktemp --tmpdir=. --suffix=.tex) 15 | 16 | # dump pandoc template for latex 17 | pandoc -D latex > ${TEMPLATE} 18 | 19 | # adjust the template 20 | sed -i \ 21 | -e 's/\(\\documentclass\[.\+\]{.\+}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ 22 | -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily}/' \ 23 | ${TEMPLATE} 24 | 25 | # cleanup markup and pipe into pandoc using the template 26 | # LIBXSTREAM documentation 27 | sed \ 28 | -e 's/https:\/\/raw\.githubusercontent\.com\/hfp\/libxstream\/master\///' \ 29 | -e 's/\[\[.\+\](.\+)\]//' \ 30 | -e '/!\[.\+\](.\+)/{n;d}' \ 31 | README.md | tee >( \ 32 | pandoc \ 33 | --latex-engine=xelatex \ 34 | --template=${TEMPLATE} --listings \ 35 | -f markdown_github+implicit_figures \ 36 | -V documentclass=scrartcl \ 37 | -V title-meta="LIBXSTREAM Documentation" \ 38 | -V author-meta="Hans Pabst" \ 39 | -V classoption=DIV=45 \ 40 | -V linkcolor=black \ 41 | -V citecolor=black \ 42 | -V urlcolor=black \ 43 | -o ${DOCDIR}/libxstream.pdf) | \ 44 | pandoc \ 45 | -f markdown_github+implicit_figures \ 46 | -o ${DOCDIR}/libxstream.docx 47 | 48 | # cleanup markup and pipe into pandoc using the template 49 | # CP2K recipe 50 | sed \ 51 | -e 's/https:\/\/raw\.githubusercontent\.com\/hfp\/libxstream\/master\///' \ 52 | -e 's/\[\[.\+\](.\+)\]//' \ 53 | -e '/!\[.\+\](.\+)/{n;d}' \ 54 | ${HERE}/documentation/cp2k.md | tee >( \ 55 | pandoc \ 56 | --latex-engine=xelatex \ 57 | --template=${TEMPLATE} --listings \ 58 | -f markdown_github+implicit_figures \ 59 | -V documentclass=scrartcl \ 60 | -V title-meta="CP2K with LIBXSTREAM" \ 61 | -V author-meta="Hans Pabst" \ 62 | -V classoption=DIV=45 \ 63 | -V linkcolor=black \ 64 | -V citecolor=black \ 65 | -V urlcolor=black \ 66 | -o ${DOCDIR}/cp2k.pdf) | \ 67 | pandoc \ 68 | -f markdown_github+implicit_figures \ 69 | -o ${DOCDIR}/cp2k.docx 70 | 71 | # remove temporary file 72 | rm ${TEMPLATE} 73 | -------------------------------------------------------------------------------- /scripts/acc_getenv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #################################################################################################### 3 | # Copyright (C) by the DBCSR developers group - All rights reserved # 4 | # This file is part of the DBCSR library. # 5 | # # 6 | # For information on the license, see the LICENSE file. # 7 | # For further information please visit https://dbcsr.cp2k.org # 8 | # SPDX-License-Identifier: BSD-3-Clause # 9 | #################################################################################################### 10 | 11 | FIND=$(command -v find) 12 | SORT=$(command -v sort) 13 | SED=$(command -v gsed) 14 | 15 | # GNU sed is desired (macOS) 16 | if [ ! "${SED}" ]; then 17 | SED=$(command -v sed) 18 | fi 19 | 20 | HERE="$(cd "$(dirname "$0")" && pwd -P)" 21 | SRC="${HERE}" 22 | EXT="c" 23 | 24 | if [ "${FIND}" ] && [ "${SORT}" ] && [ "${SED}" ] && [ -d "${SRC}" ]; then 25 | export LC_ALL=C 26 | ENVARS="$(${FIND} "${SRC}" -type f -name "*.${EXT}" -exec \ 27 | "${SED}" "s/getenv[[:space:]]*([[:space:]]*\".[^\"]*/\n&/g" {} \; | \ 28 | "${SED}" -n "s/.*getenv[[:space:]]*([[:space:]]*\"\(.[^\"]*\)..*/\1/p" | \ 29 | ${SORT} -u)" 30 | OTHERS=$(echo "${ENVARS}" | ${SED} "/ACC_OPENCL_/d;/OPENCL_LIBSMM_/d") 31 | if [ "${OTHERS}" ]; then 32 | echo "====================================" 33 | echo "Other environment variables" 34 | echo "====================================" 35 | echo "${ENVARS}" | ${SED} "/ACC_OPENCL_/d;/OPENCL_LIBSMM_/d" 36 | fi 37 | echo "====================================" 38 | echo "OpenCL Backend environment variables" 39 | echo "====================================" 40 | echo "${ENVARS}" | ${SED} -n "/ACC_OPENCL_/p" 41 | echo "====================================" 42 | echo "OpenCL LIBSMM environment variables" 43 | echo "====================================" 44 | echo "${ENVARS}" | ${SED} -n "/OPENCL_LIBSMM_/p" 45 | else 46 | >&2 echo "Error: missing prerequisites!" 47 | exit 1 48 | fi 49 | -------------------------------------------------------------------------------- /include/common/opencl_common.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef OPENCL_COMMON_H 10 | #define OPENCL_COMMON_H 11 | 12 | #if !defined(ACC_OPENCL_C_VERSION) 13 | # define ACC_OPENCL_C_VERSION __OPENCL_C_VERSION__ 14 | #endif 15 | #if !defined(ACC_OPENCL_VERSION) 16 | # define ACC_OPENCL_VERSION __OPENCL_VERSION__ 17 | #endif 18 | 19 | #if (200 /*CL_VERSION_2_0*/ <= ACC_OPENCL_C_VERSION) || defined(__NV_CL_C_VERSION) 20 | # define UNROLL_FORCE(N) __attribute__((opencl_unroll_hint(N))) 21 | # define UNROLL_AUTO __attribute__((opencl_unroll_hint)) 22 | #else 23 | # define UNROLL_FORCE(N) 24 | # define UNROLL_AUTO 25 | #endif 26 | 27 | #if !defined(LU) || (-1 == LU) 28 | # define UNROLL_OUTER(N) 29 | # define UNROLL(N) 30 | #else /* (-2) full, (-1) no hints, (0) inner, (1) outer-dehint, (2) block-m */ 31 | # if (1 <= LU) /* outer-dehint */ 32 | # define UNROLL_OUTER(N) UNROLL_FORCE(1) 33 | # elif (-1 > LU) /* full */ 34 | # define UNROLL_OUTER(N) UNROLL_FORCE(N) 35 | # else /* inner */ 36 | # define UNROLL_OUTER(N) 37 | # endif 38 | # define UNROLL(N) UNROLL_FORCE(N) 39 | #endif 40 | 41 | #define BCST_NO(V, I) (V) 42 | #if defined(WG) && (0 < WG) && defined(GPU) && (200 <= ACC_OPENCL_VERSION) 43 | # define BCST_WG(V, I) work_group_broadcast(V, I) 44 | #endif 45 | #if defined(SG) && (0 < SG) && defined(GPU) && (200 <= ACC_OPENCL_VERSION) 46 | # define BCST_SG(V, I) sub_group_broadcast(V, I) 47 | #endif 48 | 49 | #if !defined(MIN) 50 | # define MIN(A, B) ((A) < (B) ? (A) : (B)) 51 | #endif 52 | #if !defined(MAX) 53 | # define MAX(A, B) ((A) < (B) ? (B) : (A)) 54 | #endif 55 | #if !defined(MAD) 56 | # define MAD fma 57 | #endif 58 | 59 | #define DIVUP(A, B) (((A) + (B) - 1) / (B)) 60 | #define NUP(N, UP) (DIVUP(N, UP) * (UP)) 61 | #define BLR(N, BN) (NUP(N, BN) - (N)) 62 | 63 | #define IDX(I, J, M, N) ((int)(I) * (N) + (J)) 64 | #define IDT(I, J, M, N) IDX(J, I, N, M) 65 | 66 | #endif /*OPENCL_COMMON_H*/ 67 | -------------------------------------------------------------------------------- /samples/smm/kernels/transpose.cl: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | 10 | __attribute__((reqd_work_group_size(WG, 1, 1))) kernel void FN( 11 | int trs_offset, CONSTANT const int* restrict trs_stack, global T* restrict matrix) { 12 | /* offset in the transpose-stack that this block ID should handle */ 13 | const int offset = trs_stack[trs_offset + get_group_id(0)]; 14 | /* matrix according to the index (transpose-stack) */ 15 | global T* const restrict mat = matrix + offset; 16 | const int idx = get_local_id(0); 17 | #if (SM != SN) || (0 == INPLACE) 18 | /* local memory buffer */ 19 | local T buf[SM][SN]; 20 | #endif 21 | #if (WG == SM) 22 | const int m = idx; 23 | # if (SM != SN) || (0 == INPLACE) 24 | /* copy matrix elements into local buffer */ 25 | for (int n = 0; n < SN; ++n) buf[m][n] = mat[SM * n + m]; 26 | barrier(CLK_LOCAL_MEM_FENCE); 27 | /* overwrite matrix elements (gather) */ 28 | for (int n = 0; n < SN; ++n) mat[SN * m + n] = buf[m][n]; 29 | # else 30 | for (int n = 0; n < m; ++n) { 31 | const int i = SM * n + m; 32 | const int j = SN * m + n; 33 | const T tmp = mat[i]; 34 | mat[i] = mat[j]; 35 | mat[j] = tmp; 36 | } 37 | # endif 38 | #else 39 | T prv[SN]; /* private buffer */ 40 | # if (SM != SN) || (0 == INPLACE) 41 | /* copy matrix elements into local buffer */ 42 | for (int m = idx; m < SM; m += WG) { 43 | for (int n = 0; n < SN; ++n) buf[m][n] = mat[SM * n + m]; 44 | } 45 | barrier(CLK_LOCAL_MEM_FENCE); 46 | # endif 47 | for (int m = idx; m < SM; m += WG) { 48 | # if (SM != SN) || (0 == INPLACE) 49 | for (int n = 0; n < SN; ++n) prv[n] = buf[m][n]; 50 | /* overwrite matrix elements (gather) */ 51 | for (int n = 0; n < SN; ++n) mat[SN * m + n] = prv[n]; 52 | # else 53 | for (int n = 0; n < SN; ++n) prv[n] = mat[SM * n + m]; 54 | for (int n = 0; n < m; ++n) { 55 | const int i = SM * n + m; 56 | const int j = SN * m + n; 57 | mat[i] = mat[j]; 58 | mat[j] = prv[n]; 59 | } 60 | # endif 61 | } 62 | #endif 63 | } 64 | -------------------------------------------------------------------------------- /samples/test/test.hpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Copyright (c) 2014-2016, Intel Corporation ** 3 | ** All rights reserved. ** 4 | ** ** 5 | ** Redistribution and use in source and binary forms, with or without ** 6 | ** modification, are permitted provided that the following conditions ** 7 | ** are met: ** 8 | ** 1. Redistributions of source code must retain the above copyright ** 9 | ** notice, this list of conditions and the following disclaimer. ** 10 | ** 2. Redistributions in binary form must reproduce the above copyright ** 11 | ** notice, this list of conditions and the following disclaimer in the ** 12 | ** documentation and/or other materials provided with the distribution. ** 13 | ** 3. Neither the name of the copyright holder nor the names of its ** 14 | ** contributors may be used to endorse or promote products derived ** 15 | ** from this software without specific prior written permission. ** 16 | ** ** 17 | ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ** 18 | ** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ** 19 | ** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ** 20 | ** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ** 21 | ** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** 22 | ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ** 23 | ** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** 24 | ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** 25 | ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** 26 | ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** 27 | ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** 28 | ******************************************************************************/ 29 | /* Hans Pabst (Intel Corp.) 30 | ******************************************************************************/ 31 | #ifndef TEST_HPP 32 | #define TEST_HPP 33 | 34 | #include 35 | 36 | 37 | class test_type { 38 | public: 39 | test_type(int device); 40 | ~test_type(); 41 | 42 | private: 43 | libxstream_stream* m_stream; 44 | libxstream_event* m_event; 45 | void* m_host_mem; 46 | void* m_dev_mem1; 47 | void* m_dev_mem2; 48 | }; 49 | 50 | #endif // TEST_HPP 51 | -------------------------------------------------------------------------------- /samples/smm/README-backend.md: -------------------------------------------------------------------------------- 1 | # Backend 2 | 3 | The OpenCL backend implements the [ACC interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h), which is exposed in Fortran and used throughout DBCSR's code base to drive (GPU-)acceleration based on ACC's device enumeration, data movement, and synchronization functionality. By design, DBCSR activates one device per rank (process). For instance, multiple GPUs can be used by the means of multiple ranks per system or at least one rank per device. The LIBSMM library complements the backend and implements the [ACC LIBSMM interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). 4 | 5 | All major GPU vendors support OpenCL even if the vendor-preferred programming model suggests otherwise. On Nvidia GPUs, the OpenCL backend can be used with CUDA based GPU-code in other portions of CP2K. The OpenCL based backend provides the following benefits: 6 | 7 | * Code portability between GPU vendors (if not performance portability). For instance, performance of the OpenCL backend matches the performance of the CUDA backend or exceeds it. 8 | * Acceptable performance for kernels not covered by specifically tuned parameters, and the ability to run on GPU if no tuned parameters are present. 9 | * Auto-tuning kernels within an acceptable time limit along with handy scripts to retune parameters and to carry forward an existing set (new GPU). 10 | 11 | Runtime settings are made by the means of environment variables. The OpenCL backend provides `acc_getenv.sh` to list all occurrences of `getenv` categorized into "OpenCL Backend environment variables" and "OpenCL LIBSMM environment variables". Common backend related settings are: 12 | 13 | * `ACC_OPENCL_DEVSPLIT`: integer enabling devices to be split into subdevices (non-zero/default: subdevices, zero: aggregated). 14 | * `ACC_OPENCL_DEVTYPE`: character string selecting "cpu", "gpu", "all" (unfiltered), or any other string (neither CPU or GPU). 15 | * `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices. 16 | * `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in a case-insensitive fashion, e.g., "intel". 17 | * `ACC_OPENCL_VERBOSE`: verbosity level (integer) with console output on `stderr`. 18 | * `ACC_OPENCL_VERBOSE=1`: outputs the number of devices found and the name of the selected device. 19 | * `ACC_OPENCL_VERBOSE=2`: outputs the duration needed to generate a requested kernel. 20 | * `ACC_OPENCL_VERBOSE=3`: outputs device-side performance of kernels (every launch profiled). 21 | * `ACC_OPENCL_DUMP`: dump preprocessed kernel source code (1) or dump compiled OpenCL kernels (2). 22 | * `ACC_OPENCL_DUMP=1`: dump preprocessed kernel source code and use it for JIT compilation. Instantiates the original source code using preprocessor definitions (`-D`) and collapses the code accordingly. 23 | * `ACC_OPENCL_DUMP=2`: dump compiled OpenCL kernels (depends on OpenCL implementation), e.g., PTX code on Nvidia. 24 | 25 | The OpenCL backend enumerates and orders devices by kind, i.e., GPU, CPU, and "other" (primary criterion) and by memory capacity (secondary criterion). Device IDs are zero-based as defined by the ACC interface (and less than what is permitted by `acc_get_ndevices`). 26 | -------------------------------------------------------------------------------- /documentation/cp2k.md: -------------------------------------------------------------------------------- 1 | # CP2K Open Source Molecular Dynamics 2 | This document focuses on offloading CP2K's DBCSR small matrix multiplication to the Intel Xeon Phi Coprocessor, and this document is accompanying the recipe for building and running CP2K's "intel" branch as described in: 3 | > https://github.com/hfp/libxsmm/raw/master/documentation/cp2k.pdf. 4 | 5 | ## Getting and Building the Source Code 6 | Please read the above mentioned document entirely, and use one of the recommended compiler versions: 7 | * Intel Compiler 15.0.3.187 (Build 20150407) 8 | * Intel Compiler 16.0.0.109 (Build 20150815) 9 | 10 | For Intel MPI, usually any version is fine. 11 | 12 | ``` 13 | git clone --branch intel https://github.com/cp2k/cp2k.git cp2k.git 14 | ln -s cp2k.git/cp2k cp2k 15 | source /opt/intel/composer_xe_2015.3.187/bin/compilervars.sh intel64 16 | source /opt/intel/impi/5.1.0.069/intel64/bin/mpivars.sh 17 | cd cp2k/makefiles 18 | make ARCH=Linux-x86-64-intel VERSION=popt ACC=1 -j 19 | ``` 20 | 21 | It is recommended to rely on a non-SMP build ("popt", "sopt") where "popt" makes the most sense in order to partition the coprocessor according to the number of ranks on the host system. Please note that although the host may be only MPI-parallelized, the coprocessor uses OpenMP within each partition formed by a host-rank. 22 | 23 | ## Running the Application 24 | To improve scalability, the coprocessor can partitioned by the MPI-ranks launched on the host system with each rank offloading work independently. This is solely achieved by setting a number of environment variables helping to place and pin the threads on the coprocessor. In order to ease this step, one may employ scripts as found at: 25 | > https://github.com/hfp/mpirun. 26 | 27 | Although the script can take a list of nodes, it may not be suitable for launching on a larger set of cluster nodes due to building an excessively large command line (rather than relying on the "host file"). 28 | 29 | In below example, the argument "-p8" could be very suitable for single socket of a 16-core system with a single 3-series coprocessor (57 cores) attached to the first socket. The number of cores (total number minus one core for the uOS) may be divisible by the number of ranks per host-socket. It is usually preferable to minimize the number of remaining cores on the coprocessor since the current implementation in the CP2K/intel branch is leaving the host processor(s) unutilized (beside from offloading and transferring the work). 30 | 31 | ``` 32 | wget https://raw.githubusercontent.com/hfp/mpirun/master/mpirun.sh 33 | wget https://raw.githubusercontent.com/hfp/mpirun/master/mpirun.py 34 | chmod +x mpirun.* 35 | mpirun.sh -p8 -x exe/Linux-x86-64-intel/cp2k.popt workload.inp 36 | ``` 37 | 38 | For an actual workload, one may try the following: 39 | 40 | ``` 41 | &GLOBAL 42 | PRINT_LEVEL MEDIUM 43 | PROGRAM_NAME TEST 44 | RUN_TYPE NONE 45 | &TIMINGS 46 | THRESHOLD 0.00001 47 | &END 48 | &END GLOBAL 49 | &TEST 50 | &CP_DBCSR 51 | K 6440 52 | M 6440 53 | N 6440 54 | TRANSA TRUE 55 | TRANSB FALSE 56 | N_LOOP 4 57 | ASPARSITY 0.0001 58 | BSPARSITY 0.0001 59 | CSPARSITY 0.0001 60 | bs_m 1 23 61 | bs_n 1 23 62 | bs_k 1 23 63 | KEEPSPARSE .FALSE. 64 | &END 65 | &END TEST 66 | ``` 67 | -------------------------------------------------------------------------------- /samples/smm/acc_bench.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef DBCSR_ACC_BENCH_H 10 | #define DBCSR_ACC_BENCH_H 11 | 12 | #include 13 | #include 14 | 15 | #if !defined(MIN) 16 | # define MIN(A, B) ((A) < (B) ? (A) : (B)) 17 | #endif 18 | #if !defined(MAX) 19 | # define MAX(A, B) ((B) < (A) ? (A) : (B)) 20 | #endif 21 | 22 | #if !defined(INLINE) && (defined(__cplusplus) || (defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/)) 23 | # define INLINE inline 24 | #else 25 | # define INLINE 26 | #endif 27 | 28 | #if !defined(MAX_KERNEL_DIM) 29 | # define MAX_KERNEL_DIM 80 30 | #endif 31 | 32 | #define INIT_MAT(ELEM_TYPE, SEED, MAT, M, N, SCALE) \ 33 | do { \ 34 | const double init_mat_seed1_ = (SCALE) * (SEED) + (SCALE); \ 35 | int init_mat_i_, init_mat_j_; \ 36 | for (init_mat_i_ = 0; init_mat_i_ < (N); ++init_mat_i_) { \ 37 | for (init_mat_j_ = 0; init_mat_j_ < (M); ++init_mat_j_) { \ 38 | const int init_mat_k_ = init_mat_i_ * (M) + init_mat_j_; \ 39 | ((ELEM_TYPE*)(MAT))[init_mat_k_] = (ELEM_TYPE)(init_mat_seed1_ * (init_mat_k_ + 1)); \ 40 | } \ 41 | } \ 42 | } while (0) 43 | 44 | 45 | /** 46 | * Artificial stack-setup for DBCSR/ACC benchmarks. 47 | * The arguments rnd and rnd_size optionally allow 48 | * to supply an array of (pseudo-)random-numbers. 49 | */ 50 | static INLINE void init_stack( 51 | int* stack, int stack_size, int rnd_size, const int* rnd, int mn, int mk, int kn, int nc, int na, int nb) { 52 | /* navg matrix products are accumulated into a C-matrix */ 53 | const int navg = stack_size / nc; 54 | const int nimb = MAX(1, navg - 4); /* imbalance */ 55 | int i = 0, c = 0, ntop = 0; 56 | assert(0 < nc && nc <= stack_size); 57 | while (i < stack_size) { 58 | const int r = ((NULL == rnd || 0 >= rnd_size) ? rand() : rnd[i % rnd_size]), next = c + 1; 59 | ntop += navg + (r % (2 * nimb) - nimb); 60 | if (stack_size < ntop) ntop = stack_size; 61 | for (; i < ntop; ++i) { /* setup one-based indexes */ 62 | int a, b; 63 | if (NULL != rnd && 0 < rnd_size) { 64 | a = rnd[(2 * i + 0) % rnd_size] % na; 65 | b = rnd[(2 * i + 1) % rnd_size] % nb; 66 | } 67 | else { 68 | a = rand() % na; 69 | b = rand() % nb; 70 | } 71 | *stack++ = a * mk + 1; /* A-index */ 72 | *stack++ = b * kn + 1; /* B-index */ 73 | *stack++ = c * mn + 1; /* C-index */ 74 | } 75 | if (next < nc) c = next; 76 | } 77 | } 78 | 79 | #endif /*DBCSR_ACC_BENCH_H*/ 80 | -------------------------------------------------------------------------------- /include/acc.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef DBCSR_ACC_H 10 | #define DBCSR_ACC_H 11 | 12 | #include 13 | 14 | #define DBCSR_STRINGIFY_AUX(SYMBOL) #SYMBOL 15 | #define DBCSR_STRINGIFY(SYMBOL) DBCSR_STRINGIFY_AUX(SYMBOL) 16 | #define DBCSR_CONCATENATE2(A, B) A##B 17 | #define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B) 18 | 19 | /** used to mark variables used */ 20 | #define DBCSR_MARK_USED(x) (void)(x) 21 | 22 | #if defined(__cplusplus) 23 | extern "C" { 24 | #endif 25 | 26 | /** types */ 27 | typedef int c_dbcsr_acc_bool_t; 28 | 29 | /** initialization and finalization */ 30 | int c_dbcsr_acc_init(void); 31 | int c_dbcsr_acc_finalize(void); 32 | void c_dbcsr_acc_clear_errors(void); 33 | 34 | /** devices */ 35 | int c_dbcsr_acc_get_ndevices(int* ndevices); 36 | int c_dbcsr_acc_set_active_device(int device_id); 37 | int c_dbcsr_acc_device_synchronize(void); 38 | 39 | /** streams */ 40 | int c_dbcsr_acc_stream_priority_range(int* least, int* greatest); 41 | int c_dbcsr_acc_stream_create(void** stream_p, const char* name, 42 | /** lower number is higher priority */ 43 | int priority); 44 | int c_dbcsr_acc_stream_destroy(void* stream); 45 | int c_dbcsr_acc_stream_sync(void* stream); 46 | int c_dbcsr_acc_stream_wait_event(void* stream, void* event); 47 | 48 | /** events */ 49 | int c_dbcsr_acc_event_create(void** event_p); 50 | int c_dbcsr_acc_event_destroy(void* event); 51 | int c_dbcsr_acc_event_record(void* event, void* stream); 52 | int c_dbcsr_acc_event_query(void* event, c_dbcsr_acc_bool_t* has_occurred); 53 | int c_dbcsr_acc_event_synchronize(void* event); 54 | 55 | /** memory */ 56 | int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes); 57 | int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem); 58 | int c_dbcsr_acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb); 59 | int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream); 60 | int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream); 61 | int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream); 62 | int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream); 63 | int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream); 64 | int c_dbcsr_acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream); 65 | int c_dbcsr_acc_dev_mem_info(size_t* mem_free, size_t* mem_total); 66 | 67 | void c_dbcsr_timeset(const char** routineN, const int* routineN_len, int* handle); 68 | void c_dbcsr_timestop(const int* handle); 69 | 70 | #if defined(__cplusplus) 71 | } 72 | #endif 73 | 74 | #endif /*DBCSR_ACC_H*/ 75 | -------------------------------------------------------------------------------- /samples/smm/acc_libsmm.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: GPL-2.0+ */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef DBCSR_ACC_LIBSMM_H 10 | #define DBCSR_ACC_LIBSMM_H 11 | 12 | #include "acc.h" 13 | 14 | #define DBCSR_TYPE(T) DBCSR_CONCATENATE(DBCSR_TYPE_, T) 15 | #define DBCSR_TYPE_double dbcsr_type_real_8 16 | #define DBCSR_TYPE_float dbcsr_type_real_4 17 | 18 | #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_transpose_routine_name_ptr)) 19 | #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_LENPTR (&libsmm_acc_transpose_routine_name_len) 20 | #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STR (libsmm_acc_transpose_routine_name_str) 21 | 22 | #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_process_routine_name_ptr)) 23 | #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR (&libsmm_acc_process_routine_name_len) 24 | #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STR (libsmm_acc_process_routine_name_str) 25 | 26 | 27 | #if defined(__cplusplus) 28 | extern "C" { 29 | #endif 30 | 31 | typedef enum libsmm_acc_data_t { 32 | dbcsr_type_real_4 = 1, 33 | dbcsr_type_real_8 = 3, 34 | dbcsr_type_complex_4 = 5, 35 | dbcsr_type_complex_8 = 7 36 | } libsmm_acc_data_t; 37 | 38 | int libsmm_acc_init(void); 39 | int libsmm_acc_finalize(void); 40 | c_dbcsr_acc_bool_t libsmm_acc_is_thread_safe(void); 41 | 42 | int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, void* dev_data, libsmm_acc_data_t datatype, int m, 43 | int n, int max_kernel_dim, void* stream); 44 | 45 | int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, libsmm_acc_data_t datatype, 46 | const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim, 47 | c_dbcsr_acc_bool_t def_mnk, void* stack_stream, void* c_stream); 48 | 49 | int c_calculate_norms(const double* mat, int nblks, const int* offsets, const int* nelems, float* norms, void* stream_ptr); 50 | 51 | static const char libsmm_acc_transpose_routine_name_str[] = "jit_kernel_transpose"; 52 | static const char* const libsmm_acc_transpose_routine_name_ptr = libsmm_acc_transpose_routine_name_str; 53 | static const int libsmm_acc_transpose_routine_name_len = (int)sizeof(libsmm_acc_transpose_routine_name_str) - 1; 54 | 55 | static const char libsmm_acc_process_routine_name_str[] = "jit_kernel_multiply"; 56 | static const char* const libsmm_acc_process_routine_name_ptr = libsmm_acc_process_routine_name_str; 57 | static const int libsmm_acc_process_routine_name_len = (int)sizeof(libsmm_acc_process_routine_name_str) - 1; 58 | 59 | #if defined(__cplusplus) 60 | } 61 | #endif 62 | 63 | #endif /*DBCSR_ACC_LIBSMM_H*/ 64 | -------------------------------------------------------------------------------- /samples/smm/README.md: -------------------------------------------------------------------------------- 1 | # LIBSMM 2 | 3 | The LIBSMM library implements the [ACC LIBSMM interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h), and depends on the [OpenCL backend](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/README.md). 4 | 5 | Compile-time settings are (implicitly) documented and can be adjusted by editing [opencl_libsmm.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/opencl_libsmm.h), e.g., `OPENCL_LIBSMM_VALIDATE` is disabled by default but can be enabled for debug purpose. The `OPENCL_LIBSMM_VALIDATE` compile-time setting enables side-by-side validation of matrix transpose and multiply operations between device and host. For example, running DBCSR's unit tests with `OPENCL_LIBSMM_VALIDATE` enabled produces console output that allows to pin-point a kernel which misses validation. Runtime settings are made by the means of environment variables. The OpenCL backend provides `acc_getenv.sh` to list all occurrences of `getenv` categorized into "OpenCL Backend environment variables" and "OpenCL LIBSMM environment variables". 6 | 7 | There are two categories for the two domains in LIBSMM, i.e., matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). For transposing matrices, the settings are: 8 | 9 | * `OPENCL_LIBSMM_TRANS_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler. 10 | * `OPENCL_LIBSMM_TRANS_INPLACE`: Boolean value (zero or non-zero integer) for in-place matrix transpose (no local memory needed). 11 | * `OPENCL_LIBSMM_TRANS_BM`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction. 12 | 13 | The most common settings for multiplying matrices are: 14 | 15 | * `OPENCL_LIBSMM_SMM_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler. 16 | * `OPENCL_LIBSMM_SMM_PARAMS`: Disable embedded/auto-tuned parameters (`0`), or load CSV-file (e.g., `path/to/tune_multiply.csv`). 17 | * `OPENCL_LIBSMM_SMM_BS`: non-negative integer number denoting the intra-kernel (mini-)batchsize mainly used to amortize atomic updates of data in global/main memory. The remainder with respect to the "stacksize" is handled by the kernel. 18 | * `OPENCL_LIBSMM_SMM_BM`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction. 19 | * `OPENCL_LIBSMM_SMM_BN`: non-negative integer number (less/equal than the N-extent) denoting the blocksize in N-direction. 20 | * `OPENCL_LIBSMM_SMM_AP`: specifies access to array of parameters (batch or "stack"). 21 | * `OPENCL_LIBSMM_SMM_AA`: specifies access to array of A-matrices. 22 | * `OPENCL_LIBSMM_SMM_AB`: specifies access to array of B-matrices. 23 | * `OPENCL_LIBSMM_SMM_AC`: specifies access to array of C-matrices. 24 | 25 | The full list of tunable parameters and some explanation can be received with `smm/tune_multiply.py --help`, i.e., short description, default settings, and accepted values. 26 | 27 | **NOTE**: LIBSMM's tunable runtime settings can be non-smooth like producing distinct code-paths, e.g., `OPENCL_LIBSMM_SMM_BS=1` vs. `OPENCL_LIBSMM_SMM_BS=2`. 28 | 29 | # Auto Tuning 30 | 31 | To tune and optimize a kernel and generating kernel parameters, please refer to the [Auto Tuning](https://cp2k.github.io/dbcsr/develop/page/3-developer-guide/3-programming/2-accelerator-backend/3-libsmm_ocl/1-autotune.html) guide. To update or retune an entire set of kernels (optimized parameters), please refer to the [Bulk Tuning](https://cp2k.github.io/dbcsr/develop/page/3-developer-guide/3-programming/2-accelerator-backend/3-libsmm_ocl/2-bulktune.html) guide. 32 | -------------------------------------------------------------------------------- /samples/multi-dgemm/multi-dgemm-type.hpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Copyright (c) 2014-2016, Intel Corporation ** 3 | ** All rights reserved. ** 4 | ** ** 5 | ** Redistribution and use in source and binary forms, with or without ** 6 | ** modification, are permitted provided that the following conditions ** 7 | ** are met: ** 8 | ** 1. Redistributions of source code must retain the above copyright ** 9 | ** notice, this list of conditions and the following disclaimer. ** 10 | ** 2. Redistributions in binary form must reproduce the above copyright ** 11 | ** notice, this list of conditions and the following disclaimer in the ** 12 | ** documentation and/or other materials provided with the distribution. ** 13 | ** 3. Neither the name of the copyright holder nor the names of its ** 14 | ** contributors may be used to endorse or promote products derived ** 15 | ** from this software without specific prior written permission. ** 16 | ** ** 17 | ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ** 18 | ** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ** 19 | ** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ** 20 | ** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ** 21 | ** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** 22 | ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ** 23 | ** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** 24 | ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** 25 | ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** 26 | ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** 27 | ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** 28 | ******************************************************************************/ 29 | /* Hans Pabst (Intel Corp.) 30 | ******************************************************************************/ 31 | #ifndef MULTI_DGEMM_TYPE_HPP 32 | #define MULTI_DGEMM_TYPE_HPP 33 | 34 | #include 35 | 36 | 37 | class multi_dgemm_type { 38 | public: 39 | class host_data_type { 40 | public: 41 | host_data_type(libxstream_function process, size_t size, const size_t split[]); 42 | ~host_data_type(); 43 | public: 44 | libxstream_function process() { return m_process; } 45 | const double* adata() const { return m_adata; } 46 | const double* bdata() const { return m_bdata; } 47 | double* cdata() { return m_cdata; } 48 | const size_t* idata() const { return m_idata; } 49 | size_t size() const { return m_size; } 50 | size_t flops() const { return m_flops; } 51 | size_t max_matrix_size() const; 52 | size_t bytes() const; 53 | bool ready() const; 54 | private: 55 | libxstream_function m_process; 56 | double *m_adata, *m_bdata, *m_cdata; 57 | size_t *m_idata, m_size, m_flops; 58 | }; 59 | 60 | public: 61 | multi_dgemm_type(); 62 | ~multi_dgemm_type(); 63 | 64 | private: 65 | int deinit(); 66 | 67 | public: 68 | int init(const char* name, host_data_type& host_data, int device, size_t max_batch); 69 | int operator()(size_t index, size_t size); 70 | 71 | libxstream_stream* stream() { return m_stream; } 72 | libxstream_event* event(); 73 | size_t bytes() const; 74 | bool ready() const; 75 | 76 | private: 77 | host_data_type* m_host_data; 78 | libxstream_stream* m_stream; 79 | libxstream_event* m_event; 80 | 81 | double *m_adata, *m_bdata, *m_cdata; 82 | size_t *m_idata, m_max_batch; 83 | }; 84 | 85 | #endif // MULTI_DGEMM_TYPE_HPP 86 | -------------------------------------------------------------------------------- /samples/smm/README-autotune.md: -------------------------------------------------------------------------------- 1 | # Auto Tuning 2 | 3 | Auto tuning code for performance is a practical way to find the "best" setting for parameterized code (e.g., GPU kernels). Introducing effective parameters is a prerequisite, and exploring the (potentially) high-dimensional parameter space in an efficient way is an art. It is desirable to have reasonable defaults even without auto-tuning the parameters. It would be even better to avoid auto-tuning if best performance was possible right away. 4 | 5 | For the OpenCL based LIBSMM, a variety of parameters are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) (or tune_multiply.sh) leverages the `acc_bench` by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with the subject being tuned. The "communication" between the tuner and the executable is solely based on environment variables. 6 | 7 | **NOTE**: If `tune_multiply.py` (or `tune_multiply.sh`) is called with an environment variable already set, the respective parameter (e.g., `OPENCL_LIBSMM_SMM_BM` or `OPENCL_LIBSMM_SMM_BN`) is considered fixed (and not tuned automatically). This way, the parameter space is reduced in size and effort can be directed more intensely towards the remaining parameters. 8 | 9 | To toggle the benchmarks between tuning single precision (SP) and double precision (DP), `make ELEM_TYPE=float` can be used when building the benchmark driver (`ELEM_TYPE` can be also directly edited in [acc_bench.c](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_bench.c)). Auto-tuned parameters for SP and DP can be embedded into the same final application and are considered correctly at runtime. 10 | 11 | To build the benchmarks in double precision (`ELEM_TYPE=double` is default): 12 | 13 | ```bash 14 | cd src/acc/opencl 15 | make 16 | ``` 17 | 18 | To build the benchmarks in single precision (SP): 19 | 20 | ```bash 21 | cd src/acc/opencl 22 | make ELEM_TYPE=float 23 | ``` 24 | 25 | To auto-tune, please install the Python `wheel` and `opentuner` packages: 26 | 27 | ```bash 28 | cd src/acc/opencl/smm 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | The OpenTuner script supports several command line arguments (`tune_multiply.py --help`). For example, `--stop-after=300` can be of interest to finish in five minutes (without limit, OpenTuner decides when the auto-tuning process is finished). A single kernel can be selected by M, N, and K parameters (GEMM), e.g., `M=15`, `N=5`, and `K=7`: 33 | 34 | ```bash 35 | ./tune_multiply.py 13x5x7 36 | ``` 37 | 38 | **NOTE**: If multiple different kernels are tuned using `tune_multiply.py`, it is advisable to delete the `opentuner.db` directory prior to tuning a different kernel since otherwise auto-tuning is potentially (mis-)guided by information which was collected for a different kernel (`tune_multiply.sh` does this automatically). 39 | 40 | The OpenTuner script implements multiple objectives ("cost"), primarily "accuracy" (maximized) and a secondary objective "size" (minimized). The former represents the achieved performance (GFLOPS/s) while the latter represents an artificial kernel requirement (just to prefer one parameter set over another in case of similar performance). The console output looks like ("accuracy" denotes performance in GFLOPS/s): 41 | 42 | ```text 43 | [ 15s] INFO opentuner.search.plugin.DisplayPlugin: tests=8, best {'BS': 32, 'BM': 6, 'BN': 1}, cost accuracy=28.80000000, size=1.0, found by UniformGreedyMutation 44 | [ 27s] INFO opentuner.search.plugin.DisplayPlugin: tests=19, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation 45 | [ 40s] INFO opentuner.search.plugin.DisplayPlugin: tests=31, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation 46 | [ 54s] INFO opentuner.search.plugin.DisplayPlugin: tests=43, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation 47 | [ 67s] INFO opentuner.search.plugin.DisplayPlugin: tests=53, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation 48 | ``` 49 | 50 | The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-s15-60gflops.json` which is encoding the benchmark ("multiply"), the precision ("float"), the kernel ("12x12x12"), the number of bits necessary to represent the size of the problem, i.e., log2 of the problem-size ("s15"), and the achieved performance ("60gflops"). The script handles SIGINT (like Ctrl-C), and output is still written despite of abnormally terminating (can be abused to tune interactively). Tuning starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BS`, `OPENCL_LIBSMM_SMM_BM`, and `OPENCL_LIBSMM_SMM_BN` respectively), or partially fixed for a subset of parameters. 51 | 52 | **NOTE**: The `acc_bench` executable is potentially started many times when auto-tuning parameters, therefore it is advisable to keep the state of the GPU driver stack persistent (if the setup would otherwise unload the driver configuration), e.g., `nvidia-smi -pm ENABLED`. This can happen in cases where the GPU is only for compute and not used for graphics (no X-Window system, e.g., in case of a "headless" system). Time needed for tuning parameters is not only impacted by accessing and readying the device, but also by the time needed to compile a kernel at runtime aka Just-In-Time (JIT). 53 | -------------------------------------------------------------------------------- /samples/test/Makefile: -------------------------------------------------------------------------------- 1 | # Export all variables to sub-make processes. 2 | #.EXPORT_ALL_VARIABLES: #export 3 | 4 | # Automatically disable parallel builds 5 | # depending on the version of GNU Make. 6 | # MAKE_PARALLEL=0: disable explcitly 7 | # MAKE_PARALLEL=1: enable explicitly 8 | ifeq (0,$(MAKE_PARALLEL)) 9 | .NOTPARALLEL: 10 | else ifeq (,$(MAKE_PARALLEL)) 11 | ifneq (3.82,$(firstword $(sort $(MAKE_VERSION) 3.82))) 12 | .NOTPARALLEL: 13 | endif 14 | endif 15 | 16 | # Linux cut has features we use that do not work elsewhere 17 | # Mac, etc. users should install GNU coreutils and use cut from there. 18 | # 19 | # For example, if you use Homebrew, run "brew install coreutils" once 20 | # and then invoke the LIBXSMM make command with 21 | # CUT=/usr/local/Cellar/coreutils/8.24/libexec/gnubin/cut 22 | CUT ?= cut 23 | 24 | ROOTDIR = $(abspath $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))) 25 | SRCDIR = $(ROOTDIR) 26 | INCDIR = $(ROOTDIR) 27 | BLDDIR = build 28 | DEPDIR = ../.. 29 | OUTDIR = . 30 | 31 | CXXFLAGS = $(NULL) 32 | CFLAGS = $(NULL) 33 | DFLAGS = -D__extern_always_inline=inline 34 | IFLAGS = -I$(INCDIR) -I$(BLDDIR) -I$(DEPDIR)/include 35 | 36 | STATIC ?= 0 37 | BLAS ?= 0 38 | OMP ?= 1 39 | SYM ?= 0 40 | DBG ?= 0 41 | IPO ?= 0 42 | SSE ?= 0 43 | AVX ?= 0 44 | 45 | OFFLOAD ?= 1 46 | LIBNAME = $(DEPDIR)/lib/intel64/libxstream 47 | 48 | ifneq (,$(wildcard $(LIBNAME).a)) 49 | STATIC ?= 1 50 | LIBEXT = a 51 | else 52 | STATIC ?= 0 53 | LIBEXT = so 54 | endif 55 | 56 | OUTNAME = $(shell basename $(ROOTDIR)) 57 | HEADERS = $(shell ls -1 $(INCDIR)/*.h 2> /dev/null | tr "\n" " ") \ 58 | $(shell ls -1 $(SRCDIR)/*.hpp 2> /dev/null | tr "\n" " ") \ 59 | $(shell ls -1 $(SRCDIR)/*.hxx 2> /dev/null | tr "\n" " ") \ 60 | $(shell ls -1 $(SRCDIR)/*.hh 2> /dev/null | tr "\n" " ") 61 | CPPSRCS = $(shell ls -1 $(SRCDIR)/*.cpp 2> /dev/null | tr "\n" " ") 62 | CXXSRCS = $(shell ls -1 $(SRCDIR)/*.cxx 2> /dev/null | tr "\n" " ") 63 | CCXSRCS = $(shell ls -1 $(SRCDIR)/*.cc 2> /dev/null | tr "\n" " ") 64 | CSOURCS = $(shell ls -1 $(SRCDIR)/*.c 2> /dev/null | tr "\n" " ") 65 | FTNSRCS = $(shell ls -1 $(SRCDIR)/*.f 2> /dev/null | tr "\n" " ") 66 | F77SRCS = $(shell ls -1 $(SRCDIR)/*.F 2> /dev/null | tr "\n" " ") 67 | F90SRCS = $(shell ls -1 $(SRCDIR)/*.f90 2> /dev/null | tr "\n" " ") 68 | FTNINCS = $(shell ls -1 $(DEPDIR)/include/*.f 2> /dev/null | tr "\n" " ") 69 | F77INCS = $(shell ls -1 $(DEPDIR)/include/*.F 2> /dev/null | tr "\n" " ") 70 | F90INCS = $(shell ls -1 $(DEPDIR)/include/*.f90 2> /dev/null | tr "\n" " ") 71 | FTNMODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNINCS:.f=-mod.o))) 72 | F77MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77INCS:.F=-mod77.o))) 73 | F90MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90INCS:.f90=-mod90.o))) 74 | MODULES = $(FTNMODS) $(F77MODS) $(F90MODS) 75 | SOURCES = $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) $(FTNSRCS) $(F77SRCS) $(F90SRCS) 76 | CPPOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) 77 | CXXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) 78 | CCXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) 79 | COBJCTS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) 80 | FTNOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNSRCS:.f=-f.o))) 81 | F77OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) 82 | F90OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) 83 | OBJECTS = $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) $(FTNOBJS) $(F77OBJS) $(F90OBJS) 84 | 85 | # include common Makefile artifacts 86 | include $(ROOTDIR)/Makefile.inc 87 | 88 | .PHONY: all 89 | all: $(OUTDIR)/$(OUTNAME) 90 | 91 | $(OUTDIR)/$(OUTNAME): $(OBJECTS) $(LIBNAME).$(LIBEXT) 92 | @mkdir -p $(dir $@) 93 | $(LD) -o $@ $^ $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) 94 | 95 | $(BLDDIR)/%-mod.o: $(DEPDIR)/include/%.f $(ROOTDIR)/Makefile 96 | @mkdir -p $(dir $@) 97 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 98 | 99 | $(BLDDIR)/%-mod90.o: $(DEPDIR)/include/%.f90 $(ROOTDIR)/Makefile 100 | @mkdir -p $(dir $@) 101 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 102 | 103 | $(BLDDIR)/%-mod77.o: $(DEPDIR)/include/%.F $(ROOTDIR)/Makefile 104 | @mkdir -p $(dir $@) 105 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 106 | 107 | $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp $(HEADERS) $(ROOTDIR)/Makefile 108 | @mkdir -p $(dir $@) 109 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 110 | 111 | $(BLDDIR)/%-c.o: $(SRCDIR)/%.c $(HEADERS) $(ROOTDIR)/Makefile 112 | @mkdir -p $(dir $@) 113 | $(CC) $(CFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 114 | 115 | $(BLDDIR)/%-f.o: $(SRCDIR)/%.f $(MODULES) $(ROOTDIR)/Makefile 116 | @mkdir -p $(dir $@) 117 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 118 | 119 | $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 $(MODULES) $(ROOTDIR)/Makefile 120 | @mkdir -p $(dir $@) 121 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 122 | 123 | $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F $(MODULES) $(ROOTDIR)/Makefile 124 | @mkdir -p $(dir $@) 125 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 126 | 127 | .PHONY: clean 128 | clean: 129 | ifneq ($(abspath $(BLDDIR)),$(ROOTDIR)) 130 | ifneq ($(abspath $(BLDDIR)),$(abspath .)) 131 | @rm -rf $(BLDDIR) *.mod 132 | else 133 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 134 | endif 135 | else 136 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 137 | endif 138 | @rm -f fit.log *.dat 139 | 140 | .PHONY: realclean 141 | realclean: clean 142 | ifneq ($(abspath $(OUTDIR)),$(ROOTDIR)) 143 | ifneq ($(abspath $(OUTDIR)),$(abspath .)) 144 | @rm -rf $(OUTDIR) 145 | else 146 | @rm -f $(OUTDIR)/$(OUTNAME) 147 | endif 148 | else 149 | @rm -f $(OUTDIR)/$(OUTNAME) 150 | endif 151 | @rm -f $(OUTDIR)/libxstream.so 152 | 153 | install: all clean 154 | @cp $(DEPDIR)/lib/intel64/libxstream.so $(OUTDIR) 2> /dev/null || true 155 | 156 | -------------------------------------------------------------------------------- /samples/smm/opencl_libsmm.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef OPENCL_LIBSMM_H 10 | #define OPENCL_LIBSMM_H 11 | 12 | #include "../../acc_libsmm.h" 13 | #include "../acc_opencl.h" 14 | 15 | /* Inplace-transpose by default (corresponding environment variable exists also) */ 16 | #if !defined(OPENCL_LIBSMM_TRANS_INPLACE) && 0 17 | # define OPENCL_LIBSMM_TRANS_INPLACE 18 | #endif 19 | #if !defined(OPENCL_LIBSMM_F32_OFF) && defined(__DBCSR_ACC) && 0 20 | # define OPENCL_LIBSMM_F32_OFF 21 | #endif 22 | #if !defined(OPENCL_LIBSMM_F32) && !defined(OPENCL_LIBSMM_F32_OFF) 23 | # define OPENCL_LIBSMM_F32 24 | #endif 25 | #if !defined(OPENCL_LIBSMM_F64) && !defined(OPENCL_LIBSMM_F64_OFF) 26 | # define OPENCL_LIBSMM_F64 27 | #endif 28 | #if !defined(OPENCL_LIBSMM_PFORMAT) && 1 29 | # define OPENCL_LIBSMM_PFORMAT 8 30 | #endif 31 | 32 | 33 | #if defined(__cplusplus) 34 | extern "C" { 35 | #endif 36 | 37 | /** Type for querying transpose kernel configuration. */ 38 | typedef struct opencl_libsmm_transkey_t { 39 | libsmm_acc_data_t type; /* must be the 1st data member */ 40 | int m, n; 41 | } opencl_libsmm_transkey_t; 42 | 43 | /** Type for transpose kernel configuration. */ 44 | typedef struct opencl_libsmm_trans_t { 45 | cl_kernel kernel; /* must be the 1st data member */ 46 | size_t wgsize; 47 | } opencl_libsmm_trans_t; 48 | 49 | /** Type for querying SMM-kernel configuration. */ 50 | typedef struct opencl_libsmm_smmkey_t { 51 | libsmm_acc_data_t type; /* must be the 1st data member */ 52 | int m, n, k; 53 | /* device matching configuration (parameters) */ 54 | unsigned int devuid; 55 | } opencl_libsmm_smmkey_t; 56 | 57 | /** Type for SMM-kernel configuration. */ 58 | typedef struct opencl_libsmm_smm_t { 59 | cl_kernel kernel[2]; /* must be the 1st data member */ 60 | size_t wgsize[2]; 61 | double gflops; 62 | /* (pseudo-)parameters (either pretuned or determined) */ 63 | int s, bs, bm, bn, bk, ws, wg, lu, nz, al, tb, tc, ap, aa, ab, ac, flags; 64 | } opencl_libsmm_smm_t; 65 | 66 | /** Type to collect statistics about tuned SMM-kernels */ 67 | typedef struct opencl_libsmm_perfest_t { 68 | double gf_ai_sratio_max, gf_ai_sratio_sumlog, gf_ai_sratio_kahan; 69 | double gf_ai_dratio_max, gf_ai_dratio_sumlog, gf_ai_dratio_kahan; 70 | size_t scount, dcount; 71 | } opencl_libsmm_perfest_t; 72 | 73 | 74 | /** Returns environment variable's value for given domain and key. */ 75 | const char* opencl_libsmm_getenv(const char domain[], const char key[]); 76 | 77 | /** 78 | * TRANS-kernel: write key and tunables into a (file-)stream. 79 | * If config=NULL, key/parameter names are written. The arguments 80 | * delim, begin, and close are optional as well (can be NULL). 81 | * With only the key being written the config still controls 82 | * if values or names are written. 83 | * Returns the number of characters written (negative if error). 84 | */ 85 | int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key, 86 | const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close); 87 | 88 | /** 89 | * SMM-kernel: write key and tunables into a (file-)stream. 90 | * The environment variable OPENCL_LIBSMM_SMM_PARAMS="" 91 | * reproduces a configuration. If config=NULL, key/parameter 92 | * names are written. The arguments delim, begin, and close 93 | * are optional as well (can be NULL). 94 | * With only the key being written the config still controls 95 | * if values or names are written. 96 | * Returns the number of characters written (negative if error). 97 | */ 98 | int opencl_libsmm_write_smm_params(FILE* stream, int only_key, const opencl_libsmm_smmkey_t* key, const opencl_libsmm_smm_t* config, 99 | const char* delim, const char* begin, const char* close); 100 | 101 | /** Tokenize parambuf and initialize key/value pair. */ 102 | int opencl_libsmm_read_smm_params(char* parambuf, opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value, 103 | opencl_libsmm_perfest_t* perfest, char* device, int* key_ok); 104 | 105 | c_dbcsr_acc_bool_t libsmm_acc_process_suitable( 106 | c_dbcsr_acc_bool_t def_mnk, libsmm_acc_data_t datatype, int stack_size, int m_max, int n_max, int k_max, int max_kernel_dim); 107 | 108 | #if defined(OPENCL_LIBSMM_PFORMAT) && (0 < OPENCL_LIBSMM_PFORMAT) 109 | typedef int (*opencl_libsmm_acc_dbm_launch_fn_t)(void* stream, double alpha, int ntasks, int param_format, const int* params_host, 110 | const int* params, const double* pack_a_data, const double* pack_b_data, double* shard_c_data); 111 | /** Enables DBM-kernel for LIBSMM (revsere reuse). */ 112 | void opencl_libsmm_acc_set_dbm_launch_fn(opencl_libsmm_acc_dbm_launch_fn_t launch_fn); 113 | 114 | /** Backend-specific variant of libsmm_acc_process, which allows to easier reuse LIBSMM kernels. */ 115 | int opencl_libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, libsmm_acc_data_t datatype, 116 | const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim, 117 | c_dbcsr_acc_bool_t def_mnk, void* stream, void* c_stream, int param_format, cl_event* event); 118 | #endif 119 | 120 | #if defined(__cplusplus) 121 | } 122 | #endif 123 | 124 | #endif /*OPENCL_LIBSMM_H*/ 125 | -------------------------------------------------------------------------------- /samples/entropy/Makefile: -------------------------------------------------------------------------------- 1 | # Export all variables to sub-make processes. 2 | #.EXPORT_ALL_VARIABLES: #export 3 | 4 | # Automatically disable parallel builds 5 | # depending on the version of GNU Make. 6 | # MAKE_PARALLEL=0: disable explcitly 7 | # MAKE_PARALLEL=1: enable explicitly 8 | ifeq (0,$(MAKE_PARALLEL)) 9 | .NOTPARALLEL: 10 | else ifeq (,$(MAKE_PARALLEL)) 11 | ifneq (3.82,$(firstword $(sort $(MAKE_VERSION) 3.82))) 12 | .NOTPARALLEL: 13 | endif 14 | endif 15 | 16 | # Linux cut has features we use that do not work elsewhere 17 | # Mac, etc. users should install GNU coreutils and use cut from there. 18 | # 19 | # For example, if you use Homebrew, run "brew install coreutils" once 20 | # and then invoke the LIBXSMM make command with 21 | # CUT=/usr/local/Cellar/coreutils/8.24/libexec/gnubin/cut 22 | CUT ?= cut 23 | 24 | ROOTDIR = $(abspath $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))) 25 | SRCDIR = $(ROOTDIR) 26 | INCDIR = $(ROOTDIR) 27 | BLDDIR = build 28 | DEPDIR = ../.. 29 | OUTDIR = . 30 | 31 | CXXFLAGS = $(NULL) 32 | CFLAGS = $(NULL) 33 | DFLAGS = -D__extern_always_inline=inline 34 | IFLAGS = -I$(INCDIR) -I$(BLDDIR) -I$(DEPDIR)/include 35 | 36 | STATIC ?= 0 37 | BLAS ?= 0 38 | OMP ?= 1 39 | SYM ?= 0 40 | DBG ?= 0 41 | IPO ?= 0 42 | SSE ?= 0 43 | AVX ?= 0 44 | 45 | OFFLOAD ?= 1 46 | LIBNAME = $(DEPDIR)/lib/intel64/libxstream 47 | 48 | ifneq (,$(wildcard $(LIBNAME).a)) 49 | STATIC ?= 1 50 | LIBEXT = a 51 | else 52 | STATIC ?= 0 53 | LIBEXT = so 54 | endif 55 | 56 | OUTNAME = $(shell basename $(ROOTDIR)) 57 | HEADERS = $(shell ls -1 $(INCDIR)/*.h 2> /dev/null | tr "\n" " ") \ 58 | $(shell ls -1 $(SRCDIR)/*.hpp 2> /dev/null | tr "\n" " ") \ 59 | $(shell ls -1 $(SRCDIR)/*.hxx 2> /dev/null | tr "\n" " ") \ 60 | $(shell ls -1 $(SRCDIR)/*.hh 2> /dev/null | tr "\n" " ") 61 | CPPSRCS = $(shell ls -1 $(SRCDIR)/*.cpp 2> /dev/null | tr "\n" " ") 62 | CXXSRCS = $(shell ls -1 $(SRCDIR)/*.cxx 2> /dev/null | tr "\n" " ") 63 | CCXSRCS = $(shell ls -1 $(SRCDIR)/*.cc 2> /dev/null | tr "\n" " ") 64 | CSOURCS = $(shell ls -1 $(SRCDIR)/*.c 2> /dev/null | tr "\n" " ") 65 | FTNSRCS = $(shell ls -1 $(SRCDIR)/*.f 2> /dev/null | tr "\n" " ") 66 | F77SRCS = $(shell ls -1 $(SRCDIR)/*.F 2> /dev/null | tr "\n" " ") 67 | F90SRCS = $(shell ls -1 $(SRCDIR)/*.f90 2> /dev/null | tr "\n" " ") 68 | FTNINCS = $(shell ls -1 $(DEPDIR)/include/*.f 2> /dev/null | tr "\n" " ") 69 | F77INCS = $(shell ls -1 $(DEPDIR)/include/*.F 2> /dev/null | tr "\n" " ") 70 | F90INCS = $(shell ls -1 $(DEPDIR)/include/*.f90 2> /dev/null | tr "\n" " ") 71 | FTNMODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNINCS:.f=-mod.o))) 72 | F77MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77INCS:.F=-mod77.o))) 73 | F90MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90INCS:.f90=-mod90.o))) 74 | MODULES = $(FTNMODS) $(F77MODS) $(F90MODS) 75 | SOURCES = $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) $(FTNSRCS) $(F77SRCS) $(F90SRCS) 76 | CPPOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) 77 | CXXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) 78 | CCXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) 79 | COBJCTS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) 80 | FTNOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNSRCS:.f=-f.o))) 81 | F77OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) 82 | F90OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) 83 | OBJECTS = $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) $(FTNOBJS) $(F77OBJS) $(F90OBJS) 84 | 85 | # include common Makefile artifacts 86 | include $(ROOTDIR)/Makefile.inc 87 | 88 | .PHONY: all 89 | all: $(OUTDIR)/$(OUTNAME) 90 | 91 | $(OUTDIR)/$(OUTNAME): $(OBJECTS) $(LIBNAME).$(LIBEXT) 92 | @mkdir -p $(dir $@) 93 | $(LD) -o $@ $^ $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) 94 | 95 | $(BLDDIR)/%-mod.o: $(DEPDIR)/include/%.f $(ROOTDIR)/Makefile 96 | @mkdir -p $(dir $@) 97 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 98 | 99 | $(BLDDIR)/%-mod90.o: $(DEPDIR)/include/%.f90 $(ROOTDIR)/Makefile 100 | @mkdir -p $(dir $@) 101 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 102 | 103 | $(BLDDIR)/%-mod77.o: $(DEPDIR)/include/%.F $(ROOTDIR)/Makefile 104 | @mkdir -p $(dir $@) 105 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 106 | 107 | $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp $(HEADERS) $(ROOTDIR)/Makefile 108 | @mkdir -p $(dir $@) 109 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 110 | 111 | $(BLDDIR)/%-c.o: $(SRCDIR)/%.c $(HEADERS) $(ROOTDIR)/Makefile 112 | @mkdir -p $(dir $@) 113 | $(CC) $(CFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 114 | 115 | $(BLDDIR)/%-f.o: $(SRCDIR)/%.f $(MODULES) $(ROOTDIR)/Makefile 116 | @mkdir -p $(dir $@) 117 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 118 | 119 | $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 $(MODULES) $(ROOTDIR)/Makefile 120 | @mkdir -p $(dir $@) 121 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 122 | 123 | $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F $(MODULES) $(ROOTDIR)/Makefile 124 | @mkdir -p $(dir $@) 125 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 126 | 127 | .PHONY: clean 128 | clean: 129 | ifneq ($(abspath $(BLDDIR)),$(ROOTDIR)) 130 | ifneq ($(abspath $(BLDDIR)),$(abspath .)) 131 | @rm -rf $(BLDDIR) *.mod 132 | else 133 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 134 | endif 135 | else 136 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 137 | endif 138 | @rm -f fit.log *.dat 139 | 140 | .PHONY: realclean 141 | realclean: clean 142 | ifneq ($(abspath $(OUTDIR)),$(ROOTDIR)) 143 | ifneq ($(abspath $(OUTDIR)),$(abspath .)) 144 | @rm -rf $(OUTDIR) 145 | else 146 | @rm -f $(OUTDIR)/$(OUTNAME) 147 | endif 148 | else 149 | @rm -f $(OUTDIR)/$(OUTNAME) 150 | endif 151 | @rm -f $(OUTDIR)/libxstream.so 152 | 153 | install: all clean 154 | @cp $(DEPDIR)/lib/intel64/libxstream.so $(OUTDIR) 2> /dev/null || true 155 | @cp $(ROOTDIR)/$(OUTNAME).sh $(OUTDIR) 2> /dev/null || true 156 | 157 | -------------------------------------------------------------------------------- /samples/smm/README-bulktune.md: -------------------------------------------------------------------------------- 1 | # Optimized Kernels 2 | 3 | Optimized kernel parameters are stored in JSON-files and are automatically summarized into a CSV-file. Further and beyond auto-tuning kernels, [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) can be used to perform basic operations on collected data: explicitly merging all JSON-files into a CSV-file (`tune_multiply.py -m`), and updating the device name in all JSON-files according to current driver version (`tune_multiply.py -u`). 4 | 5 | Collected or auto-tuned parameters achieved with single precision (SP), double precision (DP), or from different devices can be safely combined. Practically, `acc_opencl.sh` transforms the CSV-file into source code compiled into the final binary, which is independent of `OPENCL_LIBSMM_SMM_PARAMS` accepting a CSV-file (path/filename). However, `acc_opencl.sh` currently limits the origin of parameters to a single device. Care must still be taken to not summarize unrelated results, e.g., after (major) source code changes. The CSV-file is automatically incorporated into LIBSMM by the next clean (re-)build. The format of the CSV-file is assumed to contain column names in the first row (header). 6 | 7 | Different problem sizes (like "s15"; see above) are not represented individually, but are instead collected into a maximum value. In turn, this means tuning for a non-default problem-size must be manually kept pure since the result achieved with a larger problem may dominate (maximum value). 8 | 9 | ```bash 10 | cd src/acc/opencl 11 | make realclean 12 | make 13 | ``` 14 | 15 | This way auto-tuned kernels just work and can be of course exercised using the afore mentioned benchmark: 16 | 17 | ```bash 18 | cd src/acc 19 | ./acc_bench 5 30000 13 5 7 20 | ``` 21 | 22 | Tuned parameters can be also disabled at runtime like: 23 | 24 | ```bash 25 | cd src/acc 26 | OPENCL_LIBSMM_SMM_PARAMS=0 ./acc_bench 5 30000 13 5 7 27 | ``` 28 | 29 | By supplying a CSV-file at runtime, embedded parameters and defaults are overriden, and given parameters are applied even if the current device is different from what would match the given parameters: 30 | 31 | ```bash 32 | cd src/acc 33 | OPENCL_LIBSMM_SMM_PARAMS=opencl/smm/tune_multiply.csv ./acc_bench 5 30000 13 5 7 34 | ``` 35 | 36 | To tune multiple kernels in a convenient fashion, a triplet specification can be supplied to the [tune_multiply.sh](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.sh) wrapper script. This script estimates the total runtime for auto-tuning kernels, cleans up intermediate results (`opentuner.db`), allows to specify triplets, and splits work to auto-tune in parallel. 37 | 38 | Triplets are used to conveniently describe multiple kernels. A triplet specification consists of comma-separated groups of (M,N,K)-extents, i.e., matrix shapes according to GEMM. For example: 39 | 40 | ```text 41 | 4 10 15, 6 7 8, 23 42 | ``` 43 | 44 | This triplet specification expands to 55 kernels using the Cartesian product within each group and concatenating the result of such expanded groups followed by removing duplicate triplets. Further, the wrapper script allows to limit the time spent for tuning a single kernel and to partition the number of kernels to be tuned, e.g., among a cluster of eight systems (below the first partition out of eight would be processed with five minutes per kernel and about 35 minutes in total per partition). 45 | 46 | ```bash 47 | cd src/acc/opencl/smm 48 | ./tune_multiply.sh -t 300 -j 8 -i 1 4 10 15, 6 7 8, 23 49 | ``` 50 | 51 | The script `tune_multiply.sh` is tuning 1266 kernels by default (`./tune_multiply.sh -t 300 -j 8 -i 1` takes approximately 13 hours per part). If the process is interrupted earlier (per SIGINT or Ctrl-C), the execution terminates for all requested kernels (triplet specification) unless `--continue` is given (or `-c`, or an environment variable `CONTINUE=1`). 52 | 53 | For convenience, it is possible to "update" an existing set of JSON-files (path can be given with `-p`), i.e., to parse the (M,N,K)-triplet denoted by the JSON filename and to re-tune with an almost unconstrained tuning-level (`-a 1` by default) as well as a limited duration (160 seconds per kernel by default). 54 | 55 | ```bash 56 | cd src/acc/opencl 57 | make realclean 58 | echo "Rebuild and embed smm/params/tune_multiply_P100.csv" 59 | make WITH_GPU=P100 60 | 61 | echo "Retune original parameters" 62 | smm/tune_multiply.sh -p smm/params/p100 -u 63 | 64 | echo "Override original parameters" 65 | cp tune_multiply.csv smm/params/tune_multiply_P100.csv 66 | ``` 67 | 68 | Tuning kernels further is only sensible if the previously tuned parameters are embedded into the binary (such that the process does not start from scratch). Retuned parameters are captured with JSON-files as usual. 69 | 70 | # Advanced Tuning 71 | 72 | To utilize multiple devices per system and to accelerate tuning kernels, `tune_multiply.py` comes with built-in support for running under MPI (SPMD execution model). The basic assumption is to spawn one process per device usually with different kernels tuned per device (SPMD). Of course, tuning the same kernels in parallel on multiple devices is possible but it is a waste of resources. Tuning on multiple devices per system can be also more realistic given the common power budget of all devices and less room for an increased operating frequency per device (Turbo clock speed). 73 | 74 | For example, a single dual-socket system with two PVC cards (modules) per socket exposes eight GPU devices (two GPU stacks or tiles per card). Then 350 kernels can be tuned in less than 2 1/2 hours with a duration of 200 seconds for tuning each kernel. 75 | 76 | ```bash 77 | MAXTIME=200 NPARTS=8 UPDATE=1 JSONDIR=params/pvc mpirun \ 78 | ./tune_multiply.sh -i 1 : \ 79 | ./tune_multiply.sh -i 2 : \ 80 | ./tune_multiply.sh -i 3 : \ 81 | ./tune_multiply.sh -i 4 : \ 82 | ./tune_multiply.sh -i 5 : \ 83 | ./tune_multiply.sh -i 6 : \ 84 | ./tune_multiply.sh -i 7 : \ 85 | ./tune_multiply.sh -i 8 \ 86 | >out.log 2>&1 87 | ``` 88 | 89 | **NOTE**: The above shown example prefers environment variables over command-line options that would be common to the eight launches of `tune_multiply.sh`. 90 | -------------------------------------------------------------------------------- /samples/copy/Makefile: -------------------------------------------------------------------------------- 1 | # Export all variables to sub-make processes. 2 | #.EXPORT_ALL_VARIABLES: #export 3 | 4 | # Automatically disable parallel builds 5 | # depending on the version of GNU Make. 6 | # MAKE_PARALLEL=0: disable explcitly 7 | # MAKE_PARALLEL=1: enable explicitly 8 | ifeq (0,$(MAKE_PARALLEL)) 9 | .NOTPARALLEL: 10 | else ifeq (,$(MAKE_PARALLEL)) 11 | ifneq (3.82,$(firstword $(sort $(MAKE_VERSION) 3.82))) 12 | .NOTPARALLEL: 13 | endif 14 | endif 15 | 16 | # Linux cut has features we use that do not work elsewhere 17 | # Mac, etc. users should install GNU coreutils and use cut from there. 18 | # 19 | # For example, if you use Homebrew, run "brew install coreutils" once 20 | # and then invoke the LIBXSMM make command with 21 | # CUT=/usr/local/Cellar/coreutils/8.24/libexec/gnubin/cut 22 | CUT ?= cut 23 | 24 | ROOTDIR = $(abspath $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))) 25 | SRCDIR = $(ROOTDIR) 26 | INCDIR = $(ROOTDIR) 27 | BLDDIR = build 28 | DEPDIR = ../.. 29 | OUTDIR = . 30 | 31 | CXXFLAGS = $(NULL) 32 | CFLAGS = $(NULL) 33 | DFLAGS = -D__extern_always_inline=inline 34 | IFLAGS = -I$(INCDIR) -I$(BLDDIR) -I$(DEPDIR)/include 35 | 36 | STATIC ?= 0 37 | BLAS ?= 0 38 | OMP ?= 1 39 | SYM ?= 0 40 | DBG ?= 0 41 | IPO ?= 0 42 | SSE ?= 0 43 | AVX ?= 0 44 | 45 | OFFLOAD ?= 1 46 | LIBNAME = $(DEPDIR)/lib/intel64/libxstream 47 | 48 | ifneq (,$(wildcard $(LIBNAME).a)) 49 | STATIC ?= 1 50 | LIBEXT = a 51 | else 52 | STATIC ?= 0 53 | LIBEXT = so 54 | endif 55 | 56 | OUTNAME = $(shell basename $(ROOTDIR)) 57 | HEADERS = $(shell ls -1 $(INCDIR)/*.h 2> /dev/null | tr "\n" " ") \ 58 | $(shell ls -1 $(SRCDIR)/*.hpp 2> /dev/null | tr "\n" " ") \ 59 | $(shell ls -1 $(SRCDIR)/*.hxx 2> /dev/null | tr "\n" " ") \ 60 | $(shell ls -1 $(SRCDIR)/*.hh 2> /dev/null | tr "\n" " ") 61 | CPPSRCS = $(shell ls -1 $(SRCDIR)/*.cpp 2> /dev/null | tr "\n" " ") 62 | CXXSRCS = $(shell ls -1 $(SRCDIR)/*.cxx 2> /dev/null | tr "\n" " ") 63 | CCXSRCS = $(shell ls -1 $(SRCDIR)/*.cc 2> /dev/null | tr "\n" " ") 64 | CSOURCS = $(shell ls -1 $(SRCDIR)/*.c 2> /dev/null | tr "\n" " ") 65 | FTNSRCS = $(shell ls -1 $(SRCDIR)/*.f 2> /dev/null | tr "\n" " ") 66 | F77SRCS = $(shell ls -1 $(SRCDIR)/*.F 2> /dev/null | tr "\n" " ") 67 | F90SRCS = $(shell ls -1 $(SRCDIR)/*.f90 2> /dev/null | tr "\n" " ") 68 | FTNINCS = $(shell ls -1 $(DEPDIR)/include/*.f 2> /dev/null | tr "\n" " ") 69 | F77INCS = $(shell ls -1 $(DEPDIR)/include/*.F 2> /dev/null | tr "\n" " ") 70 | F90INCS = $(shell ls -1 $(DEPDIR)/include/*.f90 2> /dev/null | tr "\n" " ") 71 | FTNMODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNINCS:.f=-mod.o))) 72 | F77MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77INCS:.F=-mod77.o))) 73 | F90MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90INCS:.f90=-mod90.o))) 74 | MODULES = $(FTNMODS) $(F77MODS) $(F90MODS) 75 | SOURCES = $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) $(FTNSRCS) $(F77SRCS) $(F90SRCS) 76 | CPPOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) 77 | CXXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) 78 | CCXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) 79 | COBJCTS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) 80 | FTNOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNSRCS:.f=-f.o))) 81 | F77OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) 82 | F90OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) 83 | OBJECTS = $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) $(FTNOBJS) $(F77OBJS) $(F90OBJS) 84 | 85 | # include common Makefile artifacts 86 | include $(ROOTDIR)/Makefile.inc 87 | 88 | .PHONY: all 89 | all: $(OUTDIR)/$(OUTNAME) 90 | 91 | $(OUTDIR)/$(OUTNAME): $(OBJECTS) $(LIBNAME).$(LIBEXT) 92 | @mkdir -p $(dir $@) 93 | $(LD) -o $@ $^ $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) 94 | 95 | $(BLDDIR)/%-mod.o: $(DEPDIR)/include/%.f $(ROOTDIR)/Makefile 96 | @mkdir -p $(dir $@) 97 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 98 | 99 | $(BLDDIR)/%-mod90.o: $(DEPDIR)/include/%.f90 $(ROOTDIR)/Makefile 100 | @mkdir -p $(dir $@) 101 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 102 | 103 | $(BLDDIR)/%-mod77.o: $(DEPDIR)/include/%.F $(ROOTDIR)/Makefile 104 | @mkdir -p $(dir $@) 105 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 106 | 107 | $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp $(HEADERS) $(ROOTDIR)/Makefile 108 | @mkdir -p $(dir $@) 109 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 110 | 111 | $(BLDDIR)/%-c.o: $(SRCDIR)/%.c $(HEADERS) $(ROOTDIR)/Makefile 112 | @mkdir -p $(dir $@) 113 | $(CC) $(CFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 114 | 115 | $(BLDDIR)/%-f.o: $(SRCDIR)/%.f $(MODULES) $(ROOTDIR)/Makefile 116 | @mkdir -p $(dir $@) 117 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 118 | 119 | $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 $(MODULES) $(ROOTDIR)/Makefile 120 | @mkdir -p $(dir $@) 121 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 122 | 123 | $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F $(MODULES) $(ROOTDIR)/Makefile 124 | @mkdir -p $(dir $@) 125 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 126 | 127 | .PHONY: clean 128 | clean: 129 | ifneq ($(abspath $(BLDDIR)),$(ROOTDIR)) 130 | ifneq ($(abspath $(BLDDIR)),$(abspath .)) 131 | @rm -rf $(BLDDIR) *.mod 132 | else 133 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 134 | endif 135 | else 136 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 137 | endif 138 | @rm -f fit.log *.dat 139 | 140 | .PHONY: realclean 141 | realclean: clean 142 | ifneq ($(abspath $(OUTDIR)),$(ROOTDIR)) 143 | ifneq ($(abspath $(OUTDIR)),$(abspath .)) 144 | @rm -rf $(OUTDIR) 145 | else 146 | @rm -f $(OUTDIR)/$(OUTNAME) 147 | endif 148 | else 149 | @rm -f $(OUTDIR)/$(OUTNAME) 150 | endif 151 | @rm -f $(OUTDIR)/libxstream.so 152 | 153 | install: all clean 154 | @cp $(DEPDIR)/lib/intel64/libxstream.so $(OUTDIR) 2> /dev/null || true 155 | @cp $(ROOTDIR)/$(OUTNAME).sh $(OUTDIR) 2> /dev/null || true 156 | @cp $(ROOTDIR)/$(OUTNAME).plt $(OUTDIR) 2> /dev/null || true 157 | @cp $(ROOTDIR)/plot.sh $(OUTDIR) 2> /dev/null || true 158 | 159 | -------------------------------------------------------------------------------- /samples/multi-dgemm/Makefile: -------------------------------------------------------------------------------- 1 | # Export all variables to sub-make processes. 2 | #.EXPORT_ALL_VARIABLES: #export 3 | 4 | # Automatically disable parallel builds 5 | # depending on the version of GNU Make. 6 | # MAKE_PARALLEL=0: disable explcitly 7 | # MAKE_PARALLEL=1: enable explicitly 8 | ifeq (0,$(MAKE_PARALLEL)) 9 | .NOTPARALLEL: 10 | else ifeq (,$(MAKE_PARALLEL)) 11 | ifneq (3.82,$(firstword $(sort $(MAKE_VERSION) 3.82))) 12 | .NOTPARALLEL: 13 | endif 14 | endif 15 | 16 | # Linux cut has features we use that do not work elsewhere 17 | # Mac, etc. users should install GNU coreutils and use cut from there. 18 | # 19 | # For example, if you use Homebrew, run "brew install coreutils" once 20 | # and then invoke the LIBXSMM make command with 21 | # CUT=/usr/local/Cellar/coreutils/8.24/libexec/gnubin/cut 22 | CUT ?= cut 23 | 24 | ROOTDIR = $(abspath $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))) 25 | SRCDIR = $(ROOTDIR) 26 | INCDIR = $(ROOTDIR) 27 | BLDDIR = build 28 | DEPDIR = ../.. 29 | OUTDIR = . 30 | 31 | CXXFLAGS = $(NULL) 32 | CFLAGS = $(NULL) 33 | DFLAGS = -D__extern_always_inline=inline 34 | IFLAGS = -I$(INCDIR) -I$(BLDDIR) -I$(DEPDIR)/include 35 | 36 | STATIC ?= 0 37 | BLAS ?= 2 38 | OMP ?= 1 39 | SYM ?= 0 40 | DBG ?= 0 41 | IPO ?= 0 42 | SSE ?= 0 43 | AVX ?= 0 44 | 45 | OFFLOAD ?= 1 46 | LIBNAME = $(DEPDIR)/lib/intel64/libxstream 47 | 48 | ifneq (,$(wildcard $(LIBNAME).a)) 49 | STATIC ?= 1 50 | LIBEXT = a 51 | else 52 | STATIC ?= 0 53 | LIBEXT = so 54 | endif 55 | 56 | OUTNAME = $(shell basename $(ROOTDIR)) 57 | HEADERS = $(shell ls -1 $(INCDIR)/*.h 2> /dev/null | tr "\n" " ") \ 58 | $(shell ls -1 $(SRCDIR)/*.hpp 2> /dev/null | tr "\n" " ") \ 59 | $(shell ls -1 $(SRCDIR)/*.hxx 2> /dev/null | tr "\n" " ") \ 60 | $(shell ls -1 $(SRCDIR)/*.hh 2> /dev/null | tr "\n" " ") 61 | CPPSRCS = $(shell ls -1 $(SRCDIR)/*.cpp 2> /dev/null | tr "\n" " ") 62 | CXXSRCS = $(shell ls -1 $(SRCDIR)/*.cxx 2> /dev/null | tr "\n" " ") 63 | CCXSRCS = $(shell ls -1 $(SRCDIR)/*.cc 2> /dev/null | tr "\n" " ") 64 | CSOURCS = $(shell ls -1 $(SRCDIR)/*.c 2> /dev/null | tr "\n" " ") 65 | FTNSRCS = $(shell ls -1 $(SRCDIR)/*.f 2> /dev/null | tr "\n" " ") 66 | F77SRCS = $(shell ls -1 $(SRCDIR)/*.F 2> /dev/null | tr "\n" " ") 67 | F90SRCS = $(shell ls -1 $(SRCDIR)/*.f90 2> /dev/null | tr "\n" " ") 68 | FTNINCS = $(shell ls -1 $(DEPDIR)/include/*.f 2> /dev/null | tr "\n" " ") 69 | F77INCS = $(shell ls -1 $(DEPDIR)/include/*.F 2> /dev/null | tr "\n" " ") 70 | F90INCS = $(shell ls -1 $(DEPDIR)/include/*.f90 2> /dev/null | tr "\n" " ") 71 | FTNMODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNINCS:.f=-mod.o))) 72 | F77MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77INCS:.F=-mod77.o))) 73 | F90MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90INCS:.f90=-mod90.o))) 74 | MODULES = $(FTNMODS) $(F77MODS) $(F90MODS) 75 | SOURCES = $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) $(FTNSRCS) $(F77SRCS) $(F90SRCS) 76 | CPPOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) 77 | CXXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) 78 | CCXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) 79 | COBJCTS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) 80 | FTNOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNSRCS:.f=-f.o))) 81 | F77OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) 82 | F90OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) 83 | OBJECTS = $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) $(FTNOBJS) $(F77OBJS) $(F90OBJS) 84 | 85 | # include common Makefile artifacts 86 | include $(ROOTDIR)/Makefile.inc 87 | 88 | .PHONY: all 89 | all: $(OUTDIR)/$(OUTNAME) 90 | 91 | $(OUTDIR)/$(OUTNAME): $(OBJECTS) $(LIBNAME).$(LIBEXT) 92 | @mkdir -p $(dir $@) 93 | $(LD) -o $@ $^ $(SLDFLAGS) $(LDFLAGS) $(CLDFLAGS) 94 | 95 | $(BLDDIR)/%-mod.o: $(DEPDIR)/include/%.f $(ROOTDIR)/Makefile 96 | @mkdir -p $(dir $@) 97 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 98 | 99 | $(BLDDIR)/%-mod90.o: $(DEPDIR)/include/%.f90 $(ROOTDIR)/Makefile 100 | @mkdir -p $(dir $@) 101 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 102 | 103 | $(BLDDIR)/%-mod77.o: $(DEPDIR)/include/%.F $(ROOTDIR)/Makefile 104 | @mkdir -p $(dir $@) 105 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 106 | 107 | $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp $(HEADERS) $(ROOTDIR)/Makefile 108 | @mkdir -p $(dir $@) 109 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 110 | 111 | $(BLDDIR)/%-c.o: $(SRCDIR)/%.c $(HEADERS) $(ROOTDIR)/Makefile 112 | @mkdir -p $(dir $@) 113 | $(CC) $(CFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 114 | 115 | $(BLDDIR)/%-f.o: $(SRCDIR)/%.f $(MODULES) $(ROOTDIR)/Makefile 116 | @mkdir -p $(dir $@) 117 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 118 | 119 | $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 $(MODULES) $(ROOTDIR)/Makefile 120 | @mkdir -p $(dir $@) 121 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 122 | 123 | $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F $(MODULES) $(ROOTDIR)/Makefile 124 | @mkdir -p $(dir $@) 125 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 126 | 127 | .PHONY: clean 128 | clean: 129 | ifneq ($(abspath $(BLDDIR)),$(ROOTDIR)) 130 | ifneq ($(abspath $(BLDDIR)),$(abspath .)) 131 | @rm -rf $(BLDDIR) *.mod 132 | else 133 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 134 | endif 135 | else 136 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 137 | endif 138 | @rm -f fit.log *.dat 139 | 140 | .PHONY: realclean 141 | realclean: clean 142 | ifneq ($(abspath $(OUTDIR)),$(ROOTDIR)) 143 | ifneq ($(abspath $(OUTDIR)),$(abspath .)) 144 | @rm -rf $(OUTDIR) 145 | else 146 | @rm -f $(OUTDIR)/$(OUTNAME) 147 | endif 148 | else 149 | @rm -f $(OUTDIR)/$(OUTNAME) 150 | endif 151 | @rm -f $(OUTDIR)/libxstream.so 152 | 153 | install: all clean 154 | @cp $(DEPDIR)/lib/intel64/libxstream.so $(OUTDIR) 2> /dev/null || true 155 | @cp $(ROOTDIR)/$(OUTNAME).sh $(OUTDIR) 2> /dev/null || true 156 | @cp $(ROOTDIR)/$(OUTNAME).plt $(OUTDIR) 2> /dev/null || true 157 | @cp $(ROOTDIR)/plot.sh $(OUTDIR) 2> /dev/null || true 158 | @cp $(ROOTDIR)/benchmark.sh $(OUTDIR) 2> /dev/null || true 159 | 160 | -------------------------------------------------------------------------------- /include/common/opencl_atomics.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #ifndef OPENCL_ATOMICS_H 10 | #define OPENCL_ATOMICS_H 11 | 12 | #include "opencl_common.h" 13 | 14 | #if (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 15 | # if !defined(T) 16 | # define T double 17 | # endif 18 | # define ZERO 0.0 19 | #elif (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) 20 | # if !defined(T) 21 | # define T float 22 | # endif 23 | # define ZERO 0.f 24 | #elif defined(T) /*c_dbcsr_acc_opencl_atomic_fp_no*/ 25 | # define ZERO 0 26 | #endif 27 | 28 | #define GLOBAL_VOLATILE(A) global volatile A 29 | 30 | #if defined(ATOMIC_PROTOTYPES) || defined(__opencl_c_ext_fp64_global_atomic_add) 31 | # if defined(__opencl_c_ext_fp64_global_atomic_add) 32 | # undef ATOMIC_ADD_GLOBAL 33 | # if defined(TF) 34 | # define ATOMIC_ADD_GLOBAL(A, B) \ 35 | atomic_fetch_add_explicit((GLOBAL_VOLATILE(TF)*)A, B, memory_order_relaxed, memory_scope_work_group) 36 | # else 37 | # define ATOMIC_ADD_GLOBAL(A, B) atomic_add(A, B) 38 | # endif 39 | # elif (2 < ATOMIC_PROTOTYPES) && defined(TF) 40 | # undef ATOMIC_ADD_GLOBAL 41 | # define ATOMIC_ADD_GLOBAL(A, B) \ 42 | __opencl_atomic_fetch_add((GLOBAL_VOLATILE(TF)*)A, B, memory_order_relaxed, memory_scope_work_group) 43 | # else 44 | # if defined(TF) && (!defined(ATOMIC_PROTOTYPES) || 1 < ATOMIC_PROTOTYPES) 45 | __attribute__((overloadable)) T atomic_fetch_add_explicit(GLOBAL_VOLATILE(TF) *, T, memory_order, memory_scope); 46 | # else 47 | __attribute__((overloadable)) T atomic_add(GLOBAL_VOLATILE(T) *, T); 48 | # endif 49 | # endif 50 | #endif 51 | 52 | #define ACCUMULATE(A, B) ATOMIC_ADD_GLOBAL(A, B) 53 | 54 | 55 | #if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 56 | # if defined(ATOMIC32_ADD64) 57 | __attribute__((always_inline)) inline void atomic32_add64_global(GLOBAL_VOLATILE(double) * dst, double inc) { 58 | *dst += inc; /* TODO */ 59 | } 60 | # endif 61 | #endif 62 | 63 | 64 | #if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 65 | # if defined(CMPXCHG) 66 | __attribute__((always_inline)) inline void atomic_add_global_cmpxchg(GLOBAL_VOLATILE(T) * dst, T inc) { 67 | # if !defined(ATOMIC32_ADD64) 68 | union { 69 | T f; 70 | TA a; 71 | } exp_val, try_val, cur_val = {.f = *dst}; 72 | do { 73 | exp_val.a = cur_val.a; 74 | try_val.f = exp_val.f + inc; 75 | # if defined(TA2) 76 | if (0 == atomic_compare_exchange_weak_explicit((GLOBAL_VOLATILE(TA2)*)dst, &cur_val.a, try_val.a, memory_order_relaxed, 77 | memory_order_relaxed, memory_scope_work_group)) 78 | continue; 79 | # else 80 | cur_val.a = CMPXCHG((GLOBAL_VOLATILE(TA)*)dst, exp_val.a, try_val.a); 81 | # endif 82 | } while (cur_val.a != exp_val.a); 83 | # else 84 | atomic32_add64_global(dst, inc); 85 | # endif 86 | } 87 | # endif 88 | #endif 89 | 90 | 91 | #if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 92 | # if defined(ATOMIC_ADD2_GLOBAL) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) 93 | __attribute__((always_inline)) inline void atomic_add_global_cmpxchg2(GLOBAL_VOLATILE(float) * dst, float2 inc) { 94 | union { 95 | float2 f; 96 | long a; 97 | } exp_val, try_val, cur_val = {.f = (float2)(dst[0], dst[1])}; 98 | do { 99 | exp_val.a = cur_val.a; 100 | try_val.f = exp_val.f + inc; 101 | # if defined(TA2) 102 | if (0 == atomic_compare_exchange_weak_explicit((GLOBAL_VOLATILE(atomic_long)*)dst, &cur_val.a, try_val.a, memory_order_relaxed, 103 | memory_order_relaxed, memory_scope_work_group)) 104 | continue; 105 | # else 106 | cur_val.a = atom_cmpxchg((GLOBAL_VOLATILE(long)*)dst, exp_val.a, try_val.a); 107 | # endif 108 | } while (cur_val.a != exp_val.a); 109 | } 110 | # endif 111 | #endif 112 | 113 | 114 | #if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 115 | # if defined(XCHG) || (defined(__NV_CL_C_VERSION) && !defined(CMPXCHG) && !defined(ATOMIC_PROTOTYPES)) 116 | __attribute__((always_inline)) inline void atomic_add_global_xchg(GLOBAL_VOLATILE(T) * dst, T inc) { 117 | # if !defined(ATOMIC32_ADD64) 118 | # if (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) 119 | asm("{ .reg .f32 t; atom.global.add.f32 t, [%0], %1; }" ::"l"(dst), "f"(inc)); 120 | # elif (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) 121 | asm("{ .reg .f64 t; atom.global.add.f64 t, [%0], %1; }" ::"l"(dst), "d"(inc)); 122 | # else 123 | union { 124 | T f; 125 | TA a; 126 | } exp_val = {.f = inc}, try_val, cur_val = {/*.f = ZERO*/ .a = 0}; 127 | do { 128 | # if defined(TA2) 129 | try_val.a = atomic_exchange_explicit((GLOBAL_VOLATILE(TA2)*)dst, cur_val.a, memory_order_relaxed, memory_scope_work_group); 130 | # else 131 | try_val.a = XCHG((GLOBAL_VOLATILE(TA)*)dst, cur_val.a); 132 | # endif 133 | try_val.f += exp_val.f; 134 | # if defined(TA2) 135 | exp_val.a = atomic_exchange_explicit((GLOBAL_VOLATILE(TA2)*)dst, try_val.a, memory_order_relaxed, memory_scope_work_group); 136 | # else 137 | exp_val.a = XCHG((GLOBAL_VOLATILE(TA)*)dst, try_val.a); 138 | # endif 139 | } while (cur_val.a != exp_val.a); 140 | # endif 141 | # else 142 | atomic32_add64_global(dst, inc); 143 | # endif 144 | } 145 | # endif 146 | #endif 147 | 148 | #endif /*OPENCL_ATOMICS_H*/ 149 | -------------------------------------------------------------------------------- /scripts/acc_triplets.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #################################################################################################### 3 | # Copyright (C) by the DBCSR developers group - All rights reserved # 4 | # This file is part of the DBCSR library. # 5 | # # 6 | # For information on the license, see the LICENSE file. # 7 | # For further information please visit https://dbcsr.cp2k.org # 8 | # SPDX-License-Identifier: BSD-3-Clause # 9 | #################################################################################################### 10 | XARGS=$(command -v xargs) 11 | SORT=$(command -v sort) 12 | HEAD=$(command -v head) 13 | SED=$(command -v gsed) 14 | CUT=$(command -v cut) 15 | 16 | # GNU sed is desired (macOS) 17 | if [ ! "${SED}" ]; then 18 | SED=$(command -v sed) 19 | fi 20 | 21 | if [ "${XARGS}" ] && [ "${SORT}" ] && [ "${HEAD}" ] && [ "${SED}" ] && [ "${CUT}" ]; then 22 | LINES=0 23 | while test $# -gt 0; do 24 | case "$1" in 25 | -h|--help) 26 | HELP=1 27 | shift $#;; 28 | -l|--lines) 29 | LINES=1 30 | shift;; 31 | -r|--bound) 32 | BOUNDL=$2 33 | BOUNDU=$3 34 | shift 3;; 35 | -m|--limit) 36 | MAXEXT=$2 37 | shift 2;; 38 | -n|--triplets) 39 | MAXNUM=$2 40 | shift 2;; 41 | -a|--amat) 42 | CUTSEL=-f1,3 43 | shift;; 44 | -b|--bmat) 45 | CUTSEL=-f3,2 46 | shift;; 47 | -c|--cmat) 48 | CUTSEL=-f1,2 49 | shift;; 50 | -k|--specid) 51 | case "$2" in 52 | 0) TRIPLETS="23, 6, 14 16 29, 5 16 13 24 26, 9 16 22, 32, 64, 78, 16 29 55";; 53 | 1) TRIPLETS="23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55";; 54 | 2) TRIPLETS="23, 6, 14 16 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 12";; 55 | 3) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12";; 56 | 4) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45";; 57 | 5) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 32, 64, 78, 16 29 55, 32 29 55, 12, 13 26 28 32 45, 7 13 25 32";; 58 | 6) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45";; 59 | 7) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10";; 60 | 8) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15";; 61 | 9) TRIPLETS="23, 6, 14 16 29, 14 32 29, 5 32 13 24 26, 9 32 22, 64, 78, 16 29 55, 32 29 55, 12, 4 5 7 9 13 25 26 28 32 45, 4 10 15, 6 7 8";; 62 | *) TRIPLETS=" \ 63 | 4 5 7 9 13 25 26 28 32 45, \ 64 | 13 14 25 26 32, \ 65 | 5 32 13 24 26, \ 66 | 14 16 29, \ 67 | 14 32 29, \ 68 | 16 29 55, \ 69 | 32 29 55, \ 70 | 9 32 22, \ 71 | 4 10 15, \ 72 | 6 7 8, \ 73 | 23, \ 74 | 64, \ 75 | 78, \ 76 | 12, \ 77 | 6";; 78 | esac 79 | shift 2;; 80 | *) 81 | if [ "" = "$(echo "$*" | ${SED} -n "s/[0-9]*[[:space:]]*,*//gp")" ]; then 82 | TRIPLETS="$*" 83 | else 84 | >&2 echo "ERROR: invalid triplet specification!" 85 | fi 86 | break;; 87 | esac 88 | done 89 | if [[ "${TRIPLETS}" && (! "${HELP}" || "0" = "${HELP}") ]]; then 90 | for SPECS in $(echo "${TRIPLETS}" | ${SED} -e "s/[[:space:]][[:space:]]*/x/g" -e "s/,/ /g"); do 91 | SPEC=$(echo "${SPECS}" | ${SED} -e "s/^x//g" -e "s/x$//g" -e "s/x/,/g") 92 | if [ "${MAXEXT}" ] && [ "0" != "$((0]\"" 134 | eval "${ECHO} \" Options must precede triplet specification\"" 135 | eval "${ECHO} \" -l|--lines: lines instead of list of words\"" 136 | eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\"" 137 | eval "${ECHO} \" -m|--limit N: limit any shape extent to N\"" 138 | eval "${ECHO} \" -n|--triplets N: limit number of triplet\"" 139 | eval "${ECHO} \" -a|--amat: select MxK instead of MxNxK\"" 140 | eval "${ECHO} \" -b|--bmat: select KxN instead of MxNxK\"" 141 | eval "${ECHO} \" -c|--cmat: select MxN instead of MxNxK\"" 142 | eval "${ECHO} \" -k|--specid N: predefined triplets\"" 143 | eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\"" 144 | eval "${ECHO} \" -k 0: 201 kernels\"" 145 | eval "${ECHO} \" -k 10: 1266 kernels\"" 146 | eval "${ECHO} \" , e.g., 134 kernels\"" 147 | eval "${ECHO} \" 23, 5 32 13 24 26, 4 9\"" 148 | eval "${ECHO}" 149 | if [ "${HELP}" ] || [ "0" = "${HELP}" ]; then exit 0; fi 150 | >&2 echo "ERROR: invalid or no given!" 151 | exit 1 152 | fi 153 | else 154 | >&2 echo "ERROR: missing prerequisites!" 155 | exit 1 156 | fi 157 | -------------------------------------------------------------------------------- /samples/multi-dgemm/results/Xeon_Phi-3120/benchmark.txt: -------------------------------------------------------------------------------- 1 | Initializing 1 device and host data... 4435.7 MB 2 | Initializing 2 streams per device... 64.0 MB 3 | Running 250 batches of 1 item... 4 | Performance: 51.5 GFLOPS/s 5 | Duration: 6.8 s 6 | Error: 9.9476e-14 7 | Finished 8 | Initializing 1 device and host data... 4435.7 MB 9 | Initializing 2 streams per device... 128.0 MB 10 | Running 125 batches of 2 items... 11 | Performance: 98.7 GFLOPS/s 12 | Duration: 3.5 s 13 | Error: 9.9476e-14 14 | Finished 15 | Initializing 1 device and host data... 4435.7 MB 16 | Initializing 2 streams per device... 192.0 MB 17 | Running 84 batches of 3 items... 18 | Performance: 111.7 GFLOPS/s 19 | Duration: 3.1 s 20 | Error: 9.9476e-14 21 | Finished 22 | Initializing 1 device and host data... 4435.7 MB 23 | Initializing 2 streams per device... 256.0 MB 24 | Running 63 batches of 4 items... 25 | Performance: 138.0 GFLOPS/s 26 | Duration: 2.5 s 27 | Error: 9.9476e-14 28 | Finished 29 | Initializing 1 device and host data... 4435.7 MB 30 | Initializing 2 streams per device... 320.0 MB 31 | Running 50 batches of 5 items... 32 | Performance: 130.4 GFLOPS/s 33 | Duration: 2.7 s 34 | Error: 9.9476e-14 35 | Finished 36 | Initializing 1 device and host data... 4435.7 MB 37 | Initializing 2 streams per device... 384.0 MB 38 | Running 42 batches of 6 items... 39 | Performance: 134.7 GFLOPS/s 40 | Duration: 2.6 s 41 | Error: 9.9476e-14 42 | Finished 43 | Initializing 1 device and host data... 4435.7 MB 44 | Initializing 2 streams per device... 448.0 MB 45 | Running 36 batches of 7 items... 46 | Performance: 144.6 GFLOPS/s 47 | Duration: 2.4 s 48 | Error: 9.9476e-14 49 | Finished 50 | Initializing 1 device and host data... 4435.7 MB 51 | Initializing 2 streams per device... 512.0 MB 52 | Running 32 batches of 8 items... 53 | Performance: 133.6 GFLOPS/s 54 | Duration: 2.6 s 55 | Error: 9.9476e-14 56 | Finished 57 | Initializing 1 device and host data... 4435.7 MB 58 | Initializing 2 streams per device... 576.0 MB 59 | Running 28 batches of 9 items... 60 | Performance: 130.8 GFLOPS/s 61 | Duration: 2.7 s 62 | Error: 9.9476e-14 63 | Finished 64 | Initializing 1 device and host data... 4435.7 MB 65 | Initializing 2 streams per device... 640.0 MB 66 | Running 25 batches of 10 items... 67 | Performance: 136.8 GFLOPS/s 68 | Duration: 2.5 s 69 | Error: 9.9476e-14 70 | Finished 71 | Initializing 1 device and host data... 4435.7 MB 72 | Initializing 2 streams per device... 704.0 MB 73 | Running 23 batches of 11 items... 74 | Performance: 133.3 GFLOPS/s 75 | Duration: 2.6 s 76 | Error: 9.9476e-14 77 | Finished 78 | Initializing 1 device and host data... 4435.7 MB 79 | Initializing 2 streams per device... 768.0 MB 80 | Running 21 batches of 12 items... 81 | Performance: 133.5 GFLOPS/s 82 | Duration: 2.6 s 83 | Error: 9.9476e-14 84 | Finished 85 | Initializing 1 device and host data... 4435.7 MB 86 | Initializing 2 streams per device... 832.0 MB 87 | Running 20 batches of 13 items... 88 | Performance: 130.4 GFLOPS/s 89 | Duration: 2.7 s 90 | Error: 9.9476e-14 91 | Finished 92 | Initializing 1 device and host data... 4435.7 MB 93 | Initializing 2 streams per device... 896.0 MB 94 | Running 18 batches of 14 items... 95 | Performance: 133.0 GFLOPS/s 96 | Duration: 2.6 s 97 | Error: 9.9476e-14 98 | Finished 99 | Initializing 1 device and host data... 4435.7 MB 100 | Initializing 2 streams per device... 960.0 MB 101 | Running 17 batches of 15 items... 102 | Performance: 127.0 GFLOPS/s 103 | Duration: 2.7 s 104 | Error: 9.9476e-14 105 | Finished 106 | Initializing 1 device and host data... 4435.7 MB 107 | Initializing 2 streams per device... 1024.0 MB 108 | Running 16 batches of 16 items... 109 | Performance: 126.4 GFLOPS/s 110 | Duration: 2.8 s 111 | Error: 9.9476e-14 112 | Finished 113 | Initializing 1 device and host data... 4435.7 MB 114 | Initializing 2 streams per device... 1088.0 MB 115 | Running 15 batches of 17 items... 116 | Performance: 119.6 GFLOPS/s 117 | Duration: 2.9 s 118 | Error: 9.9476e-14 119 | Finished 120 | Initializing 1 device and host data... 4435.7 MB 121 | Initializing 2 streams per device... 1152.0 MB 122 | Running 14 batches of 18 items... 123 | Performance: 124.6 GFLOPS/s 124 | Duration: 2.8 s 125 | Error: 9.9476e-14 126 | Finished 127 | Initializing 1 device and host data... 4435.7 MB 128 | Initializing 2 streams per device... 1216.0 MB 129 | Running 14 batches of 19 items... 130 | Performance: 120.6 GFLOPS/s 131 | Duration: 2.9 s 132 | Error: 9.9476e-14 133 | Finished 134 | Initializing 1 device and host data... 4435.7 MB 135 | Initializing 2 streams per device... 1280.0 MB 136 | Running 13 batches of 20 items... 137 | Performance: 125.1 GFLOPS/s 138 | Duration: 2.8 s 139 | Error: 9.9476e-14 140 | Finished 141 | Initializing 1 device and host data... 4435.7 MB 142 | Initializing 2 streams per device... 1344.0 MB 143 | Running 12 batches of 21 items... 144 | Performance: 120.6 GFLOPS/s 145 | Duration: 2.9 s 146 | Error: 9.9476e-14 147 | Finished 148 | Initializing 1 device and host data... 4435.7 MB 149 | Initializing 2 streams per device... 1408.0 MB 150 | Running 12 batches of 22 items... 151 | Performance: 117.8 GFLOPS/s 152 | Duration: 3.0 s 153 | Error: 9.9476e-14 154 | Finished 155 | Initializing 1 device and host data... 4435.7 MB 156 | Initializing 2 streams per device... 1472.0 MB 157 | Running 11 batches of 23 items... 158 | Performance: 100.9 GFLOPS/s 159 | Duration: 3.5 s 160 | Error: 9.9476e-14 161 | Finished 162 | Initializing 1 device and host data... 4435.7 MB 163 | Initializing 2 streams per device... 1536.0 MB 164 | Running 11 batches of 24 items... 165 | Performance: 111.6 GFLOPS/s 166 | Duration: 3.1 s 167 | Error: 9.9476e-14 168 | Finished 169 | Initializing 1 device and host data... 4435.7 MB 170 | Initializing 2 streams per device... 1600.0 MB 171 | Running 10 batches of 25 items... 172 | Performance: 112.4 GFLOPS/s 173 | Duration: 3.1 s 174 | Error: 9.9476e-14 175 | Finished 176 | Initializing 1 device and host data... 4435.7 MB 177 | Initializing 2 streams per device... 1664.0 MB 178 | Running 10 batches of 26 items... 179 | Performance: 105.6 GFLOPS/s 180 | Duration: 3.3 s 181 | Error: 9.9476e-14 182 | Finished 183 | Initializing 1 device and host data... 4435.7 MB 184 | Initializing 2 streams per device... 1728.0 MB 185 | Running 10 batches of 27 items... 186 | Performance: 105.9 GFLOPS/s 187 | Duration: 3.3 s 188 | Error: 9.9476e-14 189 | Finished 190 | Initializing 1 device and host data... 4435.7 MB 191 | Initializing 2 streams per device... 1792.0 MB 192 | Running 9 batches of 28 items... 193 | Performance: 104.9 GFLOPS/s 194 | Duration: 3.3 s 195 | Error: 9.9476e-14 196 | Finished 197 | Initializing 1 device and host data... 4435.7 MB 198 | Initializing 2 streams per device... 1856.0 MB 199 | Running 9 batches of 29 items... 200 | Performance: 102.2 GFLOPS/s 201 | Duration: 3.4 s 202 | Error: 9.9476e-14 203 | Finished 204 | Initializing 1 device and host data... 4435.7 MB 205 | Initializing 2 streams per device... 1920.0 MB 206 | Running 9 batches of 30 items... 207 | Performance: 105.6 GFLOPS/s 208 | Duration: 3.3 s 209 | Error: 9.9476e-14 210 | Finished 211 | Initializing 1 device and host data... 4435.7 MB 212 | Initializing 2 streams per device... 1984.0 MB 213 | Running 9 batches of 31 items... 214 | Performance: 104.0 GFLOPS/s 215 | Duration: 3.3 s 216 | Error: 9.9476e-14 217 | Finished 218 | -------------------------------------------------------------------------------- /samples/multi-dgemm/results/Xeon_Phi-7120/benchmark.txt: -------------------------------------------------------------------------------- 1 | Initializing 1 device and host data... 4435.7 MB 2 | Initializing 2 streams per device... 64.0 MB 3 | Running 250 batches of 1 item... 4 | Performance: 94.5 GFLOPS/s 5 | Duration: 3.7 s 6 | Error: 1.13687e-13 7 | Finished 8 | Initializing 1 device and host data... 4435.7 MB 9 | Initializing 2 streams per device... 128.0 MB 10 | Running 125 batches of 2 items... 11 | Performance: 147.0 GFLOPS/s 12 | Duration: 2.4 s 13 | Error: 1.13687e-13 14 | Finished 15 | Initializing 1 device and host data... 4435.7 MB 16 | Initializing 2 streams per device... 192.0 MB 17 | Running 84 batches of 3 items... 18 | Performance: 138.7 GFLOPS/s 19 | Duration: 2.5 s 20 | Error: 1.13687e-13 21 | Finished 22 | Initializing 1 device and host data... 4435.7 MB 23 | Initializing 2 streams per device... 256.0 MB 24 | Running 63 batches of 4 items... 25 | Performance: 151.9 GFLOPS/s 26 | Duration: 2.3 s 27 | Error: 1.13687e-13 28 | Finished 29 | Initializing 1 device and host data... 4435.7 MB 30 | Initializing 2 streams per device... 320.0 MB 31 | Running 50 batches of 5 items... 32 | Performance: 156.2 GFLOPS/s 33 | Duration: 2.2 s 34 | Error: 1.13687e-13 35 | Finished 36 | Initializing 1 device and host data... 4435.7 MB 37 | Initializing 2 streams per device... 384.0 MB 38 | Running 42 batches of 6 items... 39 | Performance: 157.4 GFLOPS/s 40 | Duration: 2.2 s 41 | Error: 1.13687e-13 42 | Finished 43 | Initializing 1 device and host data... 4435.7 MB 44 | Initializing 2 streams per device... 448.0 MB 45 | Running 36 batches of 7 items... 46 | Performance: 160.9 GFLOPS/s 47 | Duration: 2.2 s 48 | Error: 1.13687e-13 49 | Finished 50 | Initializing 1 device and host data... 4435.7 MB 51 | Initializing 2 streams per device... 512.0 MB 52 | Running 32 batches of 8 items... 53 | Performance: 151.6 GFLOPS/s 54 | Duration: 2.3 s 55 | Error: 1.13687e-13 56 | Finished 57 | Initializing 1 device and host data... 4435.7 MB 58 | Initializing 2 streams per device... 576.0 MB 59 | Running 28 batches of 9 items... 60 | Performance: 152.5 GFLOPS/s 61 | Duration: 2.3 s 62 | Error: 1.13687e-13 63 | Finished 64 | Initializing 1 device and host data... 4435.7 MB 65 | Initializing 2 streams per device... 640.0 MB 66 | Running 25 batches of 10 items... 67 | Performance: 150.0 GFLOPS/s 68 | Duration: 2.3 s 69 | Error: 1.13687e-13 70 | Finished 71 | Initializing 1 device and host data... 4435.7 MB 72 | Initializing 2 streams per device... 704.0 MB 73 | Running 23 batches of 11 items... 74 | Performance: 146.9 GFLOPS/s 75 | Duration: 2.4 s 76 | Error: 1.13687e-13 77 | Finished 78 | Initializing 1 device and host data... 4435.7 MB 79 | Initializing 2 streams per device... 768.0 MB 80 | Running 21 batches of 12 items... 81 | Performance: 144.6 GFLOPS/s 82 | Duration: 2.4 s 83 | Error: 1.13687e-13 84 | Finished 85 | Initializing 1 device and host data... 4435.7 MB 86 | Initializing 2 streams per device... 832.0 MB 87 | Running 20 batches of 13 items... 88 | Performance: 138.9 GFLOPS/s 89 | Duration: 2.5 s 90 | Error: 1.13687e-13 91 | Finished 92 | Initializing 1 device and host data... 4435.7 MB 93 | Initializing 2 streams per device... 896.0 MB 94 | Running 18 batches of 14 items... 95 | Performance: 144.6 GFLOPS/s 96 | Duration: 2.4 s 97 | Error: 1.13687e-13 98 | Finished 99 | Initializing 1 device and host data... 4435.7 MB 100 | Initializing 2 streams per device... 960.0 MB 101 | Running 17 batches of 15 items... 102 | Performance: 138.3 GFLOPS/s 103 | Duration: 2.5 s 104 | Error: 1.13687e-13 105 | Finished 106 | Initializing 1 device and host data... 4435.7 MB 107 | Initializing 2 streams per device... 1024.0 MB 108 | Running 16 batches of 16 items... 109 | Performance: 144.6 GFLOPS/s 110 | Duration: 2.4 s 111 | Error: 1.13687e-13 112 | Finished 113 | Initializing 1 device and host data... 4435.7 MB 114 | Initializing 2 streams per device... 1088.0 MB 115 | Running 15 batches of 17 items... 116 | Performance: 140.7 GFLOPS/s 117 | Duration: 2.5 s 118 | Error: 1.13687e-13 119 | Finished 120 | Initializing 1 device and host data... 4435.7 MB 121 | Initializing 2 streams per device... 1152.0 MB 122 | Running 14 batches of 18 items... 123 | Performance: 127.9 GFLOPS/s 124 | Duration: 2.7 s 125 | Error: 1.13687e-13 126 | Finished 127 | Initializing 1 device and host data... 4435.7 MB 128 | Initializing 2 streams per device... 1216.0 MB 129 | Running 14 batches of 19 items... 130 | Performance: 130.7 GFLOPS/s 131 | Duration: 2.7 s 132 | Error: 1.13687e-13 133 | Finished 134 | Initializing 1 device and host data... 4435.7 MB 135 | Initializing 2 streams per device... 1280.0 MB 136 | Running 13 batches of 20 items... 137 | Performance: 126.4 GFLOPS/s 138 | Duration: 2.8 s 139 | Error: 1.13687e-13 140 | Finished 141 | Initializing 1 device and host data... 4435.7 MB 142 | Initializing 2 streams per device... 1344.0 MB 143 | Running 12 batches of 21 items... 144 | Performance: 123.2 GFLOPS/s 145 | Duration: 2.8 s 146 | Error: 1.13687e-13 147 | Finished 148 | Initializing 1 device and host data... 4435.7 MB 149 | Initializing 2 streams per device... 1408.0 MB 150 | Running 12 batches of 22 items... 151 | Performance: 126.8 GFLOPS/s 152 | Duration: 2.7 s 153 | Error: 1.13687e-13 154 | Finished 155 | Initializing 1 device and host data... 4435.7 MB 156 | Initializing 2 streams per device... 1472.0 MB 157 | Running 11 batches of 23 items... 158 | Performance: 121.1 GFLOPS/s 159 | Duration: 2.9 s 160 | Error: 1.13687e-13 161 | Finished 162 | Initializing 1 device and host data... 4435.7 MB 163 | Initializing 2 streams per device... 1536.0 MB 164 | Running 11 batches of 24 items... 165 | Performance: 126.8 GFLOPS/s 166 | Duration: 2.7 s 167 | Error: 1.13687e-13 168 | Finished 169 | Initializing 1 device and host data... 4435.7 MB 170 | Initializing 2 streams per device... 1600.0 MB 171 | Running 10 batches of 25 items... 172 | Performance: 117.6 GFLOPS/s 173 | Duration: 3.0 s 174 | Error: 1.13687e-13 175 | Finished 176 | Initializing 1 device and host data... 4435.7 MB 177 | Initializing 2 streams per device... 1664.0 MB 178 | Running 10 batches of 26 items... 179 | Performance: 119.3 GFLOPS/s 180 | Duration: 2.9 s 181 | Error: 1.13687e-13 182 | Finished 183 | Initializing 1 device and host data... 4435.7 MB 184 | Initializing 2 streams per device... 1728.0 MB 185 | Running 10 batches of 27 items... 186 | Performance: 113.5 GFLOPS/s 187 | Duration: 3.1 s 188 | Error: 1.13687e-13 189 | Finished 190 | Initializing 1 device and host data... 4435.7 MB 191 | Initializing 2 streams per device... 1792.0 MB 192 | Running 9 batches of 28 items... 193 | Performance: 113.0 GFLOPS/s 194 | Duration: 3.1 s 195 | Error: 1.13687e-13 196 | Finished 197 | Initializing 1 device and host data... 4435.7 MB 198 | Initializing 2 streams per device... 1856.0 MB 199 | Running 9 batches of 29 items... 200 | Performance: 115.5 GFLOPS/s 201 | Duration: 3.0 s 202 | Error: 1.13687e-13 203 | Finished 204 | Initializing 1 device and host data... 4435.7 MB 205 | Initializing 2 streams per device... 1920.0 MB 206 | Running 9 batches of 30 items... 207 | Performance: 111.5 GFLOPS/s 208 | Duration: 3.1 s 209 | Error: 1.13687e-13 210 | Finished 211 | Initializing 1 device and host data... 4435.7 MB 212 | Initializing 2 streams per device... 1984.0 MB 213 | Running 9 batches of 31 items... 214 | Performance: 114.5 GFLOPS/s 215 | Duration: 3.0 s 216 | Error: 1.13687e-13 217 | Finished 218 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Export all variables to sub-make processes. 2 | #.EXPORT_ALL_VARIABLES: #export 3 | 4 | # Automatically disable parallel builds 5 | # depending on the version of GNU Make. 6 | # MAKE_PARALLEL=0: disable explcitly 7 | # MAKE_PARALLEL=1: enable explicitly 8 | ifeq (0,$(MAKE_PARALLEL)) 9 | .NOTPARALLEL: 10 | else ifeq (,$(MAKE_PARALLEL)) 11 | ifneq (3.82,$(firstword $(sort $(MAKE_VERSION) 3.82))) 12 | .NOTPARALLEL: 13 | endif 14 | endif 15 | 16 | # Linux cut has features we use that do not work elsewhere 17 | # Mac, etc. users should install GNU coreutils and use cut from there. 18 | # 19 | # For example, if you use Homebrew, run "brew install coreutils" once 20 | # and then invoke the LIBXSMM make command with 21 | # CUT=/usr/local/Cellar/coreutils/8.24/libexec/gnubin/cut 22 | CUT ?= cut 23 | 24 | ARCH = intel64 25 | 26 | ROOTDIR = $(abspath $(dir $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)))) 27 | INCDIR = $(ROOTDIR)/include 28 | SRCDIR = $(ROOTDIR)/src 29 | BLDDIR = build/$(ARCH) 30 | OUTDIR = lib/$(ARCH) 31 | DOCDIR = documentation 32 | 33 | CXXFLAGS = $(NULL) 34 | CFLAGS = $(NULL) 35 | DFLAGS = -D__extern_always_inline=inline -DLIBXSTREAM_EXPORTED 36 | IFLAGS = -I$(INCDIR) 37 | 38 | # Request strongest code conformance 39 | PEDANTIC ?= 0 40 | 41 | OFFLOAD ?= 1 42 | STATIC ?= 1 43 | OMP ?= 0 44 | SYM ?= 0 45 | DBG ?= 0 46 | IPO ?= 0 47 | EXP ?= 0 48 | SSE ?= 0 49 | AVX ?= 0 50 | 51 | OUTNAME = $(shell basename $(ROOTDIR)) 52 | HEADERS = $(shell ls -1 $(INCDIR)/*.h 2> /dev/null | tr "\n" " ") \ 53 | $(shell ls -1 $(SRCDIR)/*.hpp 2> /dev/null | tr "\n" " ") \ 54 | $(shell ls -1 $(SRCDIR)/*.hxx 2> /dev/null | tr "\n" " ") \ 55 | $(shell ls -1 $(SRCDIR)/*.hh 2> /dev/null | tr "\n" " ") 56 | CPPSRCS = $(shell ls -1 $(SRCDIR)/*.cpp 2> /dev/null | tr "\n" " ") 57 | CXXSRCS = $(shell ls -1 $(SRCDIR)/*.cxx 2> /dev/null | tr "\n" " ") 58 | CCXSRCS = $(shell ls -1 $(SRCDIR)/*.cc 2> /dev/null | tr "\n" " ") 59 | CSOURCS = $(shell ls -1 $(SRCDIR)/*.c 2> /dev/null | tr "\n" " ") 60 | FTNSRCS = $(shell ls -1 $(SRCDIR)/*.f 2> /dev/null | tr "\n" " ") 61 | F77SRCS = $(shell ls -1 $(SRCDIR)/*.F 2> /dev/null | tr "\n" " ") 62 | F90SRCS = $(shell ls -1 $(SRCDIR)/*.f90 2> /dev/null | tr "\n" " ") 63 | FTNINCS = $(shell ls -1 $(DEPDIR)/include/*.f 2> /dev/null | tr "\n" " ") 64 | F77INCS = $(shell ls -1 $(DEPDIR)/include/*.F 2> /dev/null | tr "\n" " ") 65 | F90INCS = $(shell ls -1 $(DEPDIR)/include/*.f90 2> /dev/null | tr "\n" " ") 66 | FTNMODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNINCS:.f=-mod.o))) 67 | F77MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77INCS:.F=-mod77.o))) 68 | F90MODS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90INCS:.f90=-mod90.o))) 69 | MODULES = $(FTNMODS) $(F77MODS) $(F90MODS) 70 | SOURCES = $(CPPSRCS) $(CXXSRCS) $(CCXSRCS) $(CSOURCS) $(FTNSRCS) $(F77SRCS) $(F90SRCS) 71 | CPPOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CPPSRCS:.cpp=-cpp.o))) 72 | CXXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CXXSRCS:.cxx=-cxx.o))) 73 | CCXOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CCXSRCS:.cc=-cc.o))) 74 | COBJCTS = $(patsubst %,$(BLDDIR)/%,$(notdir $(CSOURCS:.c=-c.o))) 75 | FTNOBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(FTNSRCS:.f=-f.o))) 76 | F77OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F77SRCS:.F=-f77.o))) 77 | F90OBJS = $(patsubst %,$(BLDDIR)/%,$(notdir $(F90SRCS:.f90=-f90.o))) 78 | OBJECTS = $(CPPOBJS) $(CXXOBJS) $(CCXOBJS) $(COBJCTS) $(FTNOBJS) $(F77OBJS) $(F90OBJS) 79 | 80 | # include common Makefile artifacts 81 | include $(ROOTDIR)/Makefile.inc 82 | 83 | ifneq (0,$(STATIC)) 84 | LIBEXT = a 85 | else 86 | LIBEXT = so 87 | endif 88 | 89 | parent = $(subst ?, ,$(firstword $(subst /, ,$(subst $(NULL) ,?,$(patsubst ./%,%,$1))))) 90 | 91 | .PHONY: all 92 | all: $(OUTDIR)/$(OUTNAME).$(LIBEXT) 93 | 94 | $(OUTDIR)/$(OUTNAME).$(LIBEXT): $(OBJECTS) 95 | @mkdir -p $(dir $@) 96 | ifeq ($(STATIC),0) 97 | $(LD) -shared -o $@ $(LDFLAGS) $^ 98 | else 99 | $(AR) -rs $@ $^ 100 | endif 101 | 102 | $(BLDDIR)/%-mod.o: $(DEPDIR)/include/%.f $(ROOTDIR)/Makefile 103 | @mkdir -p $(dir $@) 104 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 105 | 106 | $(BLDDIR)/%-mod90.o: $(DEPDIR)/include/%.f90 $(ROOTDIR)/Makefile 107 | @mkdir -p $(dir $@) 108 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 109 | 110 | $(BLDDIR)/%-mod77.o: $(DEPDIR)/include/%.F $(ROOTDIR)/Makefile 111 | @mkdir -p $(dir $@) 112 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ $(FCMODDIRFLAG) $(dir $@) 113 | 114 | $(BLDDIR)/%-cpp.o: $(SRCDIR)/%.cpp $(HEADERS) $(ROOTDIR)/Makefile 115 | @mkdir -p $(dir $@) 116 | $(CXX) $(CXXFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 117 | 118 | $(BLDDIR)/%-c.o: $(SRCDIR)/%.c $(HEADERS) $(ROOTDIR)/Makefile 119 | @mkdir -p $(dir $@) 120 | $(CC) $(CFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 121 | 122 | $(BLDDIR)/%-f.o: $(SRCDIR)/%.f $(MODULES) $(ROOTDIR)/Makefile 123 | @mkdir -p $(dir $@) 124 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 125 | 126 | $(BLDDIR)/%-f90.o: $(SRCDIR)/%.f90 $(MODULES) $(ROOTDIR)/Makefile 127 | @mkdir -p $(dir $@) 128 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 129 | 130 | $(BLDDIR)/%-f77.o: $(SRCDIR)/%.F $(MODULES) $(ROOTDIR)/Makefile 131 | @mkdir -p $(dir $@) 132 | $(FC) $(FCFLAGS) $(FCMTFLAGS) $(DFLAGS) $(IFLAGS) $(TARGET) -c $< -o $@ 133 | 134 | $(DOCDIR)/libxstream.pdf: $(ROOTDIR)/README.md 135 | @mkdir -p $(dir $@) 136 | $(eval TEMPLATE := $(shell mktemp --tmpdir=. --suffix=.tex)) 137 | @pandoc -D latex > $(TEMPLATE) 138 | @TMPFILE=`mktemp` 139 | @sed -i ${TMPFILE} \ 140 | -e 's/\(\\documentclass\[.\+\]{.\+}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ 141 | -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily}/' \ 142 | $(TEMPLATE) 143 | @rm -f ${TMPFILE} 144 | @sed \ 145 | -e 's/https:\/\/raw\.githubusercontent\.com\/hfp\/libxstream\/master\///' \ 146 | -e 's/\[!\[.\+\](https:\/\/travis-ci.org\/hfp\/libxstream.svg?branch=.\+)\](.\+)//' \ 147 | -e 's/\[\[.\+\](.\+)\]//' \ 148 | -e '/!\[.\+\](.\+)/{n;d}' \ 149 | $(ROOTDIR)/README.md | \ 150 | pandoc \ 151 | --latex-engine=xelatex --template=$(TEMPLATE) --listings \ 152 | -f markdown_github+implicit_figures+all_symbols_escapable \ 153 | -V documentclass=scrartcl \ 154 | -V title-meta="LIBXSTREAM Documentation" \ 155 | -V author-meta="Hans Pabst" \ 156 | -V classoption=DIV=45 \ 157 | -V linkcolor=black \ 158 | -V citecolor=black \ 159 | -V urlcolor=black \ 160 | -o $@ 161 | @rm $(TEMPLATE) 162 | 163 | $(DOCDIR)/cp2k.pdf: $(ROOTDIR)/documentation/cp2k.md 164 | @mkdir -p $(dir $@) 165 | $(eval TEMPLATE := $(shell mktemp --tmpdir=. --suffix=.tex)) 166 | @pandoc -D latex > $(TEMPLATE) 167 | @TMPFILE=`mktemp` 168 | @sed -i ${TMPFILE} \ 169 | -e 's/\(\\documentclass\[.\+\]{.\+}\)/\1\n\\pagenumbering{gobble}\n\\RedeclareSectionCommands[beforeskip=-1pt,afterskip=1pt]{subsection,subsubsection}/' \ 170 | -e 's/\\usepackage{listings}/\\usepackage{listings}\\lstset{basicstyle=\\footnotesize\\ttfamily}/' \ 171 | $(TEMPLATE) 172 | @rm -f ${TMPFILE} 173 | @sed \ 174 | -e 's/https:\/\/raw\.githubusercontent\.com\/hfp\/libxstream\/master\///' \ 175 | -e 's/\[!\[.\+\](https:\/\/travis-ci.org\/hfp\/libxstream.svg?branch=.\+)\](.\+)//' \ 176 | -e 's/\[\[.\+\](.\+)\]//' \ 177 | -e '/!\[.\+\](.\+)/{n;d}' \ 178 | $(ROOTDIR)/documentation/cp2k.md | \ 179 | pandoc \ 180 | --latex-engine=xelatex --template=$(TEMPLATE) --listings \ 181 | -f markdown_github+implicit_figures+all_symbols_escapable \ 182 | -V documentclass=scrartcl \ 183 | -V title-meta="CP2K with LIBXSTREAM" \ 184 | -V author-meta="Hans Pabst" \ 185 | -V classoption=DIV=45 \ 186 | -V linkcolor=black \ 187 | -V citecolor=black \ 188 | -V urlcolor=black \ 189 | -o $@ 190 | @rm $(TEMPLATE) 191 | 192 | .PHONY: documentation 193 | documentation: $(DOCDIR)/libxstream.pdf $(DOCDIR)/cp2k.pdf 194 | 195 | .PHONY: clean 196 | clean: 197 | ifneq ($(abspath $(call parent,$(BLDDIR))),$(ROOTDIR)) 198 | ifneq ($(abspath $(call parent,$(BLDDIR))),$(abspath .)) 199 | @rm -rf $(call parent,$(BLDDIR)) *.mod 200 | else 201 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 202 | endif 203 | else 204 | @rm -f $(OBJECTS) $(BLDDIR)/*.mod 205 | endif 206 | 207 | .PHONY: realclean 208 | realclean: clean 209 | ifneq ($(abspath $(call parent,$(OUTDIR))),$(ROOTDIR)) 210 | ifneq ($(abspath $(call parent,$(OUTDIR))),$(abspath .)) 211 | @rm -rf $(call parent,$(OUTDIR)) 212 | else 213 | @rm -f $(OUTDIR)/$(OUTNAME) 214 | endif 215 | else 216 | @rm -f $(OUTDIR)/$(OUTNAME) 217 | endif 218 | 219 | install: all clean 220 | @cp -r $(INCDIR) . 2> /dev/null || true 221 | 222 | -------------------------------------------------------------------------------- /samples/multi-dgemm/multi-dgemm.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Copyright (c) 2014-2016, Intel Corporation ** 3 | ** All rights reserved. ** 4 | ** ** 5 | ** Redistribution and use in source and binary forms, with or without ** 6 | ** modification, are permitted provided that the following conditions ** 7 | ** are met: ** 8 | ** 1. Redistributions of source code must retain the above copyright ** 9 | ** notice, this list of conditions and the following disclaimer. ** 10 | ** 2. Redistributions in binary form must reproduce the above copyright ** 11 | ** notice, this list of conditions and the following disclaimer in the ** 12 | ** documentation and/or other materials provided with the distribution. ** 13 | ** 3. Neither the name of the copyright holder nor the names of its ** 14 | ** contributors may be used to endorse or promote products derived ** 15 | ** from this software without specific prior written permission. ** 16 | ** ** 17 | ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ** 18 | ** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ** 19 | ** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ** 20 | ** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ** 21 | ** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** 22 | ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ** 23 | ** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** 24 | ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** 25 | ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** 26 | ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** 27 | ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** 28 | ******************************************************************************/ 29 | /* Hans Pabst (Intel Corp.) 30 | ******************************************************************************/ 31 | #include "multi-dgemm-type.hpp" 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #if defined(_OPENMP) 42 | # include 43 | #endif 44 | #include 45 | 46 | #define SYNCMETHOD 2 47 | //#define CHECK 48 | 49 | #define DGEMM dgemm_ 50 | 51 | 52 | LIBXSTREAM_EXTERN_C LIBXSTREAM_RETARGETABLE void DGEMM( 53 | const char*, const char*, const int*, const int*, const int*, 54 | const double*, const double*, const int*, const double*, const int*, 55 | const double*, double*, const int*); 56 | 57 | 58 | LIBXSTREAM_RETARGETABLE void process(LIBXSTREAM_INVAL(size_t) size, LIBXSTREAM_INVAL(size_t) nn, const size_t* idata, 59 | const double* adata, const double* bdata, double* cdata) 60 | { 61 | if (0 < LIBXSTREAM_GETVAL(size)) { 62 | static const double alpha = 1, beta = 1; 63 | static const char trans = 'N'; 64 | const int isize = static_cast(size); 65 | const size_t base = idata[0]; 66 | 67 | for (int i = 0; i < isize; ++i) { 68 | LIBXSTREAM_ASSERT(base <= idata[i]); 69 | const size_t i0 = idata[i], i1 = (i + 1) < isize ? idata[i+1] : (i0 + LIBXSTREAM_GETVAL(nn)), n2 = i1 - i0, offset = i0 - base; 70 | const int n = static_cast(std::sqrt(static_cast(n2)) + 0.5); 71 | DGEMM(&trans, &trans, &n, &n, &n, &alpha, adata + offset, &n, bdata + offset, &n, &beta, cdata + offset, &n); 72 | } 73 | } 74 | } 75 | 76 | 77 | int main(int argc, char* argv[]) 78 | { 79 | try { 80 | const size_t nitems = std::max(1 < argc ? std::atoi(argv[1]) : 60, 0); 81 | const size_t nbatch = std::max(2 < argc ? std::atoi(argv[2]) : 5, 1); 82 | const size_t mstreams = std::min(std::max(3 < argc ? std::atoi(argv[3]) : 2, 1), LIBXSTREAM_MAX_NSTREAMS); 83 | #if !defined(_OPENMP) 84 | LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); 85 | #endif 86 | 87 | size_t ndevices = 0; 88 | if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { 89 | LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); 90 | } 91 | 92 | fprintf(stdout, "Initializing %i device%s and host data...", static_cast(ndevices), 1 == ndevices ? "" : "s"); 93 | const size_t split[] = { size_t(nitems * 18.0 / 250.0 + 0.5), size_t(nitems * 74.0 / 250.0 + 0.5) }; 94 | multi_dgemm_type::host_data_type host_data(reinterpret_cast(&process), nitems, split); 95 | fprintf(stdout, " %.1f MB\n", host_data.bytes() * 1E-6); 96 | 97 | fprintf(stdout, "Initializing %i stream%s per device...", static_cast(mstreams), 1 < mstreams ? "s" : ""); 98 | const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1); 99 | multi_dgemm_type multi_dgemm[LIBXSTREAM_MAX_NSTREAMS]; 100 | for (size_t i = 0; i < nstreams; ++i) { 101 | char name[128]; 102 | LIBXSTREAM_SNPRINTF(name, sizeof(name), "Stream %i", static_cast(i + 1)); 103 | LIBXSTREAM_CHECK_CALL_THROW(multi_dgemm[i].init(name, host_data, 0 < ndevices ? static_cast(i % ndevices) : -1, nbatch)); 104 | } 105 | if (0 < nstreams) { 106 | fprintf(stdout, " %.1f MB\n", mstreams * multi_dgemm[0].bytes() * 1E-6); 107 | } 108 | 109 | // start benchmark with no pending work 110 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_wait(0)); 111 | 112 | const size_t nbatches = (nitems + nbatch - 1) / nbatch; 113 | fprintf(stdout, "Running %i batch%s of %i item%s...\n", static_cast(nbatches), 114 | 1 < nbatches ? "es" : "", static_cast(std::min(nbatch, nitems)), 115 | 1 < nbatch ? "s" : ""); 116 | 117 | const int end = static_cast(nitems), ninc = static_cast(nbatch * nstreams); 118 | #if defined(_OPENMP) 119 | const double start = omp_get_wtime(); 120 | #endif 121 | for (int i = 0; i < end; i += ninc) { 122 | const size_t n = std::min(nstreams, end - i); 123 | 124 | for (size_t j = 0; j < n; ++j) { // enqueue work into streams 125 | const size_t batch = j * nbatch, base = i + batch, size = base < nitems ? std::min(nbatch, nitems - base) : 0; 126 | multi_dgemm_type& call = multi_dgemm[j]; 127 | LIBXSTREAM_CHECK_CALL_ASSERT(call(base, size)); 128 | #if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) // record event 129 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(call.event(), call.stream())); 130 | #endif 131 | } 132 | 133 | #if defined(SYNCMETHOD) 134 | for (size_t j = 0; j < n; ++j) { // synchronize streams 135 | const size_t k = n - j - 1; // j-reverse 136 | # if (3 <= (SYNCMETHOD)) 137 | // wait for an event within a stream 138 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(multi_dgemm[k].stream(), multi_dgemm[(j+nstreams-1)%n].event())); 139 | # elif (2 <= (SYNCMETHOD)) 140 | // wait for an event on the host 141 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(multi_dgemm[k].event())); 142 | # else 143 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(multi_dgemm[k].stream())); 144 | # endif 145 | } 146 | #endif 147 | } 148 | 149 | // wait for all streams to complete pending work 150 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_wait(0)); 151 | 152 | #if defined(_OPENMP) 153 | const double duration = omp_get_wtime() - start; 154 | if (0 < duration) { 155 | fprintf(stdout, "Performance: %.1f GFLOPS/s\n", host_data.flops() * 1E-9 / duration); 156 | } 157 | fprintf(stdout, "Duration: %.1f s\n", duration); 158 | #endif 159 | 160 | #if !defined(CHECK) 161 | const char *const check_env = getenv("CHECK"); 162 | if (check_env && *check_env && 0 != atoi(check_env)) 163 | #endif 164 | { 165 | std::vector expected(host_data.max_matrix_size()); 166 | const size_t testbatchsize = 1; 167 | double max_error = 0; 168 | size_t i0 = 0; 169 | for (size_t i = 0; i < nitems; ++i) { 170 | const size_t i1 = host_data.idata()[i+1]; 171 | const int nn = static_cast(i1 - i0); 172 | std::fill_n(&expected[0], nn, 0.0); 173 | process(LIBXSTREAM_SETVAL(testbatchsize), LIBXSTREAM_SETVAL(nn), host_data.idata() + i, host_data.adata() + i0, host_data.bdata() + i0, &expected[0]); 174 | for (int n = 0; n < nn; ++n) max_error = std::max(max_error, std::abs(expected[n] - host_data.cdata()[i0+n])); 175 | i0 = i1; 176 | } 177 | fprintf(stdout, "Error: %g\n", max_error); 178 | } 179 | fprintf(stdout, "Finished\n"); 180 | } 181 | catch(const std::exception& e) { 182 | fprintf(stderr, "Error: %s\n", e.what()); 183 | return EXIT_FAILURE; 184 | } 185 | catch(...) { 186 | fprintf(stderr, "Error: unknown exception caught!\n"); 187 | return EXIT_FAILURE; 188 | } 189 | 190 | return EXIT_SUCCESS; 191 | } 192 | -------------------------------------------------------------------------------- /src/acc_opencl_event.c: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------------------------*/ 2 | /* Copyright (C) by the DBCSR developers group - All rights reserved */ 3 | /* This file is part of the DBCSR library. */ 4 | /* */ 5 | /* For information on the license, see the LICENSE file. */ 6 | /* For further information please visit https://dbcsr.cp2k.org */ 7 | /* SPDX-License-Identifier: BSD-3-Clause */ 8 | /*------------------------------------------------------------------------------------------------*/ 9 | #if defined(__OPENCL) 10 | # include "acc_opencl.h" 11 | 12 | 13 | # if defined(__cplusplus) 14 | extern "C" { 15 | # endif 16 | 17 | int c_dbcsr_acc_event_create(void** event_p) { 18 | int result = EXIT_SUCCESS; 19 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 20 | int routine_handle; 21 | if (0 != c_dbcsr_acc_opencl_config.profile) { 22 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 23 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 24 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 25 | } 26 | # endif 27 | assert(NULL != c_dbcsr_acc_opencl_config.events && NULL != event_p); 28 | *event_p = c_dbcsr_acc_opencl_pmalloc( 29 | c_dbcsr_acc_opencl_config.lock_event, (void**)c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents); 30 | if (NULL != *event_p) *(cl_event*)*event_p = NULL; 31 | else result = EXIT_FAILURE; 32 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 33 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 34 | # endif 35 | ACC_OPENCL_RETURN(result); 36 | } 37 | 38 | 39 | int c_dbcsr_acc_event_destroy(void* event) { 40 | int result = EXIT_SUCCESS; 41 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 42 | int routine_handle; 43 | if (0 != c_dbcsr_acc_opencl_config.profile) { 44 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 45 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 46 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 47 | } 48 | # endif 49 | if (NULL != event) { 50 | const cl_event clevent = *ACC_OPENCL_EVENT(event); 51 | assert(NULL != c_dbcsr_acc_opencl_config.events); 52 | ACC_OPENCL_ACQUIRE(c_dbcsr_acc_opencl_config.lock_event); 53 | c_dbcsr_acc_opencl_pfree(event, (void**)c_dbcsr_acc_opencl_config.events, &c_dbcsr_acc_opencl_config.nevents); 54 | if (NULL != clevent) { 55 | result = clReleaseEvent(clevent); 56 | # if !defined(NDEBUG) 57 | *(cl_event*)event = NULL; 58 | # endif 59 | } 60 | ACC_OPENCL_RELEASE(c_dbcsr_acc_opencl_config.lock_event); 61 | } 62 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 63 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 64 | # endif 65 | ACC_OPENCL_RETURN(result); 66 | } 67 | 68 | 69 | int c_dbcsr_acc_stream_wait_event(void* stream, void* event) { /* wait for an event (device-side) */ 70 | int result = EXIT_SUCCESS; 71 | const c_dbcsr_acc_opencl_stream_t* str = NULL; 72 | cl_event clevent = NULL; 73 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 74 | int routine_handle; 75 | if (0 != c_dbcsr_acc_opencl_config.profile) { 76 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 77 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 78 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 79 | } 80 | # endif 81 | str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); 82 | assert(NULL != str && NULL != str->queue && NULL != event); 83 | clevent = *ACC_OPENCL_EVENT(event); 84 | if (NULL != clevent) { 85 | # if defined(CL_VERSION_1_2) 86 | result = clEnqueueBarrierWithWaitList(str->queue, 1, &clevent, NULL); 87 | # else 88 | result = clEnqueueWaitForEvents(str->queue, 1, &clevent); 89 | # endif 90 | if (EXIT_SUCCESS != result) { 91 | ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseEvent(clevent)); 92 | *(cl_event*)event = NULL; 93 | } 94 | } 95 | else if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { 96 | fprintf(stderr, "WARN ACC/OpenCL: c_dbcsr_acc_stream_wait_event discovered an empty event.\n"); 97 | } 98 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 99 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 100 | # endif 101 | ACC_OPENCL_RETURN(result); 102 | } 103 | 104 | 105 | int c_dbcsr_acc_event_record(void* event, void* stream) { 106 | int result = EXIT_SUCCESS; 107 | const c_dbcsr_acc_opencl_stream_t* str = NULL; 108 | cl_event clevent = NULL, clevent_result = NULL; 109 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 110 | int routine_handle; 111 | if (0 != c_dbcsr_acc_opencl_config.profile) { 112 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 113 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 114 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 115 | } 116 | # endif 117 | str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream_default()); 118 | assert(NULL != str && NULL != str->queue && NULL != event); 119 | clevent = *ACC_OPENCL_EVENT(event); 120 | # if defined(CL_VERSION_1_2) 121 | result = clEnqueueMarkerWithWaitList(str->queue, 0, NULL, &clevent_result); 122 | # else 123 | result = clEnqueueMarker(str->queue, &clevent_result); 124 | # endif 125 | if (NULL != clevent) { 126 | const int result_release = clReleaseEvent(clevent); 127 | if (EXIT_SUCCESS == result) result = result_release; 128 | } 129 | if (EXIT_SUCCESS == result) { 130 | assert(NULL != clevent_result); 131 | *(cl_event*)event = clevent_result; 132 | } 133 | else { 134 | if (NULL != clevent_result) ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseEvent(clevent_result)); 135 | *(cl_event*)event = NULL; 136 | } 137 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 138 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 139 | # endif 140 | ACC_OPENCL_RETURN(result); 141 | } 142 | 143 | 144 | int c_dbcsr_acc_event_query(void* event, c_dbcsr_acc_bool_t* has_occurred) { 145 | cl_int status = CL_COMPLETE; 146 | int result; 147 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 148 | int routine_handle; 149 | if (0 != c_dbcsr_acc_opencl_config.profile) { 150 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 151 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 152 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 153 | } 154 | # endif 155 | assert(NULL != event && NULL != has_occurred); 156 | result = clGetEventInfo(*ACC_OPENCL_EVENT(event), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL); 157 | if (EXIT_SUCCESS == result && 0 <= status) *has_occurred = (CL_COMPLETE == status ? 1 : 0); 158 | else { /* error state */ 159 | result = EXIT_SUCCESS; /* soft-error */ 160 | *has_occurred = 1; 161 | } 162 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 163 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 164 | # endif 165 | ACC_OPENCL_RETURN(result); 166 | } 167 | 168 | 169 | int c_dbcsr_acc_event_synchronize(void* event) { /* waits on the host-side */ 170 | int result = EXIT_SUCCESS; 171 | cl_event clevent; 172 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 173 | int routine_handle; 174 | if (0 != c_dbcsr_acc_opencl_config.profile) { 175 | static const char* routine_name_ptr = LIBXSMM_FUNCNAME + ACC_OPENCL_PROFILE_DBCSR; 176 | static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - (ACC_OPENCL_PROFILE_DBCSR + 1); 177 | c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); 178 | } 179 | # endif 180 | assert(NULL != event); 181 | clevent = *ACC_OPENCL_EVENT(event); 182 | if (NULL != clevent) { 183 | if (0 == (32 & c_dbcsr_acc_opencl_config.wa)) { 184 | cl_int status = CL_COMPLETE + 1; 185 | if (32 & c_dbcsr_acc_opencl_config.xhints) { 186 | result = clGetEventInfo(clevent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, NULL); 187 | assert(EXIT_SUCCESS == result || CL_COMPLETE != status); 188 | } 189 | if (CL_COMPLETE != status) result = clWaitForEvents(1, &clevent); 190 | } 191 | else { 192 | cl_command_queue queue = NULL; 193 | result = clGetEventInfo(clevent, CL_EVENT_COMMAND_QUEUE, sizeof(cl_command_queue), &queue, NULL); 194 | if (EXIT_SUCCESS == result) result = clFinish(queue); 195 | } 196 | } 197 | else if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { 198 | fprintf(stderr, "WARN ACC/OpenCL: c_dbcsr_acc_event_synchronize discovered an empty event.\n"); 199 | } 200 | # if defined(ACC_OPENCL_PROFILE_DBCSR) 201 | if (0 != c_dbcsr_acc_opencl_config.profile) c_dbcsr_timestop(&routine_handle); 202 | # endif 203 | ACC_OPENCL_RETURN(result); 204 | } 205 | 206 | # if defined(__cplusplus) 207 | } 208 | # endif 209 | 210 | #endif /*__OPENCL*/ 211 | -------------------------------------------------------------------------------- /scripts/acc_opencl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #################################################################################################### 3 | # Copyright (C) by the DBCSR developers group - All rights reserved # 4 | # This file is part of the DBCSR library. # 5 | # # 6 | # For information on the license, see the LICENSE file. # 7 | # For further information please visit https://dbcsr.cp2k.org # 8 | # SPDX-License-Identifier: BSD-3-Clause # 9 | #################################################################################################### 10 | # shellcheck disable=SC2048,SC2129 11 | 12 | BASENAME=$(command -v basename) 13 | DIRNAME=$(command -v dirname) 14 | HEAD=$(command -v head) 15 | SORT=$(command -v sort) 16 | SED=$(command -v gsed) 17 | CAT=$(command -v cat) 18 | CPP=$(command -v cpp) 19 | TR=$(command -v tr) 20 | RM=$(command -v rm) 21 | WC=$(command -v wc) 22 | 23 | # flags used to control preprocessor 24 | CPPBASEFLAGS="-dD -P -fpreprocessed" 25 | 26 | # delimiters allowed in CSV-file 27 | DELIMS=";,\t|/" 28 | 29 | # GNU sed is desired (macOS) 30 | if [ ! "${SED}" ]; then 31 | SED=$(command -v sed) 32 | fi 33 | 34 | trap_exit() { 35 | if [ "0" != "$?" ] && [ "${HFILE}" ]; then ${RM} -f "${OFILE}"; fi 36 | } 37 | 38 | process_pre() { 39 | if [ "$1" ]; then 40 | if [ "${CPP}" ] && \ 41 | [ "$(eval "${CPP} ${CPPBASEFLAGS} $1" 2>/dev/null >/dev/null && echo "YES")" ]; 42 | then 43 | if [ "${CPPFLAGS}" ] && \ 44 | [ "$(eval "${CPP} ${CPPFLAGS} ${CPPBASEFLAGS} $1" 2>/dev/null >/dev/null && echo "YES")" ]; 45 | then 46 | eval "${CPP} ${CPPFLAGS} ${CPPBASEFLAGS} $1" 2>/dev/null 47 | else 48 | eval "${CPP} ${CPPBASEFLAGS} $1" 2>/dev/null 49 | fi 50 | else # fallback to sed 51 | ${SED} -r ':a;s%(.*)/\*.*\*/%\1%;ta;/\/\*/!b;N;ba' "$1" 52 | fi | \ 53 | if [ ! "$2" ] || [ "0" = "$2" ]; then # strip include guards 54 | GUARD=$(${BASENAME} "$1" | ${TR} '[:lower:]' '[:upper:]' | ${TR} '.' '_') 55 | if [ "${GUARD}" ] && [ "$(${SED} -n "/${GUARD}/p" "$1")" ]; then 56 | ${SED} "/${GUARD}/d;\${/\s*\#\s*endif/d}" 57 | else 58 | ${CAT} 59 | fi 60 | else 61 | ${CAT} 62 | fi 63 | fi 64 | } 65 | 66 | process() { 67 | IFS=$'\n' 68 | while read -r LINE; do 69 | INCLUDE=$(${SED} -n "s/#[[:space:]]*include[[:space:]][[:space:]]*\"/\"/p" <<<"${LINE}") 70 | if [ "${INCLUDE}" ] && [ "$1" ] && [ -e "$1" ]; then 71 | CLINC=$(${SED} "s/\"//g" <<<"${INCLUDE}") 72 | CLPATH=$(${DIRNAME} "$1") 73 | FILE=${CLPATH}/${CLINC} 74 | if [ "${FILE}" ] && [ -e "${FILE}" ]; then 75 | process_pre "${FILE}" "$2" | process "${FILE}" "$2" 76 | else 77 | >&2 echo "WARNING: header file ${FILE} not found!" 78 | #exit 1 79 | fi 80 | else 81 | ${SED} <<<"${LINE}" \ 82 | -e '/^[[:space:]]*$/d' -e 's/[[:space:]]*$//' \ 83 | -e 's/[[:space:]]*\\/ \\\\/g' -e 's/"/\\"/g' \ 84 | -e 's/^/ "/' -e 's/$/\\n" \\/' 85 | fi 86 | done 87 | unset IFS 88 | } 89 | 90 | if [ "${BASENAME}" ] && [ "${DIRNAME}" ] && [ "${HEAD}" ] && [ "${SORT}" ] && \ 91 | [ "${SED}" ] && [ "${CAT}" ] && [ "${TR}" ] && [ "${RM}" ] && [ "${WC}" ]; 92 | then 93 | for OFILE in "$@"; do :; done 94 | while test $# -gt 0; do 95 | case "$1" in 96 | -h|--help) 97 | shift $#;; 98 | -k|--keep) 99 | KEEP=1 100 | shift;; 101 | -b|--banner) 102 | BANNER=$2 103 | shift 2;; 104 | -p|--params) 105 | PARAMS="$2\t" 106 | shift 2;; 107 | -c|-d|--debug|--comments) 108 | CPPFLAGS+=" -C" 109 | shift;; 110 | -v|--verbose) 111 | VERBOSE=1 112 | shift;; 113 | *) break;; 114 | esac 115 | done 116 | HERE="$(cd "$(${DIRNAME} "$0")" && pwd -P)" 117 | PARAMDIR=${PARAMDIR:-${PARAMS}} 118 | PARAMDIR=${PARAMDIR:-${HERE}/smm/params} 119 | PARAMDIR=$(echo -e "${PARAMDIR}" | ${TR} -d '\t') 120 | if [ "$#" -gt 1 ]; then 121 | # allow for instance /dev/stdout 122 | if [ "${OFILE##*.}" = "h" ]; then 123 | if [ "${VERBOSE}" ] && [ "0" != "${VERBOSE}" ]; then 124 | echo "$0 $*" # stdout 125 | fi 126 | truncate -s0 "${OFILE}" 127 | HFILE=${OFILE} 128 | elif [ "${OFILE##*.}" = "cl" ] || [ "${OFILE##*.}" = "csv" ]; then 129 | >&2 echo "ERROR: no output/header file given!" 130 | exit 1 131 | elif [ "${VERBOSE}" ] && [ "0" != "${VERBOSE}" ]; then 132 | if [[ ${OFILE} != /dev/stderr ]]; then 133 | >&2 echo "$0 $*" 134 | else # stdout 135 | echo "$0 $*" 136 | fi 137 | fi 138 | trap 'trap_exit' EXIT 139 | RNAME=$(${BASENAME} "$(cd "$(${DIRNAME} "$1")" && pwd -P)") 140 | ANAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${RNAME}") 141 | NFILES_OCL=0 142 | for CLFILE in ${*:1:${#@}-1}; do 143 | if [ "${CLFILE##*.}" = "cl" ]; then 144 | CLEXT=".cl" 145 | elif [ "${CLFILE##*.}" = "h" ]; then 146 | CLEXT=".h" 147 | else 148 | CLEXT="" 149 | fi 150 | if [ "${CLEXT}" ]; then 151 | if [ -e "${CLFILE}" ]; then 152 | CNAME=$(${BASENAME} "${CLFILE}" "${CLEXT}" | ${SED} "s/${RNAME}_//;s/_opencl//") 153 | BNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${CNAME}") 154 | SNAME=OPENCL_${ANAME}_STRING_${BNAME} 155 | VNAME=opencl_${RNAME}_source_${CNAME} 156 | MNAME=OPENCL_${ANAME}_SOURCE_${BNAME} 157 | if [ "0" != "$((0<(NFILES_OCL)))" ]; then 158 | echo 159 | elif [ "${BANNER}" ] && [ "0" != "${BANNER}" ]; then 160 | ${HEAD} -n"${BANNER}" "${CLFILE}" 161 | fi 162 | echo "#define ${MNAME} ${VNAME}" 163 | echo "#define ${SNAME} \\" 164 | process_pre "${CLFILE}" "${KEEP}" | process "${CLFILE}" "${KEEP}" 165 | echo " \"\"" 166 | echo "static const char ${VNAME}[] = ${SNAME};" 167 | NFILES_OCL=$((NFILES_OCL+1)) 168 | else 169 | >&2 echo "ERROR: ${CLFILE} does not exist!" 170 | exit 1 171 | fi >>"${OFILE}" 172 | else 173 | CSVFILES=("${*:NFILES_OCL+1:${#@}-NFILES_OCL-1}") 174 | break 175 | fi 176 | done 177 | if [ "0" = "${NFILES_OCL}" ]; then 178 | >&2 echo "ERROR: no OpenCL file was given!" 179 | exit 1 180 | fi 181 | NFILES_CSV=0 182 | for CSVFILE in "${CSVFILES[@]}"; do 183 | if [ "${CSVFILE##*.}" = "csv" ]; then 184 | if [ -f "${CSVFILE}" ]; then 185 | NFILES_CSV=$((NFILES_CSV+1)) 186 | fi 187 | else 188 | >&2 echo "ERROR: ${CSVFILE} is not a CSV file!" 189 | exit 1 190 | fi 191 | done 192 | if [ "0" = "${NFILES_CSV}" ] && [ "${PARAMDIR}" ] && [ -d "${PARAMDIR}" ]; then 193 | CSVFILES=("${PARAMDIR}"/*.csv) 194 | NFILES_CSV=${#CSVFILES[@]} 195 | fi 196 | for CSVFILE in "${CSVFILES[@]}"; do 197 | if [ ! "${DELIM}" ]; then 198 | SEPAR=$(${SED} -n "1s/[^${DELIMS}]//gp" "${CSVFILE}" 2>/dev/null) 199 | DELIM=${SEPAR:0:1} 200 | MATCH=$(${SED} -n "1s/[^${DELIM}]//gp" "${CSVFILE}" 2>/dev/null) 201 | fi 202 | if [ "${DELIM}" ]; then 203 | CHECK=$(${SED} "/^[[:space:]]*$/d;s/[^${DELIM}]//g" "${CSVFILE}" | ${SORT} -u | ${SED} -n "0,/./p") 204 | if [ "0" != "$((${#MATCH}<${#CHECK}))" ]; then 205 | ERRFILE=${CSVFILES[0]} 206 | elif [ "${MATCH}" != "${CHECK}" ]; then 207 | ERRFILE=${CSVFILE} 208 | fi 209 | else 210 | ERRFILE=${CSVFILE} 211 | fi 212 | if [ "${ERRFILE}" ] && [ -f "${ERRFILE}" ]; then 213 | >&2 echo "WARNING: ${ERRFILE} is malformed and ignored!" 214 | fi 215 | done 216 | DEVPAT="s/${DELIM}..*//" 217 | DEVICES=$(for CSVFILE in "${CSVFILES[@]}"; do ${SED} "1d;/^[[:space:]]*$/d;${DEVPAT}" "${CSVFILE}"; done | ${SORT} -u) 218 | SNAME=OPENCL_${ANAME}_STRING_PARAMS_SMM 219 | VNAME=opencl_${RNAME}_params_smm 220 | DNAME=opencl_${RNAME}_devices 221 | MNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${VNAME}") 222 | NNAME=$(${TR} '[:lower:]' '[:upper:]' <<<"${DNAME}") 223 | if [ "${DEVICES}" ]; then 224 | echo 225 | echo "#define ${MNAME} ${VNAME}" 226 | echo "#define ${SNAME} \\" 227 | CSVLINES=$(for CSVFILE in "${CSVFILES[@]}"; do ${SED} "1d;/^[[:space:]]*$/d;s/[\r]*$/\\\n\" \\\/" "${CSVFILE}"; done) 228 | IFS=$'\n' 229 | for LINE in ${CSVLINES}; do 230 | I=0; IDEVICE=$(${SED} "${DEVPAT}" <<<"${LINE}") 231 | for DEVICE in ${DEVICES}; do 232 | if [ "${DEVICE}" = "${IDEVICE}" ]; then break; fi 233 | I=$((I+1)); 234 | done 235 | ${SED} "s/[^${DELIM}]*/ \"${I}/" <<<"${LINE}" 236 | done 237 | echo " \"\"" 238 | echo "static const char ${VNAME}[] = ${SNAME};" 239 | echo 240 | echo "#define ${NNAME} ${DNAME}" 241 | echo "static const char *const ${DNAME}[] = {" 242 | I=0; S=","; NDEVICES=$(${WC} -l <<<"${DEVICES}") 243 | for DEVICE in ${DEVICES}; do 244 | I=$((I+1)); if [ "0" != "$((NDEVICES==I))" ]; then S=""; fi 245 | echo " \"${DEVICE}\"${S}" 246 | done 247 | unset IFS 248 | echo "};" 249 | fi >>"${OFILE}" 250 | else 251 | echo "Usage: $0 infile.cl [infile2.cl .. infileN.cl] [infile.csv [.. infileN.csv]] outfile.h" 252 | echo " At least one OpenCL file and one header file must be supplied." 253 | echo " -k|--keep: do not strip include guards (stripped even if necessary)" 254 | echo " -b|--banner N: number of lines used as banner (default: 0)" 255 | echo " -p|--params P: directory-path to CSV-files (can be \"\")" 256 | echo " default: ${PARAMDIR}" 257 | echo " -c|-d|--debug|--comments: keep comments in source-code" 258 | echo " -v|--verbose: repeat command-line arguments" 259 | fi 260 | else 261 | >&2 echo "ERROR: missing prerequisites!" 262 | exit 1 263 | fi 264 | -------------------------------------------------------------------------------- /samples/multi-dgemm/multi-dgemm-type.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Copyright (c) 2014-2016, Intel Corporation ** 3 | ** All rights reserved. ** 4 | ** ** 5 | ** Redistribution and use in source and binary forms, with or without ** 6 | ** modification, are permitted provided that the following conditions ** 7 | ** are met: ** 8 | ** 1. Redistributions of source code must retain the above copyright ** 9 | ** notice, this list of conditions and the following disclaimer. ** 10 | ** 2. Redistributions in binary form must reproduce the above copyright ** 11 | ** notice, this list of conditions and the following disclaimer in the ** 12 | ** documentation and/or other materials provided with the distribution. ** 13 | ** 3. Neither the name of the copyright holder nor the names of its ** 14 | ** contributors may be used to endorse or promote products derived ** 15 | ** from this software without specific prior written permission. ** 16 | ** ** 17 | ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ** 18 | ** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ** 19 | ** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ** 20 | ** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ** 21 | ** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** 22 | ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ** 23 | ** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** 24 | ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** 25 | ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** 26 | ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** 27 | ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** 28 | ******************************************************************************/ 29 | /* Hans Pabst (Intel Corp.) 30 | ******************************************************************************/ 31 | #include "multi-dgemm-type.hpp" 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | 39 | multi_dgemm_type::host_data_type::host_data_type(libxstream_function process, size_t size, const size_t split[]) 40 | : m_process(process) 41 | , m_adata(0), m_bdata(0), m_cdata(0), m_idata(0) 42 | , m_size(size), m_flops(0) 43 | { 44 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast(&m_idata), sizeof(size_t) * (size + 1), 0)); 45 | 46 | size_t isize = split[0]; 47 | size_t msize = 0, n = 100, nn = n * n; 48 | for (size_t i = 0; i < isize; ++i) { 49 | m_flops += nn * (2 * n + 1); 50 | m_idata[i] = msize; 51 | msize += nn; 52 | } 53 | isize += split[1]; 54 | n = 600, nn = n * n; 55 | for (size_t i = split[0]; i < isize; ++i) { 56 | m_flops += nn * (2 * n + 1); 57 | m_idata[i] = msize; 58 | msize += nn; 59 | } 60 | n = 1000, nn = n * n; 61 | for (size_t i = isize; i < size; ++i) { 62 | m_flops += nn * (2 * n + 1); 63 | m_idata[i] = msize; 64 | msize += nn; 65 | } 66 | m_idata[size] = msize; 67 | 68 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast(&m_adata), sizeof(double) * msize, 0)); 69 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast(&m_bdata), sizeof(double) * msize, 0)); 70 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1, reinterpret_cast(&m_cdata), sizeof(double) * msize, 0)); 71 | 72 | static const double scale = 1.0 / RAND_MAX; 73 | for (size_t i = 0; i < msize; ++i) { 74 | m_adata[i] = scale * (2 * std::rand() - RAND_MAX); 75 | m_bdata[i] = scale * (2 * std::rand() - RAND_MAX); 76 | m_cdata[i] = 0; 77 | } 78 | } 79 | 80 | 81 | multi_dgemm_type::host_data_type::~host_data_type() 82 | { 83 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_adata)); 84 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_bdata)); 85 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_cdata)); 86 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1, m_idata)); 87 | } 88 | 89 | 90 | size_t multi_dgemm_type::host_data_type::max_matrix_size() const 91 | { 92 | LIBXSTREAM_ASSERT(0 == m_size || 0 == m_idata[0]); 93 | size_t result = 0, i0 = 0; 94 | for (size_t i = 0; i < m_size; ++i) { 95 | const size_t i1 = m_idata[i+1]; 96 | result = std::max(result, i1 - i0); 97 | i0 = i1; 98 | } 99 | return result; 100 | } 101 | 102 | 103 | size_t multi_dgemm_type::host_data_type::bytes() const 104 | { 105 | return sizeof(double) * m_idata[m_size] * 3 + sizeof(size_t) * m_size; 106 | } 107 | 108 | 109 | bool multi_dgemm_type::host_data_type::ready() const 110 | { 111 | LIBXSTREAM_ASSERT(0 == m_process || (m_adata && m_bdata && m_cdata && m_idata)); 112 | return 0 != m_process; 113 | } 114 | 115 | 116 | multi_dgemm_type::multi_dgemm_type() 117 | : m_host_data(0), m_stream(0), m_event(0) 118 | , m_adata(0), m_bdata(0), m_cdata(0) 119 | , m_idata(0), m_max_batch(0) 120 | {} 121 | 122 | 123 | multi_dgemm_type::~multi_dgemm_type() 124 | { 125 | LIBXSTREAM_CHECK_CALL_ASSERT(deinit()); 126 | } 127 | 128 | 129 | int multi_dgemm_type::deinit() 130 | { 131 | if (m_host_data) { 132 | int device = -1; 133 | LIBXSTREAM_CHECK_CALL(libxstream_stream_device(m_stream, &device)); 134 | LIBXSTREAM_CHECK_CALL(libxstream_stream_destroy(m_stream)); 135 | LIBXSTREAM_CHECK_CALL(libxstream_event_destroy(m_event)); 136 | LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_adata)); 137 | LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_bdata)); 138 | LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_cdata)); 139 | LIBXSTREAM_CHECK_CALL(libxstream_mem_deallocate(device, m_idata)); 140 | m_host_data = 0; 141 | m_max_batch = 0; 142 | m_stream = 0; 143 | m_event = 0; 144 | m_adata = 0; 145 | m_bdata = 0; 146 | m_cdata = 0; 147 | m_idata = 0; 148 | } 149 | 150 | return LIBXSTREAM_ERROR_NONE; 151 | } 152 | 153 | 154 | int multi_dgemm_type::init(const char* name, host_data_type& host_data, int device, size_t max_batch) 155 | { 156 | LIBXSTREAM_CHECK_CALL(deinit()); 157 | m_host_data = &host_data; 158 | m_max_batch = max_batch; 159 | 160 | const size_t max_msize = m_max_batch * m_host_data->max_matrix_size(); 161 | LIBXSTREAM_CHECK_CALL(libxstream_stream_create(&m_stream, device, 0, name)); 162 | LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast(&m_adata), sizeof(double) * max_msize, 0)); 163 | LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast(&m_bdata), sizeof(double) * max_msize, 0)); 164 | LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast(&m_cdata), sizeof(double) * max_msize, 0)); 165 | LIBXSTREAM_CHECK_CALL(libxstream_mem_allocate(device, reinterpret_cast(&m_idata), sizeof(size_t) * max_batch, 0)); 166 | 167 | return LIBXSTREAM_ERROR_NONE; 168 | } 169 | 170 | 171 | int multi_dgemm_type::operator()(size_t index, size_t size) 172 | { 173 | if (0 < size) { 174 | LIBXSTREAM_CHECK_CONDITION(ready() && (index + size) <= m_host_data->size()); 175 | 176 | const size_t i0 = m_host_data->idata()[index], i1 = m_host_data->idata()[index+size]; 177 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->adata() + i0, m_adata, sizeof(double) * (i1 - i0), m_stream)); 178 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->bdata() + i0, m_bdata, sizeof(double) * (i1 - i0), m_stream)); 179 | // transferring cdata is part of the benchmark; since it is all zeros we could do better with libxstream_memset_zero 180 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->cdata() + i0, m_cdata, sizeof(double) * (i1 - i0), m_stream)); 181 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(m_host_data->idata() + index, m_idata, sizeof(size_t) * size, m_stream)); 182 | 183 | libxstream_argument* signature = 0; 184 | const size_t max_msize = m_max_batch * m_host_data->max_matrix_size(), nn = i1 - m_host_data->idata()[index+size-1]; 185 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); 186 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 0, &size, libxstream_map_to_type(size), 0, 0)); 187 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 1, &nn, libxstream_map_to_type(nn ), 0, 0)); 188 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 2, m_idata, libxstream_map_to_type(m_idata), 1, &max_msize)); 189 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 3, m_adata, libxstream_map_to_type(m_adata), 1, &max_msize)); 190 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input (signature, 4, m_bdata, libxstream_map_to_type(m_bdata), 1, &max_msize)); 191 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 5, m_cdata, libxstream_map_to_type(m_cdata), 1, &max_msize)); 192 | 193 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call(m_host_data->process(), signature, m_stream, LIBXSTREAM_CALL_DEFAULT)); 194 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(m_cdata, m_host_data->cdata() + i0, sizeof(double) * (i1 - i0), m_stream)); 195 | } 196 | 197 | return LIBXSTREAM_ERROR_NONE; 198 | } 199 | 200 | 201 | libxstream_event* multi_dgemm_type::event() 202 | { 203 | if (0 == m_event) { 204 | libxstream_event_create(&m_event); 205 | LIBXSTREAM_ASSERT(0 != m_event); 206 | } 207 | return m_event; 208 | } 209 | 210 | 211 | bool multi_dgemm_type::ready() const 212 | { 213 | LIBXSTREAM_ASSERT(0 == m_host_data || (m_stream && m_adata && m_bdata && m_cdata && m_idata)); 214 | return 0 != m_host_data; 215 | } 216 | 217 | 218 | size_t multi_dgemm_type::bytes() const 219 | { 220 | LIBXSTREAM_ASSERT(ready()); 221 | return m_max_batch * m_host_data->max_matrix_size() * (3 * sizeof(double) + sizeof(size_t)); 222 | } 223 | -------------------------------------------------------------------------------- /samples/copy/copy.cpp: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | ** Copyright (c) 2014-2016, Intel Corporation ** 3 | ** All rights reserved. ** 4 | ** ** 5 | ** Redistribution and use in source and binary forms, with or without ** 6 | ** modification, are permitted provided that the following conditions ** 7 | ** are met: ** 8 | ** 1. Redistributions of source code must retain the above copyright ** 9 | ** notice, this list of conditions and the following disclaimer. ** 10 | ** 2. Redistributions in binary form must reproduce the above copyright ** 11 | ** notice, this list of conditions and the following disclaimer in the ** 12 | ** documentation and/or other materials provided with the distribution. ** 13 | ** 3. Neither the name of the copyright holder nor the names of its ** 14 | ** contributors may be used to endorse or promote products derived ** 15 | ** from this software without specific prior written permission. ** 16 | ** ** 17 | ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ** 18 | ** "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ** 19 | ** LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ** 20 | ** A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ** 21 | ** HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ** 22 | ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED ** 23 | ** TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ** 24 | ** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ** 25 | ** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ** 26 | ** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ** 27 | ** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ** 28 | ******************************************************************************/ 29 | /* Hans Pabst (Intel Corp.) 30 | ******************************************************************************/ 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #if defined(_OPENMP) 38 | # include 39 | #endif 40 | #include 41 | 42 | //#define COPY_NO_SYNC 43 | 44 | 45 | /** 46 | * This program is copying data to/from a copprocessor device (copy-in or copy-out). 47 | * A series of different sizes (up to a given / automatically selected amount) is 48 | * exercised in order to measure the bandwidth of the data transfers. The program 49 | * is multi-threaded (using only one thread by default) mainly to exercise the 50 | * thread-safety of LIBXSTREAM. There is no performance gain expected from using 51 | * multiple threads, remember that the number of streams determines the available 52 | * parallelism. The program is intentionally spartan (command line interface, etc.) 53 | * in order to keep it within bounds for an introductory code sample. The mechanism 54 | * selecting the stream to enqueue into as well as selecting the stream to be 55 | * synchronized shows the essence of the stream programing model. 56 | */ 57 | int main(int argc, char* argv[]) 58 | { 59 | try { 60 | size_t ndevices = 0; 61 | if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { 62 | LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); 63 | } 64 | const int device = static_cast(ndevices) - 1; 65 | const size_t reserved = 512; 66 | size_t allocatable = 0; 67 | libxstream_get_meminfo(device, &allocatable, 0); 68 | allocatable >>= 20; // MB 69 | allocatable = std::max(allocatable, reserved) - reserved; 70 | 71 | const bool copyin = 1 < argc ? ('o' != *argv[1]) : true; 72 | const size_t nstreams = std::min(std::max(2 < argc ? std::atoi(argv[2]) : 2, 1), LIBXSTREAM_MAX_NSTREAMS); 73 | #if defined(_OPENMP) 74 | const int nthreads = std::min(std::max(3 < argc ? std::atoi(argv[3]) : 1, 1), omp_get_max_threads()); 75 | #else 76 | LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); 77 | #endif 78 | const size_t minsize = 8, maxsize = static_cast(std::max(4 < argc ? std::atoi(argv[4]) : std::min(static_cast(allocatable / nstreams), 2048), 1)) * (1 << 20); 79 | const int minrepeat = std::min(std::max(5 < argc ? std::atoi(argv[5]) : 8, 4), 2048); 80 | const int maxrepeat = std::min(std::max(6 < argc ? std::atoi(argv[6]) : 4096, minrepeat), 32768); 81 | 82 | int steps_repeat = 0, steps_size = 0; 83 | for (int nrepeat = minrepeat; nrepeat <= maxrepeat; nrepeat <<= 1) ++steps_repeat; 84 | for (size_t size = minsize; size <= maxsize; size <<= 1) ++steps_size; 85 | const int stride = 0 < steps_repeat ? (steps_size / steps_repeat + 1) : 1; 86 | 87 | struct { 88 | libxstream_stream* stream; 89 | void *mem_hst, *mem_dev; 90 | } copy[LIBXSTREAM_MAX_NSTREAMS]; 91 | memset(copy, 0, sizeof(copy)); // some initialization (avoid false positives with tools) 92 | 93 | for (size_t i = 0; i < nstreams; ++i) { 94 | char name[128]; 95 | LIBXSTREAM_SNPRINTF(name, sizeof(name), "Stream %i", static_cast(i + 1)); 96 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_create(©[i].stream, device, 0, name)); 97 | } 98 | if (copyin) { 99 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1/*host*/, ©[0].mem_hst, maxsize, 0)); 100 | memset(copy[0].mem_hst, -1, maxsize); // some initialization (avoid false positives with tools) 101 | for (size_t i = 1; i < nstreams; ++i) { 102 | copy[i].mem_hst = copy[0].mem_hst; 103 | } 104 | for (size_t i = 0; i < nstreams; ++i) { 105 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(device, ©[i].mem_dev, maxsize, 0)); 106 | } 107 | } 108 | else { // copy-out 109 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(device, ©[0].mem_dev, maxsize, 0)); 110 | libxstream_memset_zero(copy[0].mem_dev, maxsize, copy[0].stream); 111 | for (size_t i = 1; i < nstreams; ++i) { 112 | copy[i].mem_dev = copy[0].mem_dev; 113 | } 114 | for (size_t i = 0; i < nstreams; ++i) { 115 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_allocate(-1/*host*/, ©[i].mem_hst, maxsize, 0)); 116 | } 117 | } 118 | // start benchmark with no pending work 119 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_wait(0)); 120 | 121 | int n = 0, nrepeat = maxrepeat; 122 | double totalsize = 0, maxval = 0, runlns = 0; 123 | #if defined(_OPENMP) 124 | const double mega = 1.0 / (1ul << 20); 125 | double duration = -omp_get_wtime(); 126 | #else 127 | const double duration = 0; 128 | #endif 129 | for (size_t size = minsize; size <= maxsize; size <<= 1, ++n) { 130 | if (0 < n && 0 == (n % stride)) { 131 | nrepeat >>= 1; 132 | } 133 | 134 | #if defined(_OPENMP) 135 | fprintf(stdout, "%lu Byte x %i: ", static_cast(size), nrepeat); 136 | fflush(stdout); // make sure to show progress 137 | double iduration = -omp_get_wtime(); 138 | # pragma omp parallel for num_threads(nthreads) schedule(dynamic) 139 | #endif 140 | for (int j = 0; j < nrepeat; ++j) { // enqueue work into streams 141 | const size_t k = j % nstreams; 142 | 143 | if (copyin) { 144 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(copy[k].mem_hst, copy[k].mem_dev, size, copy[k].stream)); 145 | } 146 | else { // copy-out 147 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(copy[k].mem_dev, copy[k].mem_hst, size, copy[k].stream)); 148 | } 149 | } 150 | 151 | #if !defined(COPY_NO_SYNC) 152 | for (int j = 0; j < nrepeat; ++j) { // synchronize streams 153 | const size_t k = j % nstreams; 154 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(copy[k].stream)); 155 | } 156 | #endif 157 | 158 | #if defined(_OPENMP) 159 | iduration += omp_get_wtime(); 160 | if (0 < iduration) { 161 | const double isize = mega * size * nrepeat, bandwidth = isize / iduration; 162 | fprintf(stdout, "%.1f MB/s\n", bandwidth); 163 | maxval = std::max(maxval, bandwidth); 164 | runlns = (runlns + std::log(bandwidth)) * (0 < n ? 0.5 : 1.0); 165 | totalsize += isize; 166 | } 167 | #endif 168 | } 169 | 170 | #if defined(COPY_NO_SYNC) 171 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_wait(0)); // wait for pending work 172 | #endif 173 | #if defined(_OPENMP) 174 | duration += omp_get_wtime(); 175 | #endif 176 | 177 | if (copyin) { 178 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_deallocate(-1/*host*/, copy[0].mem_hst)); 179 | for (size_t i = 0; i < nstreams; ++i) { 180 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_deallocate(device, copy[i].mem_dev)); 181 | } 182 | } 183 | else { // copy-out 184 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_deallocate(device, copy[0].mem_dev)); 185 | for (size_t i = 0; i < nstreams; ++i) { 186 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_mem_deallocate(-1/*host*/, copy[i].mem_hst)); 187 | } 188 | } 189 | for (size_t i = 0; i < nstreams; ++i) { 190 | LIBXSTREAM_CHECK_CALL_THROW(libxstream_stream_destroy(copy[i].stream)); 191 | } 192 | 193 | fprintf(stdout, "\n"); 194 | if (0 < duration) { 195 | fprintf(stdout, "Finished after %.0f s\n", duration); 196 | fprintf(stdout, "max: %.0f MB/s\n", maxval); 197 | fprintf(stdout, "rgm: %.0f MB/s\n", std::exp(runlns) - 1.0); 198 | fprintf(stdout, "avg: %.0f MB/s\n", totalsize / duration); 199 | } 200 | else { 201 | fprintf(stdout, "Finished\n"); 202 | } 203 | } 204 | catch(const std::exception& e) { 205 | fprintf(stderr, "Error: %s\n", e.what()); 206 | return EXIT_FAILURE; 207 | } 208 | catch(...) { 209 | fprintf(stderr, "Error: unknown exception caught!\n"); 210 | return EXIT_FAILURE; 211 | } 212 | 213 | return EXIT_SUCCESS; 214 | } 215 | -------------------------------------------------------------------------------- /samples/entropy/entropy.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #if defined(_OPENMP) 7 | # include 8 | #endif 9 | #include 10 | 11 | #define SYNCMETHOD 1 12 | /*implementation variant*/ 13 | #define HISTOGRAM 2 14 | 15 | 16 | LIBXSTREAM_RETARGETABLE void histogram1(const char* data, size_t size, size_t* histogram) 17 | { 18 | static const size_t maxint = (size_t)(((unsigned int)-1) >> 1/*sign bit*/); 19 | int i, j, m = (int)((size + maxint - 1) / maxint); 20 | 21 | for (i = 0; i < m; ++i) { /*OpenMP 2: size is broken down to integer space*/ 22 | const size_t base = i * maxint; 23 | const int n = (int)LIBXSTREAM_MIN(size - base, maxint); 24 | #if defined(_OPENMP) 25 | # pragma omp parallel for /*default(none)*/ private(j) shared(data,histogram) 26 | #endif 27 | for (j = 0; j < n; ++j) { 28 | const int k = (unsigned char)data[base+j]; 29 | #if defined(_OPENMP) 30 | # pragma omp atomic 31 | #endif 32 | ++histogram[k]; 33 | } 34 | } 35 | } 36 | 37 | 38 | LIBXSTREAM_RETARGETABLE void histogram2(const char* data, size_t size, size_t* histogram) 39 | { 40 | static const size_t maxint = (size_t)(((unsigned int)-1) >> 1/*sign bit*/); 41 | int i, j, m = (int)((size + maxint - 1) / maxint); 42 | 43 | #if defined(_OPENMP) 44 | # pragma omp parallel /*default(none)*/ private(i,j) shared(data,histogram) 45 | #endif 46 | for (i = 0; i < m; ++i) { /*OpenMP 2: size is broken down to integer space*/ 47 | LIBXSTREAM_ALIGNED(size_t local[256], LIBXSTREAM_MAX_SIMD); 48 | const size_t base = i * maxint; 49 | const int n = (int)LIBXSTREAM_MIN(size - base, maxint); 50 | for (j = 0; j < 256; ++j) local[j] = 0; 51 | #if defined(_OPENMP) 52 | # pragma omp for nowait 53 | #endif 54 | for (j = 0; j < n; ++j) { 55 | const int k = (unsigned char)data[base+j]; 56 | ++local[k]; 57 | } 58 | #if defined(_OPENMP) 59 | # pragma omp critical(histogram2) 60 | #endif 61 | for (j = 0; j < 256; ++j) histogram[j] += local[j]; 62 | } 63 | } 64 | 65 | 66 | LIBXSTREAM_RETARGETABLE void makehist(const char* data, size_t* histogram) 67 | { 68 | size_t size; 69 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_shape(0/*current context*/, 0/*data*/, &size)); 70 | LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, size, histogram); 71 | } 72 | 73 | 74 | FILE* fileopen(const char* name, const char* mode, size_t* size) 75 | { 76 | FILE *const file = (name && *name) ? fopen(name, mode) : 0; 77 | long lsize = -1; 78 | 79 | if (0 != file) { 80 | if (0 == fseek(file, 0L, SEEK_END)) { 81 | lsize = ftell(file); 82 | rewind(file); 83 | } 84 | } 85 | 86 | if (0 != size && 0 <= lsize) { 87 | *size = lsize; 88 | } 89 | 90 | return 0 <= lsize ? file : 0; 91 | } 92 | 93 | 94 | int main(int argc, char* argv[]) 95 | { 96 | size_t ndevices = 0; 97 | if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { 98 | LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); 99 | } 100 | 101 | size_t filesize = 0; 102 | FILE *const file = 1 < argc ? fileopen(argv[1], "rb", &filesize) : 0; 103 | const size_t nitems = (1 < argc && 0 == filesize && 0 < atoi(argv[1])) ? (atoi(argv[1]) * (1ULL << 20)/*MB*/) : (0 < filesize ? filesize : (512 << 20)); 104 | const size_t mbatch = LIBXSTREAM_MIN(2 < argc ? strtoul(argv[2], 0, 10) : 0/*auto*/, nitems >> 20) << 20; 105 | const size_t mstreams = LIBXSTREAM_MIN(LIBXSTREAM_MAX(3 < argc ? atoi(argv[3]) : 2, 0), LIBXSTREAM_MAX_NSTREAMS); 106 | #if !defined(_OPENMP) 107 | LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); 108 | #endif 109 | const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1), nbatch = (0 == mbatch) ? LIBXSTREAM_MIN(nitems / nstreams, 128 << 20) : mbatch, hsize = 256; 110 | size_t histogram[256/*hsize*/]; 111 | memset(histogram, 0, sizeof(histogram)); 112 | 113 | char* data; 114 | { /*allocate and initialize host memory*/ 115 | size_t i; 116 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(-1/*host*/, (void**)&data, nitems, 0)); 117 | if (0 == filesize || nitems > fread(data, 1, filesize, file)) { 118 | for (i = 0; i < nitems; ++i) data[i] = (char)LIBXSTREAM_MOD2(rand(), hsize/*POT*/); 119 | } 120 | } 121 | 122 | struct { 123 | libxstream_stream* handle; 124 | #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) 125 | libxstream_event* event; 126 | #endif 127 | size_t* histogram; 128 | char* data; 129 | } stream[(LIBXSTREAM_MAX_NDEVICES)*(LIBXSTREAM_MAX_NSTREAMS)]; 130 | 131 | { /*allocate and initialize streams and device memory*/ 132 | size_t i; 133 | for (i = 0; i < nstreams; ++i) { 134 | #if defined(NDEBUG) /*no name*/ 135 | const char *const name = 0; 136 | #else 137 | char name[128]; 138 | LIBXSTREAM_SNPRINTF(name, sizeof(name), "stream %i", (int)(i + 1)); 139 | #endif 140 | const int device = (0 < ndevices) ? ((int)(i % ndevices)) : -1; 141 | stream[i].handle = 0; 142 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_create(0 < mstreams ? &stream[i].handle : 0, device, 0, name)); 143 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].data, nbatch, 0)); 144 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].histogram, hsize * sizeof(size_t), 0)); 145 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memset_zero(stream[i].histogram, hsize * sizeof(size_t), stream[i].handle)); 146 | #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) 147 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_create(&stream[i].event)); 148 | #endif 149 | } 150 | 151 | /*start benchmark with no pending work*/ 152 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(0)); 153 | } 154 | 155 | #if defined(_OPENMP) 156 | /*if (0 == ndevices) omp_set_nested(1);*/ 157 | double duration = -omp_get_wtime(); 158 | #endif 159 | { /*process data in chunks of size nbatch*/ 160 | const size_t nstep = nbatch * nstreams; 161 | const int end = (int)((nitems + nstep - 1) / nstep); 162 | int i; 163 | libxstream_type sizetype = LIBXSTREAM_TYPE_U32; 164 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_autotype(sizeof(size_t), sizetype, &sizetype)); 165 | 166 | for (i = 0; i < end; ++i) { 167 | const size_t ibase = i * nstep, n = LIBXSTREAM_MIN(nstreams, nitems - ibase); 168 | libxstream_argument* signature; 169 | size_t j; 170 | 171 | for (j = 0; j < n; ++j) { /*enqueue work into streams*/ 172 | const size_t base = ibase + j * nbatch, size = base < nitems ? LIBXSTREAM_MIN(nbatch, nitems - base) : 0; 173 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(data + base, stream[j].data, size, stream[j].handle)); 174 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); 175 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(signature, 0, stream[j].data, LIBXSTREAM_TYPE_CHAR, 1, &size)); 176 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 1, stream[j].histogram, sizetype, 1, &hsize)); 177 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call((libxstream_function)makehist, signature, stream[j].handle, LIBXSTREAM_CALL_DEFAULT)); 178 | #if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) /*record event*/ 179 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(stream[j].event, stream[j].handle)); 180 | #endif 181 | } 182 | 183 | #if defined(SYNCMETHOD) 184 | for (j = 0; j < n; ++j) { /*synchronize streams*/ 185 | # if (3 <= (SYNCMETHOD)) 186 | /*wait for an event within a stream*/ 187 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(stream[j].handle, stream[(j+nstreams-1)%n].event)); 188 | # elif (2 <= (SYNCMETHOD)) 189 | /*wait for an event on the host*/ 190 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(stream[j].event)); 191 | # else 192 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); 193 | # endif 194 | } 195 | #endif 196 | } 197 | } 198 | 199 | { /*reduce stream-local histograms*/ 200 | LIBXSTREAM_ALIGNED(size_t local[256/*hsize*/], LIBXSTREAM_MAX_SIMD); 201 | size_t i, j; 202 | for (j = 0; j < nstreams; ++j) { 203 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(stream[j].histogram, local, sizeof(local), stream[j].handle)); 204 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); /*wait for pending work*/ 205 | for (i = 0; i < hsize; ++i) histogram[i] += local[i]; 206 | } 207 | } 208 | 209 | #if defined(_OPENMP) 210 | duration += omp_get_wtime(); 211 | #endif 212 | 213 | const double kilo = 1.0 / (1 << 10), mega = 1.0 / (1 << 20); 214 | double entropy = 0; 215 | { /*calculate entropy*/ 216 | const double log2_nitems = log2((double)nitems); 217 | size_t i; 218 | for (i = 0; i < hsize; ++i) { 219 | const double h = (double)histogram[i], log2h = 0 < h ? log2(h) : log2_nitems; 220 | entropy -= h * LIBXSTREAM_MIN(log2h - log2_nitems, 0); 221 | } 222 | entropy /= nitems; 223 | } 224 | 225 | if (0 <= entropy && 8 >= entropy) { 226 | if ((1 << 20) <= nitems) { /*mega*/ 227 | fprintf(stdout, "Compression %gx: %.1f -> %.1f MB", 0 != entropy ? (8.0 / entropy) : 1.0, mega * nitems, mega * entropy * nitems / 8.0); 228 | } 229 | else if ((1 << 10) <= nitems) { /*kilo*/ 230 | fprintf(stdout, "Compression %gx: %.1f -> %.1f KB", 0 != entropy ? (8.0 / entropy) : 1.0, kilo * nitems, kilo * entropy * nitems / 8.0); 231 | } 232 | else { 233 | fprintf(stdout, "Compression %gx: %.0f -> %0.f B", 0 != entropy ? (8.0 / entropy) : 1.0, 1.0 * nitems, entropy * nitems / 8.0); 234 | } 235 | fprintf(stdout, " (redundancy %0.f%%, entropy %.0f bit)\n", 100.0 - 12.5 * entropy, entropy); 236 | } 237 | 238 | #if defined(_OPENMP) 239 | if (0 < duration) { 240 | fprintf(stdout, "Finished after %.1f s", duration); 241 | } 242 | else { 243 | fprintf(stdout, "Finished"); 244 | } 245 | #endif 246 | 247 | { /*validate result*/ 248 | size_t check = 0, i; 249 | for (i = 0; i < hsize; ++i) check += histogram[i]; 250 | if (nitems != check) { 251 | size_t expected[256/*hsize*/]; 252 | memset(expected, 0, sizeof(expected)); 253 | LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, nitems, expected); check = 0; 254 | for (i = 0; i < hsize; ++i) check += expected[i] == histogram[i] ? 0 : 1; 255 | fprintf(stdout, " with %llu error%s\n", (unsigned long long)check, 1 != check ? "s" : ""); 256 | } 257 | else { 258 | fprintf(stdout, "\n"); 259 | } 260 | } 261 | 262 | { /*release resources*/ 263 | size_t i; 264 | for (i = 0; i < nstreams; ++i) { 265 | int device = -1; 266 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_device(stream[i].handle, &device)); 267 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].histogram)); 268 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].data)); 269 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_destroy(stream[i].handle)); 270 | #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) 271 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_destroy(stream[i].event)); 272 | #endif 273 | } 274 | LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1/*host*/, data)); 275 | } 276 | 277 | return EXIT_SUCCESS; 278 | } 279 | -------------------------------------------------------------------------------- /samples/smm/tune_multiply.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #################################################################################################### 3 | # Copyright (C) by the DBCSR developers group - All rights reserved # 4 | # This file is part of the DBCSR library. # 5 | # # 6 | # For information on the license, see the LICENSE file. # 7 | # For further information please visit https://dbcsr.cp2k.org # 8 | # SPDX-License-Identifier: BSD-3-Clause # 9 | #################################################################################################### 10 | 11 | XARGS=$(command -v xargs) 12 | SORT=$(command -v sort) 13 | HEAD=$(command -v head) 14 | SED=$(command -v gsed) 15 | CUT=$(command -v cut) 16 | LS=$(command -v ls) 17 | RM=$(command -v rm) 18 | WC=$(command -v wc) 19 | 20 | # initial delay before auto-tuning (interactive) 21 | WAIT_DEFAULT=12 22 | 23 | # GNU sed is desired (macOS) 24 | if [ ! "${SED}" ]; then 25 | SED=$(command -v sed) 26 | fi 27 | 28 | if [ "${XARGS}" ] && [ "${SORT}" ] && [ "${HEAD}" ] && [ "${SED}" ] && \ 29 | [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; 30 | then 31 | EXTRA="" 32 | while test $# -gt 0; do 33 | case "$1" in 34 | -h|--help) 35 | HELP=1 36 | shift $#;; 37 | -c|--continue) 38 | CONTINUE=1 39 | shift 1;; 40 | -w|--wait) 41 | WAIT=$2 42 | shift 2;; 43 | -u|--update) 44 | UPDATE=1 45 | shift 1;; 46 | -d|--delete) 47 | DELETE=1 48 | shift 1;; 49 | -a|--tuning-level) 50 | TLEVEL=$2 51 | shift 2;; 52 | -b|--backwards) 53 | REVERSE=1 54 | shift 1;; 55 | -t|--maxtime) 56 | MAXTIME=$2 57 | shift 2;; 58 | -p|--jsondir) 59 | JSONDIR=$2 60 | shift 2;; 61 | -k|--specid) 62 | SPECID=$2 63 | shift 2;; 64 | -m|--limit) 65 | MAXEXT=$2 66 | shift 2;; 67 | -n|--triplets) 68 | MAXNUM=$2 69 | shift 2;; 70 | -r|--bound) 71 | BOUNDL=$2 72 | BOUNDU=$3 73 | shift 3;; 74 | -i|--part) 75 | PART=$2 76 | shift 2;; 77 | -j|--nparts) 78 | NPARTS=$2 79 | shift 2;; 80 | -s|--batchsize) 81 | BATCHSIZE=$2 82 | shift 2;; 83 | *) 84 | if [ "-" != "${1:0:1}" ]; then 85 | break 86 | else 87 | EXTRA+=" $1" 88 | shift 89 | fi;; 90 | esac 91 | done 92 | # default/basic settings 93 | if [ ! "${BATCHSIZE}" ]; then BATCHSIZE=0; fi 94 | if [ ! "${JSONDIR}" ]; then JSONDIR=.; fi 95 | if [ ! "${TLEVEL}" ]; then TLEVEL=-1; fi 96 | if [ ! "${NPARTS}" ]; then NPARTS=${PMI_SIZE:-${OMPI_COMM_WORLD_SIZE:-1}}; fi 97 | if [ ! "${PART}" ]; then 98 | PART0=${PMI_RANK:-${OMPI_COMM_WORLD_RANK:-0}} 99 | PART=$(((PART0+1)%NPARTS+1)) 100 | fi 101 | if [ ! "${WAIT}" ] && [ "1" != "${NPARTS}" ]; then WAIT=0; fi 102 | # sanity checks 103 | if [ "0" != "$((NPARTS&2 echo "ERROR: part-number ${PART} is larger than the requested ${NPARTS} parts!" 105 | exit 1 106 | elif [ "0" != "$((1>PART))" ]; then 107 | >&2 echo "ERROR: part-number must be 1-based!" 108 | exit 1 109 | fi 110 | if [ "${SPECID}" ] && [ "$1" ]; then 111 | >&2 echo "ERROR: --specid and are mutual exclusive!" 112 | exit 1 113 | fi 114 | # how to print standard vs error messages 115 | if [ ! "${HELP}" ] || [ "0" = "${HELP}" ]; then 116 | JSONS=$(${LS} -1 ${JSONDIR}/tune_multiply-*-*x*x*-*gflops.json 2>/dev/null) 117 | HERE=$(cd "$(dirname "$0")" && pwd -P) 118 | ECHO=">&2 echo" 119 | if [ "${UPDATE}" ] && [ "0" != "${UPDATE}" ]; then 120 | MNKS=$(${SED} -n "s/.*tune_multiply-..*-\(..*x..*x.[^-]*\)-..*gflops\.json/\1/p" <<<"${JSONS}" \ 121 | | ${SORT} -u -n -tx -k1,1 -k2,2 -k3,3) 122 | elif [ "${SPECID}" ]; then 123 | MNKS=$(eval "${HERE}/../../acc_triplets.sh -k ${SPECID} 2>/dev/null") 124 | else 125 | if [[ "$*" != *"x"* ]]; then 126 | MNKS=$(eval "${HERE}/../../acc_triplets.sh $* 2>/dev/null") 127 | else 128 | MNKS="$*" 129 | fi 130 | fi 131 | else 132 | ECHO="echo" 133 | fi 134 | if [ ! "${WAIT}" ] || [[ ("${HELP}" && "0" != "${HELP}") ]]; then 135 | eval "${ECHO} \"Usage: $0 [options] []\"" 136 | eval "${ECHO} \" Options must precede triplet specification\"" 137 | eval "${ECHO} \" -w|--wait N: initial delay before auto-tuning (default: ${WAIT_DEFAULT} s)\"" 138 | eval "${ECHO} \" -c|--continue: proceed with plan if tuning is interrupted\"" 139 | eval "${ECHO} \" -u|--update: retune all JSONs found in directory (see -p)\"" 140 | eval "${ECHO} \" -s|--batchsize N: Number of batched SMMs (a.k.a. stacksize)\"" 141 | eval "${ECHO} \" -a|--tuning-level N=0..3: all, most, some, least tunables\"" 142 | eval "${ECHO} \" -b|--backwards: tune in descending order of triplets\"" 143 | eval "${ECHO} \" -t|--maxtime N: number of seconds spent per kernel\"" 144 | eval "${ECHO} \" -p|--jsondir P: path to JSON-files (tuned params)\"" 145 | eval "${ECHO} \" -i|--part N (1-based): Nth session out of nparts\"" 146 | eval "${ECHO} \" -j|--nparts N: number of total sessions (see -i)\"" 147 | eval "${ECHO} \" -r|--bound L U: limit L**3 < MNK <= U**3\"" 148 | eval "${ECHO} \" -m|--limit N: limit any shape extent to N\"" 149 | eval "${ECHO} \" -n|--triplets N: limit number of triplet\"" 150 | eval "${ECHO} \" -k|--specid N: predefined triplets\"" 151 | eval "${ECHO} \" 0-10: older to newer (larger), e.g.,\"" 152 | eval "${ECHO} \" 0: 201 kernels\"" 153 | eval "${ECHO} \" 10: 1266 kernels\"" 154 | eval "${ECHO} \" , e.g., 134 kernels \\\"23, 5 32 13 24 26, 4 9\\\"\"" 155 | eval "${ECHO} \" MxNxK's can be also given directly, e.g.,\"" 156 | eval "${ECHO} \" 1x1x1 2x2x2 2x2x3 2x3x2 2x3x3 3x2x2 3x2x3 3x3x2 3x3x3\"" 157 | eval "${ECHO} \" (which is equivalent to \\\"1, 2 3\\\")\"" 158 | eval "${ECHO}" 159 | if [ "${HELP}" ] && [ "0" != "${HELP}" ]; then exit 0; fi 160 | fi 161 | if [ "${MNKS}" ]; then 162 | if [ "${BOUNDL}" ] || [ "${BOUNDU}" ]; then 163 | if [ ! "${BOUNDL}" ]; then BOUNDL=0; elif [ ! "${BOUNDU}" ]; then BOUNDU=0; fi 164 | if [ "0" != "$((0<=BOUNDL))" ]; then 165 | for MNK in $(${SED} "s/x/*/g" <<<"${MNKS}"); do 166 | S=$((MNK)) 167 | if [ "0" != "$((BOUNDL&2 echo "ERROR: invalid or no given!" 201 | exit 1 202 | fi 203 | if [ ! "${WAIT}" ] || [ "0" != "${WAIT}" ]; then 204 | if [ "0" = "$((NPARTS<=NTRIPLETS))" ]; then 205 | >&2 echo "WARNING: problem is over-decomposed!" 206 | fi 207 | echo "Session ${PART} of ${NPARTS} part(s)." 208 | fi 209 | if [ ! "${MAXTIME}" ] && [[ (! "${CONTINUE}" || \ 210 | "${CONTINUE}" = "false" || \ 211 | "${CONTINUE}" = "no" || \ 212 | "${CONTINUE}" = "0") ]]; 213 | then 214 | MAXTIME=160 215 | fi 216 | PARTLOSZ=$((NPARTS&2 echo "Already found ${NJSONS} (unrelated?) JSON-files." 237 | fi 238 | elif [ -e tune_multiply.csv ]; then 239 | >&2 echo "No JSON file found but (unrelated?) tune_multiply.csv exists." 240 | fi 241 | if [ ! "${WAIT}" ]; then WAIT=${WAIT_DEFAULT}; fi 242 | if [ "0" != "$((0&2 echo "ERROR: missing prerequisites!" 278 | exit 1 279 | fi 280 | -------------------------------------------------------------------------------- /samples/smm/Makefile: -------------------------------------------------------------------------------- 1 | MAKDIR := $(subst //,,$(dir $(firstword $(MAKEFILE_LIST)))/) 2 | ACCDIR := $(MAKDIR)/.. 3 | INCACC := $(wildcard $(MAKDIR)/*.h*) $(ACCDIR)/acc.h 4 | SRCACC := $(wildcard $(MAKDIR)/*.c) 5 | OBJACC := $(SRCACC:.c=.o) 6 | 7 | INCSMM := $(wildcard $(MAKDIR)/smm/*.h*) \ 8 | $(MAKDIR)/smm/opencl_kernels.h \ 9 | $(ACCDIR)/acc_libsmm.h \ 10 | $(ACCDIR)/acc_bench.h \ 11 | $(NULL) 12 | SRCSMM := $(wildcard $(MAKDIR)/smm/*.c) 13 | OBJSMM := $(SRCSMM:.c=.o) 14 | KERNEL := $(wildcard $(MAKDIR)/smm/kernels/*.cl) 15 | 16 | INCALL := $(INCACC) $(INCSMM) 17 | 18 | LIBXSMMROOT := $(wildcard $(ACCDIR)/../../../../../libxsmm) 19 | ifeq (,$(LIBXSMMROOT)) 20 | LIBXSMMROOT := $(wildcard $(HOME)/libxsmm) 21 | endif 22 | UNAME := $(shell uname) 23 | HEADERONLY ?= 0 24 | STATIC ?= 1 25 | DEV ?= 0 26 | 27 | # Kind of Clang/GCC based analysis: 28 | # leak, address, undefined, thread 29 | SANITIZE ?= $(NULL) 30 | 31 | # Intel Compiler 32 | ICX := $(shell which icx 2>/dev/null) 33 | INTEL ?= $(if $(ICX),$(if $(filter-out 0,$(GNU)),0,2),0) 34 | 35 | # select from set of predefined triplet specifications 36 | SPECID ?= 0 37 | # limit shape in tests (zero or negative for unlimited) 38 | MAXEXT ?= 48 39 | # number of tests (zero or negative for unlimited) 40 | NSMMS ?= 10 41 | 42 | COMMAND := $(shell which command 2>/dev/null) 43 | ifneq (,$(COMMAND)) 44 | which = $(shell $(COMMAND) -v $1) 45 | else 46 | which = $(shell which $(firstword $1) 2>/dev/null) 47 | endif 48 | 49 | WITH_GPU := $(if $(WITH_GPU),$(WITH_GPU),$(GPUVER)) 50 | PARAMS_WITHGPU := $(MAKDIR)/smm/params/tune_multiply_$(WITH_GPU).csv 51 | PARAMS_DEFAULT := $(MAKDIR)/smm/tune_multiply.csv 52 | PARAMS := $(if $(wildcard $(PARAMS_WITHGPU)),$(PARAMS_WITHGPU),$(wildcard $(PARAMS_DEFAULT))) 53 | 54 | #PARAMDIR ?= $(MAKDIR)/smm/params 55 | ifeq (,$(PARAMS)) 56 | ifneq (,$(wildcard $(PARAMDIR))) 57 | WITH_GPUS := $(shell ls -1 $(PARAMDIR)/*.csv | cut -d. -f1 | rev | cut -d_ -f1 | rev) 58 | endif 59 | endif 60 | 61 | CFLAGS := -fPIC \ 62 | -Wall -Wextra -Wcast-qual \ 63 | -Wno-overlength-strings \ 64 | -Wno-variadic-macros \ 65 | -Wno-unused-function \ 66 | -Wno-long-long \ 67 | -D__OPENCL \ 68 | $(NULL) 69 | 70 | ifneq (,$(ELEM_TYPE)) 71 | CFLAGS += -DELEM_TYPE=$(ELEM_TYPE) 72 | endif 73 | 74 | ifneq (0,$(INTEL)) 75 | ifneq (1,$(INTEL)) 76 | CXX := icpx 77 | CC := icx 78 | else 79 | CXX := icpc 80 | CC := icc 81 | endif 82 | AR := $(if $(call which,xiar),xiar,ar) 83 | else 84 | CXX := g++ 85 | CC := gcc 86 | ifneq (Darwin,$(UNAME)) 87 | AR := gcc-ar 88 | else 89 | AR := ar 90 | endif 91 | endif 92 | 93 | ifneq (0,$(DEV)) 94 | ifeq (1,$(DEV)) 95 | CFLAGS += -std=c89 96 | CFLAGS += -Wno-unused-parameter 97 | else 98 | # DEV=2 (and higher): linking is not intended 99 | CFLAGS += -D__DBCSR_ACC 100 | CFLAGS += -Wno-gnu-zero-variadic-macro-arguments 101 | CFLAGS += -Wno-deprecated 102 | CFLAGS += -Werror 103 | ifneq (2,$(DEV)) 104 | ifneq (,$(findstring clang,$(CC) $(CXX))) 105 | override CC := clang++ --analyze 106 | else 107 | override CC := $(CXX) -xc++ 108 | endif 109 | else 110 | override CC := $(CXX) -xc++ 111 | endif 112 | $(info CC: $(shell $(CC) --version | head -n1)) 113 | OMP := 0 114 | endif 115 | CFLAGS += -pedantic 116 | #else 117 | #CFLAGS += -std=c99 118 | endif 119 | 120 | ifneq (,$(SANITIZE)) 121 | CXXFLAGS += -fsanitize=$(SANITIZE) 122 | CFLAGS += -fsanitize=$(SANITIZE) 123 | FCFLAGS += -fsanitize=$(SANITIZE) 124 | LDFLAGS += -fsanitize=$(SANITIZE) 125 | SYM = 1 126 | endif 127 | 128 | ifneq (0,$(DBG)) 129 | CPP_OPENCL_FLAGS += -C 130 | ifeq (,$(DBG)) 131 | CFLAGS += -O2 -DNDEBUG 132 | else 133 | ifneq (1,$(DBG)) 134 | CFLAGS += -D_DEBUG 135 | endif 136 | CFLAGS += -O0 137 | endif 138 | else 139 | CFLAGS += -O2 -DNDEBUG 140 | SYM ?= 0 141 | endif 142 | ifneq (0,$(SYM)) 143 | CFLAGS += -g 144 | endif 145 | 146 | ifneq (0,$(OMP)) 147 | ifneq (0,$(INTEL)) 148 | CFLAGS += -qopenmp 149 | LDFLAGS += -qopenmp 150 | else ifneq (Darwin,$(UNAME)) 151 | CFLAGS += -fopenmp 152 | LDFLAGS += -fopenmp 153 | else # macOS 154 | CFLAGS += -Xpreprocessor -fopenmp 155 | LDFLAGS += -lomp 156 | endif 157 | endif 158 | 159 | ifneq (,$(LIBXSMMROOT)) 160 | ifneq (0,$(STATIC)) 161 | ifeq (0,$(HEADERONLY)) 162 | ifneq (0,$(OMP)) 163 | LDFLAGS += $(LIBXSMMROOT)/lib/libxsmmext.a 164 | endif 165 | LDFLAGS += $(LIBXSMMROOT)/lib/libxsmm.a 166 | else 167 | CFLAGS_XSMM += -DLIBXSMM_DEFAULT_CONFIG 168 | endif 169 | LDFLAGS += $(LIBXSMMROOT)/lib/libxsmmnoblas.a 170 | else 171 | LDFLAGS += -L$(LIBXSMMROOT)/lib 172 | ifneq (Darwin,$(UNAME)) 173 | LDFLAGS += -Wl,-rpath=$(LIBXSMMROOT)/lib 174 | endif 175 | ifneq (0,$(OMP)) 176 | LDFLAGS += -lxsmmext 177 | endif 178 | LDFLAGS += -lxsmm -lxsmmnoblas 179 | endif 180 | CFLAGS_XSMM += -pthread -D__LIBXSMM -I$(LIBXSMMROOT)/include 181 | LDFLAGS += -pthread -ldl -lm 182 | endif 183 | 184 | ifeq (Darwin,$(UNAME)) 185 | LDFLAGS += -framework OpenCL 186 | else 187 | OPENCL_LIB := $(shell ldconfig -p 2>/dev/null | grep -m1 OpenCL | rev | cut -d' ' -f1 | rev) 188 | ifeq (,$(OPENCL_LIB)) 189 | OPENCL_LIB := $(wildcard /usr/lib/x86_64-linux-gnu/libOpenCL.so.1) 190 | endif 191 | ifeq (,$(CUDATOOLKIT_HOME)) 192 | CUDATOOLKIT_HOME := $(NVSDKCOMPUTE_ROOT) 193 | endif 194 | ifeq (,$(CUDATOOLKIT_HOME)) 195 | NVCC := $(call which,nvcc) 196 | CUDATOOLKIT_HOME := $(if $(NVCC),$(abspath $(dir $(NVCC))/..)) 197 | endif 198 | ifneq (,$(CUDATOOLKIT_HOME)) 199 | CUDA_LIBDIR := $(if $(wildcard $(CUDATOOLKIT_HOME)/lib64),lib64,lib) 200 | ifeq (,$(wildcard $(OPENCL_INC))) 201 | CLINC := $(lastword $(sort $(wildcard $(CUDATOOLKIT_HOME)/../cuda/*/targets/x86_64-linux/include/CL/cl.h))) 202 | OPENCL_INC := $(if $(CLINC),$(abspath $(dir $(CLINC))/..),$(CUDATOOLKIT_HOME)/include) 203 | endif 204 | ifeq (,$(wildcard $(OPENCL_LIB))) 205 | LDFLAGS += -L$(CUDATOOLKIT_HOME)/$(CUDA_LIBDIR) 206 | LDFLAGS += -Wl,-rpath=$(CUDATOOLKIT_HOME)/$(CUDA_LIBDIR) 207 | endif 208 | else ifeq (,$(OPENCL_INC)) 209 | ifneq (,$(wildcard $(OPENCL_ROOT)/include/CL/cl.h)) 210 | LDFLAGS += -L$(OPENCL_ROOT)/$(if $(wildcard $(OPENCL_ROOT)/lib64),lib64,lib) 211 | OPENCL_INC := $(OPENCL_ROOT)/include 212 | else ifneq (,$(ICX)) 213 | OPENCL_ROOT := $(abspath $(dir $(ICX))/..) 214 | CLINC := $(wildcard $(OPENCL_ROOT)/include/sycl/CL/cl.h $(OPENCL_ROOT)/include/CL/cl.h) 215 | ifneq (,$(CLINC)) 216 | LDFLAGS += -L$(OPENCL_ROOT)/$(if $(wildcard $(OPENCL_ROOT)/lib64),lib64,lib) 217 | LDFLAGS += -L$(OPENCL_ROOT)/compiler/lib/intel64 -lintlc 218 | OPENCL_INC := $(abspath $(dir $(firstword $(CLINC)))/..) 219 | endif 220 | endif 221 | endif 222 | # OPENCL_INC: directory containing CL/cl.h. 223 | ifneq (,$(wildcard $(OPENCL_INC))) 224 | CFLAGS += -I$(OPENCL_INC) 225 | endif 226 | # OPENCL_LIB: file/library to be linked 227 | ifneq (,$(wildcard $(OPENCL_LIB))) 228 | LDFLAGS += $(OPENCL_LIB) 229 | else 230 | LDFLAGS += -l:libOpenCL.so.1 231 | endif 232 | endif 233 | 234 | # Collect all paths in LD_LIBRARY_PATH and LD_LIBRARY_PATH/stubs, and append to LDFLAGS 235 | LD_LIBRARY_DIRS := $(wildcard $(subst :, ,$(LD_LIBRARY_PATH))) 236 | LD_LIBSTUB_PATH := $(wildcard $(patsubst %,%/stubs,$(LD_LIBRARY_DIRS))) 237 | LIBPATHS := $(foreach DIR,$(LD_LIBRARY_DIRS),$(if $(filter -L$(DIR),$(LDFLAGS)),$(NULL),-L$(DIR))) 238 | LIBSTUBS := $(foreach DIR,$(LD_LIBSTUB_PATH),$(if $(filter -L$(DIR),$(LDFLAGS)),$(NULL),-L$(DIR))) 239 | LDFLAGS += $(LIBPATHS) $(LIBSTUBS) 240 | 241 | .PHONY: bench 242 | bench: $(ACCDIR)/acc_bench 243 | 244 | .PHONY: all 245 | all: bench $(ACCDIR)/dbcsr_acc_test 246 | 247 | .PHONY: test 248 | test: test-interface test-smm 249 | 250 | .PHONY: test-interface 251 | test-interface: $(ACCDIR)/dbcsr_acc_test 252 | @echo "--- DBCSR Backend Interface" 253 | $(ACCDIR)/dbcsr_acc_test 254 | 255 | $(MAKDIR)/test-smm.log: bench 256 | $(eval SHAPES = $(shell $(ACCDIR)/acc_triplets.sh -k $(SPECID) -m $(MAXEXT) -n $(NSMMS))) 257 | $(eval DEVICE = "$(shell LIBXSMM_VERBOSE=0 ACC_OPENCL_VERBOSE=1 CHECK=0 $(ACCDIR)/acc_bench 1 1 1 2>&1 >/dev/null)") 258 | $(eval WITH_GPU = $(firstword $(foreach GPU,$(WITH_GPUS),$(findstring $(GPU),$(DEVICE))))) 259 | $(eval PARAMS = $(firstword $(wildcard $(PARAMDIR)/tune_multiply_$(WITH_GPU).csv))) 260 | $(eval GPUENV = $(if $(OPENCL_LIBSMM_SMM_PARAMS),$(NULL),$(if $(PARAMS),OPENCL_LIBSMM_SMM_PARAMS=$(PARAMS)))) 261 | @echo "--- DBCSR OpenCL SMMs ($(words $(SHAPES)))" 262 | @echo "$(DEVICE)" 263 | @if [ "$(GPUENV)" ]; then echo "$(GPUENV)"; fi 264 | @echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" 265 | ifneq (,$(LD_PRELOAD)) 266 | @echo "LD_PRELOAD=${LD_PRELOAD}" 267 | endif 268 | @echo "CC: $$($(CC) --version | head -n1)" 269 | @echo "runtime libraries:" 270 | @ldd $(ACCDIR)/acc_bench 271 | @echo "hostname: $$(hostname)" 272 | @echo 273 | @echo "$(SHAPES)" | xargs -n1 | ($(GPUENV) CHECK=$(if $(CHECK),$(CHECK),1) stdbuf --output=L \ 274 | $(ACCDIR)/acc_bench /dev/stdin 2>$(MAKDIR)/test-smm.err && rm $(MAKDIR)/test-smm.err) | tee $@ 275 | 276 | .PHONY: test-smm 277 | test-smm: $(MAKDIR)/test-smm.log 278 | ifneq (,$(call which,datamash)) 279 | ifeq (,$(shell datamash geomean 2>&1 | grep invalid)) 280 | @echo "geomean: $$(sed -n "/device:/p" $< 2>/dev/null | datamash -W -R 1 geomean 4) GFLOPS/s" 281 | endif 282 | @echo "median: $$(sed -n "/device:/p" $< 2>/dev/null | datamash -W -R 1 median 4) GFLOPS/s" 283 | @echo "mean: $$(sed -n "/device:/p" $< 2>/dev/null | datamash -W -R 1 mean 4) GFLOPS/s" 284 | endif 285 | @if [ -s $(MAKDIR)/test-smm.err ]; then \ 286 | echo && cat $(MAKDIR)/test-smm.err; \ 287 | if [ "0" != "$(if $(CHECK),$(CHECK),1)" ]; then exit 1; fi; \ 288 | fi 289 | 290 | $(MAKDIR)/smm/opencl_kernels.h: $(MAKDIR)/acc_opencl.sh $(KERNEL) $(PARAMS) 291 | CPPFLAGS=$(CPP_OPENCL_FLAGS) $(MAKDIR)/acc_opencl.sh $(KERNEL) $(PARAMS) $@ 292 | 293 | .PHONY: backend 294 | backend: $(ACCDIR)/dbcsr_acc.a 295 | $(ACCDIR)/dbcsr_acc.a: $(OBJACC) 296 | $(AR) -rs $@ $^ 297 | 298 | .PHONY: libsmm 299 | libsmm: $(ACCDIR)/dbcsr_acc_smm.a 300 | $(ACCDIR)/dbcsr_acc_smm.a: $(OBJSMM) 301 | $(AR) -rs $@ $^ 302 | 303 | %.o: %.c $(INCALL) $(MAKDIR)/Makefile 304 | $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ 305 | 306 | $(MAKDIR)/acc_bench.o: $(ACCDIR)/acc_bench.c $(MAKDIR)/Makefile 307 | ifneq (0,$(LIBXSMM)) 308 | $(CC) $(CFLAGS) $(CFLAGS_XSMM) -c $< -o $@ 309 | else 310 | $(CC) $(CFLAGS) -c $< -o $@ 311 | endif 312 | 313 | $(ACCDIR)/acc_bench: $(MAKDIR)/acc_bench.o $(ACCDIR)/dbcsr_acc_smm.a $(ACCDIR)/dbcsr_acc.a 314 | ifneq (,$(filter 0 1,$(DEV))) 315 | $(CC) $^ $(LDFLAGS) -o $@ 316 | else 317 | .PHONY: $(ACCDIR)/acc_bench 318 | endif 319 | 320 | $(MAKDIR)/dbcsr_acc_test.o: $(ACCDIR)/../../tests/dbcsr_acc_test.c $(MAKDIR)/Makefile 321 | $(CC) $(CFLAGS) -I$(ACCDIR)/.. -c $< -o $@ 322 | 323 | $(ACCDIR)/dbcsr_acc_test: $(MAKDIR)/dbcsr_acc_test.o $(ACCDIR)/dbcsr_acc.a 324 | ifneq (,$(filter 0 1,$(DEV))) 325 | $(CC) $^ $(LDFLAGS) -o $@ 326 | else 327 | .PHONY: $(ACCDIR)/dbcsr_acc_test 328 | endif 329 | 330 | .PHONY: clean 331 | clean: 332 | @rm -f $(OBJACC) $(OBJSMM) 333 | @rm -f $(MAKDIR)/dbcsr_acc_test.o 334 | @rm -f $(MAKDIR)/acc_bench.o 335 | @rm -f $(MAKDIR)/smm/opencl_kernels.h 336 | @rm -f $(MAKDIR)/test-smm.err 337 | 338 | .PHONY: realclean 339 | realclean: clean 340 | @rm -f $(ACCDIR)/dbcsr_acc.a $(ACCDIR)/dbcsr_acc_smm.a 341 | @rm -f $(ACCDIR)/acc_bench 342 | @rm -f $(ACCDIR)/dbcsr_acc_test 343 | @rm -f $(MAKDIR)/test-smm.log 344 | -------------------------------------------------------------------------------- /samples/copy/Makefile.inc: -------------------------------------------------------------------------------- 1 | ICPC = $(notdir $(shell which icpc 2> /dev/null)) 2 | ICC = $(notdir $(shell which icc 2> /dev/null)) 3 | IFORT = $(notdir $(shell which ifort 2> /dev/null)) 4 | GPP = $(notdir $(shell which g++ 2> /dev/null)) 5 | GCC = $(notdir $(shell which gcc 2> /dev/null)) 6 | GFC = $(notdir $(shell which gfortran 2> /dev/null)) 7 | 8 | CXX_CHECK = $(notdir $(shell which $(CXX) 2> /dev/null)) 9 | CC_CHECK = $(notdir $(shell which $(CC) 2> /dev/null)) 10 | FC_CHECK = $(notdir $(shell which $(FC) 2> /dev/null)) 11 | 12 | # prefer Intel Compiler (if available) 13 | CXX = $(ICPC) 14 | FC = $(IFORT) 15 | CC = $(ICC) 16 | 17 | INTEL = $(shell echo $$((3==$(words $(filter icc icpc ifort,$(CC) $(CXX) $(FC)))))) 18 | 19 | ifneq (0,$(INTEL)) 20 | AR = xiar 21 | CXXFLAGS += -fPIC -Wall -diag-disable 3415 22 | CFLAGS += -fPIC -Wall -diag-disable 3415 23 | FCFLAGS += -fPIC 24 | LDFLAGS += -fPIC -lrt 25 | FCMTFLAGS += -threads 26 | ifeq (1,$(PEDANTIC)) 27 | CFLAGS += -std=c89 -Wcheck 28 | FCFLAGS += -free 29 | FMFLAGS += -e03 30 | else ifneq (0,$(PEDANTIC)) 31 | CFLAGS += -std=c89 -Wcheck -Wremarks -diag-disable 177,2547 32 | FCFLAGS += -e03 33 | else 34 | FCFLAGS += -free 35 | endif 36 | # flag specifying output directory must be last 37 | FMFLAGS += -module 38 | ifeq (0,$(DBG)) 39 | CXXFLAGS += -fno-alias -ansi-alias -O2 40 | CFLAGS += -fno-alias -ansi-alias -O2 41 | FCFLAGS += -O2 42 | DFLAGS += -DNDEBUG 43 | ifneq (0,$(IPO)) 44 | CXXFLAGS += -ipo 45 | CFLAGS += -ipo 46 | FCFLAGS += -ipo 47 | endif 48 | else 49 | CXXFLAGS += -O0 50 | CFLAGS += -O0 51 | FCFLAGS += -O0 52 | SYM = $(DBG) 53 | endif 54 | ifeq (1,$(shell echo $$((3 > $(DBG))))) 55 | ifeq (1,$(AVX)) 56 | TARGET = -xAVX 57 | else ifeq (2,$(AVX)) 58 | TARGET = -xCORE-AVX2 59 | else ifeq (3,$(AVX)) 60 | ifeq (0,$(MIC)) 61 | TARGET = -xCOMMON-AVX512 62 | else 63 | TARGET = -xMIC-AVX512 64 | endif 65 | else ifeq (1,$(shell echo $$((2 <= $(SSE))))) 66 | TARGET = -xSSE$(SSE) 67 | else ifeq (1,$(SSE)) 68 | TARGET = -xSSE3 69 | else 70 | TARGET = -xHost 71 | endif 72 | endif 73 | ifneq (0,$(SYM)) 74 | ifeq (1,$(SYM)) 75 | CXXFLAGS := -g $(CXXFLAGS) 76 | CFLAGS := -g $(CFLAGS) 77 | FCFLAGS := -g -check -traceback $(FCFLAGS) 78 | else 79 | CXXFLAGS := -g3 -gdwarf-2 -debug inline-debug-info $(CXXFLAGS) 80 | CFLAGS := -g3 -gdwarf-2 -debug inline-debug-info $(CFLAGS) 81 | FCFLAGS := -g -check -traceback $(FCFLAGS) 82 | endif 83 | endif 84 | ifeq (0,$(EXP)) 85 | CXXFLAGS += -fno-exceptions 86 | endif 87 | ifneq (0,$(OMP)) 88 | CXXFLAGS += -openmp 89 | CFLAGS += -openmp 90 | FCFLAGS += -openmp 91 | LDFLAGS += -openmp 92 | endif 93 | ifeq (0,$(OFFLOAD)) 94 | CXXFLAGS += -no-offload 95 | CFLAGS += -no-offload 96 | FCFLAGS += -no-offload 97 | endif 98 | ifeq (1,$(STATIC)) 99 | SLDFLAGS += -no-intel-extensions -static-intel -static-libgcc -static-libstdc++ 100 | DFLAGS += -D__STATIC 101 | else ifneq (0,$(STATIC)) 102 | SLDFLAGS += -static 103 | DFLAGS += -D__STATIC 104 | endif 105 | ifneq (,$(R8)) 106 | ifneq (0,$(R8)) 107 | FCFLAGS += -autodouble 108 | endif 109 | endif 110 | else # GCC assumed 111 | ifeq (,$(CXX_CHECK)) 112 | CXX = $(GPP) 113 | endif 114 | ifeq (,$(CC_CHECK)) 115 | CC = $(GCC) 116 | endif 117 | ifeq (,$(FC_CHECK)) 118 | FC = $(GFC) 119 | endif 120 | GCC_VERSION_STRING = $(shell $(CC) --version | grep "gcc (.\+) " | sed "s/gcc (.\+) \([0-9]\+\.[0-9]\+\.[0-9]\+\).*$$/\1/") 121 | GCC_VERSION_MAJOR = $(shell echo "$(GCC_VERSION_STRING)" | $(CUT) -d"." -f1) 122 | GCC_VERSION_MINOR = $(shell echo "$(GCC_VERSION_STRING)" | $(CUT) -d"." -f2) 123 | GCC_VERSION_PATCH = $(shell echo "$(GCC_VERSION_STRING)" | $(CUT) -d"." -f3) 124 | ifeq (3,$(words $(GCC_VERSION_MAJOR) $(GCC_VERSION_MINOR) $(GCC_VERSION_PATCH))) 125 | GCC_VERSION = $(shell echo "$$(($(GCC_VERSION_MAJOR) * 10000 + $(GCC_VERSION_MINOR) * 100 + $(GCC_VERSION_PATCH)))") 126 | else 127 | GCC_VERSION = 0 128 | endif 129 | MIC = 0 130 | CXXFLAGS += -Wall -Wno-unused-function -Wno-attributes 131 | CFLAGS += -Wall -Wno-unused-function -Wno-attributes 132 | LDFLAGS += -lrt 133 | ifneq (Windows_NT,$(OS)) 134 | CXXFLAGS += -fPIC 135 | CFLAGS += -fPIC 136 | FCFLAGS += -fPIC 137 | LDFLAGS += -fPIC 138 | endif 139 | ifeq (1,$(PEDANTIC)) 140 | CFLAGS += -std=c89 -pedantic -Wno-variadic-macros -Wno-long-long -Wno-overlength-strings 141 | FCFLAGS += -ffree-form 142 | FMFLAGS += -std=f2003 -pedantic -Wunused-variable 143 | ifneq (0,$(shell echo "$$((50000 <= $(GCC_VERSION)))")) 144 | FMFLAGS += -Wuse-without-only 145 | endif 146 | else ifneq (0,$(PEDANTIC)) 147 | CFLAGS += -std=c89 -pedantic -Wno-variadic-macros -Wno-long-long -Wno-overlength-strings 148 | FCFLAGS += -std=f2003 -pedantic -Wunused-variable 149 | ifneq (0,$(shell echo "$$((50000 <= $(GCC_VERSION)))")) 150 | FMFLAGS += -Wuse-without-only 151 | endif 152 | else 153 | FCFLAGS += -ffree-form 154 | endif 155 | # flag specifying output directory must be last 156 | FMFLAGS += -J 157 | ifeq (0,$(DBG)) 158 | CXXFLAGS += -O2 -ftree-vectorize -ffast-math -funroll-loops 159 | CFLAGS += -O2 -ftree-vectorize -ffast-math -funroll-loops 160 | FCFLAGS += -O2 -ftree-vectorize -ffast-math -funroll-loops 161 | DFLAGS += -DNDEBUG 162 | ifneq (0,$(IPO)) 163 | CXXFLAGS += -flto -ffat-lto-objects 164 | CFLAGS += -flto -ffat-lto-objects 165 | FCFLAGS += -flto -ffat-lto-objects 166 | LDFLAGS += -flto 167 | endif 168 | else 169 | CXXFLAGS += -O0 170 | CFLAGS += -O0 171 | FCFLAGS += -O0 172 | SYM = $(DBG) 173 | endif 174 | ifeq (1,$(shell echo $$((3 > $(DBG))))) 175 | ifeq (1,$(AVX)) 176 | TARGET = -mavx 177 | else ifeq (2,$(AVX)) 178 | TARGET = -mavx2 179 | else ifeq (3,$(AVX)) 180 | TARGET = -mavx512f -mavx512cd 181 | ifneq (0,$(MIC)) 182 | TARGET += -mavx512er -mavx512pf 183 | endif 184 | else ifeq (1,$(shell echo $$((2 <= $(SSE))))) 185 | TARGET = -msse$(SSE) 186 | else ifeq (1,$(SSE)) 187 | TARGET = -msse3 188 | else 189 | TARGET = -march=native 190 | endif 191 | endif 192 | ifneq (0,$(SYM)) 193 | ifeq (1,$(SYM)) 194 | CXXFLAGS := -g $(CXXFLAGS) 195 | CFLAGS := -g $(CFLAGS) 196 | FCFLAGS := -g $(FCFLAGS) 197 | else ifeq (2,$(SYM)) 198 | CXXFLAGS := -g $(CXXFLAGS) -fsanitize=thread -fno-omit-frame-pointer 199 | CFLAGS := -g $(CFLAGS) -fsanitize=thread -fno-omit-frame-pointer 200 | FCFLAGS := -g $(FCFLAGS) -fsanitize=thread -fno-omit-frame-pointer 201 | LDFLAGS := -g $(LDFLAGS) -fsanitize=thread -fno-omit-frame-pointer 202 | ELDFLAGS := -pie 203 | else 204 | CXXFLAGS := -g3 -gdwarf-2 -debug inline-debug-info $(CXXFLAGS) 205 | CFLAGS := -g3 -gdwarf-2 -debug inline-debug-info $(CFLAGS) 206 | FCFLAGS := -g -check -traceback $(FCFLAGS) 207 | endif 208 | endif 209 | ifeq (0,$(EXP)) 210 | CXXFLAGS += -fno-exceptions 211 | endif 212 | ifneq (0,$(OMP)) 213 | CXXFLAGS += -fopenmp 214 | CFLAGS += -fopenmp 215 | FCFLAGS += -fopenmp 216 | LDFLAGS += -fopenmp 217 | endif 218 | ifneq (0,$(STATIC)) 219 | SLDFLAGS += -static 220 | DFLAGS += -D__STATIC 221 | endif 222 | ifneq (,$(R8)) 223 | ifneq (0,$(R8)) 224 | FCFLAGS += -fdefault-real-8 -fdefault-double-8 225 | endif 226 | endif 227 | endif 228 | 229 | ifneq (,$(CXX)) 230 | LD = $(CXX) 231 | endif 232 | ifeq (,$(LD)) 233 | LD = $(CC) 234 | endif 235 | ifeq (,$(LD)) 236 | LD = $(FC) 237 | endif 238 | 239 | ifeq (,$(CXXFLAGS)) 240 | CXXFLAGS = $(CFLAGS) 241 | endif 242 | ifeq (,$(CFLAGS)) 243 | CFLAGS = $(CXXFLAGS) 244 | endif 245 | ifeq (,$(FCFLAGS)) 246 | FCFLAGS = $(CFLAGS) 247 | endif 248 | ifeq (,$(LDFLAGS)) 249 | LDFLAGS = $(CFLAGS) 250 | endif 251 | 252 | MAKE_ILP64 = 0 253 | ifneq (,$(ILP64)) 254 | ifneq (0,$(ILP64)) 255 | MAKE_ILP64 = $(ILP64) 256 | endif 257 | endif 258 | ifneq (0,$(MAKE_ILP64)) 259 | BLAS_BITS = 64 260 | MKL_BITS = ilp64 261 | else 262 | MKL_BITS = lp64 263 | endif 264 | 265 | ifneq (,$(MKLROOT)) 266 | MKL ?= $(BLAS) 267 | MKL_DIRECT ?= 0 268 | ifneq (0,$(MKL_DIRECT)) 269 | MKL_STATIC ?= 1 270 | else ifneq (0,$(STATIC)) 271 | MKL_STATIC ?= 1 272 | else 273 | MKL_STATIC ?= 0 274 | endif 275 | else 276 | MKL = 0 277 | endif 278 | 279 | ifeq (1,$(MKL_DIRECT)) 280 | MKL_STATIC = 1 281 | DFLAGS += -DMKL_DIRECT_CALL_SEQ 282 | endif 283 | 284 | ifeq (1,$(MKL)) # sequential 285 | DFLAGS += -D__MKL 286 | IFLAGS +=-I$(MKLROOT)/include 287 | ifeq (0,$(MKL_STATIC)) # shared 288 | LDFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_$(MKL_BITS) -lmkl_core -lmkl_sequential 289 | ifneq (0,$(OFFLOAD)) 290 | LDFLAGS += -offload-option,mic,ld,"-L$(MKLROOT)/lib/mic -lmkl_intel_$(MKL_BITS) -lmkl_core -lmkl_sequential" 291 | endif 292 | else # static 293 | LDFLAGS += -Wl,--start-group \ 294 | $(MKLROOT)/lib/intel64/libmkl_intel_$(MKL_BITS).a \ 295 | $(MKLROOT)/lib/intel64/libmkl_core.a \ 296 | $(MKLROOT)/lib/intel64/libmkl_sequential.a \ 297 | -Wl,--end-group 298 | ifneq (0,$(OFFLOAD)) 299 | LDFLAGS += -offload-option,mic,ld,"--start-group \ 300 | $(MKLROOT)/lib/mic/libmkl_intel_$(MKL_BITS).a \ 301 | $(MKLROOT)/lib/mic/libmkl_core.a \ 302 | $(MKLROOT)/lib/mic/libmkl_sequential.a \ 303 | --end-group" 304 | endif 305 | endif 306 | LDFLAGS += -lpthread -lm 307 | ifneq (0,$(OFFLOAD)) 308 | LDFLAGS += -offload-option,mic,ld,"-lm" 309 | endif 310 | else ifneq (0,$(MKL)) # multi-threaded 311 | DFLAGS += -D__MKL 312 | IFLAGS +=-I$(MKLROOT)/include 313 | ifeq (0,$(MKL_STATIC)) # shared 314 | ifneq (0,$(INTEL)) 315 | LDFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_$(MKL_BITS) -lmkl_core -lmkl_intel_thread 316 | ifneq (0,$(OFFLOAD)) 317 | LDFLAGS += -offload-option,mic,ld,"-L$(MKLROOT)/lib/mic -lmkl_intel_$(MKL_BITS) -lmkl_core -lmkl_intel_thread" 318 | endif 319 | else # assuming GNU toolchain 320 | LDFLAGS += -L$(MKLROOT)/lib/intel64 -lmkl_intel_$(MKL_BITS) -lmkl_core -lmkl_gnu_thread 321 | endif 322 | else # static 323 | ifneq (0,$(INTEL)) 324 | LDFLAGS += -Wl,--start-group \ 325 | $(MKLROOT)/lib/intel64/libmkl_intel_$(MKL_BITS).a \ 326 | $(MKLROOT)/lib/intel64/libmkl_core.a \ 327 | $(MKLROOT)/lib/intel64/libmkl_intel_thread.a \ 328 | -Wl,--end-group 329 | ifneq (0,$(OFFLOAD)) 330 | LDFLAGS += -offload-option,mic,ld,"--start-group \ 331 | $(MKLROOT)/lib/mic/libmkl_intel_$(MKL_BITS).a \ 332 | $(MKLROOT)/lib/mic/libmkl_core.a \ 333 | $(MKLROOT)/lib/mic/libmkl_intel_thread.a \ 334 | --end-group" 335 | endif 336 | else # assuming GNU toolchain 337 | LDFLAGS += -Wl,--start-group \ 338 | $(MKLROOT)/lib/intel64/libmkl_intel_$(MKL_BITS).a \ 339 | $(MKLROOT)/lib/intel64/libmkl_core.a \ 340 | $(MKLROOT)/lib/intel64/libmkl_gnu_thread.a \ 341 | -Wl,--end-group -ldl 342 | endif 343 | endif 344 | ifneq (0,$(INTEL)) 345 | ifeq (0,$(OMP)) 346 | LDFLAGS += -liomp5 347 | ifneq (0,$(OFFLOAD)) 348 | LDFLAGS += -offload-option,mic,ld,"-liomp5" 349 | endif 350 | endif 351 | ifneq (0,$(OFFLOAD)) 352 | LDFLAGS += -offload-option,mic,ld,"-lpthread -lm" 353 | endif 354 | endif 355 | LDFLAGS += -lpthread -lm 356 | else ifneq (0,$(BLAS)) # generic 357 | ifneq (1,$(BLAS)) 358 | BLAS_THREADS = o 359 | endif 360 | OPENBLAS = $(shell $(CC) -lopenblas$(BLAS_THREADS)$(BLAS_BITS) 2>&1 | grep openblas$(BLAS_THREADS)$(BLAS_BITS) > /dev/null ; echo $$?) 361 | ifeq (1,$(OPENBLAS)) 362 | LDFLAGS += -lopenblas$(BLAS_THREADS)$(BLAS_BITS) 363 | else 364 | ifneq (Windows_NT,$(OS)) 365 | LDFLAGS += -lblas$(BLAS_BITS) 366 | else 367 | LDFLAGS += -lblas$(BLAS_BITS).dll 368 | endif 369 | CLDFLAGS += -lgfortran 370 | ifeq (,$(GCC_VERSION_STRING)) 371 | LDFLAGS += -lquadmath 372 | else ifneq (0,$(shell echo "$$((40500 <= $(GCC_VERSION)))")) 373 | CLDFLAGS += -lquadmath 374 | endif 375 | endif 376 | endif 377 | 378 | --------------------------------------------------------------------------------