├── .ci ├── check.sh ├── clang.Dockerfile ├── cpplint.Dockerfile ├── cuda.Dockerfile ├── cuda8.Dockerfile └── makefile ├── .clang-format ├── .drone.script ├── .drone.yml ├── .gitignore ├── BUILD ├── CMakeLists.txt ├── README.md ├── WORKSPACE ├── cmake └── CudaHelper.cmake ├── cuda.bzl ├── gtest.BUILD ├── include ├── cuda_benchmark.h ├── cuda_index.h ├── cuda_utils.h ├── multiply │ └── multiply.h └── test │ └── multiply.h ├── src ├── benchmark-multiply.cu.cc ├── deprecated_examples.cu_old ├── multiply.cc ├── multiply │ ├── multiply.cc │ ├── multiply_cpu.cc │ └── multiply_gpu.cu.cc ├── sharedmemory.cu.cc └── tune.cu.cc └── test ├── test_multiply.cc └── test_multiply.cu.cc /.ci/check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Patrick Wieschollek 3 | 4 | RETURN=0 5 | FILES=`find . -type f -name "*" | grep -E "\.(cc|h|cu)$"` 6 | for FILE in $FILES; do 7 | echo -ne "check file ${FILE}" 8 | clang-format-6.0 $FILE -style=file | cmp $FILE >/dev/null 9 | if [ $? -ne 0 ]; then 10 | echo " ... failed" 11 | echo "[!] INCORRECT FORMATTING! $FILE" >&2 12 | echo $FILE 13 | diff -u <(cat $FILE) <(clang-format-6.0 ${FILE} -style=file) 14 | # diff -u < (cat ${FILE}) < (clang-format ${FILE}) 15 | RETURN=1 16 | else 17 | echo " ... ok" 18 | fi 19 | done 20 | exit $RETURN -------------------------------------------------------------------------------- /.ci/clang.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | RUN apt-get update && apt-get install clang-format-6.0 -y -------------------------------------------------------------------------------- /.ci/cpplint.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:alpine 2 | RUN pip3 install cpplint -------------------------------------------------------------------------------- /.ci/cuda.Dockerfile: -------------------------------------------------------------------------------- 1 | ARG VERCUDA 2 | FROM nvidia/cuda:${VERCUDA}-runtime 3 | ARG VERCUDA 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | cuda-libraries-dev-$CUDA_PKG_VERSION \ 6 | cuda-nvml-dev-$CUDA_PKG_VERSION \ 7 | cuda-minimal-build-$CUDA_PKG_VERSION \ 8 | cuda-command-line-tools-$CUDA_PKG_VERSION \ 9 | cmake \ 10 | libnccl-dev=$NCCL_VERSION-1+cuda${VERCUDA} \ 11 | xz-utils \ 12 | build-essential \ 13 | libgtest-dev \ 14 | curl \ 15 | unzip 16 | 17 | RUN mkdir /google && cd /google && \ 18 | curl https://github.com/google/googletest/archive/master.zip -O -J -L && \ 19 | unzip googletest-master.zip && \ 20 | mv googletest-master src && \ 21 | rm googletest-master.zip && \ 22 | mkdir build && \ 23 | mkdir dist && \ 24 | cd build && \ 25 | cmake ../src -DCMAKE_INSTALL_PREFIX=/google/dist && \ 26 | make install 27 | 28 | ENV GTEST_ROOT /google/dist 29 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs 30 | -------------------------------------------------------------------------------- /.ci/cuda8.Dockerfile: -------------------------------------------------------------------------------- 1 | # adapted from https://gitlab.com/nvidia/cuda/blob/ubuntu16.04/8.0/runtime/Dockerfile 2 | ARG VERCUDA 3 | FROM ubuntu:16.04 4 | ARG VERCUDA 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ 7 | rm -rf /var/lib/apt/lists/* && \ 8 | NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ 9 | NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ 10 | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ 11 | apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ 12 | echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ 13 | echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list 14 | 15 | ENV CUDA_VERSION 8.0.61 16 | 17 | RUN apt-get update && apt-get install -y --no-install-recommends \ 18 | cmake \ 19 | xz-utils \ 20 | build-essential \ 21 | libgtest-dev \ 22 | curl \ 23 | unzip 24 | 25 | ENV CUDA_PKG_VERSION 8-0=$CUDA_VERSION-1 26 | RUN apt-get update && apt-get install -y --no-install-recommends \ 27 | cuda-nvrtc-$CUDA_PKG_VERSION \ 28 | cuda-nvgraph-$CUDA_PKG_VERSION \ 29 | cuda-cusolver-$CUDA_PKG_VERSION \ 30 | cuda-cublas-8-0=8.0.61.2-1 \ 31 | cuda-cufft-$CUDA_PKG_VERSION \ 32 | cuda-curand-$CUDA_PKG_VERSION \ 33 | cuda-cusparse-$CUDA_PKG_VERSION \ 34 | cuda-npp-$CUDA_PKG_VERSION \ 35 | cuda-cudart-$CUDA_PKG_VERSION && \ 36 | ln -s cuda-8.0 /usr/local/cuda 37 | 38 | RUN apt-get update && apt-get install -y --no-install-recommends \ 39 | cuda-core-$CUDA_PKG_VERSION \ 40 | cuda-misc-headers-$CUDA_PKG_VERSION \ 41 | cuda-command-line-tools-$CUDA_PKG_VERSION \ 42 | cuda-nvrtc-dev-$CUDA_PKG_VERSION \ 43 | cuda-nvml-dev-$CUDA_PKG_VERSION \ 44 | cuda-nvgraph-dev-$CUDA_PKG_VERSION \ 45 | cuda-cusolver-dev-$CUDA_PKG_VERSION \ 46 | cuda-cublas-dev-8-0=8.0.61.2-1 \ 47 | cuda-cufft-dev-$CUDA_PKG_VERSION \ 48 | cuda-curand-dev-$CUDA_PKG_VERSION \ 49 | cuda-cusparse-dev-$CUDA_PKG_VERSION \ 50 | cuda-npp-dev-$CUDA_PKG_VERSION \ 51 | cuda-cudart-dev-$CUDA_PKG_VERSION \ 52 | cuda-driver-dev-$CUDA_PKG_VERSION && \ 53 | rm -rf /var/lib/apt/lists/* 54 | 55 | 56 | # nvidia-docker 1.0 57 | LABEL com.nvidia.volumes.needed="nvidia_driver" 58 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}" 59 | 60 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \ 61 | echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf 62 | 63 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 64 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 65 | 66 | # nvidia-container-runtime 67 | ENV NVIDIA_VISIBLE_DEVICES all 68 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility 69 | ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0" 70 | 71 | 72 | RUN mkdir /google && cd /google && \ 73 | curl https://github.com/google/googletest/archive/master.zip -O -J -L && \ 74 | unzip googletest-master.zip && \ 75 | mv googletest-master src && \ 76 | rm googletest-master.zip && \ 77 | mkdir build && \ 78 | mkdir dist && \ 79 | cd build && \ 80 | cmake ../src -DCMAKE_INSTALL_PREFIX=/google/dist && \ 81 | make install 82 | 83 | ENV GTEST_ROOT /google/dist 84 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs 85 | -------------------------------------------------------------------------------- /.ci/makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build push 2 | 3 | build: 4 | docker build --build-arg VERCUDA=10.1 -t patwie/cuda:10.1 -f cuda.Dockerfile . 5 | docker build --build-arg VERCUDA=10.0 -t patwie/cuda:10.0 -f cuda.Dockerfile . 6 | docker build --build-arg VERCUDA=9.2 -t patwie/cuda:9.2 -f cuda.Dockerfile . 7 | docker build --build-arg VERCUDA=9.1 -t patwie/cuda:9.1 -f cuda.Dockerfile . 8 | docker build --build-arg VERCUDA=9.0 -t patwie/cuda:9.0 -f cuda.Dockerfile . 9 | docker build --build-arg VERCUDA=8.0 -t patwie/cuda:8.0 -f cuda8.Dockerfile . 10 | docker build -t patwie/cpplint -f cpplint.Dockerfile . 11 | docker build -t patwie/clang-format -f clang.Dockerfile . 12 | 13 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | -------------------------------------------------------------------------------- /.drone.script: -------------------------------------------------------------------------------- 1 | def main(): 2 | return [ 3 | quality_pipeline(), 4 | build_pipeline("8.0"), 5 | build_pipeline("9.0"), 6 | build_pipeline("9.1"), 7 | build_pipeline("9.2"), 8 | build_pipeline("10.0"), 9 | build_pipeline("10.1"), 10 | ] 11 | 12 | 13 | def quality_pipeline(): 14 | return { 15 | 'kind': 'pipeline', 16 | 'name': 'quality', 17 | 'platform': { 18 | 'os': "linux", 19 | 'arch': 'amd64', 20 | }, 21 | 'steps': [ 22 | { 23 | 'name': 'format', 24 | 'pull': 'never', 25 | 'image': 'patwie/clang-format:latest', 26 | 'commands': [ 27 | './.ci/check.sh', 28 | ], 29 | }, 30 | { 31 | 'name': 'lint', 32 | 'pull': 'never', 33 | 'image': 'patwie/cpplint:latest', 34 | 'commands': [ 35 | 'cpplint --recursive .', 36 | ], 37 | }, 38 | ], 39 | } 40 | 41 | 42 | def build_pipeline(cuda_version): 43 | return { 44 | 'kind': 'pipeline', 45 | 'name': 'CUDA %s' % cuda_version, 46 | 'platform': { 47 | 'os': "linux", 48 | 'arch': 'amd64', 49 | }, 50 | 'steps': [ 51 | { 52 | 'name': 'build', 53 | 'pull': 'never', 54 | 'image': 'patwie/cuda:%s' % cuda_version, 55 | 'commands': [ 56 | 'mkdir build', 57 | 'cd build', 58 | 'cmake ..', 59 | 'make', 60 | ], 61 | }, 62 | { 63 | 'name': 'test', 64 | 'pull': 'never', 65 | 'image': 'patwie/cuda:%s' % cuda_version, 66 | 'commands': [ 67 | './build/test_cpu', 68 | ], 69 | }, 70 | ], 71 | 'depends_on': ['quality'] 72 | } 73 | -------------------------------------------------------------------------------- /.drone.yml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: pipeline 3 | name: quality 4 | 5 | platform: 6 | os: linux 7 | arch: amd64 8 | 9 | steps: 10 | - name: format 11 | pull: never 12 | image: patwie/clang-format:latest 13 | commands: 14 | - ./.ci/check.sh 15 | 16 | - name: lint 17 | pull: never 18 | image: patwie/cpplint:latest 19 | commands: 20 | - cpplint --recursive . 21 | 22 | --- 23 | kind: pipeline 24 | name: CUDA 8.0 25 | 26 | platform: 27 | os: linux 28 | arch: amd64 29 | 30 | steps: 31 | - name: build 32 | pull: never 33 | image: patwie/cuda:8.0 34 | commands: 35 | - mkdir build 36 | - cd build 37 | - cmake .. 38 | - make 39 | 40 | - name: test 41 | pull: never 42 | image: patwie/cuda:8.0 43 | commands: 44 | - ./build/test_cpu 45 | 46 | depends_on: 47 | - quality 48 | 49 | --- 50 | kind: pipeline 51 | name: CUDA 9.0 52 | 53 | platform: 54 | os: linux 55 | arch: amd64 56 | 57 | steps: 58 | - name: build 59 | pull: never 60 | image: patwie/cuda:9.0 61 | commands: 62 | - mkdir build 63 | - cd build 64 | - cmake .. 65 | - make 66 | 67 | - name: test 68 | pull: never 69 | image: patwie/cuda:9.0 70 | commands: 71 | - ./build/test_cpu 72 | 73 | depends_on: 74 | - quality 75 | 76 | --- 77 | kind: pipeline 78 | name: CUDA 9.1 79 | 80 | platform: 81 | os: linux 82 | arch: amd64 83 | 84 | steps: 85 | - name: build 86 | pull: never 87 | image: patwie/cuda:9.1 88 | commands: 89 | - mkdir build 90 | - cd build 91 | - cmake .. 92 | - make 93 | 94 | - name: test 95 | pull: never 96 | image: patwie/cuda:9.1 97 | commands: 98 | - ./build/test_cpu 99 | 100 | depends_on: 101 | - quality 102 | 103 | --- 104 | kind: pipeline 105 | name: CUDA 9.2 106 | 107 | platform: 108 | os: linux 109 | arch: amd64 110 | 111 | steps: 112 | - name: build 113 | pull: never 114 | image: patwie/cuda:9.2 115 | commands: 116 | - mkdir build 117 | - cd build 118 | - cmake .. 119 | - make 120 | 121 | - name: test 122 | pull: never 123 | image: patwie/cuda:9.2 124 | commands: 125 | - ./build/test_cpu 126 | 127 | depends_on: 128 | - quality 129 | 130 | --- 131 | kind: pipeline 132 | name: CUDA 10.0 133 | 134 | platform: 135 | os: linux 136 | arch: amd64 137 | 138 | steps: 139 | - name: build 140 | pull: never 141 | image: patwie/cuda:10.0 142 | commands: 143 | - mkdir build 144 | - cd build 145 | - cmake .. 146 | - make 147 | 148 | - name: test 149 | pull: never 150 | image: patwie/cuda:10.0 151 | commands: 152 | - ./build/test_cpu 153 | 154 | depends_on: 155 | - quality 156 | 157 | --- 158 | kind: pipeline 159 | name: CUDA 10.1 160 | 161 | platform: 162 | os: linux 163 | arch: amd64 164 | 165 | steps: 166 | - name: build 167 | pull: never 168 | image: patwie/cuda:10.1 169 | commands: 170 | - mkdir build 171 | - cd build 172 | - cmake .. 173 | - make 174 | 175 | - name: test 176 | pull: never 177 | image: patwie/cuda:10.1 178 | commands: 179 | - ./build/test_cpu 180 | 181 | depends_on: 182 | - quality 183 | 184 | --- 185 | kind: signature 186 | hmac: 7d2643e4c55153be2ec8b9abff2231bd4b10f0be12de2308e98025346504d8c6 187 | 188 | ... 189 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .local/ 2 | 3 | ### C++ ### 4 | # Prerequisites 5 | *.d 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | *.obj 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | *.smod 25 | 26 | # Compiled Static libraries 27 | *.lai 28 | *.la 29 | *.a 30 | *.lib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | 37 | ### CMake ### 38 | CMakeLists.txt.user 39 | CMakeCache.txt 40 | CMakeFiles 41 | CMakeScripts 42 | Testing 43 | Makefile 44 | cmake_install.cmake 45 | install_manifest.txt 46 | compile_commands.json 47 | CTestTestfile.cmake 48 | _deps 49 | 50 | ### CMake Patch ### 51 | # External projects 52 | *-prefix/ 53 | 54 | 55 | build/ 56 | ### Bazel ### 57 | # gitignore template for Bazel build system 58 | # website: https://bazel.build/ 59 | 60 | # Ignore all bazel-* symlinks. There is no full list since this can change 61 | # based on the name of the directory bazel is cloned into. 62 | /bazel-* 63 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility=["//visibility:public"]) 2 | load("//:cuda.bzl", "cuda_binary") 3 | 4 | cc_library( 5 | name="multiply", 6 | srcs=[ 7 | "src/multiply/multiply.cc", 8 | "src/multiply/multiply_cpu.cc" 9 | ], 10 | hdrs=[ 11 | "include/multiply/multiply.h", 12 | "include/cuda_utils.h" 13 | ], 14 | ) 15 | 16 | # TODO(patwie): typed tests fail 17 | # cc_test( 18 | # name="multiply-test", 19 | # srcs=[ 20 | # "test/test_multiply_impl.h", 21 | # "test/test_multiply.cc", 22 | # ], 23 | # copts=[ 24 | # "-Iexternal/gtest/include", 25 | # "-Iexternal/gmock/include" 26 | # ], 27 | # deps=[ 28 | # ":multiply", 29 | # "@gtest//:gtest", 30 | # ] 31 | # ) 32 | 33 | cc_binary( 34 | name="multiply-example", 35 | srcs=[ 36 | "src/multiply.cc" 37 | ], 38 | deps=[ 39 | ":multiply", 40 | ], 41 | ) 42 | 43 | cuda_binary( 44 | name="sharedmemory-example", 45 | includes=[ 46 | "/usr/local/cuda/include", 47 | "." 48 | ], 49 | hdrs=[ 50 | "include/cuda_utils.h", 51 | "include/cuda_index.h", 52 | "include/test/multiply.h", 53 | ], 54 | flags="-std=c++11", 55 | srcs=[ 56 | "src/sharedmemory.cu.cc" 57 | ], 58 | ) 59 | 60 | cuda_binary( 61 | name="tune-example", 62 | includes=[ 63 | "/usr/local/cuda/include", 64 | "." 65 | ], 66 | hdrs=[ 67 | "include/cuda_utils.h", 68 | "include/test/multiply.h", 69 | ], 70 | flags="-std=c++11", 71 | srcs=[ 72 | "src/tune.cu.cc" 73 | ], 74 | 75 | ) 76 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 ) 2 | project(example) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Ofast -Wall -Wextra ") 7 | enable_testing() 8 | 9 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) 10 | 11 | find_package(GTest REQUIRED) 12 | include_directories(${GTEST_INCLUDE_DIRS}) 13 | 14 | find_package(CUDA) 15 | include(CudaHelper) 16 | 17 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 18 | 19 | add_library(multiply_cpu SHARED src/multiply/multiply_cpu.cc src/multiply/multiply.cc ) 20 | list(APPEND LIBRARIES "multiply_cpu") 21 | 22 | if(CUDA_FOUND) 23 | cuda_add_library(multiply_gpu SHARED src/multiply/multiply_gpu.cu.cc src/multiply/multiply.cc ) 24 | list(APPEND LIBRARIES "multiply_gpu") 25 | 26 | cuda_add_executable(sharedmemory src/sharedmemory.cu.cc ) 27 | cuda_add_executable(tune src/tune.cu.cc ) 28 | endif(CUDA_FOUND) 29 | 30 | add_executable(multiply src/multiply.cc ) 31 | target_link_libraries(multiply LINK_PUBLIC ${LIBRARIES}) 32 | 33 | # Benchmark 34 | if(CUDA_FOUND) 35 | cuda_add_executable(benchmark 36 | src/benchmark-multiply.cu.cc 37 | src/multiply/multiply_gpu.cu.cc 38 | src/multiply/multiply.cc 39 | ) 40 | endif(CUDA_FOUND) 41 | 42 | # TESTS 43 | # TODO(): find a more robust way to link gmock, see docker file for setup 44 | # This currently assumes gmock.a is next to gtest.a 45 | link_directories($ENV{GTEST_ROOT}/lib) 46 | add_executable(test_cpu test/test_multiply.cc) 47 | target_link_libraries(test_cpu ${GTEST_LIBRARIES} gmock ${LIBRARIES} pthread) 48 | 49 | add_test(TestCpu test_cpu) 50 | 51 | if(TEST_CUDA) 52 | cuda_add_executable(test_gpu test/test_multiply.cu.cc) 53 | target_link_libraries(test_gpu ${GTEST_LIBRARIES} gmock ${LIBRARIES} pthread) 54 | add_test(TestGpu test_gpu) 55 | endif(TEST_CUDA) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Design Patterns 2 | 3 | Some best practises I collected over the last years when writing CUDA kernels. These functions 4 | do not dictate how to use CUDA, these just simplify your workflow. I am not a big fan of libraries which rename things via wrappers. All code below does add additional benefits in CUDA programming. 5 | 6 | ## CUDA Boilerplate Code 7 | 8 | [EXAMPLE](./src/multiply/multiply_gpu.cu.cc) 9 | 10 | **Description:** 11 | Avoid plain a CUDA kernel functions and instead pack them into a struct. 12 | 13 | 14 | ```cpp 15 | template 16 | struct MyKernel : public cuda::Kernel { 17 | void Launch(cudaStream_t stream = 0) { 18 | cuda::Run<<<1, 1, 0, stream>>>(*this); 19 | } 20 | __device__ __forceinline__ void operator()() const override { 21 | printf("hi from device code with value %f\n", val); 22 | } 23 | 24 | ValueT val; 25 | }; 26 | 27 | MyKernel kernel; 28 | kernel.val = 42.f; 29 | kernel.Launch(); 30 | ``` 31 | 32 | **Reasons:** 33 | 34 | - This allows much better organization of used parameters. We recommend 35 | to write them at the end of the struct, such that when writing the CUDA kernel itself 36 | they are always visible. 37 | - These structs can contain or compute the launch configuration (grid, block, shm size) depending on the parameters. 38 | - Multiple kernel launches require less code, as we do not need to type out all parameters over and over again for a second or third launch. 39 | 40 | 41 | ## Functors 42 | 43 | [EXAMPLE](./src/multiply.cc) 44 | 45 | **Description:** 46 | Use templated `structs` to switch seemlessly between CPU and GPU code: 47 | 48 | ```cpp 49 | Multiply::Apply(A, B, 2, 2, C); // run CPU 50 | Multiply::Apply(A, B, 2, 2, C); // run GPU 51 | Multiply::Apply(A, B, 2, 2, C); // run GPU if available else on CPU 52 | ``` 53 | 54 | **Reasons:** 55 | 56 | - Switching between different devices is straight-forward. 57 | - Understanding unit-tests which compare and verify the output becomes more easy. 58 | 59 | ## Shared Memory 60 | 61 | [EXAMPLE](./src/sharedmemory.cu.cc) 62 | 63 | Use 64 | 65 | ```cpp 66 | cuda::SharedMemory shm; 67 | float* floats_5 = shm.ref(5); 68 | int* ints_3 = shm.ref(3); 69 | ``` 70 | 71 | instead of 72 | 73 | ```cpp 74 | extern __shared__ char* shm[]; 75 | float* val1 = reinterpret_cast(&shm[0]); // 5 floats 76 | int* val2 = reinterpret_cast(&shm[5]); // 3 ints 77 | ``` 78 | 79 | 80 | **Reasons:** 81 | 82 | - The number of values of specific data types to read should be on the same line as the declaration. This way adding additional shared memory becomes easier during development. 83 | 84 | ## CUDA Kernel Dispatcher 85 | 86 | [EXAMPLE](./src/tune.cu.cc) 87 | 88 | Like in the *CUDA Boilerplate Code* example we pack our kernels into structs. For different hyper-parameters we use template specialization. 89 | 90 | Given a generic CUDA kernel and a specialization 91 | 92 | ```cpp 93 | template 94 | struct MyKernel : public cuda::Kernel {} 95 | 96 | template 97 | struct MyKernel : public cuda::Kernel {} 98 | ``` 99 | 100 | we use the kernel dispatcher 101 | 102 | ```cpp 103 | MyKernel kernelA; 104 | MyKernel kernelB; 105 | 106 | cuda::KernelDispatcher dispatcher(true); 107 | dispatcher.Register>(3); // for length up to 3 (inclusive) start MyKernel 108 | dispatcher.Register>(6); // for length up to 6 (inclusive) start MyKernel 109 | // as `dispatcher(true)` this kernel will handle all 110 | // larger values as well 111 | int i = 4; // a runtime value 112 | dispatcher.Run(i); // triggers `kernelB` 113 | ``` 114 | 115 | The dispatcher can also handle multi-dim values and a initializer 116 | 117 | ```cpp 118 | struct Initializer { 119 | template 120 | void operator()(T* el) { 121 | el->val = 42.f; 122 | } 123 | }; 124 | Initializer init; 125 | cuda::KernelDispatcher> disp(true); 126 | disp.Register>(std::make_tuple(4, 3), init); 127 | disp.Register>(std::make_tuple(9, 4), init); 128 | ``` 129 | 130 | **Reasons:** 131 | 132 | - Changing the block-dims will have performance impact. A templated CUDA kernel can execute special implementations for different hyper-parameters. 133 | - A switch-statement dispatching run-time variables into a templated instantiation requires code-duplication, which can be avoid by the dispatcher. 134 | 135 | ## CUDA Index Calculation 136 | 137 | [EXAMPLE](./src/deprecated_examples.cu_old) 138 | 139 | Do not compute indicies by hand when appropriate and use 140 | 141 | ```cpp 142 | // or even ... 143 | // Used 8 registers, 368 bytes cmem[0] 144 | __global__ void readme_alternative2(float *src, float *dst, 145 | int B, int H, int W, int C, 146 | int b, int h, int w, int c) { 147 | auto src_T = NdArray(src, B, H, W, C); 148 | auto dst_T = NdArray(dst, B, H, W, C); 149 | dst_T(b, h, w, c + 1) = src_T(b, h, w, c); 150 | 151 | // Unflatten the index. 152 | auto index = NdIndex<4>(B, H, W, C); 153 | size_t flattened_index = index(b, h, w, c); 154 | 155 | int b_=0, h_=0, w_=0, c_=0; 156 | index.unflatten(flattened_index, b_, h_, w_, c_); 157 | } 158 | ``` 159 | 160 | instead of 161 | 162 | ```cpp 163 | // spot the bug 164 | // Used 6 registers, 368 bytes cmem[0] 165 | __global__ void readme_normal(float *src, float *dst, 166 | int B, int H, int W, int C, 167 | int b, int h, int w, int c) { 168 | const int pos1 = b * (H * W * C) + h * (W * c) + w * (C) + c; 169 | const int pos2 = b * (H * W * C) + h * (W * C) + w * (C) + (c + 1); 170 | dst[pos2] = src[pos1]; 171 | } 172 | ``` 173 | 174 | **Reasons**: 175 | 176 | - It is time-consuming and not worthwhile to concern yourself with index calculations. When writing CUDA code, you usually have many other vital things to ponder. 177 | - Each additional character increases the hit rate for a bug! 178 | - **I'm sick and tired of manually typing the indices.** 179 | - NdArray can have a positive impact on the number of used registers. 180 | 181 | **Cons:** 182 | 183 | - The compiler might not be able to optimize the `NdArray` overhead "away". 184 | - NdArray can have a negative impact on the number of used registers. 185 | 186 | ## CMake Setup 187 | 188 | **Description:** 189 | Use CMake to configure which targets should be build. By default set `TEST_CUDA=ON` and `WITH_CUDA=OFF`. 190 | The workflow (for this repository) is: 191 | 192 | ```bash 193 | mkdir build && cd build 194 | cmake -DCMAKE_BUILD_TYPE=Release .. 195 | # or more specific 196 | cmake -DCMAKE_BUILD_TYPE=Release -DTEST_CUDA=ON -DCUDA_ARCH="52 60" .. 197 | make 198 | make test 199 | ``` 200 | 201 | **Reasons:** 202 | 203 | - Most CIs do not have a CUDA runtime installed. Whenever, `WITH_CUDA=ON` is activated the test code for CUDA will be also build. 204 | - FindCuda might be more robust than a custom makefile. 205 | 206 | ## Benchmark Kernels 207 | 208 | [EXAMPLE](./src/benchmark-multiply.cu.cc) 209 | 210 | **Description:** 211 | Like in the *CUDA Boilerplate Code* example we pack our kernels into structs. We might want th benchmark different template arguments. 212 | 213 | ```cpp 214 | cuda::KernelBenchmark bench; 215 | bench.Case>(init); 216 | bench.Case>(init); 217 | bench.Case>(init); 218 | bench.Case>(init); 219 | bench.Case>(init); 220 | bench.Start(); 221 | ``` 222 | 223 | will give the output: 224 | 225 | ``` 226 | Using Device Number: 0 227 | Device name: GeForce GTX 970 228 | Memory Clock Rate (KHz): 3505000 229 | Memory Bus Width (bits): 256 230 | Peak Memory Bandwidth (GB/s): 224.320000 231 | 232 | time 500.000000 - 1000.000000, iters: 5 - 100 233 | - multiply_kernels::Multiply took 2.826743 ms stats(iters: 100, var: 0.067757, stddev: 0.260302) 234 | - multiply_kernels::Multiply took 1.245100 ms stats(iters: 100, var: 0.019352, stddev: 0.139112) 235 | - multiply_kernels::Multiply took 0.574468 ms stats(iters: 100, var: 0.000003, stddev: 0.001616) 236 | - multiply_kernels::Multiply took 0.502195 ms stats(iters: 100, var: 0.000002, stddev: 0.001380) 237 | - multiply_kernels::Multiply took 0.510635 ms stats(iters: 100, var: 0.000001, stddev: 0.001121) 238 | 239 | ``` 240 | 241 | ## Tools 242 | - [online CUDA calculator](http://cuda.patwie.com/) instead of the NVIDIA Excel-sheet 243 | - [nvprof2json](https://github.com/PatWie/nvprof2json) to visualize NVIDIA profiling outputs in Google Chrome Browser (no dependencies compared to NVIDIA nvvp) 244 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 2 | 3 | http_archive( 4 | name = "gtest", 5 | url = "https://github.com/google/googletest/archive/release-1.8.1.zip", 6 | sha256 = "927827c183d01734cc5cfef85e0ff3f5a92ffe6188e0d18e909c5efebf28a0c7", 7 | build_file = "gtest.BUILD", 8 | strip_prefix = "googletest-release-1.8.1", 9 | ) 10 | -------------------------------------------------------------------------------- /cmake/CudaHelper.cmake: -------------------------------------------------------------------------------- 1 | OPTION(TEST_CUDA "Build Tests for CUDA" ON) 2 | OPTION(SHOW_PTXAS "Build Tests for CUDA" ON) 3 | set(CUDA_ARCH "" CACHE STRING "Target CUDA Architectures multiple are allowed") 4 | 5 | # We use *.cu.cc as the default as most tool do not understand cu as CUDA. 6 | file(GLOB_RECURSE source_list "*.cu.cc") 7 | foreach(child ${source_list}) 8 | set_source_files_properties(${child} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) 9 | endforeach() 10 | 11 | 12 | # CUDA not available 13 | if(CUDA_FOUND) 14 | 15 | # We can only build cuda tests if building cuda is enabled. 16 | message(STATUS "Build with CUDA support") 17 | 18 | if(TEST_CUDA) 19 | message(STATUS "Build tests for CUDA") 20 | endif(TEST_CUDA) 21 | 22 | 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_CUDA ") 24 | include_directories(${CUDA_INCLUDE_DIRS}) 25 | 26 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 --expt-relaxed-constexpr -DWITH_CUDA ") 27 | 28 | # Xptxas dumps register usage 29 | if(SHOW_PTXAS) 30 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xptxas=-v") 31 | endif(SHOW_PTXAS) 32 | 33 | if(CMAKE_BUILD_TYPE STREQUAL "Release") 34 | message(STATUS "Build CUDA in ${CMAKE_BUILD_TYPE} mode") 35 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -Ofast") 36 | endif(CMAKE_BUILD_TYPE STREQUAL "Release") 37 | 38 | if(CUDA_ARCH STREQUAL "") 39 | # good defaults for CUDA Toolkit 8.x 40 | if(CUDA_VERSION_MAJOR MATCHES 8) 41 | set(CUDA_ARCH "35 37 52 60") 42 | endif(CUDA_VERSION_MAJOR MATCHES 8) 43 | 44 | # good defaults for CUDA Toolkit 9.x 45 | if(CUDA_VERSION_MAJOR MATCHES 9) 46 | set(CUDA_ARCH "35 52 60 70") 47 | endif(CUDA_VERSION_MAJOR MATCHES 9) 48 | 49 | # good defaults for CUDA Toolkit 10.x 50 | if(CUDA_VERSION_MAJOR MATCHES 10) 51 | set(CUDA_ARCH "35 52 60 70") 52 | endif(CUDA_VERSION_MAJOR MATCHES 10) 53 | endif(CUDA_ARCH STREQUAL "") 54 | 55 | # str replace ' ' with ; 56 | STRING(REGEX REPLACE " " ";" CUDA_ARCH ${CUDA_ARCH}) 57 | 58 | # set the compiler flags for each NV target 59 | foreach(target ${CUDA_ARCH}) 60 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_${target},code=\\\"sm_${target},compute_${target}\\\") 61 | endforeach(target ${CUDA_ARCH}) 62 | 63 | else(CUDA_FOUND) 64 | 65 | message(STATUS "Build CUDA and tests for CUDA are disabled") 66 | set(TEST_CUDA OFF) 67 | 68 | endif(CUDA_FOUND) -------------------------------------------------------------------------------- /cuda.bzl: -------------------------------------------------------------------------------- 1 | def _cuda_binary(ctx): 2 | default_flags = ctx.attr.flags + \ 3 | " -x=cu -Xcompiler \"-O3 -Ofast -Wall -Wextra -DWITH_CUDA\"" 4 | 5 | cmd = "/usr/local/cuda/bin/nvcc -D__CUDACC__ " 6 | cmd += " " + default_flags + " " 7 | 8 | for src in ctx.files.srcs: 9 | cmd += src.path 10 | 11 | executable = ctx.actions.declare_file(ctx.attr.name) 12 | cmd += " -o " + executable.path 13 | 14 | for include in ctx.attr.includes: 15 | cmd += " -I" + include 16 | 17 | ctx.actions.run_shell( 18 | outputs=[ctx.actions.declare_file(ctx.label.name)], 19 | inputs=ctx.files.srcs + ctx.files.hdrs, 20 | command=cmd, 21 | mnemonic="CudaCompile", 22 | progress_message="compile cuda", 23 | use_default_shell_env=True, 24 | ) 25 | 26 | return [DefaultInfo( 27 | files=depset([executable]), 28 | executable=executable, 29 | )] 30 | 31 | 32 | cuda_binary = rule( 33 | implementation=_cuda_binary, 34 | executable=True, 35 | attrs={ 36 | "flags": attr.string(default=""), 37 | "srcs": attr.label_list(default=[], allow_files=[".cc"]), 38 | "hdrs": attr.label_list(default=[], allow_files=[".h"]), 39 | "includes": attr.string_list(default=[]), 40 | "out": attr.output(mandatory=False), 41 | }, 42 | ) 43 | -------------------------------------------------------------------------------- /gtest.BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | visibility=["//visibility:public"], 3 | name="gtest", 4 | srcs=glob( 5 | include=[ 6 | "googletest/src/*.cc", 7 | "googletest/src/*.h", 8 | "googletest/include/gtest/**/*.h", 9 | "googlemock/src/*.cc", 10 | "googlemock/include/gmock/**/*.h", 11 | ], 12 | exclude=[ 13 | "googletest/src/gtest-all.cc", 14 | "googletest/src/gtest_main.cc", 15 | "googlemock/src/gmock-all.cc", 16 | "googlemock/src/gmock_main.cc", 17 | ], 18 | ), 19 | hdrs=glob([ 20 | "googletest/include/gtest/*.h", 21 | "googlemock/include/gmock/*.h", 22 | ]), 23 | copts=select({ 24 | "//conditions:default": ["-pthread -DGTEST_HAS_TYPED_TEST_P"], 25 | }), 26 | defines=select({ 27 | "//conditions:default": [], 28 | }), 29 | includes=[ 30 | "googlemock", 31 | "googlemock/include", 32 | "googletest", 33 | "googletest/include", 34 | ], 35 | linkopts=select({ 36 | "//conditions:default": ["-pthread"], 37 | }), 38 | deps=select({ 39 | "//conditions:default": [], 40 | }), 41 | features=select({ 42 | "//conditions:default": [], 43 | }) 44 | ) 45 | -------------------------------------------------------------------------------- /include/cuda_benchmark.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #ifndef INCLUDE_CUDA_BENCHMARK_H_ 20 | #define INCLUDE_CUDA_BENCHMARK_H_ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef __GNUG__ 34 | #include 35 | #include 36 | #endif 37 | #include "include/cuda_utils.h" 38 | 39 | namespace cuda { 40 | 41 | namespace internal { 42 | // taken from https://stackoverflow.com/a/4541470/7443104 43 | #ifdef __GNUG__ 44 | 45 | template 46 | std::string demangle(const char* name) { 47 | int status = -4; // some arbitrary value to eliminate the compiler warning 48 | 49 | // enable c++11 by passing the flag -std=c++11 to g++ 50 | std::unique_ptr res{ 51 | abi::__cxa_demangle(name, NULL, NULL, &status), std::free}; 52 | 53 | return (status == 0) ? res.get() : name; 54 | } 55 | 56 | #else 57 | 58 | // does nothing if not g++ 59 | template 60 | std::string demangle(const char* name) { 61 | return name; 62 | } 63 | 64 | #endif 65 | } // namespace internal 66 | 67 | /** 68 | * Benchmarks several templated kernels. 69 | * 70 | * cuda::KernelBenchmark bench; 71 | * bench.Case>(init); 72 | * bench.Case>(init); 73 | * bench.Case>(init); 74 | * bench.Case>(init); 75 | * bench.Case>(init); 76 | * bench.Start(); 77 | */ 78 | template 79 | class KernelBenchmark { 80 | using TLauncherFunc = std::function; 81 | using ValueT = std::tuple; 82 | 83 | // we test at most 1 second 84 | const float max_time_ms = 1000; 85 | // we test at least 0.5 second 86 | const float min_time_ms = 500; 87 | // we test at most 100 times 88 | const int min_iterations = 5; 89 | const int max_iterations = 100; 90 | const int device_id = 0; 91 | 92 | public: 93 | // Register a kernel. 94 | // 95 | // Example 96 | // cuda::KernelDispatcher dispatcher; 97 | // dispatcher.Case>(); 98 | template 99 | void Case() { 100 | static_assert(internal::HasLaunchMethod::value, 101 | "The kernel struct needs to have a 'Launch()' method! " 102 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 103 | // NOTE: std::shared_ptr, std::unique_ptr does not work here 104 | // eg. std::shared_ptr kernel(new T()); 105 | // so we delete these objects by collecting them 106 | T* kernel = new T(); // needs to be on heap 107 | deleter_.push_back([&]() { delete kernel; }); 108 | Place([&kernel]() { kernel->Launch(); }); 109 | } 110 | 111 | // Register and intialize a kernel. 112 | // 113 | // Example 114 | // cuda::KernelDispatcher dispatcher; 115 | // initializer init; 116 | // dispatcher.Case>(init); 117 | template 118 | void Case(Initializer initializer) { 119 | static_assert(internal::HasLaunchMethod::value, 120 | "The kernel struct needs to have a 'Launch()' method! " 121 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 122 | // NOTE: std::shared_ptr, std::unique_ptr does not work here 123 | // eg. std::shared_ptr kernel(new T()); 124 | // so we delete these objects by collecting them 125 | T* kernel = new T(); // needs to be on heap 126 | deleter_.push_back([&]() { delete kernel; }); 127 | initializer(kernel); 128 | Place([&kernel]() { kernel->Launch(); }); 129 | } 130 | 131 | KernelBenchmark() = default; 132 | KernelBenchmark(float min_time_ms, float max_time_ms, int min_iterations, 133 | int max_iterations) 134 | : min_time_ms(min_time_ms), 135 | max_time_ms(max_time_ms), 136 | min_iterations(min_iterations), 137 | max_iterations(max_iterations) {} 138 | 139 | virtual ~KernelBenchmark() { 140 | for (auto del : deleter_) { 141 | del(); 142 | } 143 | } 144 | 145 | void DeviceInfo() { 146 | ASSERT_CUDA(cudaSetDevice(device_id)); 147 | cudaDeviceProp prop; 148 | ASSERT_CUDA(cudaGetDeviceProperties(&prop, device_id)); 149 | printf("Using Device Number: %d\n", device_id); 150 | printf(" Device name: %s\n", prop.name); 151 | printf(" Memory Clock Rate (KHz): %d\n", prop.memoryClockRate); 152 | printf(" Memory Bus Width (bits): %d\n", prop.memoryBusWidth); 153 | printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 154 | 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6); 155 | } 156 | 157 | void Start() { 158 | #if __CUDACC__ 159 | cudaEvent_t start, stop; 160 | cudaEventCreate(&start); 161 | cudaEventCreate(&stop); 162 | 163 | int longest_name_len = 0; 164 | for (auto&& kernel : kernels_) { 165 | int len = std::get<1>(kernel).length(); 166 | if (len > longest_name_len) { 167 | longest_name_len = len; 168 | } 169 | } 170 | DeviceInfo(); 171 | printf("time %f - %f, iters: %d - %d\n", min_time_ms, max_time_ms, 172 | min_iterations, max_iterations); 173 | 174 | for (auto&& kernel : kernels_) { 175 | const std::string name = std::get<1>(kernel); 176 | printf(" - %-*s ", longest_name_len + 1, name.c_str()); 177 | 178 | // burn in 179 | for (int i = 0; i < 5; ++i) { 180 | cudaEventRecord(start); 181 | std::get<0>(kernel)(); 182 | cudaEventRecord(stop); 183 | } 184 | 185 | // real measurement 186 | float total_milliseconds = 0; 187 | int used_iterations = 0; 188 | 189 | float old_mean_time = 0; 190 | float cur_mean_time = 0; 191 | float old_var_time = 0.0; 192 | float cur_var_time = 0.0; 193 | 194 | for (int counter = 0; counter < max_iterations; 195 | counter++, used_iterations++) { 196 | // measure kernel 197 | float milliseconds = 0; 198 | cudaEventRecord(start); 199 | std::get<0>(kernel)(); 200 | cudaEventRecord(stop); 201 | ASSERT_CUDA(cudaPeekAtLastError()); 202 | ASSERT_CUDA(cudaDeviceSynchronize()); 203 | cudaEventSynchronize(stop); 204 | cudaEventElapsedTime(&milliseconds, start, stop); 205 | total_milliseconds += milliseconds; 206 | 207 | // Estimate if the result is stable enough to be reported. 208 | // We want to run at least two runs (variance needs this). 209 | if (counter > 0) { 210 | // Update running statistics. 211 | cur_mean_time = 212 | old_mean_time + (milliseconds - old_mean_time) / (counter + 1); 213 | cur_var_time = old_var_time + (milliseconds - old_mean_time) * 214 | (milliseconds - cur_mean_time); 215 | 216 | old_var_time = cur_var_time; 217 | old_mean_time = cur_mean_time; 218 | 219 | if (total_milliseconds <= min_time_ms) { 220 | continue; 221 | } 222 | 223 | // We can stop if it took already too long. 224 | if (total_milliseconds > max_time_ms) { 225 | break; 226 | } 227 | 228 | // We want at least some iterations. 229 | if (counter >= min_iterations) { 230 | // Is std-dev small enough? 231 | float real_stdev = sqrt(cur_var_time / (used_iterations - 1)); 232 | if (real_stdev < 0.01 * cur_mean_time) { 233 | break; 234 | } 235 | } 236 | 237 | } else { 238 | old_mean_time = milliseconds; 239 | cur_mean_time = milliseconds; 240 | } 241 | } 242 | 243 | printf(" took %12f ms stats(iters: %3d, var: %12f, stddev: %12f)\n", 244 | total_milliseconds / used_iterations, used_iterations, 245 | cur_var_time / (used_iterations - 1), 246 | sqrt(cur_var_time / (used_iterations - 1))); 247 | } 248 | cudaEventDestroy(start); 249 | cudaEventDestroy(stop); 250 | 251 | #endif // __CUDACC__ 252 | } 253 | 254 | private: 255 | template 256 | void Place(TLauncherFunc&& launch_func) { 257 | kernels_.push_back( 258 | std::move(std::make_tuple(std::forward(launch_func), 259 | internal::demangle<0>(typeid(T).name())))); 260 | } 261 | 262 | std::vector deleter_; 263 | std::vector kernels_; 264 | bool extend = true; // if true kernel with largest bound will act as default 265 | }; 266 | } // namespace cuda 267 | 268 | #endif // INCLUDE_CUDA_BENCHMARK_H_ 269 | -------------------------------------------------------------------------------- /include/cuda_index.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2018 16 | * 17 | */ 18 | 19 | #ifndef INCLUDE_CUDA_INDEX_H_ 20 | #define INCLUDE_CUDA_INDEX_H_ 21 | 22 | #if __CUDACC__ 23 | 24 | #define cuda_inline __device__ __host__ __forceinline__ 25 | 26 | #include 27 | #include 28 | 29 | namespace cuda { 30 | 31 | namespace internal { 32 | 33 | template 34 | struct pitch_helper { 35 | constexpr size_t call(const size_t dimensions_[TRank]) const { 36 | return pitch_helper().call( 37 | dimensions_); 38 | } 39 | }; 40 | 41 | template 42 | struct pitch_helper { 43 | constexpr size_t call(const size_t dimensions_[TRank]) const { 44 | return dimensions_[TPos] * 45 | pitch_helper().call(dimensions_); 46 | } 47 | }; 48 | 49 | template 50 | struct pitch_helper { 51 | constexpr size_t call(const size_t dimensions_[TRank]) const { return 1; } 52 | }; 53 | 54 | template 55 | struct position_helper { 56 | constexpr size_t call(const size_t dimensions_[TRank], T v, Ts... is) const { 57 | return v * pitch_helper().call( 58 | dimensions_) + 59 | position_helper().call(dimensions_, 60 | is...); 61 | } 62 | }; 63 | 64 | template 65 | struct position_helper { 66 | constexpr size_t call(const size_t dimensions_[TRank], T v) const { 67 | return v; 68 | } 69 | }; 70 | 71 | template 72 | struct unflatten_helper { 73 | template 74 | static constexpr void call(const size_t dimensions_[TRank], 75 | size_t flattenedIndex, size_t& index, 76 | Ts&... indices) noexcept { 77 | const size_t pitch = 78 | pitch_helper().call(dimensions_); 79 | index = flattenedIndex / pitch; 80 | unflatten_helper::call( 81 | dimensions_, flattenedIndex % pitch, indices...); 82 | } 83 | }; 84 | 85 | template 86 | struct unflatten_helper { 87 | template 88 | static constexpr void call(const size_t dimensions_[TRank], 89 | size_t flattenedIndex, size_t& index, 90 | Ts&... indices) noexcept { 91 | index = flattenedIndex; 92 | } 93 | }; 94 | 95 | }; // namespace internal 96 | 97 | template 98 | struct BaseNdIndex { 99 | protected: 100 | size_t dimensions_[TRank]; 101 | 102 | public: 103 | template 104 | explicit constexpr cuda_inline BaseNdIndex(size_t i0, Ts... is) noexcept 105 | : dimensions_{i0, is...} {} 106 | 107 | /** 108 | * Check whether given coordinate is in range. 109 | */ 110 | template 111 | constexpr cuda_inline bool valid(size_t i0, Ts... is) const noexcept { 112 | static_assert(size_t(1) + sizeof...(Ts) == TRank, 113 | "Number of dimensions does not match rank! " 114 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 115 | return valid_impl<0, Ts...>(i0, is...); 116 | } 117 | 118 | /** 119 | * Return the number of axes. 120 | * @return number of axes 121 | */ 122 | constexpr cuda_inline size_t rank() const noexcept { return TRank; } 123 | 124 | /** 125 | * Return the dimension for a given axis. 126 | * 127 | * const size_t D = my_nd_array.template dim<1>(); 128 | * 129 | * @return dimension for given axis 130 | */ 131 | template 132 | constexpr cuda_inline size_t dim() const noexcept { 133 | static_assert(TAxis < TRank, "axis < rank failed"); 134 | return dimensions_[TAxis]; 135 | } 136 | 137 | /** 138 | * Unflatten a flattened index and retrieve the corresponding 139 | * indices for each dimension. 140 | * 141 | * size_t i=0, j=0, k=0; 142 | * idx.unflatten(flattenedIndex, i, j, k); 143 | * 144 | * @param flattenedIndex the flattened index to unflatten 145 | * @param indices references to variables to store the indices 146 | */ 147 | template 148 | constexpr cuda_inline void unflatten(size_t flattenedIndex, 149 | Ts&... indices) const noexcept { 150 | static_assert(sizeof...(Ts) == TRank, 151 | "Number of indices does not match rank! " 152 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 153 | internal::unflatten_helper::call( 154 | dimensions_, flattenedIndex, indices...); 155 | } 156 | 157 | private: 158 | template 159 | constexpr cuda_inline bool valid_impl(size_t i0, Ts... is) const { 160 | return (i0 < dimensions_[TNum]) && valid_impl(is...); 161 | } 162 | 163 | template 164 | constexpr cuda_inline bool valid_impl(T i0) const { 165 | return (i0 < dimensions_[TRank - 1]); 166 | } 167 | 168 | protected: 169 | template 170 | constexpr cuda_inline size_t index_(size_t i0, Ts... is) const { 171 | return internal::position_helper().call( 172 | dimensions_, i0, is...); 173 | } 174 | }; 175 | 176 | /** 177 | * Create an index object. 178 | * 179 | * The index object can handle various dimensions. 180 | * 181 | * auto idx = NdIndex<4>(B, H, W, C); 182 | * auto TPos = idx(b, h, w, c); 183 | * 184 | * @param rank in each dimensions. 185 | */ 186 | template 187 | struct NdIndex : public BaseNdIndex { 188 | public: 189 | template 190 | explicit constexpr cuda_inline NdIndex(size_t i0, Ts... is) noexcept 191 | : BaseNdIndex(i0, is...) { 192 | static_assert(size_t(1) + sizeof...(Ts) == TRank, 193 | "Number of dimensions does not match rank! " 194 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 195 | } 196 | 197 | /** 198 | * Get flattened index for a given position. 199 | * 200 | * auto idx = NdIndex<4>(10, 20, 30, 40); 201 | * size_t actual = idx(1, 2, 3, 4); 202 | * size_t expected = 1 * (20 * 30 * 40) + 2 * (30 * 40) + 3 * (40) + 4; 203 | */ 204 | template 205 | size_t cuda_inline operator()(size_t i0, Ts... is) const { 206 | static_assert(size_t(1) + sizeof...(Ts) == TRank, 207 | "Number of dimensions does not match rank! " 208 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 209 | return index_(i0, is...); 210 | } 211 | 212 | /** 213 | * Get dimension for a given axis. 214 | * 215 | * auto idx = NdIndex<4>(10, 20, 30, 40); 216 | * size_t actual = idx[1]; // is 20 217 | */ 218 | template 219 | size_t cuda_inline operator[](size_t i0) const { 220 | return BaseNdIndex::dimensions_[i0]; 221 | } 222 | }; 223 | 224 | template 225 | struct NdArray : public BaseNdIndex { 226 | T* data_; 227 | 228 | public: 229 | template 230 | explicit constexpr cuda_inline NdArray(T* data, size_t i0, Ts... is) noexcept 231 | : BaseNdIndex(i0, is...), data_(data) {} 232 | 233 | /** 234 | * Returns value from given position if valid, else 0; 235 | * 236 | * auto T = make_ndarray(data, A, B, C); 237 | * auto val = T.safe_value(a, b, c); 238 | * 239 | * is equal 240 | * 241 | * auto T = make_ndarray(data, A, B, C); 242 | * auto val = T.valid(a, b, c) ? T(a, b, c) : 0; 243 | */ 244 | template 245 | T cuda_inline safe_value(size_t i0, Ts... is) const { 246 | return valid(i0, is...) ? data_[index(i0, is...)] : 0; 247 | } 248 | 249 | /** 250 | * Returns value from given position if valid, else 0; 251 | * 252 | * auto T = make_ndarray(data, A, B, C); 253 | * auto val = T(a, b, c); 254 | */ 255 | template 256 | T cuda_inline operator()(size_t i0, Ts... is) const { 257 | return data_[index(i0, is...)]; 258 | } 259 | 260 | /** 261 | * Write value at given position. 262 | * 263 | * auto T = make_ndarray(data, A, B, C); 264 | * T(a, b, c) = 42; 265 | */ 266 | template 267 | T& __device__ __host__ operator()(size_t i0, Ts... is) { 268 | return data_[index(i0, is...)]; 269 | } 270 | 271 | /** 272 | * Wrap c-array read access 273 | */ 274 | template 275 | T cuda_inline operator[](size_t i0) const { 276 | return data_[i0]; 277 | } 278 | 279 | /** 280 | * Wrap c-array write access 281 | */ 282 | template 283 | T& __device__ __host__ operator[](size_t i0) { 284 | return data_[i0]; 285 | } 286 | 287 | /** 288 | * Returns index from given position. 289 | * auto T = make_ndarray(data, A, B, C); 290 | * size_t TPos = T.index(a, b, c); 291 | */ 292 | template 293 | constexpr cuda_inline TT index(TT i0, Ts... is) const { 294 | static_assert(size_t(1) + sizeof...(Ts) == TRank, 295 | "Number of dimensions does not match rank! " 296 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 297 | return index_(i0, is...); 298 | } 299 | 300 | T* flat() { return data_; } 301 | }; 302 | 303 | /** 304 | * Create a multi-dim. array object but ensures rank. 305 | * 306 | * The multi-dim. array object is a combination of a flat array and nd-index. 307 | * 308 | * const float* M = ...; 309 | * auto Mt = make_ndarray(M, B, H, W, C); 310 | * float val = Mt(b, h, w, c); 311 | * 312 | * @param rank in each dimensions. 313 | */ 314 | template 315 | cuda_inline auto make_ndarray(T* arr, size_t N0, Ts... Ns) 316 | -> NdArray { 317 | static_assert(size_t(1) + sizeof...(Ts) == TRank, 318 | "Number of dimensions does not match rank! " 319 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 320 | return NdArray(arr, N0, Ns...); 321 | } 322 | 323 | }; // namespace cuda 324 | 325 | #undef cuda_inline 326 | 327 | #endif // __CUDACC__ 328 | 329 | #endif // INCLUDE_CUDA_INDEX_H_ 330 | -------------------------------------------------------------------------------- /include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * Fabian Groh, , 2019 17 | * 18 | */ 19 | 20 | #ifndef INCLUDE_CUDA_UTILS_H_ 21 | #define INCLUDE_CUDA_UTILS_H_ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | // Template parameter for compile-time cuda drop-in replacements of cpu 34 | // functions. 35 | struct CpuDevice { 36 | static const int device_id = 1; 37 | }; 38 | 39 | struct GpuDevice { 40 | static const int device_id = 2; 41 | }; 42 | 43 | struct AnyDevice { 44 | static const int device_id = 0; 45 | }; 46 | 47 | #if __CUDACC__ 48 | // __CUDACC__ is defined by nvcc on device and host 49 | // __CUDA_ARCH__ is defined by nvcc on device 50 | 51 | /** 52 | * This is the default way of testing whether executing the CUDA kernel has been 53 | * successfull. 54 | * 55 | * Example: 56 | * Mykernel kernel; 57 | * kernel.Launch(); 58 | * ASSERT_CUDA(cudaDeviceSynchronize()); 59 | * 60 | * @param ans is a function that returns a cudaError_t 61 | * taken from: https://stackoverflow.com/a/14038590 62 | */ 63 | // #if NDEBUG 64 | // // disable assert in production code 65 | // #define ASSERT_CUDA(ans) ((void)ans) 66 | // #else // NDEBUG 67 | #define ASSERT_CUDA(ans) \ 68 | { gpuAssert((ans), __FILE__, __LINE__); } 69 | inline void gpuAssert(cudaError_t code, const char* file, int line, 70 | bool abort = true) { 71 | if (code != cudaSuccess) { 72 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, 73 | line); 74 | if (abort) exit(code); 75 | } 76 | } 77 | // #endif // NDEBUG 78 | 79 | namespace cuda { 80 | 81 | /** 82 | * Compute the number of blocks for a given number of threads and a workload. 83 | * @param N number of workload instance 84 | * @param num_threads number of threads per block 85 | * @return number of required blocks 86 | */ 87 | __host__ __device__ __forceinline__ int divUp(int N, int num_threads) { 88 | return (N + num_threads - 1) / num_threads; 89 | } 90 | 91 | // Kernel is an abstract CUDA kernel, which can have attached values to avoid 92 | // lengthly function signatures. 93 | struct Kernel { 94 | /** 95 | * Launch contains the computation of all kernel parameters and executes 96 | * the CUDA call. 97 | * 98 | * This should include the computation of the kernel configuration like 99 | * gridDim, blockDim, shared_memory size. We enforce to use a cuda stream. 100 | * 101 | * Example: 102 | * void Launch(cudaStream_t stream = 0){ 103 | * cuda::Run<<<1, 1, 0, stream>>>(*this); 104 | * } 105 | * 106 | * @param stream used cuda stream 107 | */ 108 | virtual void Launch(cudaStream_t stream = 0) = 0; 109 | virtual ~Kernel() = default; 110 | 111 | /** 112 | * This operation contains the code, which will be executed on-chip. 113 | */ 114 | virtual __device__ __forceinline__ void operator()() const = 0; 115 | }; 116 | 117 | // Run a cuda kernel encapsulated in a struct. 118 | // The kernel should have the following format 119 | // 120 | // struct Kernel { 121 | // void Launch(cudaStream_t stream = 0); 122 | // __device__ __forceinline__ void operator()(); 123 | // }; 124 | // 125 | template 126 | __global__ void Run(const T kernel) { 127 | kernel(); 128 | } 129 | 130 | /** 131 | * Proxy for shared memory when used in templates to avoid double extern. 132 | * 133 | * run_kernel<<>>(...) 134 | * 135 | * T* s_shm = MixedSharedMemory(); 136 | * T* s_el1 = (T*)&s_shm[0]; 137 | * T* s_el2 = (T*)&s_shm[10]; // or use cuda::SharedMemory 138 | * 139 | * @param rank in each dimensions. 140 | */ 141 | template 142 | __device__ T* MixedSharedMemory() { 143 | extern __shared__ __align__(sizeof(T)) unsigned char s_shm[]; 144 | return reinterpret_cast(s_shm); 145 | } 146 | 147 | /** 148 | * Extracting multiple values from shared memory of different types. 149 | * 150 | * Example: 151 | * SharedMemory shm; 152 | * shm.add(5); 153 | * shm.add(3); 154 | * shm.add(2); 155 | * 156 | * kernel<<<...,...,shm.bytes>>>(); 157 | * 158 | * and inside the CUDA kernel 159 | * 160 | * SharedMemory shm; 161 | * float* shm_1 = shm.ref(5); 162 | * int* shm_2 = shm.ref(3); 163 | * float* shm_3 = shm.ref(2); 164 | */ 165 | struct SharedMemory { 166 | int bytes = 0; 167 | unsigned char* shm_anchor; 168 | 169 | __host__ __device__ SharedMemory() { 170 | #if __CUDA_ARCH__ 171 | // inside device code we can declare shared memory and refer to it 172 | extern __shared__ unsigned char shm[]; 173 | shm_anchor = shm; 174 | #endif // __CUDA_ARCH__ 175 | } 176 | 177 | template 178 | __device__ T* ref(int num) { 179 | T* ptr = reinterpret_cast(&shm_anchor[bytes]); 180 | bytes += num * sizeof(T); 181 | return ptr; 182 | } 183 | 184 | template 185 | __host__ void add(int num) { 186 | bytes += num * sizeof(T); 187 | } 188 | }; 189 | 190 | }; // namespace cuda 191 | 192 | #endif // __CUDACC__ 193 | 194 | namespace cuda { 195 | 196 | namespace internal { 197 | template 198 | class HasLaunchMethod { 199 | private: 200 | typedef char yes[1]; 201 | typedef char no[2]; 202 | 203 | template 204 | static yes& verify(decltype(&C::Launch)); 205 | template 206 | static no& verify(...); 207 | 208 | public: 209 | enum { value = sizeof(verify(0)) == sizeof(yes) }; 210 | }; 211 | 212 | }; // namespace internal 213 | 214 | /** 215 | * Dispatch template kernels according to a hyper parameter. 216 | * 217 | * ExpertKernel kernelA; 218 | * ExpertKernel kernelB; 219 | * cuda::KernelDispatcher disp(false); 220 | * 221 | * disp.Register(3, kernelA); // for length up to 3 (inclusive) start kernelA 222 | * disp.Register(6, kernelB); // for length up to 6 (inclusive) start kernelB 223 | * 224 | * int i = 6; // runtime variable 225 | * disp.Run(i - 1); // launches kernelA 226 | * disp.Run(i); // launches kernelB 227 | * disp.Run(i + 1); // triggers runtime exeception because of 228 | * // `disp(false)` 229 | */ 230 | template > 231 | class KernelDispatcher { 232 | using TLauncherFunc = std::function; 233 | using TLauncherFuncMap = std::map; 234 | 235 | public: 236 | explicit KernelDispatcher(bool extend = true) : extend(extend) {} 237 | 238 | // Register a instantiated kernel. 239 | // 240 | // Example 241 | // cuda::KernelDispatcher dispatcher; 242 | // kernel instance; 243 | // dispatcher.Register(y, &instance); 244 | template 245 | void Register(KeyT bound, T* kernel) { 246 | static_assert(internal::HasLaunchMethod::value, 247 | "The kernel struct needs to have a 'Launch()' method! " 248 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 249 | Place(bound, [&]() { kernel->Launch(); }); 250 | } 251 | 252 | // Register and intialize a instantiated kernel. 253 | // 254 | // Example 255 | // cuda::KernelDispatcher dispatcher; 256 | // kernel instance; 257 | // initializer init; 258 | // dispatcher.Register(y, &instance, init); 259 | template 260 | void Register(KeyT bound, T* kernel, Initializer initializer) { 261 | static_assert(internal::HasLaunchMethod::value, 262 | "The kernel struct needs to have a 'Launch()' method! " 263 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 264 | initializer(kernel); 265 | Place(bound, [&]() { kernel->Launch(); }); 266 | } 267 | 268 | // Register a kernel. 269 | // 270 | // Example 271 | // cuda::KernelDispatcher dispatcher; 272 | // dispatcher.Register>(y); 273 | template 274 | void Register(KeyT bound) { 275 | static_assert(internal::HasLaunchMethod::value, 276 | "The kernel struct needs to have a 'Launch()' method! " 277 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 278 | T kernel; 279 | Place(bound, [&]() { kernel->Launch(); }); 280 | } 281 | 282 | // Register and intialize a kernel. 283 | // 284 | // Example 285 | // cuda::KernelDispatcher dispatcher; 286 | // initializer init; 287 | // dispatcher.Register>(y, init); 288 | template 289 | void Register(KeyT bound, Initializer initializer) { 290 | static_assert(internal::HasLaunchMethod::value, 291 | "The kernel struct needs to have a 'Launch()' method! " 292 | "YOU_MADE_A_PROGAMMING_MISTAKE"); 293 | T kernel; 294 | initializer(&kernel); 295 | Place(bound, [&]() { kernel.Launch(); }); 296 | } 297 | 298 | // // would require C++14 to use 299 | // // 300 | // // auto init = [&](auto& T){T.val = 42;}; 301 | // // disp.Register(3, kernelA, init); 302 | // template 303 | // void Register(KeyT bound, T& kernel, std::function init) { 304 | // init(kernel); 305 | // Register(bound, [&]() { 306 | // kernel.Launch(); 307 | // }); 308 | // } 309 | 310 | void Run(KeyT hyper) { 311 | typename TLauncherFuncMap::iterator detected_kernel = 312 | kernels_.lower_bound(hyper); 313 | if (detected_kernel == kernels_.end()) { 314 | if (extend) { 315 | // Assume kernel with largest bound is the generic version. 316 | kernels_.rbegin()->second(); 317 | } else { 318 | // const KeyT upper_bound = kernels_.rbegin()->first; 319 | throw std::runtime_error( 320 | "KernelDispatcher has no kernels registered for the parameter " 321 | "requested by the runtime. Use 'KernelDispatcher(true)' to extend" 322 | " the range of the last registered kernel."); 323 | } 324 | } else { 325 | // Found registered kernel and will launch it. 326 | detected_kernel->second(); 327 | } 328 | } 329 | 330 | private: 331 | template 332 | void Place(KeyT bound, TLauncherFunc&& launch_func) { 333 | kernels_[bound] = std::forward(launch_func); 334 | } 335 | 336 | TLauncherFuncMap kernels_; 337 | bool extend = true; // if true kernel with largest bound will act as default 338 | }; 339 | }; // namespace cuda 340 | 341 | #endif // INCLUDE_CUDA_UTILS_H_ 342 | -------------------------------------------------------------------------------- /include/multiply/multiply.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #ifndef INCLUDE_MULTIPLY_MULTIPLY_H_ 20 | #define INCLUDE_MULTIPLY_MULTIPLY_H_ 21 | 22 | #include "include/cuda_index.h" 23 | #include "include/cuda_utils.h" 24 | 25 | #if __CUDACC__ 26 | namespace multiply_kernels { 27 | 28 | using cuda::make_ndarray; 29 | 30 | // We follow the NVIDIA cub library style for template parameters. 31 | // BLOCK_DIM_X is the number of threads in a block along dimension x. 32 | template 33 | struct Multiply : public cuda::Kernel { 34 | // use enum for compiletime config 35 | // enum { PER_THREAD = 1 }; 36 | 37 | void Launch(cudaStream_t stream = 0) override { 38 | dim3 block(BLOCK_DIM_X, BLOCK_DIM_X); 39 | dim3 grid(cuda::divUp(W, BLOCK_DIM_X), cuda::divUp(H, BLOCK_DIM_X)); 40 | 41 | cuda::Run<<>>(*this); 42 | } 43 | 44 | __device__ __forceinline__ void operator()() const override { 45 | __shared__ ValueT ds_M[BLOCK_DIM_X][BLOCK_DIM_X]; 46 | __shared__ ValueT ds_N[BLOCK_DIM_X][BLOCK_DIM_X]; 47 | 48 | const int tx = threadIdx.x; 49 | const int ty = threadIdx.y; 50 | const int Ch = blockIdx.y * BLOCK_DIM_X + ty; 51 | const int Cw = blockIdx.x * BLOCK_DIM_X + tx; 52 | 53 | ValueT Cval = 0; 54 | 55 | const auto At = make_ndarray(A, H, W); 56 | const auto Bt = make_ndarray(B, H, W); 57 | auto Ct = make_ndarray(C, H, W); 58 | 59 | for (int m = 0; m < (W - 1) / BLOCK_DIM_X + 1; ++m) { 60 | if (At.valid(Ch, m * BLOCK_DIM_X + tx)) { 61 | ds_M[ty][tx] = At(Ch, m * BLOCK_DIM_X + tx); 62 | } else { 63 | ds_N[ty][tx] = 0; 64 | } 65 | if (Bt.valid(m * BLOCK_DIM_X + ty, Cw)) { 66 | ds_N[ty][tx] = Bt(m * BLOCK_DIM_X + ty, Cw); 67 | } else { 68 | ds_N[ty][tx] = 0; 69 | } 70 | __syncthreads(); 71 | 72 | for (int k = 0; k < BLOCK_DIM_X; ++k) Cval += ds_M[ty][k] * ds_N[k][tx]; 73 | __syncthreads(); 74 | } 75 | if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval; 76 | } 77 | 78 | int W; 79 | int H; 80 | const ValueT* A; 81 | const ValueT* B; 82 | ValueT* C; 83 | }; 84 | 85 | } // namespace multiply_kernels 86 | #endif // __CUDACC__ 87 | 88 | template 89 | struct Multiply { 90 | static void Apply(const ValueT* A, const ValueT* B, const int H, const int W, 91 | ValueT* C); 92 | }; 93 | 94 | #endif // INCLUDE_MULTIPLY_MULTIPLY_H_ 95 | -------------------------------------------------------------------------------- /include/test/multiply.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #ifndef INCLUDE_TEST_MULTIPLY_H_ 20 | #define INCLUDE_TEST_MULTIPLY_H_ 21 | 22 | #include "gmock/gmock.h" 23 | #include "gtest/gtest.h" 24 | 25 | #if GTEST_HAS_TYPED_TEST_P 26 | 27 | #include "include/cuda_utils.h" 28 | #include "include/multiply/multiply.h" 29 | 30 | template 31 | class MultiplyTest : public testing::Test {}; 32 | 33 | TYPED_TEST_SUITE_P(MultiplyTest); 34 | 35 | TYPED_TEST_P(MultiplyTest, TestIdentity) { 36 | float *A = new float[2 * 2]; 37 | float *B = new float[2 * 2]; 38 | float *expected = new float[2 * 2]; 39 | 40 | for (int i = 0; i < 2 * 2; ++i) { 41 | A[i] = i; 42 | B[i] = 0; 43 | expected[i] = i; 44 | } 45 | B[0] = 1; 46 | B[3] = 1; 47 | 48 | float *actual = new float[2 * 2]; 49 | 50 | Multiply::Apply(A, B, 2, 2, actual); 51 | 52 | for (int i = 0; i < 2 * 2; ++i) { 53 | EXPECT_EQ(expected[i], actual[i]); 54 | } 55 | } 56 | 57 | TYPED_TEST_P(MultiplyTest, TestSquare) { 58 | float *A = new float[3 * 3]; 59 | float *B = new float[3 * 3]; 60 | float *expected = new float[3 * 3]; 61 | 62 | for (int i = 0; i < 3 * 3; ++i) { 63 | A[i] = i; 64 | B[i] = i; 65 | } 66 | expected[0] = 15; 67 | expected[1] = 18; 68 | expected[2] = 21; 69 | expected[3] = 42; 70 | expected[4] = 54; 71 | expected[5] = 66; 72 | expected[6] = 69; 73 | expected[7] = 90; 74 | expected[8] = 111; 75 | 76 | float *actual = new float[3 * 3]; 77 | 78 | Multiply::Apply(A, B, 3, 3, actual); 79 | 80 | for (int i = 0; i < 3 * 3; ++i) { 81 | EXPECT_EQ(expected[i], actual[i]); 82 | } 83 | } 84 | 85 | REGISTER_TYPED_TEST_SUITE_P(MultiplyTest, // 86 | TestIdentity, TestSquare); 87 | 88 | #endif // GTEST_HAS_TYPED_TEST_P 89 | 90 | #endif // INCLUDE_TEST_MULTIPLY_H_ 91 | -------------------------------------------------------------------------------- /src/benchmark-multiply.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/cuda_benchmark.h" 20 | #include "include/cuda_utils.h" 21 | #include "include/multiply/multiply.h" 22 | 23 | struct Initializer { 24 | explicit Initializer(float *d_A, float *d_B, float *d_C, int height, 25 | int width) 26 | : d_A(d_A), d_B(d_B), d_C(d_C), height(height), width(width) {} 27 | 28 | template 29 | void operator()(TKernel *kernel) { 30 | kernel->H = height; 31 | kernel->W = width; 32 | kernel->A = d_A; 33 | kernel->B = d_B; 34 | kernel->C = d_C; 35 | } 36 | 37 | float *d_A; 38 | float *d_B; 39 | float *d_C; 40 | int height; 41 | int width; 42 | }; 43 | 44 | // Simple way to benchmark the template parameters. 45 | void run_for(int height, int width) { 46 | std::cout << std::endl 47 | << "Benchmark for " << height << " " << width 48 | << " -------------------------------- " << std::endl; 49 | 50 | float *A = new float[height * width]; 51 | float *B = new float[height * width]; 52 | float *C = new float[height * width]; 53 | for (int i = 0; i < height * width; ++i) { 54 | A[i] = i; 55 | B[i] = i * 10; 56 | C[i] = 0; 57 | } 58 | 59 | float *d_A; 60 | float *d_B; 61 | float *d_C; 62 | 63 | const int num_bytes = height * width * sizeof(float); 64 | 65 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_A), num_bytes)); 66 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_B), num_bytes)); 67 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_C), num_bytes)); 68 | 69 | ASSERT_CUDA(cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice)); 70 | ASSERT_CUDA(cudaMemcpy(d_B, B, num_bytes, cudaMemcpyHostToDevice)); 71 | 72 | Initializer init(d_A, d_B, d_C, height, width); 73 | 74 | // Test different options. 75 | cuda::KernelBenchmark bench; 76 | bench.Case>(init); 77 | bench.Case>(init); 78 | bench.Case>(init); 79 | bench.Case>(init); 80 | bench.Case>(init); 81 | bench.Case>(init); 82 | bench.Case>(init); 83 | bench.Case>(init); 84 | bench.Case>(init); 85 | bench.Start(); 86 | 87 | delete[] A; 88 | delete[] B; 89 | delete[] C; 90 | ASSERT_CUDA(cudaFree(d_A)); 91 | ASSERT_CUDA(cudaFree(d_B)); 92 | ASSERT_CUDA(cudaFree(d_C)); 93 | } 94 | 95 | int main() { 96 | run_for(256, 256); 97 | run_for(512, 512); 98 | run_for(1024, 1024); 99 | return 0; 100 | } 101 | 102 | // clang-format off 103 | /* 104 | Benchmark for 256 256 -------------------------------- 105 | Using Device Number: 0 106 | Device name: GeForce GTX 970 107 | Memory Clock Rate (KHz): 3505000 108 | Memory Bus Width (bits): 256 109 | Peak Memory Bandwidth (GB/s): 224.320000 110 | 111 | time 500.000000 - 1000.000000, iters: 5 - 100 112 | - multiply_kernels::Multiply took 3.047784 ms stats(iters: 100, var: 0.082136, stddev: 0.286594) 113 | - multiply_kernels::Multiply took 0.847197 ms stats(iters: 100, var: 0.002289, stddev: 0.047846) 114 | - multiply_kernels::Multiply took 0.337858 ms stats(iters: 100, var: 0.000039, stddev: 0.006252) 115 | - multiply_kernels::Multiply took 0.162206 ms stats(iters: 100, var: 0.000004, stddev: 0.001925) 116 | - multiply_kernels::Multiply took 0.081275 ms stats(iters: 100, var: 0.000000, stddev: 0.000677) 117 | - multiply_kernels::Multiply took 0.100844 ms stats(iters: 100, var: 0.000000, stddev: 0.000486) 118 | - multiply_kernels::Multiply took 0.072184 ms stats(iters: 100, var: 0.000001, stddev: 0.000723) 119 | - multiply_kernels::Multiply took 0.082570 ms stats(iters: 100, var: 0.000004, stddev: 0.001894) 120 | - multiply_kernels::Multiply took 0.070467 ms stats(iters: 100, var: 0.000008, stddev: 0.002803) 121 | 122 | Benchmark for 512 512 -------------------------------- 123 | Using Device Number: 0 124 | Device name: GeForce GTX 970 125 | Memory Clock Rate (KHz): 3505000 126 | Memory Bus Width (bits): 256 127 | Peak Memory Bandwidth (GB/s): 224.320000 128 | 129 | time 500.000000 - 1000.000000, iters: 5 - 100 130 | - multiply_kernels::Multiply took 20.967186 ms stats(iters: 48, var: 1.166002, stddev: 1.079816) 131 | - multiply_kernels::Multiply took 6.682436 ms stats(iters: 100, var: 0.122818, stddev: 0.350454) 132 | - multiply_kernels::Multiply took 2.826743 ms stats(iters: 100, var: 0.067757, stddev: 0.260302) 133 | - multiply_kernels::Multiply took 1.245100 ms stats(iters: 100, var: 0.019352, stddev: 0.139112) 134 | - multiply_kernels::Multiply took 0.574468 ms stats(iters: 100, var: 0.000003, stddev: 0.001616) 135 | - multiply_kernels::Multiply took 0.713191 ms stats(iters: 100, var: 0.000003, stddev: 0.001810) 136 | - multiply_kernels::Multiply took 0.502195 ms stats(iters: 100, var: 0.000002, stddev: 0.001380) 137 | - multiply_kernels::Multiply took 0.560309 ms stats(iters: 100, var: 0.000006, stddev: 0.002414) 138 | - multiply_kernels::Multiply took 0.510635 ms stats(iters: 100, var: 0.000001, stddev: 0.001121) 139 | 140 | Benchmark for 1024 1024 -------------------------------- 141 | Using Device Number: 0 142 | Device name: GeForce GTX 970 143 | Memory Clock Rate (KHz): 3505000 144 | Memory Bus Width (bits): 256 145 | Peak Memory Bandwidth (GB/s): 224.320000 146 | 147 | time 500.000000 - 1000.000000, iters: 5 - 100 148 | - multiply_kernels::Multiply took 287.646912 ms stats(iters: 4, var: 126.933113, stddev: 11.266459) 149 | - multiply_kernels::Multiply took 78.918053 ms stats(iters: 13, var: 1.950417, stddev: 1.396573) 150 | - multiply_kernels::Multiply took 33.681572 ms stats(iters: 15, var: 0.029435, stddev: 0.171566) 151 | - multiply_kernels::Multiply took 12.483257 ms stats(iters: 41, var: 0.002221, stddev: 0.047123) 152 | - multiply_kernels::Multiply took 5.562872 ms stats(iters: 100, var: 0.034724, stddev: 0.186343) 153 | - multiply_kernels::Multiply took 6.286970 ms stats(iters: 100, var: 0.010179, stddev: 0.100893) 154 | - multiply_kernels::Multiply took 4.158412 ms stats(iters: 100, var: 0.043726, stddev: 0.209108) 155 | - multiply_kernels::Multiply took 4.711136 ms stats(iters: 100, var: 0.064436, stddev: 0.253843) 156 | - multiply_kernels::Multiply took 4.059203 ms stats(iters: 100, var: 0.044180, stddev: 0.210191) 157 | 158 | */ 159 | // clang-format on 160 | -------------------------------------------------------------------------------- /src/deprecated_examples.cu_old: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * Fabian Groh, , 2019 17 | * 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "include/cuda_utils.h" 25 | 26 | /* 27 | nvcc examples.cu --expt-relaxed-constexpr -Xptxas="-v" -std=c++11 -o test 28 | */ 29 | 30 | //////////////////////////////////////////////////////////////////////////////// 31 | 32 | using cuda_utils::make_ndarray; 33 | using cuda_utils::NdArray; 34 | using cuda_utils::NdIndex; 35 | 36 | #define check_cuda_call(ans) \ 37 | { gpuAssert((ans), __FILE__, __LINE__); } 38 | inline void gpuAssert(cudaError_t code, const char *file, int line, 39 | bool abort = true) { 40 | if (code != cudaSuccess) { 41 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, 42 | line); 43 | if (abort) exit(code); 44 | } 45 | } 46 | 47 | template 48 | __global__ void matrixMultiply____________normal__________(T *C, const T *A, 49 | const T *B, int H, 50 | int W) { 51 | __shared__ T ds_M[num_threads][num_threads]; 52 | __shared__ T ds_N[num_threads][num_threads]; 53 | 54 | int tx = threadIdx.x; 55 | int ty = threadIdx.y; 56 | int Ch = blockIdx.y * num_threads + ty; 57 | int Cw = blockIdx.x * num_threads + tx; 58 | 59 | T Cval = 0; 60 | 61 | for (int m = 0; m < (W - 1) / num_threads + 1; ++m) { 62 | if (Ch < H && m * num_threads + tx < W) 63 | ds_M[ty][tx] = A[Ch * W + m * num_threads + tx]; 64 | else 65 | ds_M[ty][tx] = 0; 66 | if (Cw < W && m * num_threads + ty < H) 67 | ds_N[ty][tx] = B[(m * num_threads + ty) * W + Cw]; 68 | else 69 | ds_N[ty][tx] = 0; 70 | __syncthreads(); 71 | 72 | for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx]; 73 | __syncthreads(); 74 | } 75 | if (Ch < H && Cw < W) C[Ch * W + Cw] = Cval; 76 | } 77 | 78 | template 79 | __global__ void matrixMultiply____________tensor__________(T *C, const T *A, 80 | const T *B, int H, 81 | int W) { 82 | __shared__ T ds_M[num_threads][num_threads]; 83 | __shared__ T ds_N[num_threads][num_threads]; 84 | 85 | const int tx = threadIdx.x; 86 | const int ty = threadIdx.y; 87 | const int Ch = blockIdx.y * num_threads + ty; 88 | const int Cw = blockIdx.x * num_threads + tx; 89 | 90 | T Cval = 0; 91 | 92 | auto At = make_ndarray(A, H, W); 93 | auto Bt = make_ndarray(B, H, W); 94 | auto Ct = make_ndarray(C, H, W); 95 | 96 | for (int m = 0; m < (W - 1) / num_threads + 1; ++m) { 97 | ds_M[ty][tx] = At.safe_value(Ch, m * num_threads + tx); 98 | // ds_N[ty][tx] = Bt.safe_value(m * num_threads + ty, Cw); 99 | if (Bt.valid(m * num_threads + ty, Cw)) { 100 | ds_N[ty][tx] = Bt(m * num_threads + ty, Cw); 101 | } else { 102 | ds_N[ty][tx] = 0; 103 | } 104 | __syncthreads(); 105 | 106 | for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx]; 107 | __syncthreads(); 108 | } 109 | if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval; 110 | } 111 | 112 | template 113 | __global__ void matrixMultiply____________tensor2__________( 114 | NdArray Ct, NdArray At, NdArray Bt) { 115 | __shared__ T ds_M[num_threads][num_threads]; 116 | __shared__ T ds_N[num_threads][num_threads]; 117 | 118 | const int tx = threadIdx.x; 119 | const int ty = threadIdx.y; 120 | const int Ch = blockIdx.y * num_threads + ty; 121 | const int Cw = blockIdx.x * num_threads + tx; 122 | const size_t W = Bt.template dim<1>(); 123 | 124 | T Cval = 0; 125 | 126 | for (int m = 0; m < (W - 1) / num_threads + 1; ++m) { 127 | ds_M[ty][tx] = At.safe_value(Ch, m * num_threads + tx); 128 | ds_N[ty][tx] = Bt.safe_value(m * num_threads + ty, Cw); 129 | __syncthreads(); 130 | 131 | for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx]; 132 | __syncthreads(); 133 | } 134 | if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval; 135 | } 136 | 137 | /************* INDEX SIMPLE ***************************************************/ 138 | 139 | __global__ void index____________normal__________(int A, int B, int C, int a, 140 | int b, int c) { 141 | const int idx = a * (B * C) + b * C + c; 142 | printf("value is %i\n", idx); 143 | } 144 | 145 | __global__ void index____________tensor__________(int A, int B, int C, int a, 146 | int b, int c) { 147 | auto idx = NdIndex<3>(A, B, C); 148 | printf("value is %i\n", idx(a, b, c)); 149 | } 150 | 151 | template 152 | __device__ __forceinline__ const T NAIVE_IDX(const T A, const T B, const T C, 153 | T a, T b, T c) { 154 | return a * B * C + b * C + c; 155 | } 156 | 157 | __global__ void index____________naive__________(int A, int B, int C, int a, 158 | int b, int c) { 159 | const int idx = NAIVE_IDX(A, B, C, a, b, c); 160 | printf("value is %i\n", idx); 161 | } 162 | 163 | /************* README EXAMPLE *************************************************/ 164 | 165 | __global__ void readme____________normal__________(float *src, float *dst, 166 | int B, int H, int W, int C, 167 | int b, int h, int w, int c) { 168 | const int pos1 = b * (H * W * C) + h * (W * C) + w * (C) + c; 169 | const int pos2 = b * (H * W * C) + h * (W * C) + w * (C) + (c + 1); 170 | dst[pos2] = src[pos1]; 171 | } 172 | 173 | __global__ void readme____________tensor__________(float *src, float *dst, 174 | int B, int H, int W, int C, 175 | int b, int h, int w, int c) { 176 | auto idx = NdIndex<4>(B, H, W, C); 177 | src[idx(b, h, w, c)] = dst[idx(b, h, w, c)]; 178 | // auto src_t = Tensor(src, B, H, W, C); 179 | // auto dst_t = Tensor(dst, B, H, W, C); 180 | // src_t(b, h, w, c) = dst_t(b, h, w, c); 181 | } 182 | /************* FLEX-DECONV ***************************************************/ 183 | // Used 42 registers, 392 bytes cmem[0] 184 | // taken from https://github.com/cgtuebingen/Flex-Convolution 185 | template 186 | __global__ void flex_deconv_simple(const int B, const int N, const int K, 187 | const int Dp, const int Din, const int Dout, 188 | const Dtype *positions, 189 | const Dtype *features, 190 | const int *neighborhood, const Dtype *theta, 191 | const Dtype *bias, Dtype *output) { 192 | const int b = blockIdx.z; 193 | 194 | for (int n = blockIdx.y * blockDim.y + threadIdx.y; n < N; 195 | n += blockDim.y * gridDim.y) { 196 | const int self_k = neighborhood[b * K * N + 0 * N + n]; 197 | 198 | for (int k_ = 0; k_ < K; ++k_) { 199 | const int other_k = neighborhood[b * K * N + k_ * N + n]; 200 | 201 | for (int dout = blockIdx.x * blockDim.x + threadIdx.x; dout < Dout; 202 | dout += blockDim.x * gridDim.x) { 203 | for (int din = 0; din < Din; ++din) { 204 | const Dtype v = features[b * Din * N + din * N + self_k]; 205 | Dtype W = bias[din * Dout + dout]; 206 | 207 | for (int dp = 0; dp < Dp; ++dp) { 208 | Dtype delta = positions[b * Dp * N + dp * N + other_k] - 209 | positions[b * Dp * N + dp * N + self_k]; 210 | W += theta[dp * Din * Dout + din * Dout + dout] * delta; 211 | } 212 | 213 | Dtype Wv = W * v; 214 | // this has been an atomic add 215 | output[b * Dout * N + dout * N + other_k] += Wv; 216 | } 217 | } 218 | } 219 | } 220 | } 221 | 222 | // Used 48 registers, 392 bytes cmem[0] 223 | template 224 | __global__ void flex_deconv_tensor(const int B, const int N, const int K, 225 | const int Dp, const int Din, const int Dout, 226 | const T *positions, const T *features, 227 | const int *neighborhood, const T *theta, 228 | const T *bias, T *output) { 229 | auto pos_t = make_ndarray(positions, B, Dp, N); 230 | auto feat_t = make_ndarray(features, B, Din, N); 231 | auto theta_t = make_ndarray(theta, Dp, Din, Dout); 232 | auto bias_t = make_ndarray(bias, Din, Dout); 233 | auto neighborhood_t = make_ndarray(neighborhood, B, K, N); 234 | auto output_t = make_ndarray(output, B, Dout, N); 235 | 236 | const int b = blockIdx.z; 237 | 238 | for (int n = blockIdx.y * blockDim.y + threadIdx.y; n < N; 239 | n += blockDim.y * gridDim.y) { 240 | const int self_k = neighborhood_t(b, 0, n); 241 | 242 | for (int k_ = 0; k_ < K; ++k_) { 243 | const int other_k = neighborhood_t(b, k_, n); 244 | 245 | for (int dout = blockIdx.x * blockDim.x + threadIdx.x; dout < Dout; 246 | dout += blockDim.x * gridDim.x) { 247 | for (int din = 0; din < Din; ++din) { 248 | const T v = feat_t(b, din, self_k); 249 | T W = bias_t(din, dout); 250 | 251 | for (int dp = 0; dp < Dp; ++dp) { 252 | T delta = pos_t(b, dp, other_k) - pos_t(b, dp, self_k); 253 | W += theta_t(dp, din, dout) * delta; 254 | } 255 | 256 | T Wv = W * v; 257 | output_t(b, dout, other_k) += Wv; 258 | } 259 | } 260 | } 261 | } 262 | } 263 | 264 | int up2(int len, int th) { return (len - 1) / th + 1; } 265 | void run_flex_deconv() { 266 | // this will fail, but during compilation, we will see register usage 267 | int B = 8; 268 | int N = 1024; 269 | int K = 8; 270 | int Dp = 3; 271 | int Din = 64; 272 | int Dout = 64; 273 | 274 | float *positions_; 275 | float *features_; 276 | int *neighborhood_; 277 | float *theta_; 278 | float *bias_; 279 | float *output_; 280 | 281 | const int threads = 32; 282 | dim3 block(threads, threads, 1); 283 | dim3 grid(up2(Dout, threads), up2(N, threads), B); 284 | 285 | flex_deconv_simple<<>>(B, N, K, Dp, Din, Dout, positions_, 286 | features_, neighborhood_, theta_, 287 | bias_, output_); 288 | flex_deconv_tensor<<>>(B, N, K, Dp, Din, Dout, positions_, 289 | features_, neighborhood_, theta_, 290 | bias_, output_); 291 | } 292 | 293 | void run_readme() { 294 | int B = 4; 295 | int H = 17; 296 | int W = 32; 297 | int C = 32; 298 | float *d_src; 299 | float *d_dst; 300 | check_cuda_call(cudaMalloc(&d_src, sizeof(float) * B * H * W * C)); 301 | check_cuda_call(cudaMalloc(&d_dst, sizeof(float) * B * H * W * C)); 302 | 303 | int b = 1; 304 | int h = 3; 305 | int w = 3; 306 | int c = 8; 307 | dim3 grid1(1); 308 | dim3 block1(1); 309 | readme____________normal__________<<>>(d_src, d_dst, B, H, W, 310 | C, b, h, w, c); 311 | readme____________tensor__________<<>>(d_src, d_dst, B, H, W, 312 | C, b, h, w, c); 313 | } 314 | 315 | void run_simple() { 316 | int A = 4; 317 | int B = 17; 318 | int C = 32; 319 | 320 | int a = 1; 321 | int b = 3; 322 | int c = 8; 323 | dim3 grid1(1); 324 | dim3 block1(1); 325 | index____________normal__________<<>>(A, B, C, a, b, c); 326 | index____________tensor__________<<>>(A, B, C, a, b, c); 327 | index____________naive__________<<>>(A, B, C, a, b, c); 328 | } 329 | 330 | void run_matmul() { 331 | int H = 4; 332 | int W = 5; 333 | float *matA = new float[H * W]; 334 | float *matB = new float[H * W]; 335 | float *matC1 = new float[H * W]; 336 | float *matC2 = new float[H * W]; 337 | float *matC3 = new float[H * W]; 338 | 339 | for (int i = 0; i < H * W; ++i) { 340 | matA[i] = rand_r() / static_cast(RAND_MAX); 341 | matB[i] = rand_r() / static_cast(RAND_MAX); 342 | matC1[i] = rand_r() / static_cast(RAND_MAX); 343 | matC2[i] = rand_r() / static_cast(RAND_MAX); 344 | matC3[i] = rand_r() / static_cast(RAND_MAX); 345 | } 346 | 347 | float *d_matA; 348 | float *d_matB; 349 | float *d_matC1; 350 | float *d_matC2; 351 | float *d_matC3; 352 | 353 | check_cuda_call(cudaMalloc(&d_matA, sizeof(float) * H * W)); 354 | check_cuda_call(cudaMalloc(&d_matB, sizeof(float) * H * W)); 355 | check_cuda_call(cudaMalloc(&d_matC1, sizeof(float) * H * W)); 356 | check_cuda_call(cudaMalloc(&d_matC2, sizeof(float) * H * W)); 357 | check_cuda_call(cudaMalloc(&d_matC3, sizeof(float) * H * W)); 358 | 359 | check_cuda_call( 360 | cudaMemcpy(d_matA, matA, sizeof(float) * H * W, cudaMemcpyHostToDevice)); 361 | check_cuda_call( 362 | cudaMemcpy(d_matB, matB, sizeof(float) * H * W, cudaMemcpyHostToDevice)); 363 | check_cuda_call(cudaMemcpy(d_matC1, matC1, sizeof(float) * H * W, 364 | cudaMemcpyHostToDevice)); 365 | check_cuda_call(cudaMemcpy(d_matC2, matC2, sizeof(float) * H * W, 366 | cudaMemcpyHostToDevice)); 367 | check_cuda_call(cudaMemcpy(d_matC3, matC3, sizeof(float) * H * W, 368 | cudaMemcpyHostToDevice)); 369 | 370 | const int num_threads = 32; 371 | dim3 threads(num_threads, num_threads); 372 | dim3 grid((W + 1) / num_threads + 1, (W + 1) / num_threads + 1); 373 | 374 | matrixMultiply____________normal__________<<>>( 375 | d_matC1, d_matA, d_matB, H, W); 376 | 377 | check_cuda_call(cudaPeekAtLastError()); 378 | check_cuda_call(cudaGetLastError()); 379 | check_cuda_call(cudaDeviceSynchronize()); 380 | 381 | matrixMultiply____________tensor__________<<>>( 382 | d_matC2, d_matA, d_matB, H, W); 383 | 384 | check_cuda_call(cudaPeekAtLastError()); 385 | check_cuda_call(cudaGetLastError()); 386 | check_cuda_call(cudaDeviceSynchronize()); 387 | 388 | auto Ct = make_ndarray(d_matC3, H, W); 389 | auto At = make_ndarray(d_matA, H, W); 390 | auto Bt = make_ndarray(d_matB, H, W); 391 | 392 | matrixMultiply____________tensor2__________<<>>( 393 | Ct, At, Bt); 394 | 395 | check_cuda_call(cudaPeekAtLastError()); 396 | check_cuda_call(cudaGetLastError()); 397 | check_cuda_call(cudaDeviceSynchronize()); 398 | 399 | check_cuda_call(cudaMemcpy(matC1, d_matC1, H * W * sizeof(float), 400 | cudaMemcpyDeviceToHost)); 401 | check_cuda_call(cudaMemcpy(matC2, d_matC2, H * W * sizeof(float), 402 | cudaMemcpyDeviceToHost)); 403 | check_cuda_call(cudaMemcpy(matC3, d_matC3, H * W * sizeof(float), 404 | cudaMemcpyDeviceToHost)); 405 | 406 | // verify 407 | bool good = true; 408 | printf("\n"); 409 | for (int i = 0; i < H * W; ++i) { 410 | if (fabs(matC1[i] - matC2[i]) > 1e-8) { 411 | printf("%i %f %f %f ", i, matC1[i], matC2[i], matA[i]); 412 | good = false; 413 | } 414 | if (fabs(matC1[i] - matC3[i]) > 1e-8) { 415 | printf("%i %f %f %f ", i, matC1[i], matC3[i], matA[i]); 416 | good = false; 417 | } 418 | } 419 | printf("\n"); 420 | if (good) 421 | printf("good\n"); 422 | else 423 | printf("bad\n"); 424 | } 425 | 426 | /******************************************************************************/ 427 | 428 | int main() { 429 | run_matmul(); 430 | // run_readme(); 431 | // run_simple(); 432 | // run_flex_deconv(); 433 | return 0; 434 | } 435 | -------------------------------------------------------------------------------- /src/multiply.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | #include "include/multiply/multiply.h" 19 | 20 | #include 21 | 22 | #include "include/cuda_utils.h" 23 | 24 | void print_mat(float *A, int H, int W) { 25 | for (int h = 0; h < H; ++h) { 26 | for (int w = 0; w < W; ++w) { 27 | printf("%2.2f ", A[h * W + w]); 28 | } 29 | printf("\n"); 30 | } 31 | printf("\n"); 32 | } 33 | 34 | int main() { 35 | float *A = new float[2 * 2]; 36 | float *B = new float[2 * 2]; 37 | float *C = new float[2 * 2]; 38 | for (int i = 0; i < 2 * 2; ++i) { 39 | A[i] = i; 40 | B[i] = i * 10; 41 | C[i] = 0; 42 | } 43 | 44 | print_mat(A, 2, 2); 45 | print_mat(B, 2, 2); 46 | 47 | // ........................................................................... 48 | printf("Cpu output\n"); 49 | // run on CPU 50 | Multiply::Apply(A, B, 2, 2, C); 51 | print_mat(C, 2, 2); 52 | 53 | // ........................................................................... 54 | #if WITH_CUDA 55 | printf("Gpu output\n"); 56 | for (int i = 0; i < 2 * 2; ++i) { 57 | C[i] = 0; 58 | } 59 | 60 | // run on GPU 61 | Multiply::Apply(A, B, 2, 2, C); 62 | 63 | print_mat(C, 2, 2); 64 | #endif 65 | 66 | // ........................................................................... 67 | printf("auto output\n"); 68 | for (int i = 0; i < 2 * 2; ++i) { 69 | C[i] = 0; 70 | } 71 | // run on GPU if available else run on CPU 72 | Multiply::Apply(A, B, 2, 2, C); 73 | 74 | print_mat(C, 2, 2); 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /src/multiply/multiply.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/multiply/multiply.h" 20 | #include "include/cuda_utils.h" 21 | 22 | template 23 | struct Multiply { 24 | static void Apply(const ValueT* A, const ValueT* B, const int H, const int W, 25 | ValueT* C) { 26 | #if WITH_CUDA 27 | Multiply::Apply(A, B, H, W, C); 28 | #else // WITH_CUDA 29 | Multiply::Apply(A, B, H, W, C); 30 | #endif // WITH_CUDA 31 | } 32 | }; 33 | 34 | template struct Multiply; 35 | template struct Multiply; 36 | template struct Multiply; 37 | -------------------------------------------------------------------------------- /src/multiply/multiply_cpu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/multiply/multiply.h" 20 | 21 | #include "include/cuda_utils.h" 22 | 23 | template 24 | struct Multiply { 25 | static void Apply(const ValueT* A, const ValueT* B, const int H, const int W, 26 | ValueT* C) { 27 | for (int h = 0; h < H; ++h) { 28 | for (int w = 0; w < W; ++w) { 29 | C[h * W + w] = 0; 30 | for (int k = 0; k < W; ++k) { 31 | C[h * W + w] += A[h * W + k] * B[k * H + w]; 32 | } 33 | } 34 | } 35 | } 36 | }; 37 | 38 | template struct Multiply; 39 | template struct Multiply; 40 | template struct Multiply; 41 | -------------------------------------------------------------------------------- /src/multiply/multiply_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #if __CUDACC__ 20 | 21 | #include "include/multiply/multiply.h" 22 | 23 | #include "include/cuda_utils.h" 24 | 25 | template 26 | struct Multiply { 27 | static void Apply(const ValueT* A, const ValueT* B, const int H, const int W, 28 | ValueT* C) { 29 | const int num_bytes = H * W * sizeof(ValueT); 30 | 31 | ValueT* d_A; 32 | ValueT* d_B; 33 | ValueT* d_C; 34 | 35 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_A), num_bytes)); 36 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_B), num_bytes)); 37 | ASSERT_CUDA(cudaMalloc(reinterpret_cast(&d_C), num_bytes)); 38 | 39 | ASSERT_CUDA(cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice)); 40 | ASSERT_CUDA(cudaMemcpy(d_B, B, num_bytes, cudaMemcpyHostToDevice)); 41 | 42 | multiply_kernels::Multiply kernel; 43 | kernel.H = H; 44 | kernel.W = W; 45 | kernel.A = d_A; 46 | kernel.B = d_B; 47 | kernel.C = d_C; 48 | kernel.Launch(); 49 | ASSERT_CUDA(cudaDeviceSynchronize()); 50 | 51 | ASSERT_CUDA(cudaMemcpy(C, d_C, num_bytes, cudaMemcpyDeviceToHost)); 52 | 53 | // needed to wait for CUDA kernel output 54 | // std::cout << cuda::Benchmark(&kernel) << std::endl; 55 | } 56 | }; 57 | 58 | template struct Multiply; 59 | template struct Multiply; 60 | template struct Multiply; 61 | 62 | #endif // __CUDACC__ 63 | -------------------------------------------------------------------------------- /src/sharedmemory.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #if __CUDACC__ 20 | 21 | #include 22 | 23 | #include "include/cuda_index.h" 24 | #include "include/cuda_utils.h" 25 | 26 | namespace { 27 | 28 | struct AddSharedMemoryCUDAKernel : public cuda::Kernel { 29 | void Launch(cudaStream_t stream = 0) override { 30 | dim3 block(2); 31 | dim3 grid(1); 32 | 33 | cuda::SharedMemory shm; 34 | shm.add(5); 35 | shm.add(3); 36 | 37 | cuda::Run<<>>(*this); 38 | } 39 | 40 | __device__ __forceinline__ void operator()() const override { 41 | cuda::SharedMemory shm; 42 | float* floats_5 = shm.ref(5); 43 | int* ints_3 = shm.ref(3); 44 | 45 | if (threadIdx.x == 0) { 46 | floats_5[0] = 1.f; 47 | floats_5[1] = 2.f; 48 | floats_5[2] = 3.f; 49 | floats_5[3] = 4.f; 50 | floats_5[4] = 5.f; 51 | 52 | ints_3[0] = 11; 53 | ints_3[1] = 22; 54 | ints_3[2] = 33; 55 | } 56 | __syncthreads(); 57 | if (threadIdx.x == 1) { 58 | float float_sum = 0; 59 | for (int i = 0; i < 5; ++i) { 60 | float_sum += floats_5[i]; 61 | floats_5[i] = 0; 62 | } 63 | int int_sum = 0; 64 | for (int i = 0; i < 3; ++i) { 65 | int_sum += ints_3[i]; 66 | ints_3[i] = 0; 67 | } 68 | 69 | printf("float sum: %f\n", float_sum); 70 | printf("int sum: %d\n", int_sum); 71 | } 72 | } 73 | }; 74 | } // namespace 75 | 76 | int main() { 77 | AddSharedMemoryCUDAKernel kernel; 78 | kernel.Launch(); 79 | ASSERT_CUDA(cudaDeviceSynchronize()); 80 | return 0; 81 | } 82 | 83 | #endif // __CUDACC__ 84 | -------------------------------------------------------------------------------- /src/tune.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/cuda_utils.h" 20 | 21 | namespace { 22 | 23 | // Example for 1D generic 24 | template 25 | struct ExpertKernel1D : public cuda::Kernel { 26 | void Launch(cudaStream_t stream = 0) override { 27 | dim3 block(BLOCK_DIM_X); 28 | dim3 grid(1); 29 | cuda::Run<<>>(*this); 30 | } 31 | 32 | __device__ __forceinline__ void operator()() const override { 33 | printf("thread %d here from the expert-kernel %d [val=%f]\n", threadIdx.x, 34 | BLOCK_DIM_X, val); 35 | } 36 | 37 | ValueT val = 0; 38 | }; 39 | 40 | // Example for 1D special 41 | template 42 | struct ExpertKernel1D : public cuda::Kernel { 43 | void Launch(cudaStream_t stream = 0) override { 44 | dim3 block(4); 45 | dim3 grid(1); 46 | cuda::Run<<>>(*this); 47 | } 48 | 49 | __device__ __forceinline__ void operator()() const override { 50 | printf("thread %d here from the special expert-kernel of 4 [val=%f]\n", 51 | threadIdx.x, val); 52 | } 53 | ValueT val = 0; 54 | }; 55 | 56 | // Example for 1D generic 57 | template 58 | struct ExpertKernel2D : public cuda::Kernel { 59 | void Launch(cudaStream_t stream = 0) override { 60 | dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); 61 | dim3 grid(1, 1); 62 | cuda::Run<<>>(*this); 63 | } 64 | 65 | __device__ __forceinline__ void operator()() const override { 66 | const int tid = threadIdx.x * BLOCK_DIM_Y + threadIdx.y; 67 | if (!tid) 68 | printf("thread %d here from the expert-kernel %d x %d [val=%f]\n", tid, 69 | BLOCK_DIM_X, BLOCK_DIM_Y, val); 70 | } 71 | ValueT val = 0; 72 | }; 73 | 74 | // Example for 1D special 75 | template 76 | struct ExpertKernel2D : public cuda::Kernel { 77 | void Launch(cudaStream_t stream = 0) override { 78 | dim3 block(4, 3); 79 | dim3 grid(1, 1); 80 | cuda::Run<<>>(*this); 81 | } 82 | 83 | __device__ __forceinline__ void operator()() const override { 84 | const int tid = threadIdx.x * 3 + threadIdx.y; 85 | if (!tid) 86 | printf( 87 | "thread %d here from the special expert-kernel of 4 x 3 [val=%f]\n", 88 | tid, val); 89 | } 90 | ValueT val = 0; 91 | }; 92 | } // namespace 93 | 94 | // Workaround to initialize all kernels for the dispatcher. 95 | struct Initializer { 96 | explicit Initializer(float val) : val(val) {} 97 | 98 | template 99 | void operator()(TKernel* kernel) { 100 | kernel->val = val; 101 | } 102 | 103 | float val; 104 | }; 105 | 106 | int main() { 107 | // We initialize these kernels using Initializer. 108 | // From c++14 on, we could use a lambda function. 109 | // But for now, we need this workaround. 110 | Initializer init(42.f); 111 | 112 | // Simple hyper-parameter: 113 | cuda::KernelDispatcher disp(true); 114 | disp.Register>(3, init); 115 | disp.Register>(6, init); 116 | 117 | for (int i = 0; i < 9; ++i) { 118 | printf("%d : \n", i); 119 | disp.Run(i); 120 | ASSERT_CUDA(cudaDeviceSynchronize()); 121 | } 122 | 123 | // Multi-dimensional hyper-parameters: 124 | cuda::KernelDispatcher> disp2(true); 125 | disp2.Register>(std::make_tuple(4, 3), init); 126 | disp2.Register>(std::make_tuple(9, 4), init); 127 | 128 | for (int i = 0; i < 10; ++i) { 129 | for (int j = 0; j < 5; ++j) { 130 | printf("i: %d j %d\n", i, j); 131 | disp2.Run(std::make_tuple(i, j)); 132 | ASSERT_CUDA(cudaDeviceSynchronize()); 133 | } 134 | } 135 | 136 | return 0; 137 | } 138 | 139 | /* 140 | 0 : 141 | thread 0 here from the special expert-kernel of 4 [val=42.000000] 142 | thread 1 here from the special expert-kernel of 4 [val=42.000000] 143 | thread 2 here from the special expert-kernel of 4 [val=42.000000] 144 | thread 3 here from the special expert-kernel of 4 [val=42.000000] 145 | 1 : 146 | thread 0 here from the special expert-kernel of 4 [val=42.000000] 147 | thread 1 here from the special expert-kernel of 4 [val=42.000000] 148 | thread 2 here from the special expert-kernel of 4 [val=42.000000] 149 | thread 3 here from the special expert-kernel of 4 [val=42.000000] 150 | 2 : 151 | thread 0 here from the special expert-kernel of 4 [val=42.000000] 152 | thread 1 here from the special expert-kernel of 4 [val=42.000000] 153 | thread 2 here from the special expert-kernel of 4 [val=42.000000] 154 | thread 3 here from the special expert-kernel of 4 [val=42.000000] 155 | 3 : 156 | thread 0 here from the special expert-kernel of 4 [val=42.000000] 157 | thread 1 here from the special expert-kernel of 4 [val=42.000000] 158 | thread 2 here from the special expert-kernel of 4 [val=42.000000] 159 | thread 3 here from the special expert-kernel of 4 [val=42.000000] 160 | 4 : 161 | thread 0 here from the expert-kernel 8 [val=42.000000] 162 | thread 1 here from the expert-kernel 8 [val=42.000000] 163 | thread 2 here from the expert-kernel 8 [val=42.000000] 164 | thread 3 here from the expert-kernel 8 [val=42.000000] 165 | thread 4 here from the expert-kernel 8 [val=42.000000] 166 | thread 5 here from the expert-kernel 8 [val=42.000000] 167 | thread 6 here from the expert-kernel 8 [val=42.000000] 168 | thread 7 here from the expert-kernel 8 [val=42.000000] 169 | 5 : 170 | thread 0 here from the expert-kernel 8 [val=42.000000] 171 | thread 1 here from the expert-kernel 8 [val=42.000000] 172 | thread 2 here from the expert-kernel 8 [val=42.000000] 173 | thread 3 here from the expert-kernel 8 [val=42.000000] 174 | thread 4 here from the expert-kernel 8 [val=42.000000] 175 | thread 5 here from the expert-kernel 8 [val=42.000000] 176 | thread 6 here from the expert-kernel 8 [val=42.000000] 177 | thread 7 here from the expert-kernel 8 [val=42.000000] 178 | 6 : 179 | thread 0 here from the expert-kernel 8 [val=42.000000] 180 | thread 1 here from the expert-kernel 8 [val=42.000000] 181 | thread 2 here from the expert-kernel 8 [val=42.000000] 182 | thread 3 here from the expert-kernel 8 [val=42.000000] 183 | thread 4 here from the expert-kernel 8 [val=42.000000] 184 | thread 5 here from the expert-kernel 8 [val=42.000000] 185 | thread 6 here from the expert-kernel 8 [val=42.000000] 186 | thread 7 here from the expert-kernel 8 [val=42.000000] 187 | 7 : 188 | thread 0 here from the expert-kernel 8 [val=42.000000] 189 | thread 1 here from the expert-kernel 8 [val=42.000000] 190 | thread 2 here from the expert-kernel 8 [val=42.000000] 191 | thread 3 here from the expert-kernel 8 [val=42.000000] 192 | thread 4 here from the expert-kernel 8 [val=42.000000] 193 | thread 5 here from the expert-kernel 8 [val=42.000000] 194 | thread 6 here from the expert-kernel 8 [val=42.000000] 195 | thread 7 here from the expert-kernel 8 [val=42.000000] 196 | 8 : 197 | thread 0 here from the expert-kernel 8 [val=42.000000] 198 | thread 1 here from the expert-kernel 8 [val=42.000000] 199 | thread 2 here from the expert-kernel 8 [val=42.000000] 200 | thread 3 here from the expert-kernel 8 [val=42.000000] 201 | thread 4 here from the expert-kernel 8 [val=42.000000] 202 | thread 5 here from the expert-kernel 8 [val=42.000000] 203 | thread 6 here from the expert-kernel 8 [val=42.000000] 204 | thread 7 here from the expert-kernel 8 [val=42.000000] 205 | i: 0 j 0 206 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 207 | i: 0 j 1 208 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 209 | i: 0 j 2 210 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 211 | i: 0 j 3 212 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 213 | i: 0 j 4 214 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 215 | i: 1 j 0 216 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 217 | i: 1 j 1 218 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 219 | i: 1 j 2 220 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 221 | i: 1 j 3 222 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 223 | i: 1 j 4 224 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 225 | i: 2 j 0 226 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 227 | i: 2 j 1 228 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 229 | i: 2 j 2 230 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 231 | i: 2 j 3 232 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 233 | i: 2 j 4 234 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 235 | i: 3 j 0 236 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 237 | i: 3 j 1 238 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 239 | i: 3 j 2 240 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 241 | i: 3 j 3 242 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 243 | i: 3 j 4 244 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 245 | i: 4 j 0 246 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 247 | i: 4 j 1 248 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 249 | i: 4 j 2 250 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 251 | i: 4 j 3 252 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000] 253 | i: 4 j 4 254 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 255 | i: 5 j 0 256 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 257 | i: 5 j 1 258 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 259 | i: 5 j 2 260 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 261 | i: 5 j 3 262 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 263 | i: 5 j 4 264 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 265 | i: 6 j 0 266 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 267 | i: 6 j 1 268 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 269 | i: 6 j 2 270 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 271 | i: 6 j 3 272 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 273 | i: 6 j 4 274 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 275 | i: 7 j 0 276 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 277 | i: 7 j 1 278 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 279 | i: 7 j 2 280 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 281 | i: 7 j 3 282 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 283 | i: 7 j 4 284 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 285 | i: 8 j 0 286 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 287 | i: 8 j 1 288 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 289 | i: 8 j 2 290 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 291 | i: 8 j 3 292 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 293 | i: 8 j 4 294 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 295 | i: 9 j 0 296 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 297 | i: 9 j 1 298 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 299 | i: 9 j 2 300 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 301 | i: 9 j 3 302 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 303 | i: 9 j 4 304 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000] 305 | */ 306 | -------------------------------------------------------------------------------- /test/test_multiply.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/test/multiply.h" 20 | 21 | #include "gmock/gmock.h" 22 | #include "gtest/gtest.h" 23 | #include "include/cuda_utils.h" 24 | #include "include/multiply/multiply.h" 25 | 26 | namespace { 27 | 28 | TEST(MultiplyTest, ExtraCpuTest) { EXPECT_TRUE(true); } 29 | 30 | using Devices = ::testing::Types; 31 | INSTANTIATE_TYPED_TEST_SUITE_P(Example, MultiplyTest, Devices); 32 | 33 | } // namespace 34 | 35 | int main(int argc, char **argv) { 36 | ::testing::InitGoogleMock(&argc, argv); 37 | return RUN_ALL_TESTS(); 38 | } 39 | -------------------------------------------------------------------------------- /test/test_multiply.cu.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Authors. All Rights Reserved. 2 | 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | * 15 | * Author: Patrick Wieschollek, , 2019 16 | * 17 | */ 18 | 19 | #include "include/test/multiply.h" 20 | 21 | #include "gmock/gmock.h" 22 | #include "gtest/gtest.h" 23 | #include "include/cuda_utils.h" 24 | #include "include/multiply/multiply.h" 25 | 26 | namespace { 27 | 28 | TEST(MultiplyTest, ExtraGpuTest) { EXPECT_TRUE(true); } 29 | 30 | TEST(MultiplyTest, GpuMatchCpu) { 31 | constexpr int M = 50; 32 | float *A = new float[M * M]; 33 | float *B = new float[M * M]; 34 | float *expected = new float[M * M]; 35 | float *actual = new float[M * M]; 36 | 37 | for (int i = 0; i < 2 * 2; ++i) { 38 | A[i] = i; 39 | B[i] = i - 5; 40 | expected[i] = 0; 41 | expected[i] = 0; 42 | } 43 | 44 | Multiply::Apply(A, B, M, M, expected); 45 | Multiply::Apply(A, B, M, M, actual); 46 | 47 | for (int i = 0; i < M * M; ++i) { 48 | EXPECT_NEAR(expected[i], actual[i], 1e-8); 49 | } 50 | } 51 | 52 | using Devices = ::testing::Types; 53 | INSTANTIATE_TYPED_TEST_SUITE_P(Example, MultiplyTest, Devices); 54 | 55 | } // namespace 56 | 57 | int main(int argc, char **argv) { 58 | ::testing::InitGoogleMock(&argc, argv); 59 | return RUN_ALL_TESTS(); 60 | } 61 | --------------------------------------------------------------------------------