├── .ci
    ├── check.sh
    ├── clang.Dockerfile
    ├── cpplint.Dockerfile
    ├── cuda.Dockerfile
    ├── cuda8.Dockerfile
    └── makefile
├── .clang-format
├── .drone.script
├── .drone.yml
├── .gitignore
├── BUILD
├── CMakeLists.txt
├── README.md
├── WORKSPACE
├── cmake
    └── CudaHelper.cmake
├── cuda.bzl
├── gtest.BUILD
├── include
    ├── cuda_benchmark.h
    ├── cuda_index.h
    ├── cuda_utils.h
    ├── multiply
    │   └── multiply.h
    └── test
    │   └── multiply.h
├── src
    ├── benchmark-multiply.cu.cc
    ├── deprecated_examples.cu_old
    ├── multiply.cc
    ├── multiply
    │   ├── multiply.cc
    │   ├── multiply_cpu.cc
    │   └── multiply_gpu.cu.cc
    ├── sharedmemory.cu.cc
    └── tune.cu.cc
└── test
    ├── test_multiply.cc
    └── test_multiply.cu.cc


/.ci/check.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Patrick Wieschollek <mail@patwie.com>
 3 | 
 4 | RETURN=0
 5 | FILES=`find . -type f -name "*" | grep -E "\.(cc|h|cu)$"`
 6 | for FILE in $FILES; do
 7 |     echo -ne "check file ${FILE}"
 8 |     clang-format-6.0 $FILE -style=file | cmp $FILE >/dev/null
 9 |     if [ $? -ne 0 ]; then
10 |       echo " ... failed"
11 |       echo "[!] INCORRECT FORMATTING! $FILE" >&2
12 |       echo $FILE
13 |       diff -u <(cat $FILE) <(clang-format-6.0 ${FILE} -style=file)
14 |       # diff -u < (cat ${FILE}) < (clang-format ${FILE})
15 |       RETURN=1
16 |     else
17 |       echo " ... ok"
18 |     fi
19 | done
20 | exit $RETURN


--------------------------------------------------------------------------------
/.ci/clang.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 | RUN apt-get update && apt-get install clang-format-6.0 -y


--------------------------------------------------------------------------------
/.ci/cpplint.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:alpine
2 | RUN pip3 install cpplint


--------------------------------------------------------------------------------
/.ci/cuda.Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG VERCUDA
 2 | FROM nvidia/cuda:${VERCUDA}-runtime
 3 | ARG VERCUDA
 4 | RUN apt-get update && apt-get install -y --no-install-recommends \
 5 |   cuda-libraries-dev-$CUDA_PKG_VERSION \
 6 |   cuda-nvml-dev-$CUDA_PKG_VERSION \
 7 |   cuda-minimal-build-$CUDA_PKG_VERSION \
 8 |   cuda-command-line-tools-$CUDA_PKG_VERSION \
 9 |   cmake \
10 |   libnccl-dev=$NCCL_VERSION-1+cuda${VERCUDA} \
11 |   xz-utils \
12 |   build-essential \
13 |   libgtest-dev \
14 |   curl \
15 |   unzip
16 | 
17 | RUN mkdir /google && cd /google && \
18 |   curl https://github.com/google/googletest/archive/master.zip -O -J -L && \
19 |   unzip googletest-master.zip  && \
20 |   mv googletest-master src  && \
21 |   rm googletest-master.zip  && \
22 |   mkdir build && \
23 |   mkdir dist && \
24 |   cd build && \
25 |   cmake ../src -DCMAKE_INSTALL_PREFIX=/google/dist && \
26 |   make install
27 | 
28 | ENV GTEST_ROOT /google/dist
29 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
30 | 


--------------------------------------------------------------------------------
/.ci/cuda8.Dockerfile:
--------------------------------------------------------------------------------
 1 | # adapted from https://gitlab.com/nvidia/cuda/blob/ubuntu16.04/8.0/runtime/Dockerfile
 2 | ARG VERCUDA
 3 | FROM ubuntu:16.04
 4 | ARG VERCUDA
 5 | 
 6 | RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
 7 |     rm -rf /var/lib/apt/lists/* && \
 8 |     NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
 9 |     NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
10 |     apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
11 |     apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
12 |     echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
13 |     echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list
14 | 
15 | ENV CUDA_VERSION 8.0.61
16 | 
17 | RUN apt-get update && apt-get install -y --no-install-recommends \
18 |   cmake \
19 |   xz-utils \
20 |   build-essential \
21 |   libgtest-dev \
22 |   curl \
23 |   unzip
24 | 
25 | ENV CUDA_PKG_VERSION 8-0=$CUDA_VERSION-1
26 | RUN apt-get update && apt-get install -y --no-install-recommends \
27 |         cuda-nvrtc-$CUDA_PKG_VERSION \
28 |         cuda-nvgraph-$CUDA_PKG_VERSION \
29 |         cuda-cusolver-$CUDA_PKG_VERSION \
30 |         cuda-cublas-8-0=8.0.61.2-1 \
31 |         cuda-cufft-$CUDA_PKG_VERSION \
32 |         cuda-curand-$CUDA_PKG_VERSION \
33 |         cuda-cusparse-$CUDA_PKG_VERSION \
34 |         cuda-npp-$CUDA_PKG_VERSION \
35 |         cuda-cudart-$CUDA_PKG_VERSION && \
36 |     ln -s cuda-8.0 /usr/local/cuda
37 | 
38 | RUN apt-get update && apt-get install -y --no-install-recommends \
39 |         cuda-core-$CUDA_PKG_VERSION \
40 |         cuda-misc-headers-$CUDA_PKG_VERSION \
41 |         cuda-command-line-tools-$CUDA_PKG_VERSION \
42 |         cuda-nvrtc-dev-$CUDA_PKG_VERSION \
43 |         cuda-nvml-dev-$CUDA_PKG_VERSION \
44 |         cuda-nvgraph-dev-$CUDA_PKG_VERSION \
45 |         cuda-cusolver-dev-$CUDA_PKG_VERSION \
46 |         cuda-cublas-dev-8-0=8.0.61.2-1 \
47 |         cuda-cufft-dev-$CUDA_PKG_VERSION \
48 |         cuda-curand-dev-$CUDA_PKG_VERSION \
49 |         cuda-cusparse-dev-$CUDA_PKG_VERSION \
50 |         cuda-npp-dev-$CUDA_PKG_VERSION \
51 |         cuda-cudart-dev-$CUDA_PKG_VERSION \
52 |         cuda-driver-dev-$CUDA_PKG_VERSION && \
53 |     rm -rf /var/lib/apt/lists/*
54 | 
55 | 
56 | # nvidia-docker 1.0
57 | LABEL com.nvidia.volumes.needed="nvidia_driver"
58 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
59 | 
60 | RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
61 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
62 | 
63 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
64 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
65 | 
66 | # nvidia-container-runtime
67 | ENV NVIDIA_VISIBLE_DEVICES all
68 | ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
69 | ENV NVIDIA_REQUIRE_CUDA "cuda>=8.0"
70 | 
71 | 
72 | RUN mkdir /google && cd /google && \
73 |   curl https://github.com/google/googletest/archive/master.zip -O -J -L && \
74 |   unzip googletest-master.zip  && \
75 |   mv googletest-master src  && \
76 |   rm googletest-master.zip  && \
77 |   mkdir build && \
78 |   mkdir dist && \
79 |   cd build && \
80 |   cmake ../src -DCMAKE_INSTALL_PREFIX=/google/dist && \
81 |   make install
82 | 
83 | ENV GTEST_ROOT /google/dist
84 | ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
85 | 


--------------------------------------------------------------------------------
/.ci/makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build push
 2 | 
 3 | build:
 4 | 	docker build --build-arg VERCUDA=10.1 -t patwie/cuda:10.1 -f cuda.Dockerfile .
 5 | 	docker build --build-arg VERCUDA=10.0 -t patwie/cuda:10.0 -f cuda.Dockerfile .
 6 | 	docker build --build-arg VERCUDA=9.2 -t patwie/cuda:9.2 -f cuda.Dockerfile .
 7 | 	docker build --build-arg VERCUDA=9.1 -t patwie/cuda:9.1 -f cuda.Dockerfile .
 8 | 	docker build --build-arg VERCUDA=9.0 -t patwie/cuda:9.0 -f cuda.Dockerfile .
 9 | 	docker build --build-arg VERCUDA=8.0 -t patwie/cuda:8.0 -f cuda8.Dockerfile .
10 | 	docker build -t patwie/cpplint -f cpplint.Dockerfile .
11 | 	docker build -t patwie/clang-format -f clang.Dockerfile .
12 | 
13 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ﻿---
2 | BasedOnStyle: Google
3 | 


--------------------------------------------------------------------------------
/.drone.script:
--------------------------------------------------------------------------------
 1 | def main():
 2 |   return [
 3 |       quality_pipeline(),
 4 |       build_pipeline("8.0"),
 5 |       build_pipeline("9.0"),
 6 |       build_pipeline("9.1"),
 7 |       build_pipeline("9.2"),
 8 |       build_pipeline("10.0"),
 9 |       build_pipeline("10.1"),
10 |   ]
11 | 
12 | 
13 | def quality_pipeline():
14 |   return {
15 |       'kind': 'pipeline',
16 |       'name': 'quality',
17 |       'platform': {
18 |           'os': "linux",
19 |           'arch': 'amd64',
20 |       },
21 |       'steps': [
22 |           {
23 |               'name': 'format',
24 |               'pull': 'never',
25 |               'image': 'patwie/clang-format:latest',
26 |               'commands': [
27 |                   './.ci/check.sh',
28 |               ],
29 |           },
30 |           {
31 |               'name': 'lint',
32 |               'pull': 'never',
33 |               'image': 'patwie/cpplint:latest',
34 |               'commands': [
35 |                   'cpplint --recursive .',
36 |               ],
37 |           },
38 |       ],
39 |   }
40 | 
41 | 
42 | def build_pipeline(cuda_version):
43 |   return {
44 |       'kind': 'pipeline',
45 |       'name': 'CUDA %s' % cuda_version,
46 |       'platform': {
47 |           'os': "linux",
48 |           'arch': 'amd64',
49 |       },
50 |       'steps': [
51 |           {
52 |               'name': 'build',
53 |               'pull': 'never',
54 |               'image': 'patwie/cuda:%s' % cuda_version,
55 |               'commands': [
56 |                   'mkdir build',
57 |                   'cd build',
58 |                   'cmake ..',
59 |                   'make',
60 |               ],
61 |           },
62 |           {
63 |               'name': 'test',
64 |               'pull': 'never',
65 |               'image': 'patwie/cuda:%s' % cuda_version,
66 |               'commands': [
67 |                   './build/test_cpu',
68 |               ],
69 |           },
70 |       ],
71 |       'depends_on': ['quality']
72 |   }
73 | 


--------------------------------------------------------------------------------
/.drone.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | kind: pipeline
  3 | name: quality
  4 | 
  5 | platform:
  6 |   os: linux
  7 |   arch: amd64
  8 | 
  9 | steps:
 10 | - name: format
 11 |   pull: never
 12 |   image: patwie/clang-format:latest
 13 |   commands:
 14 |   - ./.ci/check.sh
 15 | 
 16 | - name: lint
 17 |   pull: never
 18 |   image: patwie/cpplint:latest
 19 |   commands:
 20 |   - cpplint --recursive .
 21 | 
 22 | ---
 23 | kind: pipeline
 24 | name: CUDA 8.0
 25 | 
 26 | platform:
 27 |   os: linux
 28 |   arch: amd64
 29 | 
 30 | steps:
 31 | - name: build
 32 |   pull: never
 33 |   image: patwie/cuda:8.0
 34 |   commands:
 35 |   - mkdir build
 36 |   - cd build
 37 |   - cmake ..
 38 |   - make
 39 | 
 40 | - name: test
 41 |   pull: never
 42 |   image: patwie/cuda:8.0
 43 |   commands:
 44 |   - ./build/test_cpu
 45 | 
 46 | depends_on:
 47 | - quality
 48 | 
 49 | ---
 50 | kind: pipeline
 51 | name: CUDA 9.0
 52 | 
 53 | platform:
 54 |   os: linux
 55 |   arch: amd64
 56 | 
 57 | steps:
 58 | - name: build
 59 |   pull: never
 60 |   image: patwie/cuda:9.0
 61 |   commands:
 62 |   - mkdir build
 63 |   - cd build
 64 |   - cmake ..
 65 |   - make
 66 | 
 67 | - name: test
 68 |   pull: never
 69 |   image: patwie/cuda:9.0
 70 |   commands:
 71 |   - ./build/test_cpu
 72 | 
 73 | depends_on:
 74 | - quality
 75 | 
 76 | ---
 77 | kind: pipeline
 78 | name: CUDA 9.1
 79 | 
 80 | platform:
 81 |   os: linux
 82 |   arch: amd64
 83 | 
 84 | steps:
 85 | - name: build
 86 |   pull: never
 87 |   image: patwie/cuda:9.1
 88 |   commands:
 89 |   - mkdir build
 90 |   - cd build
 91 |   - cmake ..
 92 |   - make
 93 | 
 94 | - name: test
 95 |   pull: never
 96 |   image: patwie/cuda:9.1
 97 |   commands:
 98 |   - ./build/test_cpu
 99 | 
100 | depends_on:
101 | - quality
102 | 
103 | ---
104 | kind: pipeline
105 | name: CUDA 9.2
106 | 
107 | platform:
108 |   os: linux
109 |   arch: amd64
110 | 
111 | steps:
112 | - name: build
113 |   pull: never
114 |   image: patwie/cuda:9.2
115 |   commands:
116 |   - mkdir build
117 |   - cd build
118 |   - cmake ..
119 |   - make
120 | 
121 | - name: test
122 |   pull: never
123 |   image: patwie/cuda:9.2
124 |   commands:
125 |   - ./build/test_cpu
126 | 
127 | depends_on:
128 | - quality
129 | 
130 | ---
131 | kind: pipeline
132 | name: CUDA 10.0
133 | 
134 | platform:
135 |   os: linux
136 |   arch: amd64
137 | 
138 | steps:
139 | - name: build
140 |   pull: never
141 |   image: patwie/cuda:10.0
142 |   commands:
143 |   - mkdir build
144 |   - cd build
145 |   - cmake ..
146 |   - make
147 | 
148 | - name: test
149 |   pull: never
150 |   image: patwie/cuda:10.0
151 |   commands:
152 |   - ./build/test_cpu
153 | 
154 | depends_on:
155 | - quality
156 | 
157 | ---
158 | kind: pipeline
159 | name: CUDA 10.1
160 | 
161 | platform:
162 |   os: linux
163 |   arch: amd64
164 | 
165 | steps:
166 | - name: build
167 |   pull: never
168 |   image: patwie/cuda:10.1
169 |   commands:
170 |   - mkdir build
171 |   - cd build
172 |   - cmake ..
173 |   - make
174 | 
175 | - name: test
176 |   pull: never
177 |   image: patwie/cuda:10.1
178 |   commands:
179 |   - ./build/test_cpu
180 | 
181 | depends_on:
182 | - quality
183 | 
184 | ---
185 | kind: signature
186 | hmac: 7d2643e4c55153be2ec8b9abff2231bd4b10f0be12de2308e98025346504d8c6
187 | 
188 | ...
189 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .local/
 2 | 
 3 | ### C++ ###
 4 | # Prerequisites
 5 | *.d
 6 | 
 7 | # Compiled Object files
 8 | *.slo
 9 | *.lo
10 | *.o
11 | *.obj
12 | 
13 | # Precompiled Headers
14 | *.gch
15 | *.pch
16 | 
17 | # Compiled Dynamic libraries
18 | *.so
19 | *.dylib
20 | *.dll
21 | 
22 | # Fortran module files
23 | *.mod
24 | *.smod
25 | 
26 | # Compiled Static libraries
27 | *.lai
28 | *.la
29 | *.a
30 | *.lib
31 | 
32 | # Executables
33 | *.exe
34 | *.out
35 | *.app
36 | 
37 | ### CMake ###
38 | CMakeLists.txt.user
39 | CMakeCache.txt
40 | CMakeFiles
41 | CMakeScripts
42 | Testing
43 | Makefile
44 | cmake_install.cmake
45 | install_manifest.txt
46 | compile_commands.json
47 | CTestTestfile.cmake
48 | _deps
49 | 
50 | ### CMake Patch ###
51 | # External projects
52 | *-prefix/
53 | 
54 | 
55 | build/
56 | ### Bazel ###
57 | # gitignore template for Bazel build system
58 | # website: https://bazel.build/
59 | 
60 | # Ignore all bazel-* symlinks. There is no full list since this can change
61 | # based on the name of the directory bazel is cloned into.
62 | /bazel-*
63 | 


--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
 1 | package(default_visibility=["//visibility:public"])
 2 | load("//:cuda.bzl", "cuda_binary")
 3 | 
 4 | cc_library(
 5 |     name="multiply",
 6 |     srcs=[
 7 |         "src/multiply/multiply.cc",
 8 |         "src/multiply/multiply_cpu.cc"
 9 |     ],
10 |     hdrs=[
11 |         "include/multiply/multiply.h",
12 |         "include/cuda_utils.h"
13 |     ],
14 | )
15 | 
16 | # TODO(patwie): typed tests fail
17 | # cc_test(
18 | #     name="multiply-test",
19 | #     srcs=[
20 | #         "test/test_multiply_impl.h",
21 | #         "test/test_multiply.cc",
22 | #     ],
23 | #     copts=[
24 | #         "-Iexternal/gtest/include",
25 | #         "-Iexternal/gmock/include"
26 | #     ],
27 | #     deps=[
28 | #         ":multiply",
29 | #         "@gtest//:gtest",
30 | #     ]
31 | # )
32 | 
33 | cc_binary(
34 |     name="multiply-example",
35 |     srcs=[
36 |         "src/multiply.cc"
37 |     ],
38 |     deps=[
39 |         ":multiply",
40 |     ],
41 | )
42 | 
43 | cuda_binary(
44 |     name="sharedmemory-example",
45 |     includes=[
46 |         "/usr/local/cuda/include",
47 |         "."
48 |     ],
49 |     hdrs=[
50 |         "include/cuda_utils.h",
51 |         "include/cuda_index.h",
52 |         "include/test/multiply.h",
53 |     ],
54 |     flags="-std=c++11",
55 |     srcs=[
56 |         "src/sharedmemory.cu.cc"
57 |     ],
58 | )
59 | 
60 | cuda_binary(
61 |     name="tune-example",
62 |     includes=[
63 |         "/usr/local/cuda/include",
64 |         "."
65 |     ],
66 |     hdrs=[
67 |         "include/cuda_utils.h",
68 |         "include/test/multiply.h",
69 |     ],
70 |     flags="-std=c++11",
71 |     srcs=[
72 |         "src/tune.cu.cc"
73 |     ],
74 | 
75 | )
76 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 )
 2 | project(example)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Ofast -Wall -Wextra ")
 7 | enable_testing()
 8 | 
 9 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
10 | 
11 | find_package(GTest REQUIRED)
12 | include_directories(${GTEST_INCLUDE_DIRS})
13 | 
14 | find_package(CUDA)
15 | include(CudaHelper)
16 | 
17 | include_directories(${CMAKE_CURRENT_SOURCE_DIR})
18 | 
19 | add_library(multiply_cpu SHARED src/multiply/multiply_cpu.cc src/multiply/multiply.cc )
20 | list(APPEND LIBRARIES "multiply_cpu")
21 | 
22 | if(CUDA_FOUND)
23 |   cuda_add_library(multiply_gpu SHARED src/multiply/multiply_gpu.cu.cc src/multiply/multiply.cc )
24 |   list(APPEND LIBRARIES "multiply_gpu")
25 | 
26 |   cuda_add_executable(sharedmemory src/sharedmemory.cu.cc )
27 |   cuda_add_executable(tune src/tune.cu.cc )
28 | endif(CUDA_FOUND)
29 | 
30 | add_executable(multiply src/multiply.cc )
31 | target_link_libraries(multiply LINK_PUBLIC ${LIBRARIES})
32 | 
33 | # Benchmark
34 | if(CUDA_FOUND)
35 |   cuda_add_executable(benchmark
36 |     src/benchmark-multiply.cu.cc
37 |     src/multiply/multiply_gpu.cu.cc
38 |     src/multiply/multiply.cc
39 |   )
40 | endif(CUDA_FOUND)
41 | 
42 | # TESTS
43 | # TODO(): find a more robust way to link gmock, see docker file for setup
44 | # This currently assumes gmock.a is next to gtest.a
45 | link_directories($ENV{GTEST_ROOT}/lib)
46 | add_executable(test_cpu test/test_multiply.cc)
47 | target_link_libraries(test_cpu ${GTEST_LIBRARIES} gmock ${LIBRARIES} pthread)
48 | 
49 | add_test(TestCpu test_cpu)
50 | 
51 | if(TEST_CUDA)
52 |   cuda_add_executable(test_gpu test/test_multiply.cu.cc)
53 |   target_link_libraries(test_gpu ${GTEST_LIBRARIES} gmock ${LIBRARIES} pthread)
54 |   add_test(TestGpu test_gpu)
55 | endif(TEST_CUDA)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CUDA Design Patterns
  2 | 
  3 | Some best practises I collected over the last years when writing CUDA kernels. These functions
  4 | do not dictate how to use CUDA, these just simplify your workflow. I am not a big fan of libraries which rename things via wrappers. All code below does add additional benefits in CUDA programming.
  5 | 
  6 | ## CUDA Boilerplate Code
  7 | 
  8 | [EXAMPLE](./src/multiply/multiply_gpu.cu.cc)
  9 | 
 10 | **Description:**
 11 | Avoid plain a CUDA kernel functions and instead pack them into a struct.
 12 | 
 13 | 
 14 | ```cpp
 15 | template <typename ValueT>
 16 | struct MyKernel : public cuda::Kernel {
 17 |   void Launch(cudaStream_t stream = 0) {
 18 |     cuda::Run<<<1, 1, 0, stream>>>(*this);
 19 |   }
 20 |   __device__ __forceinline__ void operator()() const override {
 21 |     printf("hi from device code with value %f\n", val);
 22 |   }
 23 | 
 24 |   ValueT val;
 25 | };
 26 | 
 27 | MyKernel<float, 32> kernel;
 28 | kernel.val = 42.f;
 29 | kernel.Launch();
 30 | ```
 31 | 
 32 | **Reasons:**
 33 | 
 34 | - This allows much better organization of used parameters. We recommend
 35 | to write them at the end of the struct, such that when writing the CUDA kernel itself
 36 | they are always visible.
 37 | - These structs can contain or compute the launch configuration (grid, block, shm size) depending on the parameters.
 38 | - Multiple kernel launches require less code, as we do not need to type out all parameters over and over again for a second or third launch.
 39 | 
 40 | 
 41 | ## Functors
 42 | 
 43 | [EXAMPLE](./src/multiply.cc)
 44 | 
 45 | **Description:**
 46 | Use templated `structs` to switch seemlessly between CPU and GPU code:
 47 | 
 48 | ```cpp
 49 | Multiply<float, CpuDevice>::Apply(A, B, 2, 2, C); // run CPU
 50 | Multiply<float, GpuDevice>::Apply(A, B, 2, 2, C); // run GPU
 51 | Multiply<float>::Apply(A, B, 2, 2, C); // run GPU if available else on CPU
 52 | ```
 53 | 
 54 | **Reasons:**
 55 | 
 56 | - Switching between different devices is straight-forward.
 57 | - Understanding unit-tests which compare and verify the output becomes more easy.
 58 | 
 59 | ## Shared Memory
 60 | 
 61 | [EXAMPLE](./src/sharedmemory.cu.cc)
 62 | 
 63 | Use
 64 | 
 65 | ```cpp
 66 | cuda::SharedMemory shm;
 67 | float* floats_5 = shm.ref<float>(5);
 68 | int* ints_3 = shm.ref<int>(3);
 69 | ```
 70 | 
 71 | instead of
 72 | 
 73 | ```cpp
 74 | extern __shared__ char* shm[];
 75 | float* val1 = reinterpret_cast<float*>(&shm[0]); // 5 floats
 76 | int* val2 = reinterpret_cast<int*>(&shm[5]); // 3 ints
 77 | ```
 78 | 
 79 | 
 80 | **Reasons:**
 81 | 
 82 | - The number of values of specific data types to read should be on the same line as the declaration. This way adding additional shared memory becomes easier during development.
 83 | 
 84 | ## CUDA Kernel Dispatcher
 85 | 
 86 | [EXAMPLE](./src/tune.cu.cc)
 87 | 
 88 | Like in the *CUDA Boilerplate Code* example we pack our kernels into structs. For different hyper-parameters we use template specialization.
 89 | 
 90 | Given a generic CUDA kernel and a specialization
 91 | 
 92 | ```cpp
 93 | template <typename ValueT, int BLOCK_DIM_X>
 94 | struct MyKernel : public cuda::Kernel {}
 95 | 
 96 | template <typename ValueT>
 97 | struct MyKernel<ValueT, 4> : public cuda::Kernel {}
 98 | ```
 99 | 
100 | we use the kernel dispatcher
101 | 
102 | ```cpp
103 | MyKernel<float, 4> kernelA;
104 | MyKernel<float, 8> kernelB;
105 | 
106 | cuda::KernelDispatcher<int> dispatcher(true);
107 | dispatcher.Register<MyKernel<float, 4>>(3); // for length up to 3 (inclusive) start MyKernel<float, 4>
108 | dispatcher.Register<MyKernel<float, 8>>(6); // for length up to 6 (inclusive) start MyKernel<float, 8>
109 |                                             // as `dispatcher(true)` this kernel will handle all
110 |                                             // larger values as well
111 | int i = 4;         // a runtime value
112 | dispatcher.Run(i); // triggers `kernelB`
113 | ```
114 | 
115 | The dispatcher can also handle multi-dim values and a initializer
116 | 
117 | ```cpp
118 | struct Initializer {
119 |   template <typename T>
120 |   void operator()(T* el) {
121 |     el->val = 42.f;
122 |   }
123 | };
124 | Initializer init;
125 | cuda::KernelDispatcher<std::tuple<int, int>> disp(true);
126 | disp.Register<ExpertKernel2D<float, 4, 3>>(std::make_tuple(4, 3), init);
127 | disp.Register<ExpertKernel2D<float, 8, 4>>(std::make_tuple(9, 4), init);
128 | ```
129 | 
130 | **Reasons:**
131 | 
132 | - Changing the block-dims will have performance impact. A templated CUDA kernel can execute special implementations for different hyper-parameters.
133 | - A switch-statement dispatching run-time variables into a templated instantiation requires code-duplication, which can be avoid by the dispatcher.
134 | 
135 | ## CUDA Index Calculation
136 | 
137 | [EXAMPLE](./src/deprecated_examples.cu_old)
138 | 
139 | Do not compute indicies by hand when appropriate and use
140 | 
141 | ```cpp
142 | // or even ...
143 | // Used 8 registers, 368 bytes cmem[0]
144 | __global__ void readme_alternative2(float *src, float *dst,
145 |                                     int B, int H, int W, int C,
146 |                                     int b, int h, int w, int c) {
147 |   auto src_T = NdArray(src, B, H, W, C);
148 |   auto dst_T = NdArray(dst, B, H, W, C);
149 |   dst_T(b, h, w, c + 1) = src_T(b, h, w, c);
150 | 
151 |   // Unflatten the index.
152 |   auto index = NdIndex<4>(B, H, W, C);
153 |   size_t flattened_index = index(b, h, w, c);
154 | 
155 |   int b_=0, h_=0, w_=0, c_=0;
156 |   index.unflatten(flattened_index, b_, h_, w_, c_);
157 | }
158 | ```
159 | 
160 | instead of
161 | 
162 | ```cpp
163 | // spot the bug
164 | // Used 6 registers, 368 bytes cmem[0]
165 | __global__ void readme_normal(float *src, float *dst,
166 |                               int B, int H, int W, int C,
167 |                               int b, int h, int w, int c) {
168 |   const int pos1 = b * (H * W * C) + h * (W * c) + w * (C) + c;
169 |   const int pos2 = b * (H * W * C) + h * (W * C) + w * (C) + (c + 1);
170 |   dst[pos2] = src[pos1];
171 | }
172 | ```
173 | 
174 | **Reasons**:
175 | 
176 | - It is time-consuming and not worthwhile to concern yourself with index calculations. When writing CUDA code, you usually have many other vital things to ponder.
177 | - Each additional character increases the hit rate for a bug!
178 | - **I'm sick and tired of manually typing the indices.**
179 | - NdArray can have a positive impact on the number of used registers.
180 | 
181 | **Cons:**
182 | 
183 | - The compiler might not be able to optimize the `NdArray` overhead "away".
184 | - NdArray can have a negative impact on the number of used registers.
185 | 
186 | ## CMake Setup
187 | 
188 | **Description:**
189 | Use CMake to configure which targets should be build. By default set `TEST_CUDA=ON` and `WITH_CUDA=OFF`.
190 | The workflow (for this repository) is:
191 | 
192 | ```bash
193 | mkdir build && cd build
194 | cmake -DCMAKE_BUILD_TYPE=Release ..
195 | # or more specific
196 | cmake -DCMAKE_BUILD_TYPE=Release -DTEST_CUDA=ON -DCUDA_ARCH="52 60" ..
197 | make
198 | make test
199 | ```
200 | 
201 | **Reasons:**
202 | 
203 | -  Most CIs do not have a CUDA runtime installed. Whenever, `WITH_CUDA=ON` is activated the test code for CUDA will be also build.
204 | -  FindCuda might be more robust than a custom makefile.
205 | 
206 | ## Benchmark Kernels
207 | 
208 | [EXAMPLE](./src/benchmark-multiply.cu.cc)
209 | 
210 | **Description:**
211 | Like in the *CUDA Boilerplate Code* example we pack our kernels into structs. We might want th benchmark different template arguments.
212 | 
213 | ```cpp
214 | cuda::KernelBenchmark<int> bench;
215 | bench.Case<multiply_kernels::Multiply<float, 4>>(init);
216 | bench.Case<multiply_kernels::Multiply<float, 6>>(init);
217 | bench.Case<multiply_kernels::Multiply<float, 8>>(init);
218 | bench.Case<multiply_kernels::Multiply<float, 16>>(init);
219 | bench.Case<multiply_kernels::Multiply<float, 32>>(init);
220 | bench.Start();
221 | ```
222 | 
223 | will give the output:
224 | 
225 | ```
226 | Using Device Number: 0
227 |   Device name: GeForce GTX 970
228 |   Memory Clock Rate (KHz): 3505000
229 |   Memory Bus Width (bits): 256
230 |   Peak Memory Bandwidth (GB/s): 224.320000
231 | 
232 | time 500.000000 - 1000.000000, iters: 5 - 100
233 |  - multiply_kernels::Multiply<float, 4>    took     2.826743 ms stats(iters: 100, var:     0.067757, stddev:     0.260302)
234 |  - multiply_kernels::Multiply<float, 6>    took     1.245100 ms stats(iters: 100, var:     0.019352, stddev:     0.139112)
235 |  - multiply_kernels::Multiply<float, 8>    took     0.574468 ms stats(iters: 100, var:     0.000003, stddev:     0.001616)
236 |  - multiply_kernels::Multiply<float, 16>   took     0.502195 ms stats(iters: 100, var:     0.000002, stddev:     0.001380)
237 |  - multiply_kernels::Multiply<float, 32>   took     0.510635 ms stats(iters: 100, var:     0.000001, stddev:     0.001121)
238 | 
239 | ```
240 | 
241 | ## Tools
242 | - [online CUDA calculator](http://cuda.patwie.com/) instead of the NVIDIA Excel-sheet
243 | - [nvprof2json](https://github.com/PatWie/nvprof2json) to visualize NVIDIA profiling outputs in Google Chrome Browser (no dependencies compared to NVIDIA nvvp)
244 | 


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 2 | 
 3 | http_archive(
 4 |     name = "gtest",
 5 |     url = "https://github.com/google/googletest/archive/release-1.8.1.zip",
 6 |     sha256 = "927827c183d01734cc5cfef85e0ff3f5a92ffe6188e0d18e909c5efebf28a0c7",
 7 |     build_file = "gtest.BUILD",
 8 |     strip_prefix = "googletest-release-1.8.1",
 9 | )
10 | 


--------------------------------------------------------------------------------
/cmake/CudaHelper.cmake:
--------------------------------------------------------------------------------
 1 | OPTION(TEST_CUDA "Build Tests for CUDA" ON)
 2 | OPTION(SHOW_PTXAS "Build Tests for CUDA" ON)
 3 | set(CUDA_ARCH "" CACHE STRING "Target CUDA Architectures multiple are allowed")
 4 | 
 5 | # We use *.cu.cc as the default as most tool do not understand cu as CUDA.
 6 | file(GLOB_RECURSE source_list "*.cu.cc")
 7 | foreach(child ${source_list})
 8 |   set_source_files_properties(${child} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
 9 | endforeach()
10 | 
11 | 
12 | # CUDA not available
13 | if(CUDA_FOUND)
14 | 
15 |   # We can only build cuda tests if building cuda is enabled.
16 |   message(STATUS "Build with CUDA support")
17 | 
18 |   if(TEST_CUDA)
19 |     message(STATUS "Build tests for CUDA")
20 |   endif(TEST_CUDA)
21 | 
22 | 
23 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_CUDA ")
24 |   include_directories(${CUDA_INCLUDE_DIRS})
25 | 
26 |   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 --expt-relaxed-constexpr -DWITH_CUDA ")
27 | 
28 |   # Xptxas dumps register usage
29 |   if(SHOW_PTXAS)
30 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -Xptxas=-v")
31 |   endif(SHOW_PTXAS)
32 | 
33 |   if(CMAKE_BUILD_TYPE STREQUAL "Release")
34 |     message(STATUS "Build CUDA in ${CMAKE_BUILD_TYPE} mode")
35 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}  -O3 -Ofast")
36 |   endif(CMAKE_BUILD_TYPE STREQUAL "Release")
37 | 
38 |   if(CUDA_ARCH STREQUAL "")
39 |     # good defaults for CUDA Toolkit 8.x
40 |     if(CUDA_VERSION_MAJOR MATCHES 8)
41 |       set(CUDA_ARCH "35 37 52 60")
42 |     endif(CUDA_VERSION_MAJOR MATCHES 8)
43 | 
44 |     # good defaults for CUDA Toolkit 9.x
45 |     if(CUDA_VERSION_MAJOR MATCHES 9)
46 |       set(CUDA_ARCH "35 52 60 70")
47 |     endif(CUDA_VERSION_MAJOR MATCHES 9)
48 | 
49 |     # good defaults for CUDA Toolkit 10.x
50 |     if(CUDA_VERSION_MAJOR MATCHES 10)
51 |       set(CUDA_ARCH "35 52 60 70")
52 |     endif(CUDA_VERSION_MAJOR MATCHES 10)
53 |   endif(CUDA_ARCH STREQUAL "")
54 | 
55 |   # str replace ' ' with ;
56 |   STRING(REGEX REPLACE " " ";" CUDA_ARCH ${CUDA_ARCH})
57 | 
58 |   # set the compiler flags for each NV target
59 |   foreach(target ${CUDA_ARCH})
60 |     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_${target},code=\\\"sm_${target},compute_${target}\\\")
61 |   endforeach(target ${CUDA_ARCH})
62 | 
63 | else(CUDA_FOUND)
64 | 
65 |   message(STATUS "Build CUDA and tests for CUDA are disabled")
66 |   set(TEST_CUDA OFF)
67 | 
68 | endif(CUDA_FOUND)


--------------------------------------------------------------------------------
/cuda.bzl:
--------------------------------------------------------------------------------
 1 | def _cuda_binary(ctx):
 2 |   default_flags = ctx.attr.flags + \
 3 |       " -x=cu -Xcompiler \"-O3 -Ofast -Wall -Wextra -DWITH_CUDA\""
 4 | 
 5 |   cmd = "/usr/local/cuda/bin/nvcc -D__CUDACC__ "
 6 |   cmd += " " + default_flags + " "
 7 | 
 8 |   for src in ctx.files.srcs:
 9 |     cmd += src.path
10 | 
11 |   executable = ctx.actions.declare_file(ctx.attr.name)
12 |   cmd += " -o " + executable.path
13 | 
14 |   for include in ctx.attr.includes:
15 |     cmd += " -I" + include
16 | 
17 |   ctx.actions.run_shell(
18 |       outputs=[ctx.actions.declare_file(ctx.label.name)],
19 |       inputs=ctx.files.srcs + ctx.files.hdrs,
20 |       command=cmd,
21 |       mnemonic="CudaCompile",
22 |       progress_message="compile cuda",
23 |       use_default_shell_env=True,
24 |   )
25 | 
26 |   return [DefaultInfo(
27 |       files=depset([executable]),
28 |       executable=executable,
29 |   )]
30 | 
31 | 
32 | cuda_binary = rule(
33 |     implementation=_cuda_binary,
34 |     executable=True,
35 |     attrs={
36 |         "flags": attr.string(default=""),
37 |         "srcs": attr.label_list(default=[], allow_files=[".cc"]),
38 |         "hdrs": attr.label_list(default=[], allow_files=[".h"]),
39 |         "includes": attr.string_list(default=[]),
40 |         "out": attr.output(mandatory=False),
41 |     },
42 | )
43 | 


--------------------------------------------------------------------------------
/gtest.BUILD:
--------------------------------------------------------------------------------
 1 | cc_library(
 2 |     visibility=["//visibility:public"],
 3 |     name="gtest",
 4 |     srcs=glob(
 5 |         include=[
 6 |             "googletest/src/*.cc",
 7 |             "googletest/src/*.h",
 8 |             "googletest/include/gtest/**/*.h",
 9 |             "googlemock/src/*.cc",
10 |             "googlemock/include/gmock/**/*.h",
11 |         ],
12 |         exclude=[
13 |             "googletest/src/gtest-all.cc",
14 |             "googletest/src/gtest_main.cc",
15 |             "googlemock/src/gmock-all.cc",
16 |             "googlemock/src/gmock_main.cc",
17 |         ],
18 |     ),
19 |     hdrs=glob([
20 |         "googletest/include/gtest/*.h",
21 |         "googlemock/include/gmock/*.h",
22 |     ]),
23 |     copts=select({
24 |         "//conditions:default": ["-pthread -DGTEST_HAS_TYPED_TEST_P"],
25 |     }),
26 |     defines=select({
27 |         "//conditions:default": [],
28 |     }),
29 |     includes=[
30 |         "googlemock",
31 |         "googlemock/include",
32 |         "googletest",
33 |         "googletest/include",
34 |     ],
35 |     linkopts=select({
36 |         "//conditions:default": ["-pthread"],
37 |     }),
38 |     deps=select({
39 |         "//conditions:default": [],
40 |     }),
41 |     features=select({
42 |         "//conditions:default": [],
43 |     })
44 | )
45 | 


--------------------------------------------------------------------------------
/include/cuda_benchmark.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
 16 |  *
 17 |  */
 18 | 
 19 | #ifndef INCLUDE_CUDA_BENCHMARK_H_
 20 | #define INCLUDE_CUDA_BENCHMARK_H_
 21 | 
 22 | #include <stdio.h>
 23 | #include <functional>
 24 | #include <iomanip>
 25 | #include <iostream>
 26 | #include <memory>
 27 | #include <string>
 28 | #include <tuple>
 29 | #include <typeinfo>
 30 | #include <utility>
 31 | #include <vector>
 32 | 
 33 | #ifdef __GNUG__
 34 | #include <cxxabi.h>
 35 | #include <cstdlib>
 36 | #endif
 37 | #include "include/cuda_utils.h"
 38 | 
 39 | namespace cuda {
 40 | 
 41 | namespace internal {
 42 | // taken from https://stackoverflow.com/a/4541470/7443104
 43 | #ifdef __GNUG__
 44 | 
 45 | template <int DummyToBeInHeaderfile>
 46 | std::string demangle(const char* name) {
 47 |   int status = -4;  // some arbitrary value to eliminate the compiler warning
 48 | 
 49 |   // enable c++11 by passing the flag -std=c++11 to g++
 50 |   std::unique_ptr<char, void (*)(void*)> res{
 51 |       abi::__cxa_demangle(name, NULL, NULL, &status), std::free};
 52 | 
 53 |   return (status == 0) ? res.get() : name;
 54 | }
 55 | 
 56 | #else
 57 | 
 58 | // does nothing if not g++
 59 | template <int DummyToBeInHeaderfile>
 60 | std::string demangle(const char* name) {
 61 |   return name;
 62 | }
 63 | 
 64 | #endif
 65 | }  // namespace internal
 66 | 
 67 | /**
 68 |  * Benchmarks several templated kernels.
 69 |  *
 70 |  *   cuda::KernelBenchmark<int> bench;
 71 |  *   bench.Case<multiply_kernels::Multiply<float, 2>>(init);
 72 |  *   bench.Case<multiply_kernels::Multiply<float, 4>>(init);
 73 |  *   bench.Case<multiply_kernels::Multiply<float, 8>>(init);
 74 |  *   bench.Case<multiply_kernels::Multiply<float, 16>>(init);
 75 |  *   bench.Case<multiply_kernels::Multiply<float, 32>>(init);
 76 |  *   bench.Start();
 77 |  */
 78 | template <typename KeyT = int>
 79 | class KernelBenchmark {
 80 |   using TLauncherFunc = std::function<void()>;
 81 |   using ValueT = std::tuple<TLauncherFunc, std::string>;
 82 | 
 83 |   // we test at most 1 second
 84 |   const float max_time_ms = 1000;
 85 |   // we test at least 0.5 second
 86 |   const float min_time_ms = 500;
 87 |   // we test at most 100 times
 88 |   const int min_iterations = 5;
 89 |   const int max_iterations = 100;
 90 |   const int device_id = 0;
 91 | 
 92 |  public:
 93 |   // Register a kernel.
 94 |   //
 95 |   // Example
 96 |   //    cuda::KernelDispatcher<int> dispatcher;
 97 |   //    dispatcher.Case<kernel<float, X>>();
 98 |   template <typename T>
 99 |   void Case() {
100 |     static_assert(internal::HasLaunchMethod<T>::value,
101 |                   "The kernel struct needs to have a 'Launch()' method! "
102 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
103 |     // NOTE: std::shared_ptr<T>, std::unique_ptr<T> does not work here
104 |     // eg. std::shared_ptr<T> kernel(new T());
105 |     // so we delete these objects by collecting them
106 |     T* kernel = new T();  // needs to be on heap
107 |     deleter_.push_back([&]() { delete kernel; });
108 |     Place<T>([&kernel]() { kernel->Launch(); });
109 |   }
110 | 
111 |   // Register and intialize a kernel.
112 |   //
113 |   // Example
114 |   //    cuda::KernelDispatcher<int> dispatcher;
115 |   //    initializer init;
116 |   //    dispatcher.Case<kernel<float, X>>(init);
117 |   template <typename T, typename Initializer>
118 |   void Case(Initializer initializer) {
119 |     static_assert(internal::HasLaunchMethod<T>::value,
120 |                   "The kernel struct needs to have a 'Launch()' method! "
121 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
122 |     // NOTE: std::shared_ptr<T>, std::unique_ptr<T> does not work here
123 |     // eg. std::shared_ptr<T> kernel(new T());
124 |     // so we delete these objects by collecting them
125 |     T* kernel = new T();  // needs to be on heap
126 |     deleter_.push_back([&]() { delete kernel; });
127 |     initializer(kernel);
128 |     Place<T>([&kernel]() { kernel->Launch(); });
129 |   }
130 | 
131 |   KernelBenchmark() = default;
132 |   KernelBenchmark(float min_time_ms, float max_time_ms, int min_iterations,
133 |                   int max_iterations)
134 |       : min_time_ms(min_time_ms),
135 |         max_time_ms(max_time_ms),
136 |         min_iterations(min_iterations),
137 |         max_iterations(max_iterations) {}
138 | 
139 |   virtual ~KernelBenchmark() {
140 |     for (auto del : deleter_) {
141 |       del();
142 |     }
143 |   }
144 | 
145 |   void DeviceInfo() {
146 |     ASSERT_CUDA(cudaSetDevice(device_id));
147 |     cudaDeviceProp prop;
148 |     ASSERT_CUDA(cudaGetDeviceProperties(&prop, device_id));
149 |     printf("Using Device Number: %d\n", device_id);
150 |     printf("  Device name: %s\n", prop.name);
151 |     printf("  Memory Clock Rate (KHz): %d\n", prop.memoryClockRate);
152 |     printf("  Memory Bus Width (bits): %d\n", prop.memoryBusWidth);
153 |     printf("  Peak Memory Bandwidth (GB/s): %f\n\n",
154 |            2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
155 |   }
156 | 
157 |   void Start() {
158 | #if __CUDACC__
159 |     cudaEvent_t start, stop;
160 |     cudaEventCreate(&start);
161 |     cudaEventCreate(&stop);
162 | 
163 |     int longest_name_len = 0;
164 |     for (auto&& kernel : kernels_) {
165 |       int len = std::get<1>(kernel).length();
166 |       if (len > longest_name_len) {
167 |         longest_name_len = len;
168 |       }
169 |     }
170 |     DeviceInfo();
171 |     printf("time %f - %f, iters: %d - %d\n", min_time_ms, max_time_ms,
172 |            min_iterations, max_iterations);
173 | 
174 |     for (auto&& kernel : kernels_) {
175 |       const std::string name = std::get<1>(kernel);
176 |       printf(" - %-*s ", longest_name_len + 1, name.c_str());
177 | 
178 |       // burn in
179 |       for (int i = 0; i < 5; ++i) {
180 |         cudaEventRecord(start);
181 |         std::get<0>(kernel)();
182 |         cudaEventRecord(stop);
183 |       }
184 | 
185 |       // real measurement
186 |       float total_milliseconds = 0;
187 |       int used_iterations = 0;
188 | 
189 |       float old_mean_time = 0;
190 |       float cur_mean_time = 0;
191 |       float old_var_time = 0.0;
192 |       float cur_var_time = 0.0;
193 | 
194 |       for (int counter = 0; counter < max_iterations;
195 |            counter++, used_iterations++) {
196 |         // measure kernel
197 |         float milliseconds = 0;
198 |         cudaEventRecord(start);
199 |         std::get<0>(kernel)();
200 |         cudaEventRecord(stop);
201 |         ASSERT_CUDA(cudaPeekAtLastError());
202 |         ASSERT_CUDA(cudaDeviceSynchronize());
203 |         cudaEventSynchronize(stop);
204 |         cudaEventElapsedTime(&milliseconds, start, stop);
205 |         total_milliseconds += milliseconds;
206 | 
207 |         // Estimate if the result is stable enough to be reported.
208 |         // We want to run at least two runs (variance needs this).
209 |         if (counter > 0) {
210 |           // Update running statistics.
211 |           cur_mean_time =
212 |               old_mean_time + (milliseconds - old_mean_time) / (counter + 1);
213 |           cur_var_time = old_var_time + (milliseconds - old_mean_time) *
214 |                                             (milliseconds - cur_mean_time);
215 | 
216 |           old_var_time = cur_var_time;
217 |           old_mean_time = cur_mean_time;
218 | 
219 |           if (total_milliseconds <= min_time_ms) {
220 |             continue;
221 |           }
222 | 
223 |           // We can stop if it took already too long.
224 |           if (total_milliseconds > max_time_ms) {
225 |             break;
226 |           }
227 | 
228 |           // We want at least some iterations.
229 |           if (counter >= min_iterations) {
230 |             // Is std-dev small enough?
231 |             float real_stdev = sqrt(cur_var_time / (used_iterations - 1));
232 |             if (real_stdev < 0.01 * cur_mean_time) {
233 |               break;
234 |             }
235 |           }
236 | 
237 |         } else {
238 |           old_mean_time = milliseconds;
239 |           cur_mean_time = milliseconds;
240 |         }
241 |       }
242 | 
243 |       printf(" took %12f ms stats(iters: %3d, var: %12f, stddev: %12f)\n",
244 |              total_milliseconds / used_iterations, used_iterations,
245 |              cur_var_time / (used_iterations - 1),
246 |              sqrt(cur_var_time / (used_iterations - 1)));
247 |     }
248 |     cudaEventDestroy(start);
249 |     cudaEventDestroy(stop);
250 | 
251 | #endif  // __CUDACC__
252 |   }
253 | 
254 |  private:
255 |   template <typename T>
256 |   void Place(TLauncherFunc&& launch_func) {
257 |     kernels_.push_back(
258 |         std::move(std::make_tuple(std::forward<TLauncherFunc>(launch_func),
259 |                                   internal::demangle<0>(typeid(T).name()))));
260 |   }
261 | 
262 |   std::vector<TLauncherFunc> deleter_;
263 |   std::vector<ValueT> kernels_;
264 |   bool extend = true;  // if true kernel with largest bound will act as default
265 | };
266 | }  // namespace cuda
267 | 
268 | #endif  // INCLUDE_CUDA_BENCHMARK_H_
269 | 


--------------------------------------------------------------------------------
/include/cuda_index.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2018
 16 |  *
 17 |  */
 18 | 
 19 | #ifndef INCLUDE_CUDA_INDEX_H_
 20 | #define INCLUDE_CUDA_INDEX_H_
 21 | 
 22 | #if __CUDACC__
 23 | 
 24 | #define cuda_inline __device__ __host__ __forceinline__
 25 | 
 26 | #include <assert.h>
 27 | #include <cuda_runtime.h>
 28 | 
 29 | namespace cuda {
 30 | 
 31 | namespace internal {
 32 | 
 33 | template <size_t TRank, size_t TSkip, size_t TPos, size_t TRemaining>
 34 | struct pitch_helper {
 35 |   constexpr size_t call(const size_t dimensions_[TRank]) const {
 36 |     return pitch_helper<TRank, TSkip - 1, TPos + 1, TRank - TPos - 1>().call(
 37 |         dimensions_);
 38 |   }
 39 | };
 40 | 
 41 | template <size_t TRank, size_t TPos, size_t TRemaining>
 42 | struct pitch_helper<TRank, 0, TPos, TRemaining> {
 43 |   constexpr size_t call(const size_t dimensions_[TRank]) const {
 44 |     return dimensions_[TPos] *
 45 |            pitch_helper<TRank, 0, TPos + 1, TRemaining - 1>().call(dimensions_);
 46 |   }
 47 | };
 48 | 
 49 | template <size_t TRank, size_t TPos>
 50 | struct pitch_helper<TRank, 0, TPos, 0> {
 51 |   constexpr size_t call(const size_t dimensions_[TRank]) const { return 1; }
 52 | };
 53 | 
 54 | template <size_t TRank, size_t TRemaining, class T, class... Ts>
 55 | struct position_helper {
 56 |   constexpr size_t call(const size_t dimensions_[TRank], T v, Ts... is) const {
 57 |     return v * pitch_helper<TRank, TRank - TRemaining + 1, 0, TRank>().call(
 58 |                    dimensions_) +
 59 |            position_helper<TRank, TRemaining - 1, Ts...>().call(dimensions_,
 60 |                                                                 is...);
 61 |   }
 62 | };
 63 | 
 64 | template <size_t TRank, size_t TRemaining, class T>
 65 | struct position_helper<TRank, TRemaining, T> {
 66 |   constexpr size_t call(const size_t dimensions_[TRank], T v) const {
 67 |     return v;
 68 |   }
 69 | };
 70 | 
 71 | template <size_t TRank, size_t TPos, size_t TRemaining>
 72 | struct unflatten_helper {
 73 |   template <class... Ts>
 74 |   static constexpr void call(const size_t dimensions_[TRank],
 75 |                              size_t flattenedIndex, size_t& index,
 76 |                              Ts&... indices) noexcept {
 77 |     const size_t pitch =
 78 |         pitch_helper<TRank, 1, TPos, TRank - 1>().call(dimensions_);
 79 |     index = flattenedIndex / pitch;
 80 |     unflatten_helper<TRank, TPos + 1, TRemaining - 1>::call(
 81 |         dimensions_, flattenedIndex % pitch, indices...);
 82 |   }
 83 | };
 84 | 
 85 | template <size_t TRank, size_t TPos>
 86 | struct unflatten_helper<TRank, TPos, 1> {
 87 |   template <class... Ts>
 88 |   static constexpr void call(const size_t dimensions_[TRank],
 89 |                              size_t flattenedIndex, size_t& index,
 90 |                              Ts&... indices) noexcept {
 91 |     index = flattenedIndex;
 92 |   }
 93 | };
 94 | 
 95 | };  // namespace internal
 96 | 
 97 | template <size_t TRank>
 98 | struct BaseNdIndex {
 99 |  protected:
100 |   size_t dimensions_[TRank];
101 | 
102 |  public:
103 |   template <class... Ts>
104 |   explicit constexpr cuda_inline BaseNdIndex(size_t i0, Ts... is) noexcept
105 |       : dimensions_{i0, is...} {}
106 | 
107 |   /**
108 |    * Check whether given coordinate is in range.
109 |    */
110 |   template <class... Ts>
111 |   constexpr cuda_inline bool valid(size_t i0, Ts... is) const noexcept {
112 |     static_assert(size_t(1) + sizeof...(Ts) == TRank,
113 |                   "Number of dimensions does not match rank! "
114 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
115 |     return valid_impl<0, Ts...>(i0, is...);
116 |   }
117 | 
118 |   /**
119 |    * Return the number of axes.
120 |    * @return number of axes
121 |    */
122 |   constexpr cuda_inline size_t rank() const noexcept { return TRank; }
123 | 
124 |   /**
125 |    * Return the dimension for a given axis.
126 |    *
127 |    *     const size_t D = my_nd_array.template dim<1>();
128 |    *
129 |    * @return dimension for given axis
130 |    */
131 |   template <size_t TAxis>
132 |   constexpr cuda_inline size_t dim() const noexcept {
133 |     static_assert(TAxis < TRank, "axis < rank failed");
134 |     return dimensions_[TAxis];
135 |   }
136 | 
137 |   /**
138 |    * Unflatten a flattened index and retrieve the corresponding
139 |    * indices for each dimension.
140 |    *
141 |    *     size_t i=0, j=0, k=0;
142 |    *     idx.unflatten(flattenedIndex, i, j, k);
143 |    *
144 |    * @param flattenedIndex the flattened index to unflatten
145 |    * @param indices references to variables to store the indices
146 |    */
147 |   template <class... Ts>
148 |   constexpr cuda_inline void unflatten(size_t flattenedIndex,
149 |                                        Ts&... indices) const noexcept {
150 |     static_assert(sizeof...(Ts) == TRank,
151 |                   "Number of indices does not match rank! "
152 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
153 |     internal::unflatten_helper<TRank, 0, TRank>::call(
154 |         dimensions_, flattenedIndex, indices...);
155 |   }
156 | 
157 |  private:
158 |   template <size_t TNum, class... Ts>
159 |   constexpr cuda_inline bool valid_impl(size_t i0, Ts... is) const {
160 |     return (i0 < dimensions_[TNum]) && valid_impl<TNum + 1, Ts...>(is...);
161 |   }
162 | 
163 |   template <size_t TNum, typename T>
164 |   constexpr cuda_inline bool valid_impl(T i0) const {
165 |     return (i0 < dimensions_[TRank - 1]);
166 |   }
167 | 
168 |  protected:
169 |   template <class... Ts>
170 |   constexpr cuda_inline size_t index_(size_t i0, Ts... is) const {
171 |     return internal::position_helper<TRank, TRank, size_t, Ts...>().call(
172 |         dimensions_, i0, is...);
173 |   }
174 | };
175 | 
176 | /**
177 |  * Create an index object.
178 |  *
179 |  * The index object can handle various dimensions.
180 |  *
181 |  *     auto idx = NdIndex<4>(B, H, W, C);
182 |  *     auto TPos = idx(b, h, w, c);
183 |  *
184 |  * @param rank in each dimensions.
185 |  */
186 | template <size_t TRank>
187 | struct NdIndex : public BaseNdIndex<TRank> {
188 |  public:
189 |   template <class... Ts>
190 |   explicit constexpr cuda_inline NdIndex(size_t i0, Ts... is) noexcept
191 |       : BaseNdIndex<TRank>(i0, is...) {
192 |     static_assert(size_t(1) + sizeof...(Ts) == TRank,
193 |                   "Number of dimensions does not match rank! "
194 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
195 |   }
196 | 
197 |   /**
198 |    * Get flattened index for a given position.
199 |    *
200 |    *     auto idx = NdIndex<4>(10, 20, 30, 40);
201 |    *     size_t actual = idx(1, 2, 3, 4);
202 |    *     size_t expected = 1 * (20 * 30 * 40) + 2 * (30 * 40) + 3 * (40) + 4;
203 |    */
204 |   template <class... Ts>
205 |   size_t cuda_inline operator()(size_t i0, Ts... is) const {
206 |     static_assert(size_t(1) + sizeof...(Ts) == TRank,
207 |                   "Number of dimensions does not match rank! "
208 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
209 |     return index_(i0, is...);
210 |   }
211 | 
212 |   /**
213 |    * Get dimension for a given axis.
214 |    *
215 |    *     auto idx = NdIndex<4>(10, 20, 30, 40);
216 |    *     size_t actual = idx[1]; // is 20
217 |    */
218 |   template <class... Ts>
219 |   size_t cuda_inline operator[](size_t i0) const {
220 |     return BaseNdIndex<TRank>::dimensions_[i0];
221 |   }
222 | };
223 | 
224 | template <class T, size_t TRank>
225 | struct NdArray : public BaseNdIndex<TRank> {
226 |   T* data_;
227 | 
228 |  public:
229 |   template <class... Ts>
230 |   explicit constexpr cuda_inline NdArray(T* data, size_t i0, Ts... is) noexcept
231 |       : BaseNdIndex<TRank>(i0, is...), data_(data) {}
232 | 
233 |   /**
234 |    * Returns value from given position if valid, else 0;
235 |    *
236 |    *    auto T = make_ndarray(data, A, B, C);
237 |    *    auto val = T.safe_value(a, b, c);
238 |    *
239 |    * is equal
240 |    *
241 |    *    auto T = make_ndarray(data, A, B, C);
242 |    *    auto val = T.valid(a, b, c) ? T(a, b, c) : 0;
243 |    */
244 |   template <class... Ts>
245 |   T cuda_inline safe_value(size_t i0, Ts... is) const {
246 |     return valid(i0, is...) ? data_[index(i0, is...)] : 0;
247 |   }
248 | 
249 |   /**
250 |    * Returns value from given position if valid, else 0;
251 |    *
252 |    *    auto T = make_ndarray(data, A, B, C);
253 |    *    auto val = T(a, b, c);
254 |    */
255 |   template <class... Ts>
256 |   T cuda_inline operator()(size_t i0, Ts... is) const {
257 |     return data_[index(i0, is...)];
258 |   }
259 | 
260 |   /**
261 |    * Write value at given position.
262 |    *
263 |    *    auto T = make_ndarray(data, A, B, C);
264 |    *    T(a, b, c) = 42;
265 |    */
266 |   template <class... Ts>
267 |   T& __device__ __host__ operator()(size_t i0, Ts... is) {
268 |     return data_[index(i0, is...)];
269 |   }
270 | 
271 |   /**
272 |    * Wrap c-array read access
273 |    */
274 |   template <class... Ts>
275 |   T cuda_inline operator[](size_t i0) const {
276 |     return data_[i0];
277 |   }
278 | 
279 |   /**
280 |    * Wrap c-array write access
281 |    */
282 |   template <class... Ts>
283 |   T& __device__ __host__ operator[](size_t i0) {
284 |     return data_[i0];
285 |   }
286 | 
287 |   /**
288 |    * Returns index from given position.
289 |    *    auto T = make_ndarray(data, A, B, C);
290 |    *    size_t TPos = T.index(a, b, c);
291 |    */
292 |   template <class TT, class... Ts>
293 |   constexpr cuda_inline TT index(TT i0, Ts... is) const {
294 |     static_assert(size_t(1) + sizeof...(Ts) == TRank,
295 |                   "Number of dimensions does not match rank! "
296 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
297 |     return index_(i0, is...);
298 |   }
299 | 
300 |   T* flat() { return data_; }
301 | };
302 | 
303 | /**
304 |  * Create a multi-dim. array object but ensures rank.
305 |  *
306 |  * The multi-dim. array object is a combination of a flat array and nd-index.
307 |  *
308 |  *     const float* M = ...;
309 |  *     auto Mt = make_ndarray<float, 4>(M, B, H, W, C);
310 |  *     float val = Mt(b, h, w, c);
311 |  *
312 |  * @param rank in each dimensions.
313 |  */
314 | template <typename T, size_t TRank, class... Ts>
315 | cuda_inline auto make_ndarray(T* arr, size_t N0, Ts... Ns)
316 |     -> NdArray<T, TRank> {
317 |   static_assert(size_t(1) + sizeof...(Ts) == TRank,
318 |                 "Number of dimensions does not match rank! "
319 |                 "YOU_MADE_A_PROGAMMING_MISTAKE");
320 |   return NdArray<T, TRank>(arr, N0, Ns...);
321 | }
322 | 
323 | };  // namespace cuda
324 | 
325 | #undef cuda_inline
326 | 
327 | #endif  // __CUDACC__
328 | 
329 | #endif  // INCLUDE_CUDA_INDEX_H_
330 | 


--------------------------------------------------------------------------------
/include/cuda_utils.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
 16 |  *         Fabian Groh, <fabian.groh@uni-tuebingen.de>, 2019
 17 |  *
 18 |  */
 19 | 
 20 | #ifndef INCLUDE_CUDA_UTILS_H_
 21 | #define INCLUDE_CUDA_UTILS_H_
 22 | 
 23 | #include <stdio.h>
 24 | #include <functional>
 25 | #include <iostream>
 26 | #include <map>
 27 | #include <memory>
 28 | #include <string>
 29 | #include <tuple>
 30 | #include <typeinfo>
 31 | #include <utility>
 32 | 
 33 | // Template parameter for compile-time cuda drop-in replacements of cpu
 34 | // functions.
 35 | struct CpuDevice {
 36 |   static const int device_id = 1;
 37 | };
 38 | 
 39 | struct GpuDevice {
 40 |   static const int device_id = 2;
 41 | };
 42 | 
 43 | struct AnyDevice {
 44 |   static const int device_id = 0;
 45 | };
 46 | 
 47 | #if __CUDACC__
 48 | // __CUDACC__ is defined by nvcc on device and host
 49 | // __CUDA_ARCH__ is defined by nvcc on device
 50 | 
 51 | /**
 52 |  * This is the default way of testing whether executing the CUDA kernel has been
 53 |  * successfull.
 54 |  *
 55 |  * Example:
 56 |  *    Mykernel kernel;
 57 |  *    kernel.Launch();
 58 |  *    ASSERT_CUDA(cudaDeviceSynchronize());
 59 |  *
 60 |  * @param  ans is a function that returns a cudaError_t
 61 |  * taken from: https://stackoverflow.com/a/14038590
 62 |  */
 63 | // #if NDEBUG
 64 | // // disable assert in production code
 65 | // #define ASSERT_CUDA(ans) ((void)ans)
 66 | // #else  // NDEBUG
 67 | #define ASSERT_CUDA(ans) \
 68 |   { gpuAssert((ans), __FILE__, __LINE__); }
 69 | inline void gpuAssert(cudaError_t code, const char* file, int line,
 70 |                       bool abort = true) {
 71 |   if (code != cudaSuccess) {
 72 |     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
 73 |             line);
 74 |     if (abort) exit(code);
 75 |   }
 76 | }
 77 | // #endif  // NDEBUG
 78 | 
 79 | namespace cuda {
 80 | 
 81 | /**
 82 |  * Compute the number of blocks for a given number of threads and a workload.
 83 |  * @param  N           number of workload instance
 84 |  * @param  num_threads number of threads per block
 85 |  * @return             number of required blocks
 86 |  */
 87 | __host__ __device__ __forceinline__ int divUp(int N, int num_threads) {
 88 |   return (N + num_threads - 1) / num_threads;
 89 | }
 90 | 
 91 | // Kernel is an abstract CUDA kernel, which can have attached values to avoid
 92 | // lengthly function signatures.
 93 | struct Kernel {
 94 |   /**
 95 |    * Launch contains the computation of all kernel parameters and executes
 96 |    * the CUDA call.
 97 |    *
 98 |    * This should include the computation of the kernel configuration like
 99 |    * gridDim, blockDim, shared_memory size. We enforce to use a cuda stream.
100 |    *
101 |    * Example:
102 |    *   void Launch(cudaStream_t stream = 0){
103 |    *     cuda::Run<<<1, 1, 0, stream>>>(*this);
104 |    *   }
105 |    *
106 |    * @param stream used cuda stream
107 |    */
108 |   virtual void Launch(cudaStream_t stream = 0) = 0;
109 |   virtual ~Kernel() = default;
110 | 
111 |   /**
112 |    * This operation contains the code, which will be executed on-chip.
113 |    */
114 |   virtual __device__ __forceinline__ void operator()() const = 0;
115 | };
116 | 
117 | // Run a cuda kernel encapsulated in a struct.
118 | // The kernel should have the following format
119 | //
120 | // struct Kernel {
121 | //    void Launch(cudaStream_t stream = 0);
122 | //    __device__ __forceinline__ void operator()();
123 | // };
124 | //
125 | template <typename T>
126 | __global__ void Run(const T kernel) {
127 |   kernel();
128 | }
129 | 
130 | /**
131 |  * Proxy for shared memory when used in templates to avoid double extern.
132 |  *
133 |  *     run_kernel<<<grid, block, shm_size>>>(...)
134 |  *
135 |  *     T* s_shm = MixedSharedMemory<T>();
136 |  *     T* s_el1 = (T*)&s_shm[0];
137 |  *     T* s_el2 = (T*)&s_shm[10]; // or use cuda::SharedMemory
138 |  *
139 |  * @param rank in each dimensions.
140 |  */
141 | template <typename T>
142 | __device__ T* MixedSharedMemory() {
143 |   extern __shared__ __align__(sizeof(T)) unsigned char s_shm[];
144 |   return reinterpret_cast<T*>(s_shm);
145 | }
146 | 
147 | /**
148 |  * Extracting multiple values from shared memory of different types.
149 |  *
150 |  * Example:
151 |  *    SharedMemory shm;
152 |  *    shm.add<float>(5);
153 |  *    shm.add<int>(3);
154 |  *    shm.add<float>(2);
155 |  *
156 |  *    kernel<<<...,...,shm.bytes>>>();
157 |  *
158 |  * and inside the CUDA kernel
159 |  *
160 |  *    SharedMemory shm;
161 |  *    float* shm_1 = shm.ref<float>(5);
162 |  *    int* shm_2 = shm.ref<int>(3);
163 |  *    float* shm_3 = shm.ref<float>(2);
164 |  */
165 | struct SharedMemory {
166 |   int bytes = 0;
167 |   unsigned char* shm_anchor;
168 | 
169 |   __host__ __device__ SharedMemory() {
170 | #if __CUDA_ARCH__
171 |     // inside device code we can declare shared memory and refer to it
172 |     extern __shared__ unsigned char shm[];
173 |     shm_anchor = shm;
174 | #endif  // __CUDA_ARCH__
175 |   }
176 | 
177 |   template <typename T>
178 |   __device__ T* ref(int num) {
179 |     T* ptr = reinterpret_cast<T*>(&shm_anchor[bytes]);
180 |     bytes += num * sizeof(T);
181 |     return ptr;
182 |   }
183 | 
184 |   template <typename T>
185 |   __host__ void add(int num) {
186 |     bytes += num * sizeof(T);
187 |   }
188 | };
189 | 
190 | };  // namespace cuda
191 | 
192 | #endif  // __CUDACC__
193 | 
194 | namespace cuda {
195 | 
196 | namespace internal {
197 | template <typename T>
198 | class HasLaunchMethod {
199 |  private:
200 |   typedef char yes[1];
201 |   typedef char no[2];
202 | 
203 |   template <typename C>
204 |   static yes& verify(decltype(&C::Launch));
205 |   template <typename C>
206 |   static no& verify(...);
207 | 
208 |  public:
209 |   enum { value = sizeof(verify<T>(0)) == sizeof(yes) };
210 | };
211 | 
212 | };  // namespace internal
213 | 
214 | /**
215 |  * Dispatch template kernels according to a hyper parameter.
216 |  *
217 |  *   ExpertKernel<float, 4> kernelA;
218 |  *   ExpertKernel<float, 8> kernelB;
219 |  *   cuda::KernelDispatcher disp(false);
220 |  *
221 |  *   disp.Register(3, kernelA); // for length up to 3 (inclusive) start kernelA
222 |  *   disp.Register(6, kernelB); // for length up to 6 (inclusive) start kernelB
223 |  *
224 |  *   int i = 6;       // runtime variable
225 |  *   disp.Run(i - 1); // launches kernelA
226 |  *   disp.Run(i);     // launches kernelB
227 |  *   disp.Run(i + 1); // triggers runtime exeception because of
228 |  *                    // `disp(false)`
229 |  */
230 | template <typename KeyT = int, typename TComparator = std::less<KeyT>>
231 | class KernelDispatcher {
232 |   using TLauncherFunc = std::function<void()>;
233 |   using TLauncherFuncMap = std::map<KeyT, TLauncherFunc, TComparator>;
234 | 
235 |  public:
236 |   explicit KernelDispatcher(bool extend = true) : extend(extend) {}
237 | 
238 |   // Register a instantiated kernel.
239 |   //
240 |   // Example
241 |   //    cuda::KernelDispatcher<int> dispatcher;
242 |   //    kernel<float, X> instance;
243 |   //    dispatcher.Register(y, &instance);
244 |   template <typename T>
245 |   void Register(KeyT bound, T* kernel) {
246 |     static_assert(internal::HasLaunchMethod<T>::value,
247 |                   "The kernel struct needs to have a 'Launch()' method! "
248 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
249 |     Place<T>(bound, [&]() { kernel->Launch(); });
250 |   }
251 | 
252 |   // Register and intialize a instantiated kernel.
253 |   //
254 |   // Example
255 |   //    cuda::KernelDispatcher<int> dispatcher;
256 |   //    kernel<float, X> instance;
257 |   //    initializer init;
258 |   //    dispatcher.Register(y, &instance, init);
259 |   template <typename T, typename Initializer>
260 |   void Register(KeyT bound, T* kernel, Initializer initializer) {
261 |     static_assert(internal::HasLaunchMethod<T>::value,
262 |                   "The kernel struct needs to have a 'Launch()' method! "
263 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
264 |     initializer(kernel);
265 |     Place<T>(bound, [&]() { kernel->Launch(); });
266 |   }
267 | 
268 |   // Register a kernel.
269 |   //
270 |   // Example
271 |   //    cuda::KernelDispatcher<int> dispatcher;
272 |   //    dispatcher.Register<kernel<float, X>>(y);
273 |   template <typename T>
274 |   void Register(KeyT bound) {
275 |     static_assert(internal::HasLaunchMethod<T>::value,
276 |                   "The kernel struct needs to have a 'Launch()' method! "
277 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
278 |     T kernel;
279 |     Place<T>(bound, [&]() { kernel->Launch(); });
280 |   }
281 | 
282 |   // Register and intialize a kernel.
283 |   //
284 |   // Example
285 |   //    cuda::KernelDispatcher<int> dispatcher;
286 |   //    initializer init;
287 |   //    dispatcher.Register<kernel<float, X>>(y, init);
288 |   template <typename T, typename Initializer>
289 |   void Register(KeyT bound, Initializer initializer) {
290 |     static_assert(internal::HasLaunchMethod<T>::value,
291 |                   "The kernel struct needs to have a 'Launch()' method! "
292 |                   "YOU_MADE_A_PROGAMMING_MISTAKE");
293 |     T kernel;
294 |     initializer(&kernel);
295 |     Place<T>(bound, [&]() { kernel.Launch(); });
296 |   }
297 | 
298 |   // // would require C++14 to use
299 |   // //
300 |   // // auto init = [&](auto& T){T.val = 42;};
301 |   // // disp.Register(3, kernelA, init);
302 |   // template <typename T>
303 |   // void Register(KeyT bound, T& kernel, std::function<void(T&)> init) {
304 |   //   init(kernel);
305 |   //   Register(bound, [&]() {
306 |   //     kernel.Launch();
307 |   //   });
308 |   // }
309 | 
310 |   void Run(KeyT hyper) {
311 |     typename TLauncherFuncMap::iterator detected_kernel =
312 |         kernels_.lower_bound(hyper);
313 |     if (detected_kernel == kernels_.end()) {
314 |       if (extend) {
315 |         // Assume kernel with largest bound is the generic version.
316 |         kernels_.rbegin()->second();
317 |       } else {
318 |         // const KeyT upper_bound = kernels_.rbegin()->first;
319 |         throw std::runtime_error(
320 |             "KernelDispatcher has no kernels registered for the parameter "
321 |             "requested by the runtime. Use 'KernelDispatcher(true)' to extend"
322 |             " the range of the last registered kernel.");
323 |       }
324 |     } else {
325 |       // Found registered kernel and will launch it.
326 |       detected_kernel->second();
327 |     }
328 |   }
329 | 
330 |  private:
331 |   template <typename T>
332 |   void Place(KeyT bound, TLauncherFunc&& launch_func) {
333 |     kernels_[bound] = std::forward<TLauncherFunc>(launch_func);
334 |   }
335 | 
336 |   TLauncherFuncMap kernels_;
337 |   bool extend = true;  // if true kernel with largest bound will act as default
338 | };
339 | };  // namespace cuda
340 | 
341 | #endif  // INCLUDE_CUDA_UTILS_H_
342 | 


--------------------------------------------------------------------------------
/include/multiply/multiply.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #ifndef INCLUDE_MULTIPLY_MULTIPLY_H_
20 | #define INCLUDE_MULTIPLY_MULTIPLY_H_
21 | 
22 | #include "include/cuda_index.h"
23 | #include "include/cuda_utils.h"
24 | 
25 | #if __CUDACC__
26 | namespace multiply_kernels {
27 | 
28 | using cuda::make_ndarray;
29 | 
30 | // We follow the NVIDIA cub library style for template parameters.
31 | // BLOCK_DIM_X is the number of threads in a block along dimension x.
32 | template <typename ValueT, int BLOCK_DIM_X>
33 | struct Multiply : public cuda::Kernel {
34 |   // use enum for compiletime config
35 |   // enum { PER_THREAD = 1 };
36 | 
37 |   void Launch(cudaStream_t stream = 0) override {
38 |     dim3 block(BLOCK_DIM_X, BLOCK_DIM_X);
39 |     dim3 grid(cuda::divUp(W, BLOCK_DIM_X), cuda::divUp(H, BLOCK_DIM_X));
40 | 
41 |     cuda::Run<<<grid, block, 0, stream>>>(*this);
42 |   }
43 | 
44 |   __device__ __forceinline__ void operator()() const override {
45 |     __shared__ ValueT ds_M[BLOCK_DIM_X][BLOCK_DIM_X];
46 |     __shared__ ValueT ds_N[BLOCK_DIM_X][BLOCK_DIM_X];
47 | 
48 |     const int tx = threadIdx.x;
49 |     const int ty = threadIdx.y;
50 |     const int Ch = blockIdx.y * BLOCK_DIM_X + ty;
51 |     const int Cw = blockIdx.x * BLOCK_DIM_X + tx;
52 | 
53 |     ValueT Cval = 0;
54 | 
55 |     const auto At = make_ndarray<const ValueT, 2>(A, H, W);
56 |     const auto Bt = make_ndarray<const ValueT, 2>(B, H, W);
57 |     auto Ct = make_ndarray<ValueT, 2>(C, H, W);
58 | 
59 |     for (int m = 0; m < (W - 1) / BLOCK_DIM_X + 1; ++m) {
60 |       if (At.valid(Ch, m * BLOCK_DIM_X + tx)) {
61 |         ds_M[ty][tx] = At(Ch, m * BLOCK_DIM_X + tx);
62 |       } else {
63 |         ds_N[ty][tx] = 0;
64 |       }
65 |       if (Bt.valid(m * BLOCK_DIM_X + ty, Cw)) {
66 |         ds_N[ty][tx] = Bt(m * BLOCK_DIM_X + ty, Cw);
67 |       } else {
68 |         ds_N[ty][tx] = 0;
69 |       }
70 |       __syncthreads();
71 | 
72 |       for (int k = 0; k < BLOCK_DIM_X; ++k) Cval += ds_M[ty][k] * ds_N[k][tx];
73 |       __syncthreads();
74 |     }
75 |     if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval;
76 |   }
77 | 
78 |   int W;
79 |   int H;
80 |   const ValueT* A;
81 |   const ValueT* B;
82 |   ValueT* C;
83 | };
84 | 
85 | }  // namespace multiply_kernels
86 | #endif  // __CUDACC__
87 | 
88 | template <typename ValueT, typename Device = AnyDevice>
89 | struct Multiply {
90 |   static void Apply(const ValueT* A, const ValueT* B, const int H, const int W,
91 |                     ValueT* C);
92 | };
93 | 
94 | #endif  // INCLUDE_MULTIPLY_MULTIPLY_H_
95 | 


--------------------------------------------------------------------------------
/include/test/multiply.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #ifndef INCLUDE_TEST_MULTIPLY_H_
20 | #define INCLUDE_TEST_MULTIPLY_H_
21 | 
22 | #include "gmock/gmock.h"
23 | #include "gtest/gtest.h"
24 | 
25 | #if GTEST_HAS_TYPED_TEST_P
26 | 
27 | #include "include/cuda_utils.h"
28 | #include "include/multiply/multiply.h"
29 | 
30 | template <class T>
31 | class MultiplyTest : public testing::Test {};
32 | 
33 | TYPED_TEST_SUITE_P(MultiplyTest);
34 | 
35 | TYPED_TEST_P(MultiplyTest, TestIdentity) {
36 |   float *A = new float[2 * 2];
37 |   float *B = new float[2 * 2];
38 |   float *expected = new float[2 * 2];
39 | 
40 |   for (int i = 0; i < 2 * 2; ++i) {
41 |     A[i] = i;
42 |     B[i] = 0;
43 |     expected[i] = i;
44 |   }
45 |   B[0] = 1;
46 |   B[3] = 1;
47 | 
48 |   float *actual = new float[2 * 2];
49 | 
50 |   Multiply<float, TypeParam>::Apply(A, B, 2, 2, actual);
51 | 
52 |   for (int i = 0; i < 2 * 2; ++i) {
53 |     EXPECT_EQ(expected[i], actual[i]);
54 |   }
55 | }
56 | 
57 | TYPED_TEST_P(MultiplyTest, TestSquare) {
58 |   float *A = new float[3 * 3];
59 |   float *B = new float[3 * 3];
60 |   float *expected = new float[3 * 3];
61 | 
62 |   for (int i = 0; i < 3 * 3; ++i) {
63 |     A[i] = i;
64 |     B[i] = i;
65 |   }
66 |   expected[0] = 15;
67 |   expected[1] = 18;
68 |   expected[2] = 21;
69 |   expected[3] = 42;
70 |   expected[4] = 54;
71 |   expected[5] = 66;
72 |   expected[6] = 69;
73 |   expected[7] = 90;
74 |   expected[8] = 111;
75 | 
76 |   float *actual = new float[3 * 3];
77 | 
78 |   Multiply<float, TypeParam>::Apply(A, B, 3, 3, actual);
79 | 
80 |   for (int i = 0; i < 3 * 3; ++i) {
81 |     EXPECT_EQ(expected[i], actual[i]);
82 |   }
83 | }
84 | 
85 | REGISTER_TYPED_TEST_SUITE_P(MultiplyTest,  //
86 |                             TestIdentity, TestSquare);
87 | 
88 | #endif  // GTEST_HAS_TYPED_TEST_P
89 | 
90 | #endif  // INCLUDE_TEST_MULTIPLY_H_
91 | 


--------------------------------------------------------------------------------
/src/benchmark-multiply.cu.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
 16 |  *
 17 |  */
 18 | 
 19 | #include "include/cuda_benchmark.h"
 20 | #include "include/cuda_utils.h"
 21 | #include "include/multiply/multiply.h"
 22 | 
 23 | struct Initializer {
 24 |   explicit Initializer(float *d_A, float *d_B, float *d_C, int height,
 25 |                        int width)
 26 |       : d_A(d_A), d_B(d_B), d_C(d_C), height(height), width(width) {}
 27 | 
 28 |   template <typename TKernel>
 29 |   void operator()(TKernel *kernel) {
 30 |     kernel->H = height;
 31 |     kernel->W = width;
 32 |     kernel->A = d_A;
 33 |     kernel->B = d_B;
 34 |     kernel->C = d_C;
 35 |   }
 36 | 
 37 |   float *d_A;
 38 |   float *d_B;
 39 |   float *d_C;
 40 |   int height;
 41 |   int width;
 42 | };
 43 | 
 44 | // Simple way to benchmark the template parameters.
 45 | void run_for(int height, int width) {
 46 |   std::cout << std::endl
 47 |             << "Benchmark for " << height << " " << width
 48 |             << " -------------------------------- " << std::endl;
 49 | 
 50 |   float *A = new float[height * width];
 51 |   float *B = new float[height * width];
 52 |   float *C = new float[height * width];
 53 |   for (int i = 0; i < height * width; ++i) {
 54 |     A[i] = i;
 55 |     B[i] = i * 10;
 56 |     C[i] = 0;
 57 |   }
 58 | 
 59 |   float *d_A;
 60 |   float *d_B;
 61 |   float *d_C;
 62 | 
 63 |   const int num_bytes = height * width * sizeof(float);
 64 | 
 65 |   ASSERT_CUDA(cudaMalloc(reinterpret_cast<void **>(&d_A), num_bytes));
 66 |   ASSERT_CUDA(cudaMalloc(reinterpret_cast<void **>(&d_B), num_bytes));
 67 |   ASSERT_CUDA(cudaMalloc(reinterpret_cast<void **>(&d_C), num_bytes));
 68 | 
 69 |   ASSERT_CUDA(cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice));
 70 |   ASSERT_CUDA(cudaMemcpy(d_B, B, num_bytes, cudaMemcpyHostToDevice));
 71 | 
 72 |   Initializer init(d_A, d_B, d_C, height, width);
 73 | 
 74 |   // Test different options.
 75 |   cuda::KernelBenchmark<int> bench;
 76 |   bench.Case<multiply_kernels::Multiply<float, 2>>(init);
 77 |   bench.Case<multiply_kernels::Multiply<float, 3>>(init);
 78 |   bench.Case<multiply_kernels::Multiply<float, 4>>(init);
 79 |   bench.Case<multiply_kernels::Multiply<float, 6>>(init);
 80 |   bench.Case<multiply_kernels::Multiply<float, 8>>(init);
 81 |   bench.Case<multiply_kernels::Multiply<float, 10>>(init);
 82 |   bench.Case<multiply_kernels::Multiply<float, 16>>(init);
 83 |   bench.Case<multiply_kernels::Multiply<float, 20>>(init);
 84 |   bench.Case<multiply_kernels::Multiply<float, 32>>(init);
 85 |   bench.Start();
 86 | 
 87 |   delete[] A;
 88 |   delete[] B;
 89 |   delete[] C;
 90 |   ASSERT_CUDA(cudaFree(d_A));
 91 |   ASSERT_CUDA(cudaFree(d_B));
 92 |   ASSERT_CUDA(cudaFree(d_C));
 93 | }
 94 | 
 95 | int main() {
 96 |   run_for(256, 256);
 97 |   run_for(512, 512);
 98 |   run_for(1024, 1024);
 99 |   return 0;
100 | }
101 | 
102 | // clang-format off
103 | /*
104 | Benchmark for 256 256 --------------------------------
105 | Using Device Number: 0
106 |   Device name: GeForce GTX 970
107 |   Memory Clock Rate (KHz): 3505000
108 |   Memory Bus Width (bits): 256
109 |   Peak Memory Bandwidth (GB/s): 224.320000
110 | 
111 | time 500.000000 - 1000.000000, iters: 5 - 100
112 |  - multiply_kernels::Multiply<float, 2>    took     3.047784 ms stats(iters: 100, var:     0.082136, stddev:     0.286594)
113 |  - multiply_kernels::Multiply<float, 3>    took     0.847197 ms stats(iters: 100, var:     0.002289, stddev:     0.047846)
114 |  - multiply_kernels::Multiply<float, 4>    took     0.337858 ms stats(iters: 100, var:     0.000039, stddev:     0.006252)
115 |  - multiply_kernels::Multiply<float, 6>    took     0.162206 ms stats(iters: 100, var:     0.000004, stddev:     0.001925)
116 |  - multiply_kernels::Multiply<float, 8>    took     0.081275 ms stats(iters: 100, var:     0.000000, stddev:     0.000677)
117 |  - multiply_kernels::Multiply<float, 10>   took     0.100844 ms stats(iters: 100, var:     0.000000, stddev:     0.000486)
118 |  - multiply_kernels::Multiply<float, 16>   took     0.072184 ms stats(iters: 100, var:     0.000001, stddev:     0.000723)
119 |  - multiply_kernels::Multiply<float, 20>   took     0.082570 ms stats(iters: 100, var:     0.000004, stddev:     0.001894)
120 |  - multiply_kernels::Multiply<float, 32>   took     0.070467 ms stats(iters: 100, var:     0.000008, stddev:     0.002803)
121 | 
122 | Benchmark for 512 512 --------------------------------
123 | Using Device Number: 0
124 |   Device name: GeForce GTX 970
125 |   Memory Clock Rate (KHz): 3505000
126 |   Memory Bus Width (bits): 256
127 |   Peak Memory Bandwidth (GB/s): 224.320000
128 | 
129 | time 500.000000 - 1000.000000, iters: 5 - 100
130 |  - multiply_kernels::Multiply<float, 2>    took    20.967186 ms stats(iters:  48, var:     1.166002, stddev:     1.079816)
131 |  - multiply_kernels::Multiply<float, 3>    took     6.682436 ms stats(iters: 100, var:     0.122818, stddev:     0.350454)
132 |  - multiply_kernels::Multiply<float, 4>    took     2.826743 ms stats(iters: 100, var:     0.067757, stddev:     0.260302)
133 |  - multiply_kernels::Multiply<float, 6>    took     1.245100 ms stats(iters: 100, var:     0.019352, stddev:     0.139112)
134 |  - multiply_kernels::Multiply<float, 8>    took     0.574468 ms stats(iters: 100, var:     0.000003, stddev:     0.001616)
135 |  - multiply_kernels::Multiply<float, 10>   took     0.713191 ms stats(iters: 100, var:     0.000003, stddev:     0.001810)
136 |  - multiply_kernels::Multiply<float, 16>   took     0.502195 ms stats(iters: 100, var:     0.000002, stddev:     0.001380)
137 |  - multiply_kernels::Multiply<float, 20>   took     0.560309 ms stats(iters: 100, var:     0.000006, stddev:     0.002414)
138 |  - multiply_kernels::Multiply<float, 32>   took     0.510635 ms stats(iters: 100, var:     0.000001, stddev:     0.001121)
139 | 
140 | Benchmark for 1024 1024 --------------------------------
141 | Using Device Number: 0
142 |   Device name: GeForce GTX 970
143 |   Memory Clock Rate (KHz): 3505000
144 |   Memory Bus Width (bits): 256
145 |   Peak Memory Bandwidth (GB/s): 224.320000
146 | 
147 | time 500.000000 - 1000.000000, iters: 5 - 100
148 |  - multiply_kernels::Multiply<float, 2>    took   287.646912 ms stats(iters:   4, var:   126.933113, stddev:    11.266459)
149 |  - multiply_kernels::Multiply<float, 3>    took    78.918053 ms stats(iters:  13, var:     1.950417, stddev:     1.396573)
150 |  - multiply_kernels::Multiply<float, 4>    took    33.681572 ms stats(iters:  15, var:     0.029435, stddev:     0.171566)
151 |  - multiply_kernels::Multiply<float, 6>    took    12.483257 ms stats(iters:  41, var:     0.002221, stddev:     0.047123)
152 |  - multiply_kernels::Multiply<float, 8>    took     5.562872 ms stats(iters: 100, var:     0.034724, stddev:     0.186343)
153 |  - multiply_kernels::Multiply<float, 10>   took     6.286970 ms stats(iters: 100, var:     0.010179, stddev:     0.100893)
154 |  - multiply_kernels::Multiply<float, 16>   took     4.158412 ms stats(iters: 100, var:     0.043726, stddev:     0.209108)
155 |  - multiply_kernels::Multiply<float, 20>   took     4.711136 ms stats(iters: 100, var:     0.064436, stddev:     0.253843)
156 |  - multiply_kernels::Multiply<float, 32>   took     4.059203 ms stats(iters: 100, var:     0.044180, stddev:     0.210191)
157 | 
158 | */
159 | // clang-format on
160 | 


--------------------------------------------------------------------------------
/src/deprecated_examples.cu_old:
--------------------------------------------------------------------------------
  1 | /* Copyright 2018 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
 16 |  *         Fabian Groh, <fabian.groh@uni-tuebingen.de>, 2019
 17 |  *
 18 |  */
 19 | 
 20 | #include <stdlib.h>
 21 | #include <algorithm>
 22 | #include <iostream>
 23 | 
 24 | #include "include/cuda_utils.h"
 25 | 
 26 | /*
 27 | nvcc examples.cu --expt-relaxed-constexpr -Xptxas="-v" -std=c++11 -o test
 28 | */
 29 | 
 30 | ////////////////////////////////////////////////////////////////////////////////
 31 | 
 32 | using cuda_utils::make_ndarray;
 33 | using cuda_utils::NdArray;
 34 | using cuda_utils::NdIndex;
 35 | 
 36 | #define check_cuda_call(ans) \
 37 |   { gpuAssert((ans), __FILE__, __LINE__); }
 38 | inline void gpuAssert(cudaError_t code, const char *file, int line,
 39 |                       bool abort = true) {
 40 |   if (code != cudaSuccess) {
 41 |     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
 42 |             line);
 43 |     if (abort) exit(code);
 44 |   }
 45 | }
 46 | 
 47 | template <typename T, int num_threads>
 48 | __global__ void matrixMultiply____________normal__________(T *C, const T *A,
 49 |                                                            const T *B, int H,
 50 |                                                            int W) {
 51 |   __shared__ T ds_M[num_threads][num_threads];
 52 |   __shared__ T ds_N[num_threads][num_threads];
 53 | 
 54 |   int tx = threadIdx.x;
 55 |   int ty = threadIdx.y;
 56 |   int Ch = blockIdx.y * num_threads + ty;
 57 |   int Cw = blockIdx.x * num_threads + tx;
 58 | 
 59 |   T Cval = 0;
 60 | 
 61 |   for (int m = 0; m < (W - 1) / num_threads + 1; ++m) {
 62 |     if (Ch < H && m * num_threads + tx < W)
 63 |       ds_M[ty][tx] = A[Ch * W + m * num_threads + tx];
 64 |     else
 65 |       ds_M[ty][tx] = 0;
 66 |     if (Cw < W && m * num_threads + ty < H)
 67 |       ds_N[ty][tx] = B[(m * num_threads + ty) * W + Cw];
 68 |     else
 69 |       ds_N[ty][tx] = 0;
 70 |     __syncthreads();
 71 | 
 72 |     for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx];
 73 |     __syncthreads();
 74 |   }
 75 |   if (Ch < H && Cw < W) C[Ch * W + Cw] = Cval;
 76 | }
 77 | 
 78 | template <typename T, int num_threads>
 79 | __global__ void matrixMultiply____________tensor__________(T *C, const T *A,
 80 |                                                            const T *B, int H,
 81 |                                                            int W) {
 82 |   __shared__ T ds_M[num_threads][num_threads];
 83 |   __shared__ T ds_N[num_threads][num_threads];
 84 | 
 85 |   const int tx = threadIdx.x;
 86 |   const int ty = threadIdx.y;
 87 |   const int Ch = blockIdx.y * num_threads + ty;
 88 |   const int Cw = blockIdx.x * num_threads + tx;
 89 | 
 90 |   T Cval = 0;
 91 | 
 92 |   auto At = make_ndarray<const T, 2>(A, H, W);
 93 |   auto Bt = make_ndarray<const T, 2>(B, H, W);
 94 |   auto Ct = make_ndarray<T, 2>(C, H, W);
 95 | 
 96 |   for (int m = 0; m < (W - 1) / num_threads + 1; ++m) {
 97 |     ds_M[ty][tx] = At.safe_value(Ch, m * num_threads + tx);
 98 |     // ds_N[ty][tx] = Bt.safe_value(m * num_threads + ty, Cw);
 99 |     if (Bt.valid(m * num_threads + ty, Cw)) {
100 |       ds_N[ty][tx] = Bt(m * num_threads + ty, Cw);
101 |     } else {
102 |       ds_N[ty][tx] = 0;
103 |     }
104 |     __syncthreads();
105 | 
106 |     for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx];
107 |     __syncthreads();
108 |   }
109 |   if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval;
110 | }
111 | 
112 | template <typename T, int num_threads>
113 | __global__ void matrixMultiply____________tensor2__________(
114 |     NdArray<T, 2> Ct, NdArray<const T, 2> At, NdArray<const T, 2> Bt) {
115 |   __shared__ T ds_M[num_threads][num_threads];
116 |   __shared__ T ds_N[num_threads][num_threads];
117 | 
118 |   const int tx = threadIdx.x;
119 |   const int ty = threadIdx.y;
120 |   const int Ch = blockIdx.y * num_threads + ty;
121 |   const int Cw = blockIdx.x * num_threads + tx;
122 |   const size_t W = Bt.template dim<1>();
123 | 
124 |   T Cval = 0;
125 | 
126 |   for (int m = 0; m < (W - 1) / num_threads + 1; ++m) {
127 |     ds_M[ty][tx] = At.safe_value(Ch, m * num_threads + tx);
128 |     ds_N[ty][tx] = Bt.safe_value(m * num_threads + ty, Cw);
129 |     __syncthreads();
130 | 
131 |     for (int k = 0; k < num_threads; ++k) Cval += ds_M[ty][k] * ds_N[k][tx];
132 |     __syncthreads();
133 |   }
134 |   if (Ct.valid(Ch, Cw)) Ct(Ch, Cw) = Cval;
135 | }
136 | 
137 | /************* INDEX SIMPLE ***************************************************/
138 | 
139 | __global__ void index____________normal__________(int A, int B, int C, int a,
140 |                                                   int b, int c) {
141 |   const int idx = a * (B * C) + b * C + c;
142 |   printf("value is %i\n", idx);
143 | }
144 | 
145 | __global__ void index____________tensor__________(int A, int B, int C, int a,
146 |                                                   int b, int c) {
147 |   auto idx = NdIndex<3>(A, B, C);
148 |   printf("value is %i\n", idx(a, b, c));
149 | }
150 | 
151 | template <typename T>
152 | __device__ __forceinline__ const T NAIVE_IDX(const T A, const T B, const T C,
153 |                                              T a, T b, T c) {
154 |   return a * B * C + b * C + c;
155 | }
156 | 
157 | __global__ void index____________naive__________(int A, int B, int C, int a,
158 |                                                  int b, int c) {
159 |   const int idx = NAIVE_IDX(A, B, C, a, b, c);
160 |   printf("value is %i\n", idx);
161 | }
162 | 
163 | /************* README EXAMPLE *************************************************/
164 | 
165 | __global__ void readme____________normal__________(float *src, float *dst,
166 |                                                    int B, int H, int W, int C,
167 |                                                    int b, int h, int w, int c) {
168 |   const int pos1 = b * (H * W * C) + h * (W * C) + w * (C) + c;
169 |   const int pos2 = b * (H * W * C) + h * (W * C) + w * (C) + (c + 1);
170 |   dst[pos2] = src[pos1];
171 | }
172 | 
173 | __global__ void readme____________tensor__________(float *src, float *dst,
174 |                                                    int B, int H, int W, int C,
175 |                                                    int b, int h, int w, int c) {
176 |   auto idx = NdIndex<4>(B, H, W, C);
177 |   src[idx(b, h, w, c)] = dst[idx(b, h, w, c)];
178 |   // auto src_t = Tensor(src, B, H, W, C);
179 |   // auto dst_t = Tensor(dst, B, H, W, C);
180 |   // src_t(b, h, w, c) = dst_t(b, h, w, c);
181 | }
182 | /************* FLEX-DECONV ***************************************************/
183 | // Used 42 registers, 392 bytes cmem[0]
184 | // taken from https://github.com/cgtuebingen/Flex-Convolution
185 | template <typename Dtype>
186 | __global__ void flex_deconv_simple(const int B, const int N, const int K,
187 |                                    const int Dp, const int Din, const int Dout,
188 |                                    const Dtype *positions,
189 |                                    const Dtype *features,
190 |                                    const int *neighborhood, const Dtype *theta,
191 |                                    const Dtype *bias, Dtype *output) {
192 |   const int b = blockIdx.z;
193 | 
194 |   for (int n = blockIdx.y * blockDim.y + threadIdx.y; n < N;
195 |        n += blockDim.y * gridDim.y) {
196 |     const int self_k = neighborhood[b * K * N + 0 * N + n];
197 | 
198 |     for (int k_ = 0; k_ < K; ++k_) {
199 |       const int other_k = neighborhood[b * K * N + k_ * N + n];
200 | 
201 |       for (int dout = blockIdx.x * blockDim.x + threadIdx.x; dout < Dout;
202 |            dout += blockDim.x * gridDim.x) {
203 |         for (int din = 0; din < Din; ++din) {
204 |           const Dtype v = features[b * Din * N + din * N + self_k];
205 |           Dtype W = bias[din * Dout + dout];
206 | 
207 |           for (int dp = 0; dp < Dp; ++dp) {
208 |             Dtype delta = positions[b * Dp * N + dp * N + other_k] -
209 |                           positions[b * Dp * N + dp * N + self_k];
210 |             W += theta[dp * Din * Dout + din * Dout + dout] * delta;
211 |           }
212 | 
213 |           Dtype Wv = W * v;
214 |           // this has been an atomic add
215 |           output[b * Dout * N + dout * N + other_k] += Wv;
216 |         }
217 |       }
218 |     }
219 |   }
220 | }
221 | 
222 | // Used 48 registers, 392 bytes cmem[0]
223 | template <typename T>
224 | __global__ void flex_deconv_tensor(const int B, const int N, const int K,
225 |                                    const int Dp, const int Din, const int Dout,
226 |                                    const T *positions, const T *features,
227 |                                    const int *neighborhood, const T *theta,
228 |                                    const T *bias, T *output) {
229 |   auto pos_t = make_ndarray<const T, 3>(positions, B, Dp, N);
230 |   auto feat_t = make_ndarray<const T, 3>(features, B, Din, N);
231 |   auto theta_t = make_ndarray<const T, 3>(theta, Dp, Din, Dout);
232 |   auto bias_t = make_ndarray<const T, 2>(bias, Din, Dout);
233 |   auto neighborhood_t = make_ndarray<const int, 3>(neighborhood, B, K, N);
234 |   auto output_t = make_ndarray<T, 3>(output, B, Dout, N);
235 | 
236 |   const int b = blockIdx.z;
237 | 
238 |   for (int n = blockIdx.y * blockDim.y + threadIdx.y; n < N;
239 |        n += blockDim.y * gridDim.y) {
240 |     const int self_k = neighborhood_t(b, 0, n);
241 | 
242 |     for (int k_ = 0; k_ < K; ++k_) {
243 |       const int other_k = neighborhood_t(b, k_, n);
244 | 
245 |       for (int dout = blockIdx.x * blockDim.x + threadIdx.x; dout < Dout;
246 |            dout += blockDim.x * gridDim.x) {
247 |         for (int din = 0; din < Din; ++din) {
248 |           const T v = feat_t(b, din, self_k);
249 |           T W = bias_t(din, dout);
250 | 
251 |           for (int dp = 0; dp < Dp; ++dp) {
252 |             T delta = pos_t(b, dp, other_k) - pos_t(b, dp, self_k);
253 |             W += theta_t(dp, din, dout) * delta;
254 |           }
255 | 
256 |           T Wv = W * v;
257 |           output_t(b, dout, other_k) += Wv;
258 |         }
259 |       }
260 |     }
261 |   }
262 | }
263 | 
264 | int up2(int len, int th) { return (len - 1) / th + 1; }
265 | void run_flex_deconv() {
266 |   // this will fail, but during compilation, we will see register usage
267 |   int B = 8;
268 |   int N = 1024;
269 |   int K = 8;
270 |   int Dp = 3;
271 |   int Din = 64;
272 |   int Dout = 64;
273 | 
274 |   float *positions_;
275 |   float *features_;
276 |   int *neighborhood_;
277 |   float *theta_;
278 |   float *bias_;
279 |   float *output_;
280 | 
281 |   const int threads = 32;
282 |   dim3 block(threads, threads, 1);
283 |   dim3 grid(up2(Dout, threads), up2(N, threads), B);
284 | 
285 |   flex_deconv_simple<float><<<grid, block>>>(B, N, K, Dp, Din, Dout, positions_,
286 |                                              features_, neighborhood_, theta_,
287 |                                              bias_, output_);
288 |   flex_deconv_tensor<float><<<grid, block>>>(B, N, K, Dp, Din, Dout, positions_,
289 |                                              features_, neighborhood_, theta_,
290 |                                              bias_, output_);
291 | }
292 | 
293 | void run_readme() {
294 |   int B = 4;
295 |   int H = 17;
296 |   int W = 32;
297 |   int C = 32;
298 |   float *d_src;
299 |   float *d_dst;
300 |   check_cuda_call(cudaMalloc(&d_src, sizeof(float) * B * H * W * C));
301 |   check_cuda_call(cudaMalloc(&d_dst, sizeof(float) * B * H * W * C));
302 | 
303 |   int b = 1;
304 |   int h = 3;
305 |   int w = 3;
306 |   int c = 8;
307 |   dim3 grid1(1);
308 |   dim3 block1(1);
309 |   readme____________normal__________<<<grid1, block1>>>(d_src, d_dst, B, H, W,
310 |                                                         C, b, h, w, c);
311 |   readme____________tensor__________<<<grid1, block1>>>(d_src, d_dst, B, H, W,
312 |                                                         C, b, h, w, c);
313 | }
314 | 
315 | void run_simple() {
316 |   int A = 4;
317 |   int B = 17;
318 |   int C = 32;
319 | 
320 |   int a = 1;
321 |   int b = 3;
322 |   int c = 8;
323 |   dim3 grid1(1);
324 |   dim3 block1(1);
325 |   index____________normal__________<<<grid1, block1>>>(A, B, C, a, b, c);
326 |   index____________tensor__________<<<grid1, block1>>>(A, B, C, a, b, c);
327 |   index____________naive__________<<<grid1, block1>>>(A, B, C, a, b, c);
328 | }
329 | 
330 | void run_matmul() {
331 |   int H = 4;
332 |   int W = 5;
333 |   float *matA = new float[H * W];
334 |   float *matB = new float[H * W];
335 |   float *matC1 = new float[H * W];
336 |   float *matC2 = new float[H * W];
337 |   float *matC3 = new float[H * W];
338 | 
339 |   for (int i = 0; i < H * W; ++i) {
340 |     matA[i] = rand_r() / static_cast<float>(RAND_MAX);
341 |     matB[i] = rand_r() / static_cast<float>(RAND_MAX);
342 |     matC1[i] = rand_r() / static_cast<float>(RAND_MAX);
343 |     matC2[i] = rand_r() / static_cast<float>(RAND_MAX);
344 |     matC3[i] = rand_r() / static_cast<float>(RAND_MAX);
345 |   }
346 | 
347 |   float *d_matA;
348 |   float *d_matB;
349 |   float *d_matC1;
350 |   float *d_matC2;
351 |   float *d_matC3;
352 | 
353 |   check_cuda_call(cudaMalloc(&d_matA, sizeof(float) * H * W));
354 |   check_cuda_call(cudaMalloc(&d_matB, sizeof(float) * H * W));
355 |   check_cuda_call(cudaMalloc(&d_matC1, sizeof(float) * H * W));
356 |   check_cuda_call(cudaMalloc(&d_matC2, sizeof(float) * H * W));
357 |   check_cuda_call(cudaMalloc(&d_matC3, sizeof(float) * H * W));
358 | 
359 |   check_cuda_call(
360 |       cudaMemcpy(d_matA, matA, sizeof(float) * H * W, cudaMemcpyHostToDevice));
361 |   check_cuda_call(
362 |       cudaMemcpy(d_matB, matB, sizeof(float) * H * W, cudaMemcpyHostToDevice));
363 |   check_cuda_call(cudaMemcpy(d_matC1, matC1, sizeof(float) * H * W,
364 |                              cudaMemcpyHostToDevice));
365 |   check_cuda_call(cudaMemcpy(d_matC2, matC2, sizeof(float) * H * W,
366 |                              cudaMemcpyHostToDevice));
367 |   check_cuda_call(cudaMemcpy(d_matC3, matC3, sizeof(float) * H * W,
368 |                              cudaMemcpyHostToDevice));
369 | 
370 |   const int num_threads = 32;
371 |   dim3 threads(num_threads, num_threads);
372 |   dim3 grid((W + 1) / num_threads + 1, (W + 1) / num_threads + 1);
373 | 
374 |   matrixMultiply____________normal__________<float, 32><<<grid, threads>>>(
375 |       d_matC1, d_matA, d_matB, H, W);
376 | 
377 |   check_cuda_call(cudaPeekAtLastError());
378 |   check_cuda_call(cudaGetLastError());
379 |   check_cuda_call(cudaDeviceSynchronize());
380 | 
381 |   matrixMultiply____________tensor__________<float, 32><<<grid, threads>>>(
382 |       d_matC2, d_matA, d_matB, H, W);
383 | 
384 |   check_cuda_call(cudaPeekAtLastError());
385 |   check_cuda_call(cudaGetLastError());
386 |   check_cuda_call(cudaDeviceSynchronize());
387 | 
388 |   auto Ct = make_ndarray<float, 2>(d_matC3, H, W);
389 |   auto At = make_ndarray<const float, 2>(d_matA, H, W);
390 |   auto Bt = make_ndarray<const float, 2>(d_matB, H, W);
391 | 
392 |   matrixMultiply____________tensor2__________<float, 32><<<grid, threads>>>(
393 |       Ct, At, Bt);
394 | 
395 |   check_cuda_call(cudaPeekAtLastError());
396 |   check_cuda_call(cudaGetLastError());
397 |   check_cuda_call(cudaDeviceSynchronize());
398 | 
399 |   check_cuda_call(cudaMemcpy(matC1, d_matC1, H * W * sizeof(float),
400 |                              cudaMemcpyDeviceToHost));
401 |   check_cuda_call(cudaMemcpy(matC2, d_matC2, H * W * sizeof(float),
402 |                              cudaMemcpyDeviceToHost));
403 |   check_cuda_call(cudaMemcpy(matC3, d_matC3, H * W * sizeof(float),
404 |                              cudaMemcpyDeviceToHost));
405 | 
406 |   // verify
407 |   bool good = true;
408 |   printf("\n");
409 |   for (int i = 0; i < H * W; ++i) {
410 |     if (fabs(matC1[i] - matC2[i]) > 1e-8) {
411 |       printf("%i %f %f %f ", i, matC1[i], matC2[i], matA[i]);
412 |       good = false;
413 |     }
414 |     if (fabs(matC1[i] - matC3[i]) > 1e-8) {
415 |       printf("%i %f %f %f ", i, matC1[i], matC3[i], matA[i]);
416 |       good = false;
417 |     }
418 |   }
419 |   printf("\n");
420 |   if (good)
421 |     printf("good\n");
422 |   else
423 |     printf("bad\n");
424 | }
425 | 
426 | /******************************************************************************/
427 | 
428 | int main() {
429 |   run_matmul();
430 |   // run_readme();
431 |   // run_simple();
432 |   // run_flex_deconv();
433 |   return 0;
434 | }
435 | 


--------------------------------------------------------------------------------
/src/multiply.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | #include "include/multiply/multiply.h"
19 | 
20 | #include <stdio.h>
21 | 
22 | #include "include/cuda_utils.h"
23 | 
24 | void print_mat(float *A, int H, int W) {
25 |   for (int h = 0; h < H; ++h) {
26 |     for (int w = 0; w < W; ++w) {
27 |       printf("%2.2f ", A[h * W + w]);
28 |     }
29 |     printf("\n");
30 |   }
31 |   printf("\n");
32 | }
33 | 
34 | int main() {
35 |   float *A = new float[2 * 2];
36 |   float *B = new float[2 * 2];
37 |   float *C = new float[2 * 2];
38 |   for (int i = 0; i < 2 * 2; ++i) {
39 |     A[i] = i;
40 |     B[i] = i * 10;
41 |     C[i] = 0;
42 |   }
43 | 
44 |   print_mat(A, 2, 2);
45 |   print_mat(B, 2, 2);
46 | 
47 |   // ...........................................................................
48 |   printf("Cpu output\n");
49 |   // run on CPU
50 |   Multiply<float, CpuDevice>::Apply(A, B, 2, 2, C);
51 |   print_mat(C, 2, 2);
52 | 
53 | // ...........................................................................
54 | #if WITH_CUDA
55 |   printf("Gpu output\n");
56 |   for (int i = 0; i < 2 * 2; ++i) {
57 |     C[i] = 0;
58 |   }
59 | 
60 |   // run on GPU
61 |   Multiply<float, GpuDevice>::Apply(A, B, 2, 2, C);
62 | 
63 |   print_mat(C, 2, 2);
64 | #endif
65 | 
66 |   // ...........................................................................
67 |   printf("auto output\n");
68 |   for (int i = 0; i < 2 * 2; ++i) {
69 |     C[i] = 0;
70 |   }
71 |   // run on GPU if available else run on CPU
72 |   Multiply<float>::Apply(A, B, 2, 2, C);
73 | 
74 |   print_mat(C, 2, 2);
75 | 
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/multiply/multiply.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #include "include/multiply/multiply.h"
20 | #include "include/cuda_utils.h"
21 | 
22 | template <typename ValueT>
23 | struct Multiply<ValueT> {
24 |   static void Apply(const ValueT* A, const ValueT* B, const int H, const int W,
25 |                     ValueT* C) {
26 | #if WITH_CUDA
27 |     Multiply<ValueT, GpuDevice>::Apply(A, B, H, W, C);
28 | #else   // WITH_CUDA
29 |     Multiply<ValueT, CpuDevice>::Apply(A, B, H, W, C);
30 | #endif  // WITH_CUDA
31 |   }
32 | };
33 | 
34 | template struct Multiply<double>;
35 | template struct Multiply<float>;
36 | template struct Multiply<int>;
37 | 


--------------------------------------------------------------------------------
/src/multiply/multiply_cpu.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #include "include/multiply/multiply.h"
20 | 
21 | #include "include/cuda_utils.h"
22 | 
23 | template <typename ValueT>
24 | struct Multiply<ValueT, CpuDevice> {
25 |   static void Apply(const ValueT* A, const ValueT* B, const int H, const int W,
26 |                     ValueT* C) {
27 |     for (int h = 0; h < H; ++h) {
28 |       for (int w = 0; w < W; ++w) {
29 |         C[h * W + w] = 0;
30 |         for (int k = 0; k < W; ++k) {
31 |           C[h * W + w] += A[h * W + k] * B[k * H + w];
32 |         }
33 |       }
34 |     }
35 |   }
36 | };
37 | 
38 | template struct Multiply<double, CpuDevice>;
39 | template struct Multiply<float, CpuDevice>;
40 | template struct Multiply<int, CpuDevice>;
41 | 


--------------------------------------------------------------------------------
/src/multiply/multiply_gpu.cu.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #if __CUDACC__
20 | 
21 | #include "include/multiply/multiply.h"
22 | 
23 | #include "include/cuda_utils.h"
24 | 
25 | template <typename ValueT>
26 | struct Multiply<ValueT, GpuDevice> {
27 |   static void Apply(const ValueT* A, const ValueT* B, const int H, const int W,
28 |                     ValueT* C) {
29 |     const int num_bytes = H * W * sizeof(ValueT);
30 | 
31 |     ValueT* d_A;
32 |     ValueT* d_B;
33 |     ValueT* d_C;
34 | 
35 |     ASSERT_CUDA(cudaMalloc(reinterpret_cast<void**>(&d_A), num_bytes));
36 |     ASSERT_CUDA(cudaMalloc(reinterpret_cast<void**>(&d_B), num_bytes));
37 |     ASSERT_CUDA(cudaMalloc(reinterpret_cast<void**>(&d_C), num_bytes));
38 | 
39 |     ASSERT_CUDA(cudaMemcpy(d_A, A, num_bytes, cudaMemcpyHostToDevice));
40 |     ASSERT_CUDA(cudaMemcpy(d_B, B, num_bytes, cudaMemcpyHostToDevice));
41 | 
42 |     multiply_kernels::Multiply<ValueT, 32> kernel;
43 |     kernel.H = H;
44 |     kernel.W = W;
45 |     kernel.A = d_A;
46 |     kernel.B = d_B;
47 |     kernel.C = d_C;
48 |     kernel.Launch();
49 |     ASSERT_CUDA(cudaDeviceSynchronize());
50 | 
51 |     ASSERT_CUDA(cudaMemcpy(C, d_C, num_bytes, cudaMemcpyDeviceToHost));
52 | 
53 |     // needed to wait for CUDA kernel output
54 |     // std::cout << cuda::Benchmark(&kernel) << std::endl;
55 |   }
56 | };
57 | 
58 | template struct Multiply<double, GpuDevice>;
59 | template struct Multiply<float, GpuDevice>;
60 | template struct Multiply<int, GpuDevice>;
61 | 
62 | #endif  // __CUDACC__
63 | 


--------------------------------------------------------------------------------
/src/sharedmemory.cu.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #if __CUDACC__
20 | 
21 | #include <iostream>
22 | 
23 | #include "include/cuda_index.h"
24 | #include "include/cuda_utils.h"
25 | 
26 | namespace {
27 | 
28 | struct AddSharedMemoryCUDAKernel : public cuda::Kernel {
29 |   void Launch(cudaStream_t stream = 0) override {
30 |     dim3 block(2);
31 |     dim3 grid(1);
32 | 
33 |     cuda::SharedMemory shm;
34 |     shm.add<float>(5);
35 |     shm.add<int>(3);
36 | 
37 |     cuda::Run<<<grid, block, shm.bytes, stream>>>(*this);
38 |   }
39 | 
40 |   __device__ __forceinline__ void operator()() const override {
41 |     cuda::SharedMemory shm;
42 |     float* floats_5 = shm.ref<float>(5);
43 |     int* ints_3 = shm.ref<int>(3);
44 | 
45 |     if (threadIdx.x == 0) {
46 |       floats_5[0] = 1.f;
47 |       floats_5[1] = 2.f;
48 |       floats_5[2] = 3.f;
49 |       floats_5[3] = 4.f;
50 |       floats_5[4] = 5.f;
51 | 
52 |       ints_3[0] = 11;
53 |       ints_3[1] = 22;
54 |       ints_3[2] = 33;
55 |     }
56 |     __syncthreads();
57 |     if (threadIdx.x == 1) {
58 |       float float_sum = 0;
59 |       for (int i = 0; i < 5; ++i) {
60 |         float_sum += floats_5[i];
61 |         floats_5[i] = 0;
62 |       }
63 |       int int_sum = 0;
64 |       for (int i = 0; i < 3; ++i) {
65 |         int_sum += ints_3[i];
66 |         ints_3[i] = 0;
67 |       }
68 | 
69 |       printf("float sum: %f\n", float_sum);
70 |       printf("int sum: %d\n", int_sum);
71 |     }
72 |   }
73 | };
74 | }  // namespace
75 | 
76 | int main() {
77 |   AddSharedMemoryCUDAKernel kernel;
78 |   kernel.Launch();
79 |   ASSERT_CUDA(cudaDeviceSynchronize());
80 |   return 0;
81 | }
82 | 
83 | #endif  // __CUDACC__
84 | 


--------------------------------------------------------------------------------
/src/tune.cu.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Authors. All Rights Reserved.
  2 | 
  3 |  * Licensed under the Apache License, Version 2.0 (the "License");
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *     http://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License.
 14 |  *
 15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
 16 |  *
 17 |  */
 18 | 
 19 | #include "include/cuda_utils.h"
 20 | 
 21 | namespace {
 22 | 
 23 | // Example for 1D generic
 24 | template <typename ValueT, int BLOCK_DIM_X>
 25 | struct ExpertKernel1D : public cuda::Kernel {
 26 |   void Launch(cudaStream_t stream = 0) override {
 27 |     dim3 block(BLOCK_DIM_X);
 28 |     dim3 grid(1);
 29 |     cuda::Run<<<grid, block, 0, stream>>>(*this);
 30 |   }
 31 | 
 32 |   __device__ __forceinline__ void operator()() const override {
 33 |     printf("thread %d here from the expert-kernel %d [val=%f]\n", threadIdx.x,
 34 |            BLOCK_DIM_X, val);
 35 |   }
 36 | 
 37 |   ValueT val = 0;
 38 | };
 39 | 
 40 | // Example for 1D special
 41 | template <typename ValueT>
 42 | struct ExpertKernel1D<ValueT, 4> : public cuda::Kernel {
 43 |   void Launch(cudaStream_t stream = 0) override {
 44 |     dim3 block(4);
 45 |     dim3 grid(1);
 46 |     cuda::Run<<<grid, block, 0, stream>>>(*this);
 47 |   }
 48 | 
 49 |   __device__ __forceinline__ void operator()() const override {
 50 |     printf("thread %d here from the special expert-kernel of 4 [val=%f]\n",
 51 |            threadIdx.x, val);
 52 |   }
 53 |   ValueT val = 0;
 54 | };
 55 | 
 56 | // Example for 1D generic
 57 | template <typename ValueT, int BLOCK_DIM_X, int BLOCK_DIM_Y>
 58 | struct ExpertKernel2D : public cuda::Kernel {
 59 |   void Launch(cudaStream_t stream = 0) override {
 60 |     dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
 61 |     dim3 grid(1, 1);
 62 |     cuda::Run<<<grid, block, 0, stream>>>(*this);
 63 |   }
 64 | 
 65 |   __device__ __forceinline__ void operator()() const override {
 66 |     const int tid = threadIdx.x * BLOCK_DIM_Y + threadIdx.y;
 67 |     if (!tid)
 68 |       printf("thread %d here from the expert-kernel %d x %d [val=%f]\n", tid,
 69 |              BLOCK_DIM_X, BLOCK_DIM_Y, val);
 70 |   }
 71 |   ValueT val = 0;
 72 | };
 73 | 
 74 | // Example for 1D special
 75 | template <typename ValueT>
 76 | struct ExpertKernel2D<ValueT, 4, 3> : public cuda::Kernel {
 77 |   void Launch(cudaStream_t stream = 0) override {
 78 |     dim3 block(4, 3);
 79 |     dim3 grid(1, 1);
 80 |     cuda::Run<<<grid, block, 0, stream>>>(*this);
 81 |   }
 82 | 
 83 |   __device__ __forceinline__ void operator()() const override {
 84 |     const int tid = threadIdx.x * 3 + threadIdx.y;
 85 |     if (!tid)
 86 |       printf(
 87 |           "thread %d here from the special expert-kernel of 4 x 3 [val=%f]\n",
 88 |           tid, val);
 89 |   }
 90 |   ValueT val = 0;
 91 | };
 92 | }  // namespace
 93 | 
 94 | // Workaround to initialize all kernels for the dispatcher.
 95 | struct Initializer {
 96 |   explicit Initializer(float val) : val(val) {}
 97 | 
 98 |   template <typename TKernel>
 99 |   void operator()(TKernel* kernel) {
100 |     kernel->val = val;
101 |   }
102 | 
103 |   float val;
104 | };
105 | 
106 | int main() {
107 |   // We initialize these kernels using Initializer.
108 |   // From c++14 on, we could use a lambda function.
109 |   // But for now, we need this workaround.
110 |   Initializer init(42.f);
111 | 
112 |   // Simple hyper-parameter:
113 |   cuda::KernelDispatcher<int> disp(true);
114 |   disp.Register<ExpertKernel1D<float, 4>>(3, init);
115 |   disp.Register<ExpertKernel1D<float, 8>>(6, init);
116 | 
117 |   for (int i = 0; i < 9; ++i) {
118 |     printf("%d : \n", i);
119 |     disp.Run(i);
120 |     ASSERT_CUDA(cudaDeviceSynchronize());
121 |   }
122 | 
123 |   // Multi-dimensional hyper-parameters:
124 |   cuda::KernelDispatcher<std::tuple<int, int>> disp2(true);
125 |   disp2.Register<ExpertKernel2D<float, 4, 3>>(std::make_tuple(4, 3), init);
126 |   disp2.Register<ExpertKernel2D<float, 8, 4>>(std::make_tuple(9, 4), init);
127 | 
128 |   for (int i = 0; i < 10; ++i) {
129 |     for (int j = 0; j < 5; ++j) {
130 |       printf("i: %d j %d\n", i, j);
131 |       disp2.Run(std::make_tuple(i, j));
132 |       ASSERT_CUDA(cudaDeviceSynchronize());
133 |     }
134 |   }
135 | 
136 |   return 0;
137 | }
138 | 
139 | /*
140 | 0 :
141 | thread 0 here from the special expert-kernel of 4 [val=42.000000]
142 | thread 1 here from the special expert-kernel of 4 [val=42.000000]
143 | thread 2 here from the special expert-kernel of 4 [val=42.000000]
144 | thread 3 here from the special expert-kernel of 4 [val=42.000000]
145 | 1 :
146 | thread 0 here from the special expert-kernel of 4 [val=42.000000]
147 | thread 1 here from the special expert-kernel of 4 [val=42.000000]
148 | thread 2 here from the special expert-kernel of 4 [val=42.000000]
149 | thread 3 here from the special expert-kernel of 4 [val=42.000000]
150 | 2 :
151 | thread 0 here from the special expert-kernel of 4 [val=42.000000]
152 | thread 1 here from the special expert-kernel of 4 [val=42.000000]
153 | thread 2 here from the special expert-kernel of 4 [val=42.000000]
154 | thread 3 here from the special expert-kernel of 4 [val=42.000000]
155 | 3 :
156 | thread 0 here from the special expert-kernel of 4 [val=42.000000]
157 | thread 1 here from the special expert-kernel of 4 [val=42.000000]
158 | thread 2 here from the special expert-kernel of 4 [val=42.000000]
159 | thread 3 here from the special expert-kernel of 4 [val=42.000000]
160 | 4 :
161 | thread 0 here from the expert-kernel 8 [val=42.000000]
162 | thread 1 here from the expert-kernel 8 [val=42.000000]
163 | thread 2 here from the expert-kernel 8 [val=42.000000]
164 | thread 3 here from the expert-kernel 8 [val=42.000000]
165 | thread 4 here from the expert-kernel 8 [val=42.000000]
166 | thread 5 here from the expert-kernel 8 [val=42.000000]
167 | thread 6 here from the expert-kernel 8 [val=42.000000]
168 | thread 7 here from the expert-kernel 8 [val=42.000000]
169 | 5 :
170 | thread 0 here from the expert-kernel 8 [val=42.000000]
171 | thread 1 here from the expert-kernel 8 [val=42.000000]
172 | thread 2 here from the expert-kernel 8 [val=42.000000]
173 | thread 3 here from the expert-kernel 8 [val=42.000000]
174 | thread 4 here from the expert-kernel 8 [val=42.000000]
175 | thread 5 here from the expert-kernel 8 [val=42.000000]
176 | thread 6 here from the expert-kernel 8 [val=42.000000]
177 | thread 7 here from the expert-kernel 8 [val=42.000000]
178 | 6 :
179 | thread 0 here from the expert-kernel 8 [val=42.000000]
180 | thread 1 here from the expert-kernel 8 [val=42.000000]
181 | thread 2 here from the expert-kernel 8 [val=42.000000]
182 | thread 3 here from the expert-kernel 8 [val=42.000000]
183 | thread 4 here from the expert-kernel 8 [val=42.000000]
184 | thread 5 here from the expert-kernel 8 [val=42.000000]
185 | thread 6 here from the expert-kernel 8 [val=42.000000]
186 | thread 7 here from the expert-kernel 8 [val=42.000000]
187 | 7 :
188 | thread 0 here from the expert-kernel 8 [val=42.000000]
189 | thread 1 here from the expert-kernel 8 [val=42.000000]
190 | thread 2 here from the expert-kernel 8 [val=42.000000]
191 | thread 3 here from the expert-kernel 8 [val=42.000000]
192 | thread 4 here from the expert-kernel 8 [val=42.000000]
193 | thread 5 here from the expert-kernel 8 [val=42.000000]
194 | thread 6 here from the expert-kernel 8 [val=42.000000]
195 | thread 7 here from the expert-kernel 8 [val=42.000000]
196 | 8 :
197 | thread 0 here from the expert-kernel 8 [val=42.000000]
198 | thread 1 here from the expert-kernel 8 [val=42.000000]
199 | thread 2 here from the expert-kernel 8 [val=42.000000]
200 | thread 3 here from the expert-kernel 8 [val=42.000000]
201 | thread 4 here from the expert-kernel 8 [val=42.000000]
202 | thread 5 here from the expert-kernel 8 [val=42.000000]
203 | thread 6 here from the expert-kernel 8 [val=42.000000]
204 | thread 7 here from the expert-kernel 8 [val=42.000000]
205 | i: 0 j 0
206 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
207 | i: 0 j 1
208 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
209 | i: 0 j 2
210 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
211 | i: 0 j 3
212 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
213 | i: 0 j 4
214 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
215 | i: 1 j 0
216 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
217 | i: 1 j 1
218 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
219 | i: 1 j 2
220 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
221 | i: 1 j 3
222 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
223 | i: 1 j 4
224 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
225 | i: 2 j 0
226 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
227 | i: 2 j 1
228 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
229 | i: 2 j 2
230 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
231 | i: 2 j 3
232 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
233 | i: 2 j 4
234 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
235 | i: 3 j 0
236 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
237 | i: 3 j 1
238 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
239 | i: 3 j 2
240 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
241 | i: 3 j 3
242 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
243 | i: 3 j 4
244 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
245 | i: 4 j 0
246 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
247 | i: 4 j 1
248 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
249 | i: 4 j 2
250 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
251 | i: 4 j 3
252 | thread 0 here from the special expert-kernel of 4 x 3 [val=42.000000]
253 | i: 4 j 4
254 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
255 | i: 5 j 0
256 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
257 | i: 5 j 1
258 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
259 | i: 5 j 2
260 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
261 | i: 5 j 3
262 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
263 | i: 5 j 4
264 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
265 | i: 6 j 0
266 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
267 | i: 6 j 1
268 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
269 | i: 6 j 2
270 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
271 | i: 6 j 3
272 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
273 | i: 6 j 4
274 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
275 | i: 7 j 0
276 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
277 | i: 7 j 1
278 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
279 | i: 7 j 2
280 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
281 | i: 7 j 3
282 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
283 | i: 7 j 4
284 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
285 | i: 8 j 0
286 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
287 | i: 8 j 1
288 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
289 | i: 8 j 2
290 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
291 | i: 8 j 3
292 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
293 | i: 8 j 4
294 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
295 | i: 9 j 0
296 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
297 | i: 9 j 1
298 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
299 | i: 9 j 2
300 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
301 | i: 9 j 3
302 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
303 | i: 9 j 4
304 | thread 0 here from the expert-kernel 8 x 4 [val=42.000000]
305 |  */
306 | 


--------------------------------------------------------------------------------
/test/test_multiply.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #include "include/test/multiply.h"
20 | 
21 | #include "gmock/gmock.h"
22 | #include "gtest/gtest.h"
23 | #include "include/cuda_utils.h"
24 | #include "include/multiply/multiply.h"
25 | 
26 | namespace {
27 | 
28 | TEST(MultiplyTest, ExtraCpuTest) { EXPECT_TRUE(true); }
29 | 
30 | using Devices = ::testing::Types<CpuDevice>;
31 | INSTANTIATE_TYPED_TEST_SUITE_P(Example, MultiplyTest, Devices);
32 | 
33 | }  // namespace
34 | 
35 | int main(int argc, char **argv) {
36 |   ::testing::InitGoogleMock(&argc, argv);
37 |   return RUN_ALL_TESTS();
38 | }
39 | 


--------------------------------------------------------------------------------
/test/test_multiply.cu.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Authors. All Rights Reserved.
 2 | 
 3 |  * Licensed under the Apache License, Version 2.0 (the "License");
 4 |  * you may not use this file except in compliance with the License.
 5 |  * You may obtain a copy of the License at
 6 |  *
 7 |  *     http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software
10 |  * distributed under the License is distributed on an "AS IS" BASIS,
11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  * See the License for the specific language governing permissions and
13 |  * limitations under the License.
14 |  *
15 |  * Author: Patrick Wieschollek, <mail@patwie.com>, 2019
16 |  *
17 |  */
18 | 
19 | #include "include/test/multiply.h"
20 | 
21 | #include "gmock/gmock.h"
22 | #include "gtest/gtest.h"
23 | #include "include/cuda_utils.h"
24 | #include "include/multiply/multiply.h"
25 | 
26 | namespace {
27 | 
28 | TEST(MultiplyTest, ExtraGpuTest) { EXPECT_TRUE(true); }
29 | 
30 | TEST(MultiplyTest, GpuMatchCpu) {
31 |   constexpr int M = 50;
32 |   float *A = new float[M * M];
33 |   float *B = new float[M * M];
34 |   float *expected = new float[M * M];
35 |   float *actual = new float[M * M];
36 | 
37 |   for (int i = 0; i < 2 * 2; ++i) {
38 |     A[i] = i;
39 |     B[i] = i - 5;
40 |     expected[i] = 0;
41 |     expected[i] = 0;
42 |   }
43 | 
44 |   Multiply<float, CpuDevice>::Apply(A, B, M, M, expected);
45 |   Multiply<float, GpuDevice>::Apply(A, B, M, M, actual);
46 | 
47 |   for (int i = 0; i < M * M; ++i) {
48 |     EXPECT_NEAR(expected[i], actual[i], 1e-8);
49 |   }
50 | }
51 | 
52 | using Devices = ::testing::Types<GpuDevice>;
53 | INSTANTIATE_TYPED_TEST_SUITE_P(Example, MultiplyTest, Devices);
54 | 
55 | }  // namespace
56 | 
57 | int main(int argc, char **argv) {
58 |   ::testing::InitGoogleMock(&argc, argv);
59 |   return RUN_ALL_TESTS();
60 | }
61 | 


--------------------------------------------------------------------------------