├── .gitignore
├── LICENSE
├── README.md
├── cudnn_samples_v6
    ├── RNN
    │   ├── Makefile
    │   ├── RNN_example.cu
    │   ├── compare.py
    │   ├── golden_1.txt
    │   ├── golden_2.txt
    │   ├── golden_3.txt
    │   ├── golden_4.txt
    │   └── result.txt
    └── mnistCUDNN
    │   ├── FreeImage
    │       ├── freeimage-license.txt
    │       └── include
    │       │   └── FreeImage.h
    │   ├── Makefile
    │   ├── data
    │       ├── conv1.bias.bin
    │       ├── conv1.bin
    │       ├── conv2.bias.bin
    │       ├── conv2.bin
    │       ├── five_28x28.pgm
    │       ├── ip1.bias.bin
    │       ├── ip1.bin
    │       ├── ip2.bias.bin
    │       ├── ip2.bin
    │       ├── one_28x28.pgm
    │       └── three_28x28.pgm
    │   ├── error_util.h
    │   ├── fp16_dev.cu
    │   ├── fp16_dev.h
    │   ├── fp16_emu.cpp
    │   ├── fp16_emu.h
    │   ├── gemv.h
    │   ├── mnistCUDNN.cpp
    │   └── readme.txt
├── cudnn_samples_v7
    ├── RNN
    │   ├── Makefile
    │   ├── RNN_example.cu
    │   ├── compare.py
    │   ├── golden_1.txt
    │   ├── golden_2.txt
    │   ├── golden_3.txt
    │   └── golden_4.txt
    ├── conv_sample
    │   ├── Makefile
    │   ├── config_fermi_islip.icnt
    │   ├── conv_sample.cpp
    │   ├── error_util.h
    │   ├── fp16_dev.cu
    │   ├── fp16_dev.h
    │   ├── fp16_emu.cpp
    │   ├── fp16_emu.h
    │   └── gpgpusim.config
    └── mnistCUDNN
    │   ├── FreeImage
    │       ├── freeimage-license.txt
    │       └── include
    │       │   └── FreeImage.h
    │   ├── Makefile
    │   ├── data
    │       ├── conv1.bias.bin
    │       ├── conv1.bin
    │       ├── conv2.bias.bin
    │       ├── conv2.bin
    │       ├── five_28x28.pgm
    │       ├── ip1.bias.bin
    │       ├── ip1.bin
    │       ├── ip2.bias.bin
    │       ├── ip2.bin
    │       ├── one_28x28.pgm
    │       └── three_28x28.pgm
    │   ├── error_util.h
    │   ├── fp16_dev.cu
    │   ├── fp16_dev.h
    │   ├── fp16_emu.cpp
    │   ├── fp16_emu.h
    │   ├── gemv.h
    │   ├── mnistCUDNN
    │   ├── mnistCUDNN.cpp
    │   └── readme.txt
└── home-made
    ├── common.hpp
    ├── config_fermi_islip.icnt
    ├── gpgpusim.config
    └── helloworld.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Zheng Liang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cuDNN-sample
 2 | 
 3 | Some cuDNN sample codes provided by Nvidia as well as some home-made codes.
 4 | 
 5 | ## cuDNN static linking
 6 | 
 7 | There is no official guide on how to link cuDNN statically. However, I found an official guide on how to [link cuBLAS statically](https://docs.nvidia.com/cuda/cublas/index.html). Actually, nVidia takes the static library as a different library (with a different name).
 8 | 
 9 | So, you need to use the following commands to link cuDNN statically.
10 | 
11 | ```bash
12 | nvcc <source> -lcudnn_static -o <target>
13 | ```


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/Makefile:
--------------------------------------------------------------------------------
  1 | # Location of the CUDA Toolkit
  2 | CUDA_PATH ?= /usr/local/cuda
  3 | 
  4 | # architecture
  5 | HOST_ARCH   := $(shell uname -m)
  6 | TARGET_ARCH ?= $(HOST_ARCH)
  7 | 
  8 | # Adjust this for ARMv7 with a 32-bit filesystem
  9 | ifeq ($(TARGET_ARCH), aarch64)
 10 |     ifeq ($(shell file /sbin/init | grep 32-bit), 1)
 11 |         TARGET_ARCH=armv7l
 12 |     endif
 13 | endif
 14 |  
 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 16 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 17 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 18 |             TARGET_SIZE := 64
 19 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 20 |             TARGET_SIZE := 32
 21 |         endif
 22 |     else
 23 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 24 |     endif
 25 | else
 26 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 27 | endif
 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 29 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 30 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 31 |     endif
 32 | endif
 33 | 
 34 | # operating system
 35 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 36 | TARGET_OS ?= $(HOST_OS)
 37 | 
 38 | ifeq ($(TARGET_OS),QNX)
 39 | TARGET_OS := qnx
 40 | endif
 41 | 
 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android))
 43 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 44 | endif
 45 | 
 46 | # host compiler
 47 | ifeq ($(TARGET_OS),darwin)
 48 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 49 |         HOST_COMPILER ?= clang++
 50 |     endif
 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 52 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 53 |         ifeq ($(TARGET_OS),linux)
 54 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 55 |         else ifeq ($(TARGET_OS),qnx)
 56 |             ifeq ($(QNX_HOST),)
 57 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 58 |             endif
 59 |             ifeq ($(QNX_TARGET),)
 60 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 61 |             endif
 62 |             export QNX_HOST
 63 |             export QNX_TARGET
 64 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 65 |         else ifeq ($(TARGET_OS),android)
 66 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 67 |         endif
 68 |     else ifeq ($(TARGET_ARCH),aarch64)
 69 |         ifeq ($(TARGET_OS), linux)
 70 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 71 |         else ifeq ($(TARGET_OS), android)
 72 |             HOST_COMPILER ?= aarch64-linux-android-g++
 73 |         endif
 74 |     else ifeq ($(TARGET_ARCH),ppc64le)
 75 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 76 |     endif
 77 | endif
 78 | HOST_COMPILER ?= g++
 79 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 80 | 
 81 | # internal flags
 82 | NVCCFLAGS   := -m${TARGET_SIZE}
 83 | CCFLAGS     :=
 84 | LDFLAGS     :=
 85 | 
 86 | # build flags
 87 | ifeq ($(TARGET_OS),darwin)
 88 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
 89 |     CCFLAGS += -arch $(HOST_ARCH)
 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
 91 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
 92 |     CCFLAGS += -mfloat-abi=hard
 93 | else ifeq ($(TARGET_OS),android)
 94 |     LDFLAGS += -pie
 95 |     CCFLAGS += -fpie -fpic -fexceptions
 96 | endif
 97 | 
 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 99 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
100 |         ifneq ($(TARGET_FS),)
101 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
102 |             ifeq ($(GCCVERSIONLTEQ46),1)
103 |                 CCFLAGS += --sysroot=$(TARGET_FS)
104 |             endif
105 |             LDFLAGS += --sysroot=$(TARGET_FS)
106 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
107 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
108 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
109 |         endif
110 |     endif
111 | endif
112 | 
113 | # Debug build flags
114 | ifeq ($(dbg),1)
115 |       NVCCFLAGS += -g -G
116 |       BUILD_TYPE := debug
117 | else
118 |       BUILD_TYPE := release
119 | endif
120 | 
121 | ALL_CCFLAGS :=
122 | ALL_CCFLAGS += $(NVCCFLAGS)
123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
126 | 
127 | SAMPLE_ENABLED := 1
128 | 
129 | ALL_LDFLAGS :=
130 | ALL_LDFLAGS += $(ALL_CCFLAGS)
131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
133 | 
134 | # Common includes and paths for CUDA
135 | ifneq ($(TARGET_ARCH), ppc64le)
136 | INCLUDES := -I$(CUDA_PATH)/include
137 | else
138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
139 | endif
140 | LIBRARIES :=
141 | 
142 | ################################################################################
143 | 
144 | # Gencode arguments
145 | SMS ?= 30 35 50 53
146 | 
147 | ifeq ($(SMS),)
148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
149 | SAMPLE_ENABLED := 0
150 | endif
151 | 
152 | ifeq ($(GENCODE_FLAGS),)
153 | # Generate SASS code for each SM architecture listed in $(SMS)
154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
155 | 
156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
157 | HIGHEST_SM := $(lastword $(sort $(SMS)))
158 | ifneq ($(HIGHEST_SM),)
159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
160 | endif
161 | endif
162 | 
163 | INCLUDES += -I.
164 | LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm
165 | 
166 | ifeq ($(SAMPLE_ENABLED),0)
167 | EXEC ?= @echo "[@]"
168 | endif
169 | 
170 | ################################################################################
171 | 
172 | # Target rules
173 | all: build
174 | 
175 | build: RNN
176 | 
177 | check.deps:
178 | ifeq ($(SAMPLE_ENABLED),0)
179 | 	@echo "Sample will be waived due to the above missing dependencies"
180 | else
181 | 	@echo "Sample is ready - all dependencies have been met"
182 | endif
183 | 
184 | OBJ = RNN_example.o
185 | 
186 | RNN: $(OBJ)
187 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
188 | 
189 | %.o: %.cu
190 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
191 | 
192 | run: build
193 | 	$(EXEC) ./RNN 100 4 512 64 2
194 | 
195 | clean:
196 | 	rm -rf *o
197 | 	rm -rf RNN
198 | 
199 | clobber: clean
200 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/RNN_example.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2016 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #include <cudnn.h>
 13 | #include <cuda.h>
 14 | #include <stdio.h>
 15 | 
 16 | 
 17 | // Reference outputs (calculated on an M40 GPU)
 18 | // > ./RNN 20 2 512 64 0
 19 | // Forward: 1299 GFLOPs
 20 | // Backward: 2171 GFLOPs, (1564 GFLOPs), (3549 GFLOPs)
 21 | // i checksum 1.315793E+06     h checksum 1.315212E+05
 22 | // di checksum 6.676003E+01    dh checksum 6.425067E+01
 23 | // dw checksum 1.453750E+09
 24 | //
 25 | // > ./RNN 20 2 512 64 1
 26 | // Forward: 1296 GFLOPs
 27 | // Backward: 2235 GFLOPs, (1567 GFLOPs), (3896 GFLOPs)
 28 | // i checksum 6.319591E+05     h checksum 6.319605E+04
 29 | // di checksum 4.501830E+00    dh checksum 4.489546E+00
 30 | // dw checksum 5.012598E+07
 31 | //
 32 | // > ./RNN 20 2 512 64 2
 33 | // Forward: 2635 GFLOPs
 34 | // Backward: 2757 GFLOPs, (2001 GFLOPs), (4433 GFLOPs)
 35 | // i checksum 5.749536E+05     c checksum 4.365091E+05     h checksum 5.774818E+04
 36 | // di checksum 3.842206E+02    dc checksum 9.323785E+03    dh checksum 1.182566E+01
 37 | // dw checksum 4.313461E+08
 38 | //
 39 | // > ./RNN 20 2 512 64 3
 40 | // Forward: 2428 GFLOPs
 41 | // Backward: 2645 GFLOPs, (1915 GFLOPs), (4270 GFLOPs)
 42 | // i checksum 6.358978E+05     h checksum 6.281680E+04
 43 | // di checksum 6.296622E+00    dh checksum 2.289960E+05
 44 | // dw checksum 5.397419E+07
 45 | 
 46 | 
 47 | 
 48 | // Define some error checking macros.
 49 | #define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
 50 | void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
 51 |    if (stat != cudaSuccess) {
 52 |       fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
 53 |    }
 54 | }
 55 | 
 56 | #define cudnnErrCheck(stat) { cudnnErrCheck_((stat), __FILE__, __LINE__); }
 57 | void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) {
 58 |    if (stat != CUDNN_STATUS_SUCCESS) {
 59 |       fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), file, line);
 60 |    }
 61 | }
 62 | 
 63 | __global__ void initGPUData_ker(float *data, int numElements, float value) {
 64 |    int tid = blockIdx.x * blockDim.x + threadIdx.x;
 65 |    if (tid < numElements) {
 66 |       data[tid] = value;
 67 |    }
 68 | }
 69 | 
 70 | void initGPUData(float *data, int numElements, float value) {
 71 |    dim3 gridDim;
 72 |    dim3 blockDim;
 73 |    
 74 |    blockDim.x = 1024;
 75 |    gridDim.x = (numElements + blockDim.x - 1) / blockDim.x;
 76 |    
 77 |    initGPUData_ker <<< gridDim, blockDim >>> (data, numElements, value);
 78 | }
 79 | 
 80 |   
 81 | int main(int argc, char* argv[]) {
 82 | 
 83 |    int seqLength;
 84 |    int numLayers;
 85 |    int hiddenSize;
 86 |    int inputSize;
 87 |    int miniBatch;
 88 |    float dropout;
 89 |    bool bidirectional;
 90 |    int mode;
 91 | 
 92 |    FILE *fp;
 93 |    fp=fopen("result.txt","w");
 94 | 
 95 |    if (argc == 6) {
 96 |       seqLength = atoi(argv[1]);
 97 |       numLayers = atoi(argv[2]);
 98 |       hiddenSize = atoi(argv[3]);
 99 |       inputSize = hiddenSize;
100 |       miniBatch = atoi(argv[4]);
101 |       dropout = 0;
102 |       bidirectional = 0;
103 |       mode = atoi(argv[5]);
104 |    }
105 |    else {
106 |       printf("Usage:\n");
107 |       printf("./RNN <seqLength> <numLayers> <hiddenSize> <miniBatch> <mode>\n");
108 |       printf("Modes: 0 = RNN_RELU, 1 = RNN_TANH, 2 = LSTM, 3 = GRU\n");
109 |       return 1;
110 |    }
111 | 
112 |    // -------------------------   
113 |    // Create cudnn context
114 |    // -------------------------  
115 |    cudnnHandle_t cudnnHandle;   
116 |    cudnnErrCheck(cudnnCreate(&cudnnHandle));
117 | 
118 |    
119 |    // -------------------------   
120 |    // Set up inputs and outputs
121 |    // -------------------------
122 |    void *x;
123 |    void *hx = NULL;
124 |    void *cx = NULL;
125 |    
126 |    void *dx;
127 |    void *dhx = NULL;
128 |    void *dcx = NULL;
129 |   
130 |    void *y;
131 |    void *hy = NULL;
132 |    void *cy = NULL;
133 |    
134 |    void *dy;
135 |    void *dhy = NULL;
136 |    void *dcy = NULL;
137 |    
138 |    // Memory allocation. hx, cx, dhx, dcx, hy, cy, dhy and dcy can be NULL.
139 |    cudaErrCheck(cudaMalloc((void**)&x, seqLength * inputSize * miniBatch * sizeof(float)));
140 |    cudaErrCheck(cudaMalloc((void**)&hx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
141 |    cudaErrCheck(cudaMalloc((void**)&cx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
142 |    
143 |    cudaErrCheck(cudaMalloc((void**)&dx, seqLength * inputSize * miniBatch * sizeof(float)));
144 |    cudaErrCheck(cudaMalloc((void**)&dhx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
145 |    cudaErrCheck(cudaMalloc((void**)&dcx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
146 |    
147 |    cudaErrCheck(cudaMalloc((void**)&y, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
148 |    cudaErrCheck(cudaMalloc((void**)&hy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
149 |    cudaErrCheck(cudaMalloc((void**)&cy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
150 |    
151 |    cudaErrCheck(cudaMalloc((void**)&dy, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
152 |    cudaErrCheck(cudaMalloc((void**)&dhy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
153 |    cudaErrCheck(cudaMalloc((void**)&dcy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float)));
154 |       
155 |    // Set up tensor descriptors. x/y/dx/dy are arrays, one per time step.
156 |    cudnnTensorDescriptor_t *xDesc, *yDesc, *dxDesc, *dyDesc;
157 |    cudnnTensorDescriptor_t hxDesc, cxDesc;
158 |    cudnnTensorDescriptor_t hyDesc, cyDesc;
159 |    cudnnTensorDescriptor_t dhxDesc, dcxDesc;
160 |    cudnnTensorDescriptor_t dhyDesc, dcyDesc;
161 |    
162 |    xDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t));
163 |    yDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t));
164 |    dxDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t));
165 |    dyDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t));
166 |    
167 |    int dimA[3];
168 |    int strideA[3];
169 | 
170 |    // In this example dimA[1] is constant across the whole sequence
171 |    // This isn't required, all that is required is that it does not increase.
172 |    for (int i = 0; i < seqLength; i++) {
173 |       cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc[i]));
174 |       cudnnErrCheck(cudnnCreateTensorDescriptor(&yDesc[i]));
175 |       cudnnErrCheck(cudnnCreateTensorDescriptor(&dxDesc[i]));
176 |       cudnnErrCheck(cudnnCreateTensorDescriptor(&dyDesc[i]));
177 |    
178 |       dimA[0] = miniBatch;
179 |       dimA[1] = inputSize;
180 |       dimA[2] = 1;
181 |      
182 |       strideA[0] = dimA[2] * dimA[1];
183 |       strideA[1] = dimA[2];
184 |       strideA[2] = 1;
185 | 
186 |       cudnnErrCheck(cudnnSetTensorNdDescriptor(xDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
187 |       cudnnErrCheck(cudnnSetTensorNdDescriptor(dxDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
188 |       
189 |       dimA[0] = miniBatch;
190 |       dimA[1] = bidirectional ? hiddenSize * 2 : hiddenSize;
191 |       dimA[2] = 1;
192 | 
193 |       strideA[0] = dimA[2] * dimA[1];
194 |       strideA[1] = dimA[2];
195 |       strideA[2] = 1;
196 |       
197 |       cudnnErrCheck(cudnnSetTensorNdDescriptor(yDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
198 |       cudnnErrCheck(cudnnSetTensorNdDescriptor(dyDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA));
199 |    }
200 |    
201 |    
202 |    dimA[0] = numLayers * (bidirectional ? 2 : 1);
203 |    dimA[1] = miniBatch;
204 |    dimA[2] = hiddenSize;
205 |    
206 |    strideA[0] = dimA[2] * dimA[1];
207 |    strideA[1] = dimA[2];
208 |    strideA[2] = 1;
209 |    
210 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&hxDesc));
211 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&cxDesc));
212 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&hyDesc));
213 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&cyDesc));
214 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&dhxDesc));
215 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&dcxDesc));
216 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&dhyDesc));
217 |    cudnnErrCheck(cudnnCreateTensorDescriptor(&dcyDesc));
218 |    
219 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(hxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
220 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(cxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
221 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(hyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
222 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(cyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
223 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(dhxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
224 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(dcxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
225 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(dhyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
226 |    cudnnErrCheck(cudnnSetTensorNdDescriptor(dcyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA));
227 |   
228 |   
229 |    // -------------------------
230 |    // Set up the dropout descriptor (needed for the RNN descriptor)
231 |    // -------------------------
232 |    unsigned long long seed = 1337ull; // Pick a seed.
233 |    
234 |    cudnnDropoutDescriptor_t dropoutDesc;
235 |    cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc));
236 |    
237 |    // How much memory does dropout need for states?
238 |    // These states are used to generate random numbers internally
239 |    // and should not be freed until the RNN descriptor is no longer used
240 |    size_t stateSize;
241 |    void *states;
242 |    cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
243 |    
244 |    cudaErrCheck(cudaMalloc(&states, stateSize));
245 |    
246 |    cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc, 
247 |                              cudnnHandle,
248 |                              dropout, 
249 |                              states, 
250 |                              stateSize, 
251 |                              seed));
252 |                              
253 |    // -------------------------   
254 |    // Set up the RNN descriptor
255 |    // -------------------------
256 |    cudnnRNNDescriptor_t rnnDesc;
257 |    cudnnRNNMode_t RNNMode;
258 |    
259 |    cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc));
260 |    
261 |    if      (mode == 0) RNNMode = CUDNN_RNN_RELU;
262 |    else if (mode == 1) RNNMode = CUDNN_RNN_TANH;
263 |    else if (mode == 2) RNNMode = CUDNN_LSTM;
264 |    else if (mode == 3) RNNMode = CUDNN_GRU;
265 |       
266 |    cudnnErrCheck(cudnnSetRNNDescriptor_v6(cudnnHandle,
267 |                                        rnnDesc,
268 |                                        hiddenSize, 
269 |                                        numLayers, 
270 |                                        dropoutDesc,
271 |                                        CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation
272 |                                        bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, 
273 |                                        RNNMode, 
274 |                                        CUDNN_RNN_ALGO_STANDARD, // Can be changed to use persistent RNNs on Pascal+ GPUs.
275 |                                        CUDNN_DATA_FLOAT));
276 |    
277 |    
278 |    // -------------------------
279 |    // Set up parameters
280 |    // -------------------------
281 |    // This needs to be done after the rnn descriptor is set as otherwise
282 |    // we don't know how many parameters we have to allocate
283 |    void *w;   
284 |    void *dw;   
285 | 
286 |    cudnnFilterDescriptor_t wDesc, dwDesc;
287 |    
288 |    cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc));
289 |    cudnnErrCheck(cudnnCreateFilterDescriptor(&dwDesc));
290 |    
291 |    size_t weightsSize;
292 |    cudnnErrCheck(cudnnGetRNNParamsSize(cudnnHandle, rnnDesc, xDesc[0], &weightsSize, CUDNN_DATA_FLOAT));
293 |    
294 |    int dimW[3];   
295 |    dimW[0] =  weightsSize / sizeof(float);
296 |    dimW[1] = 1;
297 |    dimW[2] = 1;
298 |       
299 |    cudnnErrCheck(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));   
300 |    cudnnErrCheck(cudnnSetFilterNdDescriptor(dwDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW));   
301 |    
302 |    cudaErrCheck(cudaMalloc((void**)&w,  weightsSize));
303 |    cudaErrCheck(cudaMalloc((void**)&dw, weightsSize));
304 |    
305 |    
306 |    // -------------------------
307 |    // Set up work space and reserved memory
308 |    // -------------------------   
309 |    void *workspace;
310 |    void *reserveSpace;   
311 |    
312 |    size_t workSize;
313 |    size_t reserveSize;
314 | 
315 |    // Need for every pass
316 |    cudnnErrCheck(cudnnGetRNNWorkspaceSize(cudnnHandle, rnnDesc, seqLength, xDesc, &workSize));
317 |    // Only needed in training, shouldn't be touched between passes.
318 |    cudnnErrCheck(cudnnGetRNNTrainingReserveSize(cudnnHandle, rnnDesc, seqLength, xDesc, &reserveSize));
319 |     
320 |    cudaErrCheck(cudaMalloc((void**)&workspace, workSize));
321 |    cudaErrCheck(cudaMalloc((void**)&reserveSpace, reserveSize));
322 |    
323 |    // *********************************************************************************************************
324 |    // Initialise weights and inputs
325 |    // *********************************************************************************************************
326 |    // We initialise to something simple.
327 |    // Matrices are initialised to 1 / matrixSize, biases to 1, data is 1.
328 |    initGPUData((float*)x, seqLength * inputSize * miniBatch, 1.f);
329 |    if (hx != NULL) initGPUData((float*)hx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f);
330 |    if (cx != NULL) initGPUData((float*)cx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f);
331 |    
332 |    initGPUData((float*)dy, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f);
333 |    if (dhy != NULL) initGPUData((float*)dhy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f);
334 |    if (dcy != NULL) initGPUData((float*)dcy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f);
335 |       
336 |    
337 |    // Weights
338 |    int numLinearLayers = 0;
339 |    if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) {
340 |       numLinearLayers = 2;
341 |    }
342 |    else if (RNNMode == CUDNN_LSTM) {
343 |       numLinearLayers = 8;
344 |    }
345 |    else if (RNNMode == CUDNN_GRU) {
346 |       numLinearLayers = 6;
347 |    }
348 |    
349 |    for (int layer = 0; layer < numLayers * (bidirectional ? 2 : 1); layer++) {
350 |       for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) {
351 |          cudnnFilterDescriptor_t linLayerMatDesc;
352 |          cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc));
353 |          float *linLayerMat;
354 |          
355 |          cudnnErrCheck(cudnnGetRNNLinLayerMatrixParams( cudnnHandle,
356 |                                                         rnnDesc,  
357 |                                                         layer,
358 |                                                         xDesc[0], 
359 |                                                         wDesc, 
360 |                                                         w,
361 |                                                         linLayerID,  
362 |                                                         linLayerMatDesc, 
363 |                                                         (void**)&linLayerMat));
364 |          
365 |          cudnnDataType_t dataType;
366 |          cudnnTensorFormat_t format;
367 |          int nbDims;
368 |          int filterDimA[3];
369 |          cudnnErrCheck(cudnnGetFilterNdDescriptor(linLayerMatDesc,
370 |                                                   3,
371 |                                                   &dataType,
372 |                                                   &format,
373 |                                                   &nbDims,
374 |                                                   filterDimA));
375 |                                                   
376 |          initGPUData(linLayerMat, filterDimA[0] * filterDimA[1] * filterDimA[2], 1.f / (float)(filterDimA[0] * filterDimA[1] * filterDimA[2]));                                                 
377 | 
378 |          cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerMatDesc));         
379 |          
380 |          cudnnFilterDescriptor_t linLayerBiasDesc;
381 |          cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerBiasDesc));
382 |          float *linLayerBias;
383 |          
384 |          cudnnErrCheck(cudnnGetRNNLinLayerBiasParams( cudnnHandle,
385 |                                                         rnnDesc,  
386 |                                                         layer,
387 |                                                         xDesc[0], 
388 |                                                         wDesc, 
389 |                                                         w,
390 |                                                         linLayerID,  
391 |                                                         linLayerBiasDesc, 
392 |                                                         (void**)&linLayerBias));
393 |          
394 |          cudnnErrCheck(cudnnGetFilterNdDescriptor(linLayerBiasDesc,
395 |                                                   3,
396 |                                                   &dataType,
397 |                                                   &format,
398 |                                                   &nbDims,
399 |                                                   filterDimA));
400 |                                                   
401 |          initGPUData(linLayerBias, filterDimA[0] * filterDimA[1] * filterDimA[2], 1.f);
402 |                                                   
403 |          cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerBiasDesc));
404 |       }
405 |    }
406 |    
407 |    // *********************************************************************************************************
408 |    // At this point all of the setup is done. We now need to pass through the RNN.
409 |    // *********************************************************************************************************
410 |    
411 |   
412 |    
413 |    cudaErrCheck(cudaDeviceSynchronize());
414 |    
415 |    cudaEvent_t start, stop;
416 |    float timeForward, timeBackward1, timeBackward2;
417 |    cudaErrCheck(cudaEventCreate(&start));
418 |    cudaErrCheck(cudaEventCreate(&stop));
419 |    
420 |    cudaErrCheck(cudaEventRecord(start));   
421 | 
422 |    // If we're not training we use this instead
423 |    // cudnnErrCheck(cudnnRNNForwardInference(cudnnHandle, 
424 |                                          // rnnDesc, 
425 |                                          // xDesc, 
426 |                                          // x, 
427 |                                          // hxDesc,
428 |                                          // hx, 
429 |                                          // cxDesc, 
430 |                                          // cx, 
431 |                                          // wDesc, 
432 |                                          // w, 
433 |                                          // yDesc,  
434 |                                          // y, 
435 |                                          // hyDesc, 
436 |                                          // hy, 
437 |                                          // cyDesc, 
438 |                                          // cy, 
439 |                                          // workspace, 
440 |                                          // workSize));
441 | 
442 |    cudnnErrCheck(cudnnRNNForwardTraining(cudnnHandle, 
443 |                                          rnnDesc, 
444 |                                          seqLength,                                          
445 |                                          xDesc, 
446 |                                          x, 
447 |                                          hxDesc,
448 |                                          hx, 
449 |                                          cxDesc, 
450 |                                          cx, 
451 |                                          wDesc, 
452 |                                          w, 
453 |                                          yDesc,  
454 |                                          y, 
455 |                                          hyDesc, 
456 |                                          hy, 
457 |                                          cyDesc, 
458 |                                          cy, 
459 |                                          workspace, 
460 |                                          workSize,
461 |                                          reserveSpace, 
462 |                                          reserveSize));
463 |                 
464 |    cudaErrCheck(cudaEventRecord(stop));   
465 |    cudaErrCheck(cudaEventSynchronize(stop));
466 |    cudaErrCheck(cudaEventElapsedTime(&timeForward, start, stop));
467 |    
468 |    cudaErrCheck(cudaEventRecord(start));
469 |    
470 |    cudnnErrCheck(cudnnRNNBackwardData(cudnnHandle, 
471 |                                rnnDesc, 
472 |                                seqLength,                                
473 |                                yDesc, 
474 |                                y,
475 |                                dyDesc, 
476 |                                dy, 
477 |                                dhyDesc, 
478 |                                dhy, 
479 |                                dcyDesc, 
480 |                                dcy, 
481 |                                wDesc, 
482 |                                w, 
483 |                                hxDesc, 
484 |                                hx,
485 |                                cxDesc, 
486 |                                cx,
487 |                                dxDesc, 
488 |                                dx, 
489 |                                dhxDesc,
490 |                                dhx,
491 |                                dcxDesc,
492 |                                dcx,
493 |                                workspace,
494 |                                workSize,
495 |                                reserveSpace, 
496 |                                reserveSize ));
497 |    
498 |    cudaErrCheck(cudaEventRecord(stop));   
499 |    cudaErrCheck(cudaEventSynchronize(stop));
500 |    cudaErrCheck(cudaEventElapsedTime(&timeBackward1, start, stop));
501 |    
502 |    cudaErrCheck(cudaEventRecord(start));
503 |    
504 |    // cudnnRNNBackwardWeights adds to the data in dw.
505 |    cudaErrCheck(cudaMemset(dw, 0, weightsSize));
506 |    
507 |    cudnnErrCheck(cudnnRNNBackwardWeights( cudnnHandle, 
508 |                                     rnnDesc, 
509 |                                     seqLength, 
510 |                                     xDesc, 
511 |                                     x, 
512 |                                     hxDesc, 
513 |                                     hx,                                                   
514 |                                     yDesc, 
515 |                                     y,
516 |                                     workspace, 
517 |                                     workSize, 
518 |                                     dwDesc, 
519 |                                     dw,
520 |                                     reserveSpace, 
521 |                                     reserveSize ));
522 |                      
523 |                      
524 | 
525 |    cudaErrCheck(cudaEventRecord(stop));   
526 | 
527 |    cudaErrCheck(cudaEventSynchronize(stop));
528 |    cudaErrCheck(cudaEventElapsedTime(&timeBackward2, start, stop));
529 | 
530 |    
531 |    int numMats = 0;
532 |    
533 |    if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) {
534 |       numMats = 2;
535 |    }
536 |    else if (RNNMode == CUDNN_LSTM) {
537 |       numMats = 8;
538 |    }
539 |    else if (RNNMode == CUDNN_GRU) {
540 |       numMats = 6;
541 |    }
542 |    
543 |    // Calculate FLOPS
544 |    printf("Forward: %3.0f GFLOPS\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeForward));
545 |    printf("Backward: %3.0f GFLOPS, ", numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * (timeBackward1 + timeBackward2)));
546 |    printf("(%3.0f GFLOPS), ", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward1));
547 |    printf("(%3.0f GFLOPS)\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward2));
548 | 
549 |    // Calculate FLOPS
550 |    fprintf(fp,"Forward: %3.0f GFLOPS\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeForward));
551 |    fprintf(fp,"Backward: %3.0f GFLOPS, ", numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * (timeBackward1 + timeBackward2)));
552 |    fprintf(fp,"(%3.0f GFLOPS), ", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward1));
553 |    fprintf(fp,"(%3.0f GFLOPS)\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward2));
554 | 
555 |    // Make double-sure everything is finished before we copy for result checking.
556 |    cudaDeviceSynchronize();
557 |    
558 |    // *********************************************************************************************************
559 |    // Print checksums.
560 |    // *********************************************************************************************************
561 |    if (true) {
562 |       float* testOutputi;
563 |       float* testOutputh;
564 |       float* testOutputc;
565 |       
566 |       int biDirScale = (bidirectional ? 2 : 1);
567 |       
568 |       testOutputi = (float*)malloc(hiddenSize * seqLength * miniBatch * biDirScale * sizeof(float));
569 |       testOutputh = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float));
570 |       testOutputc = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float));
571 |  
572 |       cudaErrCheck(cudaMemcpy(testOutputi, y, hiddenSize * seqLength * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost));
573 |       if (hy != NULL) cudaErrCheck(cudaMemcpy(testOutputh, hy, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost));
574 |       if (cy != NULL && RNNMode == CUDNN_LSTM) cudaErrCheck(cudaMemcpy(testOutputc, cy, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost));
575 |       
576 |       double checksumi = 0.f;
577 |       double checksumh = 0.f;
578 |       double checksumc = 0.f;
579 |       
580 |       for (int m = 0; m < miniBatch; m++) {
581 |          double localSumi = 0;
582 |          double localSumh = 0;
583 |          double localSumc = 0;
584 |          
585 |          for (int j = 0; j < seqLength; j++) {
586 |             for (int i = 0; i < hiddenSize * biDirScale; i++) {   
587 |                localSumi += testOutputi[j * miniBatch * hiddenSize * biDirScale + m * hiddenSize * biDirScale + i];
588 |             }
589 |          }
590 |          for (int j = 0; j < numLayers * biDirScale; j++) {
591 |             for (int i = 0; i < hiddenSize; i++) {         
592 |                if (hy != NULL) localSumh += testOutputh[j * hiddenSize * miniBatch + m * hiddenSize + i];
593 |                if (cy != NULL) if (RNNMode == CUDNN_LSTM) localSumc += testOutputc[j * hiddenSize * miniBatch + m * hiddenSize + i];
594 |             }
595 |          }
596 |                   
597 |          checksumi += localSumi;
598 |          checksumh += localSumh;
599 |          checksumc += localSumc;
600 |       }
601 |       
602 |       printf("i checksum %E     ", checksumi);
603 |       fprintf(fp,"i checksum %E     ", checksumi);
604 |       if (RNNMode == CUDNN_LSTM) { printf("c checksum %E     ", checksumc); fprintf(fp,"c checksum %E     ", checksumc); }
605 |       printf("h checksum %E\n", checksumh);
606 |       fprintf(fp,"h checksum %E\n", checksumh);
607 |       
608 |       free(testOutputi);
609 |       free(testOutputc);
610 |       free(testOutputh);
611 |    }   
612 |    
613 |    if (true) {
614 |       float* testOutputdi;
615 |       float* testOutputdh;
616 |       float* testOutputdc;
617 | 
618 |       int biDirScale = (bidirectional ? 2 : 1);
619 |       
620 |       testOutputdi = (float*)malloc(inputSize * seqLength * miniBatch * sizeof(float));
621 |       testOutputdh = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float));
622 |       testOutputdc = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float));
623 |       cudaErrCheck(cudaMemcpy(testOutputdi, dx, seqLength * miniBatch * inputSize * sizeof(float), cudaMemcpyDeviceToHost));
624 |       if (dhx != NULL) cudaErrCheck(cudaMemcpy(testOutputdh, dhx, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost));
625 |       if (dcx != NULL) if (RNNMode == CUDNN_LSTM) cudaErrCheck(cudaMemcpy(testOutputdc, dcx, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost));
626 |       
627 |       float checksumdi = 0.f;
628 |       float checksumdh = 0.f;
629 |       float checksumdc = 0.f;
630 |       
631 |       for (int m = 0; m < miniBatch; m++) {
632 |          double localSumdi = 0;
633 |          double localSumdh = 0;
634 |          double localSumdc = 0;
635 | 
636 |          for (int j = 0; j < seqLength; j++) {
637 |             for (int i = 0; i < inputSize; i++) {
638 |                localSumdi += testOutputdi[j * miniBatch * inputSize + m * inputSize + i];
639 |             }
640 |          }
641 | 
642 |          for (int j = 0; j < numLayers * biDirScale; j++) {
643 |             for (int i = 0; i < hiddenSize; i++) {         
644 |                localSumdh += testOutputdh[j * hiddenSize * miniBatch + m * hiddenSize + i];
645 |                if (RNNMode == CUDNN_LSTM) localSumdc += testOutputdc[j * hiddenSize * miniBatch + m * hiddenSize + i];
646 |             }
647 |          }         
648 | 
649 |          checksumdi += localSumdi;
650 |          checksumdh += localSumdh;
651 |          checksumdc += localSumdc;
652 |          
653 |       }
654 |       
655 |       printf("di checksum %E    ", checksumdi);
656 |       fprintf(fp,"di checksum %E    ", checksumdi);
657 |       if (RNNMode == CUDNN_LSTM) { printf("dc checksum %E    ", checksumdc); fprintf(fp,"dc checksum %E    ", checksumdc); }
658 |       printf("dh checksum %E\n", checksumdh);
659 |       fprintf(fp,"dh checksum %E\n", checksumdh);
660 |       
661 |       free(testOutputdi);
662 |       free(testOutputdh);
663 |       free(testOutputdc);
664 |    }
665 | 
666 |    if (true) {
667 |       float* testOutputdw;
668 |       testOutputdw = (float*)malloc(weightsSize);
669 |  
670 |       cudaErrCheck(cudaMemcpy(testOutputdw, dw, weightsSize, cudaMemcpyDeviceToHost));
671 |       
672 |       double checksumdw = 0.;
673 |             
674 |       for (int i = 0; i < weightsSize / sizeof(float); i++) {
675 |          checksumdw += testOutputdw[i];
676 |       }
677 |       
678 |       printf("dw checksum %E\n", checksumdw);
679 |       fprintf(fp,"dw checksum %E\n", checksumdw);
680 |       
681 |       free(testOutputdw);
682 |    }   
683 |   
684 |    cudaFree(x);
685 |    cudaFree(hx);
686 |    cudaFree(cx);
687 |    cudaFree(y);
688 |    cudaFree(hy);
689 |    cudaFree(cy);
690 |    cudaFree(dx);
691 |    cudaFree(dhx);
692 |    cudaFree(dcx);
693 |    cudaFree(dy);
694 |    cudaFree(dhy);
695 |    cudaFree(dcy);
696 |    cudaFree(workspace);
697 |    cudaFree(reserveSpace);
698 |    cudaFree(w);
699 |    cudaFree(dw);
700 |    
701 |    cudnnDestroy(cudnnHandle);
702 |    fclose(fp);
703 |    return 0;
704 | }
705 | 
706 | 
707 | 
708 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/compare.py:
--------------------------------------------------------------------------------
  1 | #This script can compare the result files with the golden files and report the status: pass or failed\
  2 | #Usage: python compare_result.py results.txt golden.txt
  3 | import os, sys, re
  4 | 
  5 | patterns = ['{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)\s+{key3}\s+checksum\s+([.eE+0-9]+)', #3 similar keys as below each line
  6 |             '{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)', #2 similar keys as below each line
  7 |             '{key}\s+checksum\s+([.eE+0-9]+)',   #one key each line: di checksum 6.676003E+01
  8 |             '{key}[: ]+([0-9]+)\s+GFLOPS[, ]+\\(([0-9]+)\s+GFLOPS\\)[, ]+\\(([0-9]+)\s+GFLOPS\\)', #1 key each line with more returns
  9 |             '{key}[: ]+([0-9]+)\s+GFLOPS']       #one key each line: Forward: 673 GFLOPS
 10 | #keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw', 'Backward', 'Forward']
 11 | keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw'] # skip the last 2 targets
 12 | pats = [0,0,1,1,2,3,4]
 13 | datnum = [len(k) if isinstance(k, tuple) else (3 if k == 'Backward' else 1) for k in keys]
 14 | #tol = 1.0e-3
 15 | def compare_results(ftarget, fgolden):
 16 |     assert ftarget and fgolden, 'No enough input files given!'
 17 |     print ftarget, fgolden
 18 |     targ, _ = get_results_from_file(ftarget)
 19 |     golden, tol = get_results_from_file(fgolden, golden=True)
 20 | 
 21 |     ret = 0
 22 |     assert targ and golden, 'targets or golen results not generated!'
 23 |     for k, vals in golden.iteritems():
 24 |         if not isinstance(vals, list):
 25 |             vals = [vals]
 26 |             targ[k] = [targ[k]]
 27 |         for idx, v in enumerate(vals):
 28 |             tval = float(targ[k][idx])
 29 |             gval = float(v)
 30 |             err = None
 31 |             if tol[k]['type'] == 'rel':
 32 |                 err = abs((tval-gval)/max(gval,tval)) # clamp rel_err <= 1
 33 |             elif tol[k]['type'] == 'abs':
 34 |                 err = abs(tval-gval)
 35 |             assert err is not None, 'Error is Empty!'
 36 |             tol_i = tol[k]['val']
 37 |             #print 'k,t,g,err',k,tval, gval, err
 38 |             if err > tol_i:
 39 |                 print 'FAILED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i)
 40 |                 ret = 1
 41 |             else:
 42 |                 print 'PASSED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i)
 43 |     if ret == 0:
 44 |         print 'ALL PASSED'
 45 |     return ret
 46 | 
 47 | def _get_tolerance_line(line):
 48 |     """get a data item for a tolerance line with format (each line only one item):
 49 |     i: type=rel, 1e-3
 50 |     """
 51 |     assert line, 'Empty line!'
 52 |     line = line.strip().replace(' ','')
 53 |     stmp = line.split(':')
 54 |     key = stmp[0]
 55 |     _type, _val = stmp[1].split(',')
 56 |     _type = _type.split('=')[-1]
 57 |     tol={key:{'type':_type, 'val':float(_val)}}
 58 |     return tol
 59 | 
 60 | def get_results_from_file(fname, golden=False):
 61 |     assert fname, 'No file name given!'
 62 |     ret = {}
 63 |     tol = {}
 64 |     is_tolerance = False
 65 |     with open(fname, 'r') as fin:
 66 |         lines = fin.readlines()
 67 |     if len(lines) == 1:
 68 |         lines = lines[0].split('\r')
 69 |     for idx, line in enumerate(lines):
 70 |         line = line.strip()
 71 |         if not line:
 72 |             continue
 73 |         val = get_valpat_line(line)
 74 |         if val:
 75 |             ret = dict(ret, **val)
 76 |         if golden:
 77 |             if 'TOLERANCE' in line: # the next line is the tol value
 78 |                 is_tolerance = True
 79 |             elif is_tolerance:
 80 |                 _tol = _get_tolerance_line(line)
 81 |                 tol = dict(tol, **_tol)
 82 | 
 83 |     return ret, tol
 84 | 
 85 | def get_valpat_line(line):
 86 |     for idx, key in enumerate(keys):
 87 |         Ndat = datnum[idx]
 88 |         if isinstance(key, tuple):
 89 |             format_expr = {}
 90 |             for j in range(Ndat):
 91 |                 format_expr['key%d'%(j+1)] = keys[idx][j]
 92 |             ret = re.search(patterns[pats[idx]].format(**format_expr), line)
 93 |             if ret:
 94 |                 vals = {}
 95 |                 for j in range(Ndat):
 96 |                     vals[key[j]] = ret.group(j+1)
 97 |                 return vals
 98 |         else:
 99 |             ret = re.search(patterns[pats[idx]].format(key=key), line)
100 |             if ret:
101 |                 if Ndat >1:
102 |                     #print Ndat, key, datnum, idx
103 |                     return {key:[ret.group(j+1) for j in range(Ndat)]}
104 |                 else:
105 |                     return {key:ret.group(1)}
106 |     return None
107 | 
108 | def str_test():
109 |     s='Forward: 673 GFLOPS'
110 |     s1='Backward: 835 GFLOPS, (654 GFLOPS), (1155 GFLOPS)'
111 |     s2='i checksum 1.315793E+06 h checksum 1.315212E+05'
112 |     s3='di checksum 6.676003E+01 dh checksum 6.425050E+01'
113 |     s4='dw checksum 1.453750E+09'
114 |     print get_valpat_line(s1)
115 |     print get_valpat_line(s)
116 |     print get_valpat_line(s2)
117 |     print get_valpat_line(s3)
118 |     print get_valpat_line(s4)
119 | if __name__ == '__main__':
120 |     #str_test()
121 |     #print get_results_from_file('results.txt')
122 |     #print get_results_from_file('golden.txt', golden=True)
123 |     sys.exit(compare_results(sys.argv[1], sys.argv[2]))
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/golden_1.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 1250 GFLOPS
 3 | Backward: 1896 GFLOPS, (1299 GFLOPS), (3511 GFLOPS)
 4 | i checksum 1.315793E+06     h checksum 1.315212E+05
 5 | di checksum 6.676003E+01    dh checksum 6.425050E+01
 6 | dw checksum 1.453750E+09
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/golden_2.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 1225 GFLOPS
 3 | Backward: 1910 GFLOPS, (1299 GFLOPS), (3601 GFLOPS)
 4 | i checksum 6.319591E+05     h checksum 6.319605E+04
 5 | di checksum 4.501830E+00    dh checksum 4.489543E+00
 6 | dw checksum 5.012598E+07
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/golden_3.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 2569 GFLOPS
 3 | Backward: 2654 GFLOPS, (2071 GFLOPS), (3694 GFLOPS)
 4 | i checksum 5.749536E+05     c checksum 4.365091E+05     h checksum 5.774818E+04
 5 | di checksum 3.842206E+02    dc checksum 9.323785E+03    dh checksum 1.182562E+01
 6 | dw checksum 4.313461E+08
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | c: type=rel, 1e-3
13 | dc: type=rel, 1e-3
14 | di: type=rel, 1e-3
15 | dh: type=rel, 1e-3
16 | dw: type=rel, 1e-3
17 | 
18 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/golden_4.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 2310 GFLOPS
 3 | Backward: 2536 GFLOPS, (1955 GFLOPS), (3606 GFLOPS)
 4 | i checksum 6.358978E+05     h checksum 6.281680E+04
 5 | di checksum 6.296622E+00    dh checksum 2.289960E+05
 6 | dw checksum 5.397419E+07
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/RNN/result.txt:
--------------------------------------------------------------------------------
1 | Forward: 413 GFLOPS
2 | Backward: 666 GFLOPS, (410 GFLOPS), (1762 GFLOPS)
3 | i checksum 4.210712E+06     h checksum 6.576062E+04
4 | di checksum 4.015642E+01    dh checksum 3.212526E+01
5 | dw checksum 4.379117E+09
6 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/FreeImage/freeimage-license.txt:
--------------------------------------------------------------------------------
  1 | FreeImage Public License - Version 1.0
  2 | ---------------------------------------------
  3 | 
  4 | 1. Definitions.
  5 | 
  6 | 1.1. "Contributor" means each entity that creates or contributes to the creation of Modifications.
  7 | 
  8 | 1.2. "Contributor Version" means the combination of the Original Code, prior Modifications used by a Contributor, and the Modifications made by that particular Contributor.
  9 | 
 10 | 1.3. "Covered Code" means the Original Code or Modifications or the combination of the Original Code and Modifications, in each case including portions thereof.
 11 | 
 12 | 1.4. "Electronic Distribution Mechanism" means a mechanism generally accepted in the software development community for the electronic transfer of data.
 13 | 
 14 | 1.5. "Executable" means Covered Code in any form other than Source Code.
 15 | 
 16 | 1.6. "Initial Developer" means the individual or entity identified as the Initial Developer in the Source Code notice required by Exhibit A.
 17 | 
 18 | 1.7. "Larger Work" means a work which combines Covered Code or portions thereof with code not governed by the terms of this License.
 19 | 
 20 | 1.8. "License" means this document.
 21 | 
 22 | 1.9. "Modifications" means any addition to or deletion from the substance or structure of either the Original Code or any previous Modifications. When Covered Code is released as a series of files, a
 23 | Modification is:
 24 | 
 25 | A. Any addition to or deletion from the contents of a file containing Original Code or previous Modifications.
 26 | 
 27 | B. Any new file that contains any part of the Original Code or previous Modifications.
 28 | 
 29 | 1.10. "Original Code" means Source Code of computer software code which is described in the Source Code notice required by Exhibit A as Original Code, and which, at the time of its release under this License is not already Covered Code governed by this License.
 30 | 
 31 | 1.11. "Source Code" means the preferred form of the Covered Code for making modifications to it, including all modules it contains, plus any associated interface definition files, scripts used to control
 32 | compilation and installation of an Executable, or a list of source code differential comparisons against either the Original Code or another well known, available Covered Code of the Contributor's choice. The Source Code can be in a compressed or archival form, provided the appropriate decompression or de-archiving software is widely available for no charge.
 33 | 
 34 | 1.12. "You" means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License or a future version of this License issued under Section 6.1. For legal entities, "You" includes any entity which controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the
 35 | direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares or beneficial ownership of such entity.
 36 | 
 37 | 2. Source Code License.
 38 | 
 39 | 2.1. The Initial Developer Grant.
 40 | The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims:
 41 | 
 42 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Original Code (or portions thereof) with or without Modifications, or as part of a Larger Work; and
 43 | 
 44 | (b) under patents now or hereafter owned or controlled by Initial Developer, to make, have made, use and sell ("Utilize") the Original Code (or portions thereof), but solely to the extent that
 45 | any such patent is reasonably necessary to enable You to Utilize the Original Code (or portions thereof) and not to any greater extent that may be necessary to Utilize further Modifications or
 46 | combinations.
 47 | 
 48 | 2.2. Contributor Grant.
 49 | Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims:
 50 | 
 51 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Modifications created by such Contributor (or portions thereof) either on an unmodified basis, with other Modifications, as Covered Code or as part of a Larger Work; and
 52 | 
 53 | (b) under patents now or hereafter owned or controlled by Contributor, to Utilize the Contributor Version (or portions thereof), but solely to the extent that any such patent is reasonably necessary to enable You to Utilize the Contributor Version (or portions thereof), and not to any greater extent that
 54 | may be necessary to Utilize further Modifications or combinations.
 55 | 
 56 | 3. Distribution Obligations.
 57 | 
 58 | 3.1. Application of License.
 59 | The Modifications which You create or to which You contribute are governed by the terms of this License, including without limitation Section 2.2. The Source Code version of Covered Code may be distributed only under the terms of this License or a future version of this License released under Section 6.1, and You must include a copy of this License with every copy of the Source Code You distribute. You may not offer or impose any terms on any Source Code version that alters or
 60 | restricts the applicable version of this License or the recipients' rights hereunder. However, You may include an additional document offering the additional rights described in Section 3.5.
 61 | 
 62 | 3.2. Availability of Source Code.
 63 | Any Modification which You create or to which You contribute must be made available in Source Code form under the terms of this License either on the same media as an Executable version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an Executable version available; and if made available via Electronic Distribution Mechanism, must remain available for at least twelve (12) months after the date it initially became available, or at least six (6) months after a subsequent version of that particular Modification has been made available to such recipients. You are responsible for ensuring that the Source Code version remains available even if the Electronic Distribution Mechanism is maintained by a third party.
 64 | 
 65 | 3.3. Description of Modifications.
 66 | You must cause all Covered Code to which you contribute to contain a file documenting the changes You made to create that Covered Code and the date of any change. You must include a prominent statement that the Modification is derived, directly or indirectly, from Original Code provided by the Initial Developer and including the name of the Initial Developer in (a) the Source Code, and (b) in any notice in an Executable version or related documentation in which You describe the origin or ownership of the Covered Code.
 67 | 
 68 | 3.4. Intellectual Property Matters
 69 | 
 70 | (a) Third Party Claims.
 71 | If You have knowledge that a party claims an intellectual property right in particular functionality or code (or its utilization under this License), you must include a text file with the source code distribution titled "LEGAL" which describes the claim and the party making the claim in sufficient detail that a recipient will know whom to contact. If you obtain such knowledge after You make Your Modification available as described in Section 3.2, You shall promptly modify the LEGAL file in all copies You make
 72 | available thereafter and shall take other steps (such as notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who received the Covered Code that new knowledge has been obtained.
 73 | 
 74 | (b) Contributor APIs.
 75 | If Your Modification is an application programming interface and You own or control patents which are reasonably necessary to implement that API, you must also include this information in the LEGAL file.
 76 | 
 77 | 3.5. Required Notices.
 78 | You must duplicate the notice in Exhibit A in each file of the Source Code, and this License in any documentation for the Source Code, where You describe recipients' rights relating to Covered Code. If You created one or more Modification(s), You may add your name as a Contributor to the notice described in Exhibit A. If it is not possible to put such notice in a particular Source Code file due to its
 79 | structure, then you must include such notice in a location (such as a relevant directory file) where a user would be likely to look for such a notice. You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Code. However, You may do so only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You must make it absolutely clear than any such warranty, support, indemnity or
 80 | liability obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of
 81 | warranty, support, indemnity or liability terms You offer.
 82 | 
 83 | 3.6. Distribution of Executable Versions.
 84 | You may distribute Covered Code in Executable form only if the requirements of Section 3.1-3.5 have been met for that Covered Code, and if You include a notice stating that the Source Code version of the Covered Code is available under the terms of this License, including a description of how and where You have fulfilled the obligations of Section 3.2. The notice must be conspicuously included in any notice in an Executable version, related documentation or collateral in which You
 85 | describe recipients' rights relating to the Covered Code. You may distribute the Executable version of Covered Code under a license of Your choice, which may contain terms different from this License,
 86 | provided that You are in compliance with the terms of this License and that the license for the Executable version does not attempt to limit or alter the recipient's rights in the Source Code version from the rights set forth in this License. If You distribute the Executable version under a different license You must make it absolutely clear that any terms which differ from this License are offered by You alone, not by the Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of any such terms You offer.
 87 | 
 88 | 3.7. Larger Works.
 89 | You may create a Larger Work by combining Covered Code with other code not governed by the terms of this License and distribute the Larger Work as a single product. In such a case, You must make sure the requirements of this License are fulfilled for the Covered Code.
 90 | 
 91 | 4. Inability to Comply Due to Statute or Regulation.
 92 | 
 93 | If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Code due to statute or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be included in the LEGAL file described in Section 3.4 and must be included with all distributions of the Source Code. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it.
 94 | 
 95 | 5. Application of this License.
 96 | 
 97 | This License applies to code to which the Initial Developer has attached the notice in Exhibit A, and to related Covered Code.
 98 | 
 99 | 6. Versions of the License.
100 | 
101 | 6.1. New Versions.
102 | Floris van den Berg may publish revised and/or new versions of the License from time to time. Each version will be given a distinguishing version number.
103 | 
104 | 6.2. Effect of New Versions.
105 | Once Covered Code has been published under a particular version of the License, You may always continue to use it under the terms of that version. You may also choose to use such Covered Code under the terms of any subsequent version of the License published by Floris van den Berg
106 | No one other than Floris van den Berg has the right to modify the terms applicable to Covered Code created under this License.
107 | 
108 | 6.3. Derivative Works.
109 | If you create or use a modified version of this License (which you may only do in order to apply it to code which is not already Covered Code governed by this License), you must (a) rename Your license so that the phrases "FreeImage", `FreeImage Public License", "FIPL", or any confusingly similar phrase do not appear anywhere in your license and (b) otherwise make it clear that your version of the license contains terms which differ from the FreeImage Public License. (Filling in the name of the Initial Developer, Original Code or Contributor in the notice described in Exhibit A shall not of themselves be deemed to be modifications of this License.)
110 | 
111 | 7. DISCLAIMER OF WARRANTY.
112 | 
113 | COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
114 | 
115 | 8. TERMINATION.
116 | 
117 | This License and the rights granted hereunder will terminate automatically if You fail to comply with terms herein and fail to cure such breach within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which are properly granted shall survive any termination of this License. Provisions which, by their nature, must remain in effect beyond the termination of this License shall survive.
118 | 
119 | 9. LIMITATION OF LIABILITY.
120 | 
121 | UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL THE INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO YOU OR ANY OTHER PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
122 | EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THAT EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
123 | 
124 | 10. U.S. GOVERNMENT END USERS.
125 | 
126 | The Covered Code is a "commercial item," as that term is defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer software" and "commercial computer software documentation," as such terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users acquire Covered Code with only those rights set forth herein.
127 | 
128 | 11. MISCELLANEOUS.
129 | 
130 | This License represents the complete agreement concerning subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. This License shall be governed by Dutch law provisions (except to the extent applicable law, if any, provides otherwise), excluding its conflict-of-law provisions. With respect to disputes in which at least one party is a citizen of, or an entity chartered or registered to do business in, the The Netherlands: (a) unless otherwise agreed in writing, all disputes relating to this License (excepting any dispute relating to intellectual property rights) shall be subject to final and binding arbitration, with the losing party paying all costs of arbitration; (b) any arbitration relating to this Agreement shall be held in Almelo, The Netherlands; and (c) any litigation relating to this Agreement shall be subject to the jurisdiction of the court of Almelo, The Netherlands with the losing party responsible for costs, including without limitation, court costs and reasonable attorneys fees and expenses. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not apply to this License.
131 | 
132 | 12. RESPONSIBILITY FOR CLAIMS.
133 | 
134 | Except in cases where another Contributor has failed to comply with Section 3.4, You are responsible for damages arising, directly or indirectly, out of Your utilization of rights under this License, based
135 | on the number of copies of Covered Code you made available, the revenues you received from utilizing such rights, and other relevant factors. You agree to work with affected parties to distribute
136 | responsibility on an equitable basis.
137 | 
138 | EXHIBIT A.
139 | 
140 | "The contents of this file are subject to the FreeImage Public License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://home.wxs.nl/~flvdberg/freeimage-license.txt
141 | 
142 | Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/FreeImage/include/FreeImage.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/FreeImage/include/FreeImage.h


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/Makefile:
--------------------------------------------------------------------------------
  1 | # Location of the CUDA Toolkit
  2 | CUDA_PATH ?= /usr/local/cuda
  3 | 
  4 | # architecture
  5 | HOST_ARCH   := $(shell uname -m)
  6 | TARGET_ARCH ?= $(HOST_ARCH)
  7 | 
  8 | # Adjust this for ARMv7 with a 32-bit filesystem
  9 | ifeq ($(TARGET_ARCH), aarch64)
 10 |     ifeq ($(shell file /sbin/init | grep 32-bit), 1)
 11 |         TARGET_ARCH=armv7l
 12 |     endif
 13 | endif
 14 |  
 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 16 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 17 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 18 |             TARGET_SIZE := 64
 19 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 20 |             TARGET_SIZE := 32
 21 |         endif
 22 |     else
 23 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 24 |     endif
 25 | else
 26 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 27 | endif
 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 29 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 30 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 31 |     endif
 32 | endif
 33 | 
 34 | # operating system
 35 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 36 | TARGET_OS ?= $(HOST_OS)
 37 | 
 38 | ifeq ($(TARGET_OS),QNX)
 39 | override TARGET_OS := qnx
 40 | endif
 41 | 
 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
 43 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 44 | endif
 45 | 
 46 | # host compiler
 47 | ifeq ($(TARGET_OS),darwin)
 48 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 49 |         HOST_COMPILER ?= clang++
 50 |     endif
 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 52 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 53 |         ifeq ($(TARGET_OS),linux)
 54 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 55 |         else ifeq ($(TARGET_OS),qnx)
 56 |             ifeq ($(QNX_HOST),)
 57 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 58 |             endif
 59 |             ifeq ($(QNX_TARGET),)
 60 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 61 |             endif
 62 |             export QNX_HOST
 63 |             export QNX_TARGET
 64 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 65 |         else ifeq ($(TARGET_OS),android)
 66 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 67 |         endif
 68 |     else ifeq ($(TARGET_ARCH),aarch64)
 69 |         ifeq ($(TARGET_OS), linux)
 70 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 71 |         else ifeq ($(TARGET_OS), android)
 72 |             HOST_COMPILER ?= aarch64-linux-android-g++
 73 |         endif
 74 |     else ifeq ($(TARGET_ARCH),ppc64le)
 75 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 76 |     endif
 77 | endif
 78 | HOST_COMPILER ?= g++
 79 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 80 | 
 81 | # internal flags
 82 | NVCCFLAGS   := -m${TARGET_SIZE}
 83 | CCFLAGS     :=
 84 | LDFLAGS     :=
 85 | 
 86 | # build flags
 87 | ifeq ($(TARGET_OS),darwin)
 88 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
 89 |     CCFLAGS += -arch $(HOST_ARCH)
 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
 91 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
 92 |     CCFLAGS += -mfloat-abi=hard
 93 | else ifeq ($(TARGET_OS),android)
 94 |     LDFLAGS += -pie
 95 |     CCFLAGS += -fpie -fpic -fexceptions
 96 | endif
 97 | 
 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 99 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
100 |         ifneq ($(TARGET_FS),)
101 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
102 |             ifeq ($(GCCVERSIONLTEQ46),1)
103 |                 CCFLAGS += --sysroot=$(TARGET_FS)
104 |             endif
105 |             LDFLAGS += --sysroot=$(TARGET_FS)
106 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
107 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
108 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
109 |         endif
110 |     endif
111 | endif
112 | 
113 | # Debug build flags
114 | ifeq ($(dbg),1)
115 |       NVCCFLAGS += -g -G
116 |       BUILD_TYPE := debug
117 | else
118 |       BUILD_TYPE := release
119 | endif
120 | 
121 | ALL_CCFLAGS :=
122 | ALL_CCFLAGS += $(NVCCFLAGS)
123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
126 | 
127 | SAMPLE_ENABLED := 1
128 | 
129 | ALL_LDFLAGS :=
130 | ALL_LDFLAGS += $(ALL_CCFLAGS)
131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
133 | 
134 | # Common includes and paths for CUDA
135 | ifneq ($(TARGET_ARCH), ppc64le)
136 | INCLUDES := -I$(CUDA_PATH)/include
137 | else
138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
139 | endif
140 | LIBRARIES :=
141 | 
142 | ################################################################################
143 | 
144 | # Gencode arguments
145 | SMS ?= 30 35 50 53
146 | 
147 | ifeq ($(SMS),)
148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
149 | SAMPLE_ENABLED := 0
150 | endif
151 | 
152 | ifeq ($(GENCODE_FLAGS),)
153 | # Generate SASS code for each SM architecture listed in $(SMS)
154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
155 | 
156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
157 | HIGHEST_SM := $(lastword $(sort $(SMS)))
158 | ifneq ($(HIGHEST_SM),)
159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
160 | endif
161 | endif
162 | 
163 | INCLUDES += -IFreeImage/include
164 | LIBRARIES += -LFreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH) -LFreeImage/lib/$(TARGET_OS) -lcudart -lcublas -lcudnn -lfreeimage -lstdc++ -lm
165 | 
166 | # Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up.
167 | $(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(LIBRARIES) -l freeimage test.c)
168 | FREEIMAGE := $(shell find a.out 2>/dev/null)
169 | $(shell rm a.out test.c 2>/dev/null)
170 | 
171 | ifeq ("$(FREEIMAGE)","")
172 | $(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<)
173 | SAMPLE_ENABLED := 0
174 | endif
175 | 
176 | ifeq ($(SAMPLE_ENABLED),0)
177 | EXEC ?= @echo "[@]"
178 | endif
179 | 
180 | ################################################################################
181 | 
182 | # Target rules
183 | all: build
184 | 
185 | build: mnistCUDNN
186 | 
187 | check.deps:
188 | ifeq ($(SAMPLE_ENABLED),0)
189 | 	@echo "Sample will be waived due to the above missing dependencies"
190 | else
191 | 	@echo "Sample is ready - all dependencies have been met"
192 | endif
193 | 
194 | OBJ = fp16_dev.o fp16_emu.o mnistCUDNN.o
195 | 
196 | mnistCUDNN: $(OBJ)
197 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
198 | 
199 | %.o: %.cpp
200 | 	$(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $<
201 | 
202 | %.o: %.cu
203 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
204 | 
205 | run: build
206 | 	$(EXEC) ./mnistCUDNN
207 | 
208 | clean:
209 | 	rm -rf *o
210 | 	rm -rf mnistCUDNN
211 | 
212 | clobber: clean
213 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/conv1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv1.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/conv1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv1.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/conv2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv2.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/conv2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv2.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/five_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/five_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/ip1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip1.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/ip1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip1.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/ip2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip2.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/ip2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip2.bin


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/one_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/one_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/data/three_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/three_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/error_util.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #if !defined(_ERROR_UTIL_H_)
 13 | #define _ERROR_UTIL_H_
 14 | 
 15 | #include <sstream>
 16 | #include <stdlib.h>
 17 | #include <stdio.h>
 18 | #include <iostream>
 19 | 
 20 | #define TOSTR_(s)   #s
 21 | #define TOSTR(s)    TOSTR_(s)
 22 | #if defined(__GNUC__)
 23 | #define COMPILER_NAME "GCC"
 24 | #define COMPILER_VER  TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__)
 25 | #elif defined(_MSC_VER)
 26 | #if _MSC_VER < 1500
 27 | #define COMPILER_NAME "MSVC_2005"
 28 | #elif _MSC_VER < 1600
 29 | #define COMPILER_NAME "MSVC_2008"
 30 | #elif _MSC_VER < 1700
 31 | #define COMPILER_NAME "MSVC_2010"
 32 | #elif _MSC_VER < 1800
 33 | #define COMPILER_NAME "MSVC_2012"
 34 | #elif _MSC_VER < 1900
 35 | #define COMPILER_NAME "MSVC_2013"
 36 | #elif _MSC_VER < 2000
 37 | #define COMPILER_NAME "MSVC_2014"
 38 | #else
 39 | #define COMPILER_NAME "MSVC"
 40 | #endif
 41 | #define COMPILER_VER  TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD)
 42 | #elif defined(__clang_major__)
 43 | #define COMPILER_NAME "CLANG"
 44 | #define COMPILER_VER  TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__)
 45 | #elif defined(__INTEL_COMPILER)
 46 | #define COMPILER_NAME "ICC"
 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE)
 48 | #else
 49 | #define COMPILER_NAME "unknown"
 50 | #define COMPILER_VER  "???"
 51 | #endif
 52 | 
 53 | #define CUDNN_VERSION_STR  TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL)
 54 | 
 55 | #define FatalError(s) {                                                \
 56 |     std::stringstream _where, _message;                                \
 57 |     _where << __FILE__ << ':' << __LINE__;                             \
 58 |     _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\
 59 |     std::cerr << _message.str() << "\nAborting...\n";                  \
 60 |     cudaDeviceReset();                                                 \
 61 |     exit(EXIT_FAILURE);                                                \
 62 | }
 63 | 
 64 | #define checkCUDNN(status) {                                           \
 65 |     std::stringstream _error;                                          \
 66 |     if (status != CUDNN_STATUS_SUCCESS) {                              \
 67 |       _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \
 68 |       FatalError(_error.str());                                        \
 69 |     }                                                                  \
 70 | }
 71 | 
 72 | #define checkCudaErrors(status) {                                      \
 73 |     std::stringstream _error;                                          \
 74 |     if (status != 0) {                                                 \
 75 |       _error << "Cuda failure\nError: " << cudaGetErrorString(status); \
 76 |       FatalError(_error.str());                                        \
 77 |     }                                                                  \
 78 | }
 79 | 
 80 | #define checkCublasErrors(status) {                                    \
 81 |     std::stringstream _error;                                          \
 82 |     if (status != 0) {                                                 \
 83 |       _error << "Cublas failure\nError code " << status;        \
 84 |       FatalError(_error.str());                                        \
 85 |     }                                                                  \
 86 | }
 87 | 
 88 | // CUDA Utility Helper Functions
 89 | 
 90 | static void  showDevices( void )
 91 | {
 92 |     int totalDevices;
 93 |     checkCudaErrors(cudaGetDeviceCount( &totalDevices ));
 94 |     printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices);
 95 |     for (int i=0; i< totalDevices; i++) {
 96 |         struct cudaDeviceProp prop;
 97 |         checkCudaErrors(cudaGetDeviceProperties( &prop, i ));
 98 |         printf( "device %d : sms %2d  Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n",
 99 |                     i, prop.multiProcessorCount, prop.major, prop.minor,
100 |                     (float)prop.clockRate*1e-3,
101 |                     (int)(prop.totalGlobalMem/(1024*1024)),
102 |                     (float)prop.memoryClockRate*1e-3,
103 |                     prop.ECCEnabled,
104 |                     prop.multiGpuBoardGroupID);
105 |     }
106 | } 
107 | 
108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
109 | #ifndef _CRT_SECURE_NO_DEPRECATE
110 | #define _CRT_SECURE_NO_DEPRECATE
111 | #endif
112 | #ifndef STRNCASECMP
113 | #define STRNCASECMP _strnicmp
114 | #endif
115 | #else // Linux Includes
116 | #include <string.h>
117 | #include <strings.h>
118 | #ifndef STRNCASECMP
119 | #define STRNCASECMP strncasecmp
120 | #endif
121 | #endif
122 | inline int stringRemoveDelimiter(char delimiter, const char *string)
123 | {
124 |     int string_start = 0;
125 | 
126 |     while (string[string_start] == delimiter)
127 |     {
128 |         string_start++;
129 |     }
130 | 
131 |     if (string_start >= (int)strlen(string)-1)
132 |     {  
133 |         return 0;
134 |     }
135 | 
136 |     return string_start;
137 | }
138 | 
139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
140 | {
141 |     bool bFound = false;
142 | 
143 |     if (argc >= 1)
144 |     {
145 |         for (int i=1; i < argc; i++)
146 |         {
147 |             int string_start = stringRemoveDelimiter('-', argv[i]);
148 |             const char *string_argv = &argv[i][string_start];
149 | 
150 |             const char *equal_pos = strchr(string_argv, '=');
151 |             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
152 | 
153 |             int length = (int)strlen(string_ref);
154 | 
155 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
156 |             {
157 |                 bFound = true;
158 |                 continue;
159 |             }
160 |         }
161 |     }
162 | 
163 |     return bFound;
164 | }
165 | 
166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
167 | {
168 |     bool bFound = false;
169 |     int value = -1;
170 | 
171 |     if (argc >= 1)
172 |     {
173 |         for (int i=1; i < argc; i++)
174 |         {
175 |             int string_start = stringRemoveDelimiter('-', argv[i]);
176 |             const char *string_argv = &argv[i][string_start];
177 |             int length = (int)strlen(string_ref);
178 | 
179 |             if (!STRNCASECMP(string_argv, string_ref, length))
180 |             {
181 |                 if (length+1 <= (int)strlen(string_argv))
182 |                 {
183 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
184 |                     value = atoi(&string_argv[length + auto_inc]);
185 |                 }
186 |                 else
187 |                 {
188 |                     value = 0;
189 |                 }
190 | 
191 |                 bFound = true;
192 |                 continue;
193 |             }
194 |         }
195 |     }
196 | 
197 |     if (bFound)
198 |     {
199 |         return value;
200 |     }
201 |     else
202 |     {
203 |         printf("Not found int\n");
204 |         return 0;
205 |     }
206 | }
207 | 
208 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
209 |                                      const char *string_ref, char **string_retval)
210 | {
211 |     bool bFound = false;
212 | 
213 |     if (argc >= 1)
214 |     {
215 |         for (int i=1; i < argc; i++)
216 |         {
217 |             int string_start = stringRemoveDelimiter('-', argv[i]);
218 |             char *string_argv = (char *)&argv[i][string_start];
219 |             int length = (int)strlen(string_ref);
220 | 
221 |             if (!STRNCASECMP(string_argv, string_ref, length))
222 |             {
223 |                 *string_retval = &string_argv[length+1];
224 |                 bFound = true;
225 |                 continue;
226 |             }
227 |         }
228 |     }
229 | 
230 |     if (!bFound)
231 |     {
232 |         *string_retval = NULL;
233 |     }
234 | 
235 |     return bFound;
236 | }
237 | 
238 | #endif // _ERROR_UTIL_H_


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/fp16_dev.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | #include "error_util.h"
12 | 
13 | #include "fp16_dev.h"
14 | 
15 | #define BLOCK_SIZE 128
16 | template <class value_type>
17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, 
18 |                                       half1 *buffOut)
19 | {
20 |   const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x;
21 |   if (idx >= size) return;
22 |   half1 val;
23 |   val.x = __float2half_rn(float(buffIn[idx]));
24 |   buffOut[idx] = val;
25 | }
26 | 
27 | template <class value_type>
28 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut)
29 | {
30 |   int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
31 |   float2half_rn_kernel<value_type><<<grid_size, BLOCK_SIZE>>>
32 |                     (size, buffIn, buffOut);
33 |   checkCudaErrors(cudaDeviceSynchronize());
34 | }
35 | 
36 | template void gpu_float2half_rn<float> (int, const float*, half1*);
37 | template void gpu_float2half_rn<double> (int, const double*, half1*);


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/fp16_dev.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | #if !defined(_FP16_DEV_H_)
12 | #define _FP16_DEV_H_
13 | 
14 | #include<driver_types.h>
15 | 
16 | typedef struct __align__(2) {
17 |    unsigned short x;
18 | } half1;
19 | 
20 | template <class value_type>
21 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut);
22 | 
23 | #endif // _FP16_DEV_H_


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/fp16_emu.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 |  
 50 | #include "fp16_emu.h" 
 51 | 
 52 | // Host functions for converting between FP32 and FP16 formats
 53 | // Paulius Micikevicius (pauliusm@nvidia.com)
 54 | 
 55 | half1 cpu_float2half_rn(float f)
 56 | {
 57 |     half1 ret;
 58 | 
 59 |     unsigned x = *((int*)(void*)(&f));
 60 |     unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
 61 |     unsigned sign, exponent, mantissa;
 62 | 
 63 |     // Get rid of +NaN/-NaN case first.
 64 |     if (u > 0x7f800000) {
 65 |         ret.x = 0x7fffU;
 66 |         return ret;
 67 |     }
 68 |   
 69 |     sign = ((x >> 16) & 0x8000);
 70 |   
 71 |     // Get rid of +Inf/-Inf, +0/-0.
 72 |     if (u > 0x477fefff) {
 73 |         ret.x = sign | 0x7c00U;
 74 |         return ret;
 75 |     }
 76 |     if (u < 0x33000001) {
 77 |         ret.x = (sign | 0x0000);
 78 |         return ret;
 79 |     }
 80 | 
 81 |     exponent = ((u >> 23) & 0xff);
 82 |     mantissa = (u & 0x7fffff);
 83 | 
 84 |     if (exponent > 0x70) {
 85 |         shift = 13;
 86 |         exponent -= 0x70;
 87 |     } else {
 88 |         shift = 0x7e - exponent;
 89 |         exponent = 0;
 90 |         mantissa |= 0x800000;
 91 |     }
 92 |     lsb = (1 << shift);
 93 |     lsb_s1 = (lsb >> 1);
 94 |     lsb_m1 = (lsb - 1);
 95 |   
 96 |     // Round to nearest even.
 97 |     remainder = (mantissa & lsb_m1);
 98 |     mantissa >>= shift;
 99 |     if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
100 |         ++mantissa;
101 |         if (!(mantissa & 0x3ff)) {
102 |             ++exponent;
103 |             mantissa = 0;
104 |         }
105 |     }  
106 | 
107 |     ret.x = (sign | (exponent << 10) | mantissa);  
108 | 
109 |     return ret;
110 | }
111 | 
112 | 
113 | float cpu_half2float(half1 h)
114 | {
115 |     unsigned sign = ((h.x >> 15) & 1);
116 |     unsigned exponent = ((h.x >> 10) & 0x1f);
117 |     unsigned mantissa = ((h.x & 0x3ff) << 13);
118 | 
119 |     if (exponent == 0x1f) {  /* NaN or Inf */
120 |         mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
121 |         exponent = 0xff;
122 |     } else if (!exponent) {  /* Denorm or Zero */
123 |         if (mantissa) {
124 |             unsigned int msb;
125 |             exponent = 0x71;
126 |             do {
127 |                 msb = (mantissa & 0x400000);
128 |                 mantissa <<= 1;  /* normalize */
129 |                 --exponent;
130 |             } while (!msb);
131 |             mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
132 |         }
133 |     } else {
134 |         exponent += 0x70;
135 |     }
136 | 
137 |     int temp = ((sign << 31) | (exponent << 23) | mantissa);
138 | 
139 |     return *((float*)((void*)&temp));
140 | }
141 | 
142 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/fp16_emu.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 | 
 50 | // Conversion from/to 16-bit floating point (half-precision).
 51 | 
 52 | #if !defined(_FP16_EMU_H_)
 53 | #define _FP16_EMU_H_
 54 | 
 55 | #include "fp16_dev.h"
 56 | 
 57 | #define HLF_EPSILON 4.887581E-04
 58 | #define HLF_MIN     6.103516E-05
 59 | #define HLF_MAX     6.550400E+04
 60 | 
 61 | half1 cpu_float2half_rn(float f);
 62 | 
 63 | float cpu_half2float(half1 h);
 64 | 
 65 | static __inline__ __device__ __host__ half1 habs(half1 h)
 66 | {
 67 |     h.x &= 0x7fffU;
 68 |     return h;
 69 | }
 70 | 
 71 | static __inline__ __device__ __host__ half1 hneg(half1 h)
 72 | {
 73 |     h.x ^= 0x8000U;
 74 |     return h;
 75 | }
 76 | 
 77 | static __inline__ __device__ __host__ int ishnan(half1 h)
 78 | {
 79 |     // When input is NaN, exponent is all ones and mantissa is non-zero.
 80 |     return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) != 0;
 81 | }
 82 | 
 83 | static __inline__ __device__ __host__ int ishinf(half1 h)
 84 | {
 85 |     // When input is +/- inf, exponent is all ones and mantissa is zero.
 86 |     return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) == 0;
 87 | }
 88 | 
 89 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
 90 | {
 91 |     return ishnan(x) == 0 && ishnan(y) == 0 && x.x == y.x;
 92 | }
 93 | 
 94 | // Returns 0.0000 in FP16 binary form
 95 | static __inline__ __device__ __host__ half1 hzero()
 96 | {
 97 |     half1 ret;
 98 |     ret.x = 0x0000U;
 99 |     return ret;
100 | }
101 | 
102 | // Returns 1.0000 in FP16 binary form
103 | static __inline__ __device__ __host__ half1 hone()
104 | {
105 |     half1 ret;
106 |     ret.x = 0x3c00U;
107 |     return ret;
108 | }
109 | 
110 | // Returns quiet NaN, the most significant fraction bit #9 is set
111 | static __inline__ __device__ __host__ half1 hnan()
112 | {
113 |     half1 ret;
114 |     ret.x = 0x7e00U;
115 |     return ret;
116 | }
117 | 
118 | // Largest positive FP16 value, corresponds to 6.5504e+04
119 | static __inline__ __device__ __host__ half1 hmax()
120 | {
121 |     half1 ret;
122 |     // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
123 |     ret.x = 0x7bffU;
124 |     return ret;
125 | }
126 | 
127 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
128 | static __inline__ __device__ __host__ half1 hmin()
129 | {
130 |     half1 ret;
131 |     // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
132 |     ret.x = 0x0400U;
133 |     return ret;
134 | }
135 | 
136 | #endif  // _FP16_EMU_H_
137 | 
138 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/gemv.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #if !defined(_GEMV_H_)
 13 | #define _GEMV_H_
 14 | 
 15 | #include <cuda.h> // CUDA_VERSION
 16 | #include <cublas_v2.h>
 17 | #include "error_util.h"
 18 | 
 19 | //#define DISABLE_GEMV
 20 | 
 21 | void gemv(cublasHandle_t cublasHandle, int m, int n, double alpha, 
 22 |             const double *A, const double *x,
 23 |                                double beta, double *y)
 24 | {
 25 | #ifdef DISABLE_GEMV
 26 |     checkCublasErrors( cublasDgemm (cublasHandle, 
 27 |                       CUBLAS_OP_T,
 28 |                       CUBLAS_OP_N,
 29 |                       n,
 30 |                       1,
 31 |                       m,
 32 |                       &alpha, 
 33 |                       A, 
 34 |                       m,
 35 |                       x,
 36 |                       m, 
 37 |                       &beta, 
 38 |                       y,
 39 |                       m) );
 40 | #else
 41 |     checkCublasErrors( cublasDgemv(cublasHandle, CUBLAS_OP_T,
 42 |                                   m, n,
 43 |                                   &alpha,
 44 |                                   A, m,
 45 |                                   x, 1,
 46 |                                   &beta,
 47 |                                   y, 1) );    
 48 | #endif
 49 | };
 50 | 
 51 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 
 52 |             const float *A, const float *x,
 53 |                                float beta, float *y)
 54 | {
 55 | #ifdef DISABLE_GEMV
 56 |     checkCublasErrors( cublasSgemm (cublasHandle, 
 57 |                       CUBLAS_OP_T,
 58 |                       CUBLAS_OP_N,
 59 |                       n,
 60 |                       1,
 61 |                       m,
 62 |                       &alpha, 
 63 |                       A, 
 64 |                       m,
 65 |                       x,
 66 |                       m, 
 67 |                       &beta, 
 68 |                       y,
 69 |                       m) );
 70 | #else
 71 |     checkCublasErrors( cublasSgemv(cublasHandle, CUBLAS_OP_T,
 72 |                                   m, n,
 73 |                                   &alpha,
 74 |                                   A, m,
 75 |                                   x, 1,
 76 |                                   &beta,
 77 |                                   y, 1) );    
 78 | #endif
 79 | };
 80 | 
 81 | #if defined(CUDA_VERSION) && (CUDA_VERSION > 7000)
 82 | 
 83 | #if (CUDA_VERSION < 8000)
 84 | #define  CUDA_R_16F CUBLAS_DATA_HALF
 85 | #endif
 86 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 
 87 |             const half1 *A, const half1 *x,
 88 |                                float beta, half1 *y)
 89 | {
 90 |     checkCublasErrors( cublasSgemmEx  ( cublasHandle, 
 91 |                                       CUBLAS_OP_T,
 92 |                                       CUBLAS_OP_N, 
 93 |                                       n,
 94 |                                       1,
 95 |                                       m,
 96 |                                       &alpha, 
 97 |                                       A,  
 98 |                                       CUDA_R_16F,
 99 |                                       m,
100 |                                       x,
101 |                                       CUDA_R_16F,
102 |                                       m, 
103 |                                       &beta, 
104 |                                       y,
105 |                                       CUDA_R_16F,
106 |                                       m) );
107 | };
108 | #endif
109 | 
110 | #endif  // _GEMV_H_
111 | 


--------------------------------------------------------------------------------
/cudnn_samples_v6/mnistCUDNN/readme.txt:
--------------------------------------------------------------------------------
 1 | This sample demonstrates how to use cuDNN library to implement forward pass
 2 | given a trained network.
 3 | 
 4 | The sample is based on "Training LeNet on MNIST with Caffe" tutorial, located
 5 | at http://caffe.berkeleyvision.org/. The network is identical with the exception 
 6 | of addition of LRN layer. All the network weights are obtained and exported
 7 | using Caffe.
 8 | 
 9 | Network layer topology:
10 | 
11 | 1. Convolution
12 | 2. Pooling
13 | 3. Convolution
14 | 4. Pooling
15 | 5. Fully connected
16 | 6. Relu
17 | 7. LRN
18 | 8. Fully Connected
19 | 9. SoftMax
20 | 
21 | By default, the sample will classify three images, located in "data" directory
22 | using precomputed network weights:
23 | 1) Two convolution layers and their bias: conv1.bias.bin conv1.bin conv2.bias.bin conv2.bin
24 | 2) Two fully connected layers and their bias: ip1.bias.bin ip1.bin ip2.bias.bin ip2.bin
25 | 
26 | Supported platforms: identical to cuDNN
27 | 
28 | How to run:
29 | 
30 | mnistCUDNN {<options>}
31 | help                   : display this help
32 | device=<int>           : set the device to run the sample
33 | image=<name>           : classify specific image
34 | 
35 | New in version 3 release
36 | fp16 (three ways of conversion: on host, on device using cuDNN, on device using CUDA)
37 | Local Response Normalization (LRN)
38 | Find fastest config (cudnnFindConvolutionForwardAlgorithm)
39 | FFT convolution
40 | Demonstrate Nd API (first available in cuDNN v2)
41 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/Makefile:
--------------------------------------------------------------------------------
  1 | # Location of the CUDA Toolkit
  2 | CUDA_PATH ?= /usr/local/cuda
  3 | 
  4 | # architecture
  5 | HOST_ARCH   := $(shell uname -m)
  6 | TARGET_ARCH ?= $(HOST_ARCH)
  7 | 
  8 | # Adjust this for ARMv7 with a 32-bit filesystem
  9 | ifeq ($(TARGET_ARCH), aarch64)
 10 |     ifeq ($(shell file /sbin/init | grep 32-bit), 1)
 11 |         TARGET_ARCH=armv7l
 12 |     endif
 13 | endif
 14 |  
 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 16 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 17 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 18 |             TARGET_SIZE := 64
 19 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 20 |             TARGET_SIZE := 32
 21 |         endif
 22 |     else
 23 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 24 |     endif
 25 | else
 26 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 27 | endif
 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 29 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 30 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 31 |     endif
 32 | endif
 33 | 
 34 | # operating system
 35 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 36 | TARGET_OS ?= $(HOST_OS)
 37 | 
 38 | ifeq ($(TARGET_OS),QNX)
 39 | TARGET_OS := qnx
 40 | endif
 41 | 
 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android))
 43 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 44 | endif
 45 | 
 46 | # host compiler
 47 | ifeq ($(TARGET_OS),darwin)
 48 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 49 |         HOST_COMPILER ?= clang++
 50 |     endif
 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 52 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 53 |         ifeq ($(TARGET_OS),linux)
 54 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 55 |         else ifeq ($(TARGET_OS),qnx)
 56 |             ifeq ($(QNX_HOST),)
 57 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 58 |             endif
 59 |             ifeq ($(QNX_TARGET),)
 60 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 61 |             endif
 62 |             export QNX_HOST
 63 |             export QNX_TARGET
 64 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 65 |         else ifeq ($(TARGET_OS),android)
 66 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 67 |         endif
 68 |     else ifeq ($(TARGET_ARCH),aarch64)
 69 |         ifeq ($(TARGET_OS), linux)
 70 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 71 |         else ifeq ($(TARGET_OS), android)
 72 |             HOST_COMPILER ?= aarch64-linux-android-g++
 73 |         endif
 74 |     else ifeq ($(TARGET_ARCH),ppc64le)
 75 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 76 |     endif
 77 | endif
 78 | HOST_COMPILER ?= g++
 79 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 80 | 
 81 | # internal flags
 82 | NVCCFLAGS   := -m${TARGET_SIZE}
 83 | CCFLAGS     :=
 84 | LDFLAGS     :=
 85 | 
 86 | # build flags
 87 | ifeq ($(TARGET_OS),darwin)
 88 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
 89 |     CCFLAGS += -arch $(HOST_ARCH)
 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
 91 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
 92 |     CCFLAGS += -mfloat-abi=hard
 93 | else ifeq ($(TARGET_OS),android)
 94 |     LDFLAGS += -pie
 95 |     CCFLAGS += -fpie -fpic -fexceptions
 96 | endif
 97 | 
 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 99 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
100 |         ifneq ($(TARGET_FS),)
101 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
102 |             ifeq ($(GCCVERSIONLTEQ46),1)
103 |                 CCFLAGS += --sysroot=$(TARGET_FS)
104 |             endif
105 |             LDFLAGS += --sysroot=$(TARGET_FS)
106 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
107 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
108 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
109 |         endif
110 |     endif
111 | endif
112 | 
113 | # Debug build flags
114 | ifeq ($(dbg),1)
115 |       NVCCFLAGS += -g -G
116 |       BUILD_TYPE := debug
117 | else
118 |       BUILD_TYPE := release
119 | endif
120 | 
121 | ALL_CCFLAGS :=
122 | ALL_CCFLAGS += $(NVCCFLAGS)
123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
126 | 
127 | SAMPLE_ENABLED := 1
128 | 
129 | ALL_LDFLAGS :=
130 | ALL_LDFLAGS += $(ALL_CCFLAGS)
131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
133 | 
134 | # Common includes and paths for CUDA
135 | ifneq ($(TARGET_ARCH), ppc64le)
136 | INCLUDES := -I$(CUDA_PATH)/include
137 | else
138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
139 | endif
140 | LIBRARIES :=
141 | 
142 | ################################################################################
143 | 
144 | # Gencode arguments
145 | SMS ?= 30 35 50 53
146 | 
147 | ifeq ($(SMS),)
148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
149 | SAMPLE_ENABLED := 0
150 | endif
151 | 
152 | ifeq ($(GENCODE_FLAGS),)
153 | # Generate SASS code for each SM architecture listed in $(SMS)
154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
155 | 
156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
157 | HIGHEST_SM := $(lastword $(sort $(SMS)))
158 | ifneq ($(HIGHEST_SM),)
159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
160 | endif
161 | endif
162 | 
163 | INCLUDES += -I.
164 | LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm
165 | 
166 | ifeq ($(SAMPLE_ENABLED),0)
167 | EXEC ?= @echo "[@]"
168 | endif
169 | 
170 | ################################################################################
171 | 
172 | # Target rules
173 | all: build
174 | 
175 | build: RNN
176 | 
177 | check.deps:
178 | ifeq ($(SAMPLE_ENABLED),0)
179 | 	@echo "Sample will be waived due to the above missing dependencies"
180 | else
181 | 	@echo "Sample is ready - all dependencies have been met"
182 | endif
183 | 
184 | OBJ = RNN_example.o
185 | 
186 | RNN: $(OBJ)
187 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
188 | 
189 | %.o: %.cu
190 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
191 | 
192 | run: build
193 | 	$(EXEC) ./RNN 100 4 512 64 2
194 | 
195 | clean:
196 | 	rm -rf *o
197 | 	rm -rf RNN
198 | 
199 | clobber: clean
200 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/compare.py:
--------------------------------------------------------------------------------
  1 | #This script can compare the result files with the golden files and report the status: pass or failed\
  2 | #Usage: python compare_result.py results.txt golden.txt
  3 | import os, sys, re
  4 | 
  5 | patterns = ['{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)\s+{key3}\s+checksum\s+([.eE+0-9]+)', #3 similar keys as below each line
  6 |             '{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)', #2 similar keys as below each line
  7 |             '{key}\s+checksum\s+([.eE+0-9]+)',   #one key each line: di checksum 6.676003E+01
  8 |             '{key}[: ]+([0-9]+)\s+GFLOPS[, ]+\\(([0-9]+)\s+GFLOPS\\)[, ]+\\(([0-9]+)\s+GFLOPS\\)', #1 key each line with more returns
  9 |             '{key}[: ]+([0-9]+)\s+GFLOPS']       #one key each line: Forward: 673 GFLOPS
 10 | #keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw', 'Backward', 'Forward']
 11 | keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw'] # skip the last 2 targets
 12 | pats = [0,0,1,1,2,3,4]
 13 | datnum = [len(k) if isinstance(k, tuple) else (3 if k == 'Backward' else 1) for k in keys]
 14 | #tol = 1.0e-3
 15 | def compare_results(ftarget, fgolden):
 16 |     assert ftarget and fgolden, 'No enough input files given!'
 17 |     print ftarget, fgolden
 18 |     targ, _ = get_results_from_file(ftarget)
 19 |     golden, tol = get_results_from_file(fgolden, golden=True)
 20 | 
 21 |     ret = 0
 22 |     assert targ and golden, 'targets or golen results not generated!'
 23 |     for k, vals in golden.iteritems():
 24 |         if not isinstance(vals, list):
 25 |             vals = [vals]
 26 |             targ[k] = [targ[k]]
 27 |         for idx, v in enumerate(vals):
 28 |             tval = float(targ[k][idx])
 29 |             gval = float(v)
 30 |             err = None
 31 |             if tol[k]['type'] == 'rel':
 32 |                 err = abs((tval-gval)/max(gval,tval)) # clamp rel_err <= 1
 33 |             elif tol[k]['type'] == 'abs':
 34 |                 err = abs(tval-gval)
 35 |             assert err is not None, 'Error is Empty!'
 36 |             tol_i = tol[k]['val']
 37 |             #print 'k,t,g,err',k,tval, gval, err
 38 |             if err > tol_i:
 39 |                 print 'FAILED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i)
 40 |                 ret = 1
 41 |             else:
 42 |                 print 'PASSED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i)
 43 |     if ret == 0:
 44 |         print 'ALL PASSED'
 45 |     return ret
 46 | 
 47 | def _get_tolerance_line(line):
 48 |     """get a data item for a tolerance line with format (each line only one item):
 49 |     i: type=rel, 1e-3
 50 |     """
 51 |     assert line, 'Empty line!'
 52 |     line = line.strip().replace(' ','')
 53 |     stmp = line.split(':')
 54 |     key = stmp[0]
 55 |     _type, _val = stmp[1].split(',')
 56 |     _type = _type.split('=')[-1]
 57 |     tol={key:{'type':_type, 'val':float(_val)}}
 58 |     return tol
 59 | 
 60 | def get_results_from_file(fname, golden=False):
 61 |     assert fname, 'No file name given!'
 62 |     ret = {}
 63 |     tol = {}
 64 |     is_tolerance = False
 65 |     with open(fname, 'r') as fin:
 66 |         lines = fin.readlines()
 67 |     if len(lines) == 1:
 68 |         lines = lines[0].split('\r')
 69 |     for idx, line in enumerate(lines):
 70 |         line = line.strip()
 71 |         if not line:
 72 |             continue
 73 |         val = get_valpat_line(line)
 74 |         if val:
 75 |             ret = dict(ret, **val)
 76 |         if golden:
 77 |             if 'TOLERANCE' in line: # the next line is the tol value
 78 |                 is_tolerance = True
 79 |             elif is_tolerance:
 80 |                 _tol = _get_tolerance_line(line)
 81 |                 tol = dict(tol, **_tol)
 82 | 
 83 |     return ret, tol
 84 | 
 85 | def get_valpat_line(line):
 86 |     for idx, key in enumerate(keys):
 87 |         Ndat = datnum[idx]
 88 |         if isinstance(key, tuple):
 89 |             format_expr = {}
 90 |             for j in range(Ndat):
 91 |                 format_expr['key%d'%(j+1)] = keys[idx][j]
 92 |             ret = re.search(patterns[pats[idx]].format(**format_expr), line)
 93 |             if ret:
 94 |                 vals = {}
 95 |                 for j in range(Ndat):
 96 |                     vals[key[j]] = ret.group(j+1)
 97 |                 return vals
 98 |         else:
 99 |             ret = re.search(patterns[pats[idx]].format(key=key), line)
100 |             if ret:
101 |                 if Ndat >1:
102 |                     #print Ndat, key, datnum, idx
103 |                     return {key:[ret.group(j+1) for j in range(Ndat)]}
104 |                 else:
105 |                     return {key:ret.group(1)}
106 |     return None
107 | 
108 | def str_test():
109 |     s='Forward: 673 GFLOPS'
110 |     s1='Backward: 835 GFLOPS, (654 GFLOPS), (1155 GFLOPS)'
111 |     s2='i checksum 1.315793E+06 h checksum 1.315212E+05'
112 |     s3='di checksum 6.676003E+01 dh checksum 6.425050E+01'
113 |     s4='dw checksum 1.453750E+09'
114 |     print get_valpat_line(s1)
115 |     print get_valpat_line(s)
116 |     print get_valpat_line(s2)
117 |     print get_valpat_line(s3)
118 |     print get_valpat_line(s4)
119 | if __name__ == '__main__':
120 |     #str_test()
121 |     #print get_results_from_file('results.txt')
122 |     #print get_results_from_file('golden.txt', golden=True)
123 |     sys.exit(compare_results(sys.argv[1], sys.argv[2]))
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/golden_1.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 1250 GFLOPS
 3 | Backward: 1896 GFLOPS, (1299 GFLOPS), (3511 GFLOPS)
 4 | i checksum 1.315793E+06     h checksum 1.315212E+05
 5 | di checksum 6.676003E+01    dh checksum 6.425050E+01
 6 | dw checksum 1.453750E+09
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/golden_2.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 1225 GFLOPS
 3 | Backward: 1910 GFLOPS, (1299 GFLOPS), (3601 GFLOPS)
 4 | i checksum 6.319591E+05     h checksum 6.319605E+04
 5 | di checksum 4.501830E+00    dh checksum 4.489543E+00
 6 | dw checksum 5.012598E+07
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/golden_3.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 2569 GFLOPS
 3 | Backward: 2654 GFLOPS, (2071 GFLOPS), (3694 GFLOPS)
 4 | i checksum 5.749536E+05     c checksum 4.365091E+05     h checksum 5.774818E+04
 5 | di checksum 3.842206E+02    dc checksum 9.323785E+03    dh checksum 1.182562E+01
 6 | dw checksum 4.313461E+08
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | c: type=rel, 1e-3
13 | dc: type=rel, 1e-3
14 | di: type=rel, 1e-3
15 | dh: type=rel, 1e-3
16 | dw: type=rel, 1e-3
17 | 
18 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/RNN/golden_4.txt:
--------------------------------------------------------------------------------
 1 | ------------GOLDEN------------
 2 | Forward: 2310 GFLOPS
 3 | Backward: 2536 GFLOPS, (1955 GFLOPS), (3606 GFLOPS)
 4 | i checksum 6.358978E+05     h checksum 6.281680E+04
 5 | di checksum 6.296622E+00    dh checksum 2.289960E+05
 6 | dw checksum 5.397419E+07
 7 | -----------TOLERANCE-----------
 8 | Forward: type=rel, 1 
 9 | Backward: type=rel, 1
10 | i: type=rel, 1e-3
11 | h: type=rel, 1e-3
12 | di: type=rel, 1e-3
13 | dh: type=rel, 1e-3
14 | dw: type=rel, 1e-3
15 | 
16 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/Makefile:
--------------------------------------------------------------------------------
  1 | # Location of the CUDA Toolkit
  2 | CUDA_PATH ?= /usr/local/cuda
  3 | 
  4 | # architecture
  5 | HOST_ARCH   := $(shell uname -m)
  6 | TARGET_ARCH ?= $(HOST_ARCH)
  7 | 
  8 | # Adjust this for ARMv7 with a 32-bit filesystem
  9 | ifeq ($(TARGET_ARCH), aarch64)
 10 |     ifeq ($(shell file /sbin/init | grep 32-bit), 1)
 11 |         TARGET_ARCH=armv7l
 12 |     endif
 13 | endif
 14 |  
 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 16 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 17 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 18 |             TARGET_SIZE := 64
 19 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 20 |             TARGET_SIZE := 32
 21 |         endif
 22 |     else
 23 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 24 |     endif
 25 | else
 26 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 27 | endif
 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 29 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 30 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 31 |     endif
 32 | endif
 33 | 
 34 | # operating system
 35 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 36 | TARGET_OS ?= $(HOST_OS)
 37 | 
 38 | ifeq ($(TARGET_OS),QNX)
 39 | override TARGET_OS := qnx
 40 | endif
 41 | 
 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
 43 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 44 | endif
 45 | 
 46 | # host compiler
 47 | ifeq ($(TARGET_OS),darwin)
 48 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 49 |         HOST_COMPILER ?= clang++
 50 |     endif
 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 52 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 53 |         ifeq ($(TARGET_OS),linux)
 54 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 55 |         else ifeq ($(TARGET_OS),qnx)
 56 |             ifeq ($(QNX_HOST),)
 57 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 58 |             endif
 59 |             ifeq ($(QNX_TARGET),)
 60 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 61 |             endif
 62 |             export QNX_HOST
 63 |             export QNX_TARGET
 64 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 65 |         else ifeq ($(TARGET_OS),android)
 66 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 67 |         endif
 68 |     else ifeq ($(TARGET_ARCH),aarch64)
 69 |         ifeq ($(TARGET_OS), linux)
 70 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 71 |         else ifeq ($(TARGET_OS), android)
 72 |             HOST_COMPILER ?= aarch64-linux-android-g++
 73 |         endif
 74 |     else ifeq ($(TARGET_ARCH),ppc64le)
 75 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 76 |     endif
 77 | endif
 78 | HOST_COMPILER ?= g++
 79 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 80 | 
 81 | # internal flags
 82 | NVCCFLAGS   := -m${TARGET_SIZE}
 83 | CCFLAGS     :=
 84 | LDFLAGS     :=
 85 | 
 86 | # build flags
 87 | ifeq ($(TARGET_OS),darwin)
 88 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
 89 |     CCFLAGS += -arch $(HOST_ARCH)
 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
 91 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
 92 |     CCFLAGS += -mfloat-abi=hard
 93 | else ifeq ($(TARGET_OS),android)
 94 |     LDFLAGS += -pie
 95 |     CCFLAGS += -fpie -fpic -fexceptions
 96 | endif
 97 | 
 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 99 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
100 |         ifneq ($(TARGET_FS),)
101 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
102 |             ifeq ($(GCCVERSIONLTEQ46),1)
103 |                 CCFLAGS += --sysroot=$(TARGET_FS)
104 |             endif
105 |             LDFLAGS += --sysroot=$(TARGET_FS)
106 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
107 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
108 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
109 |         endif
110 |     endif
111 | endif
112 | 
113 | # Debug build flags
114 | ifeq ($(dbg),1)
115 |       NVCCFLAGS += -g -G
116 |       BUILD_TYPE := debug
117 | else
118 |       BUILD_TYPE := release
119 | endif
120 | 
121 | ALL_CCFLAGS :=
122 | ALL_CCFLAGS += $(NVCCFLAGS)
123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
126 | 
127 | SAMPLE_ENABLED := 1
128 | 
129 | ALL_LDFLAGS :=
130 | ALL_LDFLAGS += $(ALL_CCFLAGS)
131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
133 | 
134 | # Common includes and paths for CUDA
135 | ifneq ($(TARGET_ARCH), ppc64le)
136 | INCLUDES := -I$(CUDA_PATH)/include
137 | else
138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
139 | endif
140 | LIBRARIES :=
141 | 
142 | ################################################################################
143 | 
144 | # Gencode arguments
145 | #$(warning "print cuda path $(CUDA_PATH)")
146 | 
147 | ifneq ($(TARGET_ARCH), ppc64le)
148 | CUDA_VERSION := $(shell cat $(CUDA_PATH)/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') 
149 | else
150 | CUDA_VERSION := $(shell cat $(CUDA_PATH)/targets/ppc64le-linux/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') 
151 | endif
152 | #$(warning "print cuda version $(CUDA_VERSION)")
153 | 
154 | ifeq ($(CUDA_VERSION),8000 )
155 | SMS_VOLTA = 
156 | else
157 | ifneq ($(TARGET_ARCH), ppc64le)
158 | ifeq ($(CUDA_VERSION),9000 )
159 | SMS_VOLTA ?= 70 
160 | else
161 | SMS_VOLTA ?= 70 72
162 | endif
163 | else
164 | SMS_VOLTA ?= 70 
165 | endif
166 | endif
167 | #$(warning "print sms_volta $(SMS_VOLTA)")
168 | 
169 | SMS ?= 30 35 50 53 60 61 $(SMS_VOLTA) 
170 | $(warning "print CUDA version  $(CUDA_VERSION)")
171 | $(warning "print sms new  $(SMS)")
172 | 
173 | 
174 | ifeq ($(SMS),)
175 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
176 | SAMPLE_ENABLED := 0
177 | endif
178 | 
179 | ifeq ($(GENCODE_FLAGS),)
180 | # Generate SASS code for each SM architecture listed in $(SMS)
181 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
182 | 
183 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
184 | HIGHEST_SM := $(lastword $(sort $(SMS)))
185 | ifneq ($(HIGHEST_SM),)
186 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
187 | endif
188 | endif
189 | 
190 | #INCLUDES += -IFreeImage/include
191 | LIBRARIES += -lcudart -lcublas -lcudnn -lstdc++ -lm
192 | 
193 | ifeq ($(SAMPLE_ENABLED),0)
194 | EXEC ?= @echo "[@]"
195 | endif
196 | 
197 | ################################################################################
198 | 
199 | # Target rules
200 | all: build
201 | 
202 | build: conv_sample
203 | 
204 | check.deps:
205 | ifeq ($(SAMPLE_ENABLED),0)
206 | 	@echo "Sample will be waived due to the above missing dependencies"
207 | else
208 | 	@echo "Sample is ready - all dependencies have been met"
209 | endif
210 | 
211 | OBJ = fp16_dev.o fp16_emu.o conv_sample.o
212 | 
213 | conv_sample: $(OBJ)
214 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
215 | 
216 | %.o: %.cpp
217 | 	$(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $<
218 | 
219 | %.o: %.cu
220 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
221 | 
222 | run: build
223 | 	$(EXEC) ./conv_sample
224 | 
225 | clean:
226 | 	rm -rf *o
227 | 	rm -rf conv_sample
228 | 
229 | clobber: clean
230 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/config_fermi_islip.icnt:
--------------------------------------------------------------------------------
 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode
 2 | use_map = 0;
 3 | flit_size = 40; 
 4 | 
 5 | // currently we do not use this, see subnets below
 6 | network_count = 2;
 7 | 
 8 | // Topology
 9 | topology = fly;
10 | k = 52;
11 | n = 1;
12 | 
13 | // Routing
14 | 
15 | routing_function = dest_tag;
16 | 
17 | // Flow control
18 | 
19 | num_vcs     = 1;
20 | vc_buf_size = 64;
21 | input_buffer_size = 256;
22 | ejection_buffer_size = 64;
23 | boundary_buffer_size = 64;
24 | 
25 | wait_for_tail_credit = 0;
26 | 
27 | // Router architecture
28 | 
29 | vc_allocator = islip; //separable_input_first;
30 | sw_allocator = islip; //separable_input_first;
31 | alloc_iters  = 1;
32 | 
33 | credit_delay   = 0;
34 | routing_delay  = 0;
35 | vc_alloc_delay = 1;
36 | sw_alloc_delay = 1;
37 | 
38 | input_speedup     = 1;
39 | output_speedup    = 1;
40 | internal_speedup  = 2.0;
41 | 
42 | // Traffic, GPGPU-Sim does not use this
43 | 
44 | traffic                = uniform;
45 | packet_size ={{1,2,3,4},{10,20}};
46 | packet_size_rate={{1,1,1,1},{2,1}};
47 | 
48 | // Simulation - Don't change
49 | 
50 | sim_type       = gpgpusim;
51 | //sim_type = latency;
52 | injection_rate = 0.1;
53 | 
54 | subnets = 2;
55 | 
56 | // Always use read and write no matter following line
57 | //use_read_write = 1;
58 | 
59 | 
60 | read_request_subnet = 0;
61 | read_reply_subnet = 1;
62 | write_request_subnet = 0;
63 | write_reply_subnet = 1;
64 | 
65 | read_request_begin_vc = 0;
66 | read_request_end_vc = 0;
67 | write_request_begin_vc = 0;
68 | write_request_end_vc = 0;
69 | read_reply_begin_vc = 0;
70 | read_reply_end_vc = 0;
71 | write_reply_begin_vc = 0;
72 | write_reply_end_vc = 0;
73 | 
74 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/error_util.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #if !defined(_ERROR_UTIL_H_)
 13 | #define _ERROR_UTIL_H_
 14 | 
 15 | #include <sstream>
 16 | #include <stdlib.h>
 17 | #include <stdio.h>
 18 | #include <iostream>
 19 | 
 20 | #define TOSTR_(s)   #s
 21 | #define TOSTR(s)    TOSTR_(s)
 22 | #if defined(__GNUC__)
 23 | #define COMPILER_NAME "GCC"
 24 | #define COMPILER_VER  TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__)
 25 | #elif defined(_MSC_VER)
 26 | #if _MSC_VER < 1500
 27 | #define COMPILER_NAME "MSVC_2005"
 28 | #elif _MSC_VER < 1600
 29 | #define COMPILER_NAME "MSVC_2008"
 30 | #elif _MSC_VER < 1700
 31 | #define COMPILER_NAME "MSVC_2010"
 32 | #elif _MSC_VER < 1800
 33 | #define COMPILER_NAME "MSVC_2012"
 34 | #elif _MSC_VER < 1900
 35 | #define COMPILER_NAME "MSVC_2013"
 36 | #elif _MSC_VER < 2000
 37 | #define COMPILER_NAME "MSVC_2014"
 38 | #else
 39 | #define COMPILER_NAME "MSVC"
 40 | #endif
 41 | #define COMPILER_VER  TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD)
 42 | #elif defined(__clang_major__)
 43 | #define COMPILER_NAME "CLANG"
 44 | #define COMPILER_VER  TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__)
 45 | #elif defined(__INTEL_COMPILER)
 46 | #define COMPILER_NAME "ICC"
 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE)
 48 | #else
 49 | #define COMPILER_NAME "unknown"
 50 | #define COMPILER_VER  "???"
 51 | #endif
 52 | 
 53 | #define CUDNN_VERSION_STR  TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL)
 54 | 
 55 | #define FatalError(s) {                                                \
 56 |     std::stringstream _where, _message;                                \
 57 |     _where << __FILE__ << ':' << __LINE__;                             \
 58 |     _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\
 59 |     std::cerr << _message.str() << "\nAborting...\n";                  \
 60 |     cudaDeviceReset();                                                 \
 61 |     exit(EXIT_FAILURE);                                                \
 62 | }
 63 | 
 64 | #define checkCUDNN(status) {                                           \
 65 |     std::stringstream _error;                                          \
 66 |     if (status != CUDNN_STATUS_SUCCESS) {                              \
 67 |       _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \
 68 |       FatalError(_error.str());                                        \
 69 |     }                                                                  \
 70 | }
 71 | 
 72 | #define checkCudaErrors(status) {                                      \
 73 |     std::stringstream _error;                                          \
 74 |     if (status != 0) {                                                 \
 75 |       _error << "Cuda failure\nError: " << cudaGetErrorString(status); \
 76 |       FatalError(_error.str());                                        \
 77 |     }                                                                  \
 78 | }
 79 | 
 80 | #define checkCublasErrors(status) {                                    \
 81 |     std::stringstream _error;                                          \
 82 |     if (status != 0) {                                                 \
 83 |       _error << "Cublas failure\nError code " << status;        \
 84 |       FatalError(_error.str());                                        \
 85 |     }                                                                  \
 86 | }
 87 | 
 88 | // CUDA Utility Helper Functions
 89 | 
 90 | static void  showDevices( void )
 91 | {
 92 |     int totalDevices;
 93 |     checkCudaErrors(cudaGetDeviceCount( &totalDevices ));
 94 |     printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices);
 95 |     for (int i=0; i< totalDevices; i++) {
 96 |         struct cudaDeviceProp prop;
 97 |         checkCudaErrors(cudaGetDeviceProperties( &prop, i ));
 98 |         printf( "device %d : sms %2d  Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n",
 99 |                     i, prop.multiProcessorCount, prop.major, prop.minor,
100 |                     (float)prop.clockRate*1e-3,
101 |                     (int)(prop.totalGlobalMem/(1024*1024)),
102 |                     (float)prop.memoryClockRate*1e-3,
103 |                     prop.ECCEnabled,
104 |                     prop.multiGpuBoardGroupID);
105 |     }
106 | } 
107 | 
108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
109 | #ifndef _CRT_SECURE_NO_DEPRECATE
110 | #define _CRT_SECURE_NO_DEPRECATE
111 | #endif
112 | #ifndef STRNCASECMP
113 | #define STRNCASECMP _strnicmp
114 | #endif
115 | #else // Linux Includes
116 | #include <string.h>
117 | #include <strings.h>
118 | #ifndef STRNCASECMP
119 | #define STRNCASECMP strncasecmp
120 | #endif
121 | #endif
122 | inline int stringRemoveDelimiter(char delimiter, const char *string)
123 | {
124 |     int string_start = 0;
125 | 
126 |     while (string[string_start] == delimiter)
127 |     {
128 |         string_start++;
129 |     }
130 | 
131 |     if (string_start >= (int)strlen(string)-1)
132 |     {  
133 |         return 0;
134 |     }
135 | 
136 |     return string_start;
137 | }
138 | 
139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
140 | {
141 |     bool bFound = false;
142 | 
143 |     if (argc >= 1)
144 |     {
145 |         for (int i=1; i < argc; i++)
146 |         {
147 |             int string_start = stringRemoveDelimiter('-', argv[i]);
148 |             const char *string_argv = &argv[i][string_start];
149 | 
150 |             const char *equal_pos = strchr(string_argv, '=');
151 |             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
152 | 
153 |             int length = (int)strlen(string_ref);
154 | 
155 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
156 |             {
157 |                 bFound = true;
158 |                 continue;
159 |             }
160 |         }
161 |     }
162 | 
163 |     return bFound;
164 | }
165 | 
166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
167 | {
168 |     bool bFound = false;
169 |     int value = -1;
170 | 
171 |     if (argc >= 1)
172 |     {
173 |         for (int i=1; i < argc; i++)
174 |         {
175 |             int string_start = stringRemoveDelimiter('-', argv[i]);
176 |             const char *string_argv = &argv[i][string_start];
177 |             int length = (int)strlen(string_ref);
178 | 
179 |             if (!STRNCASECMP(string_argv, string_ref, length))
180 |             {
181 |                 if (length+1 <= (int)strlen(string_argv))
182 |                 {
183 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
184 |                     value = atoi(&string_argv[length + auto_inc]);
185 |                 }
186 |                 else
187 |                 {
188 |                     value = 0;
189 |                 }
190 | 
191 |                 bFound = true;
192 |                 continue;
193 |             }
194 |         }
195 |     }
196 | 
197 |     if (bFound)
198 |     {
199 |         return value;
200 |     }
201 |     else
202 |     {
203 |         printf("Not found int\n");
204 |         return 0;
205 |     }
206 | }
207 | 
208 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
209 |                                      const char *string_ref, char **string_retval)
210 | {
211 |     bool bFound = false;
212 | 
213 |     if (argc >= 1)
214 |     {
215 |         for (int i=1; i < argc; i++)
216 |         {
217 |             int string_start = stringRemoveDelimiter('-', argv[i]);
218 |             char *string_argv = (char *)&argv[i][string_start];
219 |             int length = (int)strlen(string_ref);
220 | 
221 |             if (!STRNCASECMP(string_argv, string_ref, length))
222 |             {
223 |                 *string_retval = &string_argv[length+1];
224 |                 bFound = true;
225 |                 continue;
226 |             }
227 |         }
228 |     }
229 | 
230 |     if (!bFound)
231 |     {
232 |         *string_retval = NULL;
233 |     }
234 | 
235 |     return bFound;
236 | }
237 | 
238 | #endif // _ERROR_UTIL_H_


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/fp16_dev.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | 
12 | #include "error_util.h"
13 | #include "fp16_dev.h"
14 | 
15 | #define BLOCK_SIZE 128
16 | template <class value_type>
17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut)
18 | {
19 |     const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x;
20 |     if (idx >= size) {
21 |         return;
22 |     }
23 | #if CUDART_VERSION < 9000
24 |     half1 val;
25 |     val.x = __float2half_rn(float(buffIn[idx]));
26 | #else
27 |     half1 val = __float2half_rn(float(buffIn[idx]));
28 | #endif
29 |     buffOut[idx] = val;
30 | }
31 | 
32 | template <class value_type>
33 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut)
34 | {
35 |     int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
36 |     float2half_rn_kernel<value_type><<<grid_size, BLOCK_SIZE>>> (size, buffIn, buffOut);
37 |     checkCudaErrors(cudaDeviceSynchronize());
38 | }
39 | 
40 | template void gpu_float2half_rn<float> (int, const float*, half1*);
41 | template void gpu_float2half_rn<double> (int, const double*, half1*);
42 | 
43 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/fp16_dev.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | 
12 | #if !defined(_FP16_DEV_H_)
13 | #define _FP16_DEV_H_
14 | 
15 | #include "fp16_emu.h"
16 | 
17 | template <class value_type>
18 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut);
19 | 
20 | #endif // _FP16_DEV_H_
21 | 
22 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/fp16_emu.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 |  
 50 | #include "fp16_emu.h" 
 51 | 
 52 | #define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0)
 53 | 
 54 | // Host functions for converting between FP32 and FP16 formats
 55 | // Paulius Micikevicius (pauliusm@nvidia.com)
 56 | 
 57 | half1 cpu_float2half_rn(float f)
 58 | {
 59 |     unsigned x = *((int*)(void*)(&f));
 60 |     unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
 61 |     unsigned sign, exponent, mantissa;
 62 | 
 63 |     __half_raw hr;
 64 | 
 65 |     // Get rid of +NaN/-NaN case first.
 66 |     if (u > 0x7f800000) {
 67 |         hr.x = 0x7fffU;
 68 |         return reinterpret_cast<half1&>(hr);
 69 |     }
 70 |   
 71 |     sign = ((x >> 16) & 0x8000);
 72 |   
 73 |     // Get rid of +Inf/-Inf, +0/-0.
 74 |     if (u > 0x477fefff) {
 75 |         hr.x = sign | 0x7c00U;
 76 |         return reinterpret_cast<half1&>(hr);
 77 |     }
 78 |     if (u < 0x33000001) {
 79 |         hr.x = sign | 0x0000U;
 80 |         return reinterpret_cast<half1&>(hr);
 81 |     }
 82 | 
 83 |     exponent = ((u >> 23) & 0xff);
 84 |     mantissa = (u & 0x7fffff);
 85 | 
 86 |     if (exponent > 0x70) {
 87 |         shift = 13;
 88 |         exponent -= 0x70;
 89 |     } else {
 90 |         shift = 0x7e - exponent;
 91 |         exponent = 0;
 92 |         mantissa |= 0x800000;
 93 |     }
 94 |     lsb = (1 << shift);
 95 |     lsb_s1 = (lsb >> 1);
 96 |     lsb_m1 = (lsb - 1);
 97 |   
 98 |     // Round to nearest even.
 99 |     remainder = (mantissa & lsb_m1);
100 |     mantissa >>= shift;
101 |     if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
102 |         ++mantissa;
103 |         if (!(mantissa & 0x3ff)) {
104 |             ++exponent;
105 |             mantissa = 0;
106 |         }
107 |     }  
108 | 
109 |     hr.x = (sign | (exponent << 10) | mantissa);  
110 | 
111 |     return reinterpret_cast<half1&>(hr);
112 | }
113 | 
114 | 
115 | float cpu_half2float(half1 h)
116 | {
117 |     STATIC_ASSERT(sizeof(int) == sizeof(float));
118 | 
119 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
120 | 
121 |     unsigned sign     = ((hr.x >> 15) & 1);
122 |     unsigned exponent = ((hr.x >> 10) & 0x1f);
123 |     unsigned mantissa = ((hr.x & 0x3ff) << 13);
124 | 
125 |     if (exponent == 0x1f) {  /* NaN or Inf */
126 |         mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
127 |         exponent = 0xff;
128 |     } else if (!exponent) {  /* Denorm or Zero */
129 |         if (mantissa) {
130 |             unsigned int msb;
131 |             exponent = 0x71;
132 |             do {
133 |                 msb = (mantissa & 0x400000);
134 |                 mantissa <<= 1;  /* normalize */
135 |                 --exponent;
136 |             } while (!msb);
137 |             mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
138 |         }
139 |     } else {
140 |         exponent += 0x70;
141 |     }
142 | 
143 |     int temp = ((sign << 31) | (exponent << 23) | mantissa);
144 | 
145 |     return reinterpret_cast<float&>(temp);
146 | }
147 | 
148 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/fp16_emu.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 | 
 50 | // Conversion from/to 16-bit floating point (half-precision).
 51 | 
 52 | #if !defined(_FP16_EMU_H_)
 53 | #define _FP16_EMU_H_
 54 | 
 55 | #include <driver_types.h>
 56 | #include <cuda_fp16.h>
 57 | 
 58 | // Necessary to ensure visibility of CUDART_VERSION macro
 59 | #include <cuda_runtime_api.h>
 60 | 
 61 | // Definition of '__half_raw' was not provided before CUDA 9.0.
 62 | // '__half_raw' is our type where the unsigned 16-bit integer 
 63 | // data member 'x' can be accessed in both CUDA 9.0 and 8.0.
 64 | #if CUDART_VERSION < 9000 
 65 | typedef __half __half_raw;
 66 | #endif
 67 | 
 68 | // Internally, in CUDNN we use half1 struct as the FP16 type.
 69 | typedef __half half1;
 70 | 
 71 | #define HLF_EPSILON 4.887581E-04
 72 | #define HLF_MIN     6.103516E-05
 73 | #define HLF_MAX     6.550400E+04
 74 | 
 75 | half1 cpu_float2half_rn(float f);
 76 | 
 77 | float cpu_half2float(half1 h);
 78 | 
 79 | static __inline__ __device__ __host__ half1 habs(half1 h)
 80 | {
 81 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 82 |     hr.x &= 0x7fffU;
 83 |     return reinterpret_cast<half1&>(hr);
 84 | }
 85 | 
 86 | static __inline__ __device__ __host__ half1 hneg(half1 h)
 87 | {
 88 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 89 |     hr.x ^= 0x8000U;
 90 |     return reinterpret_cast<half1&>(hr);
 91 | }
 92 | 
 93 | static __inline__ __device__ __host__ int ishnan(half1 h)
 94 | {
 95 |     // When input is NaN, exponent is all ones and mantissa is non-zero.
 96 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 97 |     return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
 98 | }
 99 | 
100 | static __inline__ __device__ __host__ int ishinf(half1 h)
101 | {
102 |     // When input is +/- inf, exponent is all ones and mantissa is zero.
103 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
104 |     return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
105 | }
106 | 
107 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
108 | {
109 |     __half_raw xr = reinterpret_cast<__half_raw&>(x);
110 |     __half_raw yr = reinterpret_cast<__half_raw&>(y);
111 |     return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
112 | }
113 | 
114 | // Returns 0.0000 in FP16 binary form
115 | static __inline__ __device__ __host__ half1 hzero()
116 | {
117 |     __half_raw hr;
118 |     hr.x = 0x0000U;
119 |     return reinterpret_cast<half1&>(hr);
120 | }
121 | 
122 | // Returns 1.0000 in FP16 binary form
123 | static __inline__ __device__ __host__ half1 hone()
124 | {
125 |     __half_raw hr;
126 |     hr.x = 0x3c00U;
127 |     return reinterpret_cast<half1&>(hr);
128 | }
129 | 
130 | // Returns quiet NaN, the most significant fraction bit #9 is set
131 | static __inline__ __device__ __host__ half1 hnan()
132 | {
133 |     __half_raw hr;
134 |     hr.x = 0x7e00U;
135 |     return reinterpret_cast<half1&>(hr);
136 | }
137 | 
138 | // Largest positive FP16 value, corresponds to 6.5504e+04
139 | static __inline__ __device__ __host__ half1 hmax()
140 | {
141 |     // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
142 |     __half_raw hr;
143 |     hr.x = 0x7bffU;
144 |     return reinterpret_cast<half1&>(hr);
145 | }
146 | 
147 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
148 | static __inline__ __device__ __host__ half1 hmin()
149 | {
150 |     // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
151 |     __half_raw hr;
152 |     hr.x = 0x0400U;
153 |     return reinterpret_cast<half1&>(hr);
154 | }
155 | 
156 | #endif  // _FP16_EMU_H_
157 | 
158 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/conv_sample/gpgpusim.config:
--------------------------------------------------------------------------------
 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode
 2 | use_map = 0;
 3 | flit_size = 40; 
 4 | 
 5 | // currently we do not use this, see subnets below
 6 | network_count = 2;
 7 | 
 8 | // Topology
 9 | topology = fly;
10 | k = 52;
11 | n = 1;
12 | 
13 | // Routing
14 | 
15 | routing_function = dest_tag;
16 | 
17 | // Flow control
18 | 
19 | num_vcs     = 1;
20 | vc_buf_size = 64;
21 | input_buffer_size = 256;
22 | ejection_buffer_size = 64;
23 | boundary_buffer_size = 64;
24 | 
25 | wait_for_tail_credit = 0;
26 | 
27 | // Router architecture
28 | 
29 | vc_allocator = islip; //separable_input_first;
30 | sw_allocator = islip; //separable_input_first;
31 | alloc_iters  = 1;
32 | 
33 | credit_delay   = 0;
34 | routing_delay  = 0;
35 | vc_alloc_delay = 1;
36 | sw_alloc_delay = 1;
37 | 
38 | input_speedup     = 1;
39 | output_speedup    = 1;
40 | internal_speedup  = 2.0;
41 | 
42 | // Traffic, GPGPU-Sim does not use this
43 | 
44 | traffic                = uniform;
45 | packet_size ={{1,2,3,4},{10,20}};
46 | packet_size_rate={{1,1,1,1},{2,1}};
47 | 
48 | // Simulation - Don't change
49 | 
50 | sim_type       = gpgpusim;
51 | //sim_type = latency;
52 | injection_rate = 0.1;
53 | 
54 | subnets = 2;
55 | 
56 | // Always use read and write no matter following line
57 | //use_read_write = 1;
58 | 
59 | 
60 | read_request_subnet = 0;
61 | read_reply_subnet = 1;
62 | write_request_subnet = 0;
63 | write_reply_subnet = 1;
64 | 
65 | read_request_begin_vc = 0;
66 | read_request_end_vc = 0;
67 | write_request_begin_vc = 0;
68 | write_request_end_vc = 0;
69 | read_reply_begin_vc = 0;
70 | read_reply_end_vc = 0;
71 | write_reply_begin_vc = 0;
72 | write_reply_end_vc = 0;
73 | 
74 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/FreeImage/freeimage-license.txt:
--------------------------------------------------------------------------------
  1 | FreeImage Public License - Version 1.0
  2 | ---------------------------------------------
  3 | 
  4 | 1. Definitions.
  5 | 
  6 | 1.1. "Contributor" means each entity that creates or contributes to the creation of Modifications.
  7 | 
  8 | 1.2. "Contributor Version" means the combination of the Original Code, prior Modifications used by a Contributor, and the Modifications made by that particular Contributor.
  9 | 
 10 | 1.3. "Covered Code" means the Original Code or Modifications or the combination of the Original Code and Modifications, in each case including portions thereof.
 11 | 
 12 | 1.4. "Electronic Distribution Mechanism" means a mechanism generally accepted in the software development community for the electronic transfer of data.
 13 | 
 14 | 1.5. "Executable" means Covered Code in any form other than Source Code.
 15 | 
 16 | 1.6. "Initial Developer" means the individual or entity identified as the Initial Developer in the Source Code notice required by Exhibit A.
 17 | 
 18 | 1.7. "Larger Work" means a work which combines Covered Code or portions thereof with code not governed by the terms of this License.
 19 | 
 20 | 1.8. "License" means this document.
 21 | 
 22 | 1.9. "Modifications" means any addition to or deletion from the substance or structure of either the Original Code or any previous Modifications. When Covered Code is released as a series of files, a
 23 | Modification is:
 24 | 
 25 | A. Any addition to or deletion from the contents of a file containing Original Code or previous Modifications.
 26 | 
 27 | B. Any new file that contains any part of the Original Code or previous Modifications.
 28 | 
 29 | 1.10. "Original Code" means Source Code of computer software code which is described in the Source Code notice required by Exhibit A as Original Code, and which, at the time of its release under this License is not already Covered Code governed by this License.
 30 | 
 31 | 1.11. "Source Code" means the preferred form of the Covered Code for making modifications to it, including all modules it contains, plus any associated interface definition files, scripts used to control
 32 | compilation and installation of an Executable, or a list of source code differential comparisons against either the Original Code or another well known, available Covered Code of the Contributor's choice. The Source Code can be in a compressed or archival form, provided the appropriate decompression or de-archiving software is widely available for no charge.
 33 | 
 34 | 1.12. "You" means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License or a future version of this License issued under Section 6.1. For legal entities, "You" includes any entity which controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the
 35 | direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares or beneficial ownership of such entity.
 36 | 
 37 | 2. Source Code License.
 38 | 
 39 | 2.1. The Initial Developer Grant.
 40 | The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims:
 41 | 
 42 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Original Code (or portions thereof) with or without Modifications, or as part of a Larger Work; and
 43 | 
 44 | (b) under patents now or hereafter owned or controlled by Initial Developer, to make, have made, use and sell ("Utilize") the Original Code (or portions thereof), but solely to the extent that
 45 | any such patent is reasonably necessary to enable You to Utilize the Original Code (or portions thereof) and not to any greater extent that may be necessary to Utilize further Modifications or
 46 | combinations.
 47 | 
 48 | 2.2. Contributor Grant.
 49 | Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims:
 50 | 
 51 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Modifications created by such Contributor (or portions thereof) either on an unmodified basis, with other Modifications, as Covered Code or as part of a Larger Work; and
 52 | 
 53 | (b) under patents now or hereafter owned or controlled by Contributor, to Utilize the Contributor Version (or portions thereof), but solely to the extent that any such patent is reasonably necessary to enable You to Utilize the Contributor Version (or portions thereof), and not to any greater extent that
 54 | may be necessary to Utilize further Modifications or combinations.
 55 | 
 56 | 3. Distribution Obligations.
 57 | 
 58 | 3.1. Application of License.
 59 | The Modifications which You create or to which You contribute are governed by the terms of this License, including without limitation Section 2.2. The Source Code version of Covered Code may be distributed only under the terms of this License or a future version of this License released under Section 6.1, and You must include a copy of this License with every copy of the Source Code You distribute. You may not offer or impose any terms on any Source Code version that alters or
 60 | restricts the applicable version of this License or the recipients' rights hereunder. However, You may include an additional document offering the additional rights described in Section 3.5.
 61 | 
 62 | 3.2. Availability of Source Code.
 63 | Any Modification which You create or to which You contribute must be made available in Source Code form under the terms of this License either on the same media as an Executable version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an Executable version available; and if made available via Electronic Distribution Mechanism, must remain available for at least twelve (12) months after the date it initially became available, or at least six (6) months after a subsequent version of that particular Modification has been made available to such recipients. You are responsible for ensuring that the Source Code version remains available even if the Electronic Distribution Mechanism is maintained by a third party.
 64 | 
 65 | 3.3. Description of Modifications.
 66 | You must cause all Covered Code to which you contribute to contain a file documenting the changes You made to create that Covered Code and the date of any change. You must include a prominent statement that the Modification is derived, directly or indirectly, from Original Code provided by the Initial Developer and including the name of the Initial Developer in (a) the Source Code, and (b) in any notice in an Executable version or related documentation in which You describe the origin or ownership of the Covered Code.
 67 | 
 68 | 3.4. Intellectual Property Matters
 69 | 
 70 | (a) Third Party Claims.
 71 | If You have knowledge that a party claims an intellectual property right in particular functionality or code (or its utilization under this License), you must include a text file with the source code distribution titled "LEGAL" which describes the claim and the party making the claim in sufficient detail that a recipient will know whom to contact. If you obtain such knowledge after You make Your Modification available as described in Section 3.2, You shall promptly modify the LEGAL file in all copies You make
 72 | available thereafter and shall take other steps (such as notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who received the Covered Code that new knowledge has been obtained.
 73 | 
 74 | (b) Contributor APIs.
 75 | If Your Modification is an application programming interface and You own or control patents which are reasonably necessary to implement that API, you must also include this information in the LEGAL file.
 76 | 
 77 | 3.5. Required Notices.
 78 | You must duplicate the notice in Exhibit A in each file of the Source Code, and this License in any documentation for the Source Code, where You describe recipients' rights relating to Covered Code. If You created one or more Modification(s), You may add your name as a Contributor to the notice described in Exhibit A. If it is not possible to put such notice in a particular Source Code file due to its
 79 | structure, then you must include such notice in a location (such as a relevant directory file) where a user would be likely to look for such a notice. You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Code. However, You may do so only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You must make it absolutely clear than any such warranty, support, indemnity or
 80 | liability obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of
 81 | warranty, support, indemnity or liability terms You offer.
 82 | 
 83 | 3.6. Distribution of Executable Versions.
 84 | You may distribute Covered Code in Executable form only if the requirements of Section 3.1-3.5 have been met for that Covered Code, and if You include a notice stating that the Source Code version of the Covered Code is available under the terms of this License, including a description of how and where You have fulfilled the obligations of Section 3.2. The notice must be conspicuously included in any notice in an Executable version, related documentation or collateral in which You
 85 | describe recipients' rights relating to the Covered Code. You may distribute the Executable version of Covered Code under a license of Your choice, which may contain terms different from this License,
 86 | provided that You are in compliance with the terms of this License and that the license for the Executable version does not attempt to limit or alter the recipient's rights in the Source Code version from the rights set forth in this License. If You distribute the Executable version under a different license You must make it absolutely clear that any terms which differ from this License are offered by You alone, not by the Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of any such terms You offer.
 87 | 
 88 | 3.7. Larger Works.
 89 | You may create a Larger Work by combining Covered Code with other code not governed by the terms of this License and distribute the Larger Work as a single product. In such a case, You must make sure the requirements of this License are fulfilled for the Covered Code.
 90 | 
 91 | 4. Inability to Comply Due to Statute or Regulation.
 92 | 
 93 | If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Code due to statute or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be included in the LEGAL file described in Section 3.4 and must be included with all distributions of the Source Code. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it.
 94 | 
 95 | 5. Application of this License.
 96 | 
 97 | This License applies to code to which the Initial Developer has attached the notice in Exhibit A, and to related Covered Code.
 98 | 
 99 | 6. Versions of the License.
100 | 
101 | 6.1. New Versions.
102 | Floris van den Berg may publish revised and/or new versions of the License from time to time. Each version will be given a distinguishing version number.
103 | 
104 | 6.2. Effect of New Versions.
105 | Once Covered Code has been published under a particular version of the License, You may always continue to use it under the terms of that version. You may also choose to use such Covered Code under the terms of any subsequent version of the License published by Floris van den Berg
106 | No one other than Floris van den Berg has the right to modify the terms applicable to Covered Code created under this License.
107 | 
108 | 6.3. Derivative Works.
109 | If you create or use a modified version of this License (which you may only do in order to apply it to code which is not already Covered Code governed by this License), you must (a) rename Your license so that the phrases "FreeImage", `FreeImage Public License", "FIPL", or any confusingly similar phrase do not appear anywhere in your license and (b) otherwise make it clear that your version of the license contains terms which differ from the FreeImage Public License. (Filling in the name of the Initial Developer, Original Code or Contributor in the notice described in Exhibit A shall not of themselves be deemed to be modifications of this License.)
110 | 
111 | 7. DISCLAIMER OF WARRANTY.
112 | 
113 | COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER.
114 | 
115 | 8. TERMINATION.
116 | 
117 | This License and the rights granted hereunder will terminate automatically if You fail to comply with terms herein and fail to cure such breach within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which are properly granted shall survive any termination of this License. Provisions which, by their nature, must remain in effect beyond the termination of this License shall survive.
118 | 
119 | 9. LIMITATION OF LIABILITY.
120 | 
121 | UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL THE INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO YOU OR ANY OTHER PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE
122 | EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THAT EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU.
123 | 
124 | 10. U.S. GOVERNMENT END USERS.
125 | 
126 | The Covered Code is a "commercial item," as that term is defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer software" and "commercial computer software documentation," as such terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users acquire Covered Code with only those rights set forth herein.
127 | 
128 | 11. MISCELLANEOUS.
129 | 
130 | This License represents the complete agreement concerning subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. This License shall be governed by Dutch law provisions (except to the extent applicable law, if any, provides otherwise), excluding its conflict-of-law provisions. With respect to disputes in which at least one party is a citizen of, or an entity chartered or registered to do business in, the The Netherlands: (a) unless otherwise agreed in writing, all disputes relating to this License (excepting any dispute relating to intellectual property rights) shall be subject to final and binding arbitration, with the losing party paying all costs of arbitration; (b) any arbitration relating to this Agreement shall be held in Almelo, The Netherlands; and (c) any litigation relating to this Agreement shall be subject to the jurisdiction of the court of Almelo, The Netherlands with the losing party responsible for costs, including without limitation, court costs and reasonable attorneys fees and expenses. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not apply to this License.
131 | 
132 | 12. RESPONSIBILITY FOR CLAIMS.
133 | 
134 | Except in cases where another Contributor has failed to comply with Section 3.4, You are responsible for damages arising, directly or indirectly, out of Your utilization of rights under this License, based
135 | on the number of copies of Covered Code you made available, the revenues you received from utilizing such rights, and other relevant factors. You agree to work with affected parties to distribute
136 | responsibility on an equitable basis.
137 | 
138 | EXHIBIT A.
139 | 
140 | "The contents of this file are subject to the FreeImage Public License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://home.wxs.nl/~flvdberg/freeimage-license.txt
141 | 
142 | Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/FreeImage/include/FreeImage.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/FreeImage/include/FreeImage.h


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/Makefile:
--------------------------------------------------------------------------------
  1 | # Location of the CUDA Toolkit
  2 | CUDA_PATH ?= /usr/local/cuda
  3 | 
  4 | # architecture
  5 | HOST_ARCH   := $(shell uname -m)
  6 | TARGET_ARCH ?= $(HOST_ARCH)
  7 | 
  8 | # Adjust this for ARMv7 with a 32-bit filesystem
  9 | ifeq ($(TARGET_ARCH), aarch64)
 10 |     ifeq ($(shell file /sbin/init | grep 32-bit), 1)
 11 |         TARGET_ARCH=armv7l
 12 |     endif
 13 | endif
 14 |  
 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 16 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 17 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 18 |             TARGET_SIZE := 64
 19 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 20 |             TARGET_SIZE := 32
 21 |         endif
 22 |     else
 23 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 24 |     endif
 25 | else
 26 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 27 | endif
 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 29 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 30 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 31 |     endif
 32 | endif
 33 | 
 34 | # operating system
 35 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
 36 | TARGET_OS ?= $(HOST_OS)
 37 | 
 38 | ifeq ($(TARGET_OS),QNX)
 39 | override TARGET_OS := qnx
 40 | endif
 41 | 
 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
 43 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
 44 | endif
 45 | 
 46 | # host compiler
 47 | ifeq ($(TARGET_OS),darwin)
 48 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
 49 |         HOST_COMPILER ?= clang++
 50 |     endif
 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 52 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
 53 |         ifeq ($(TARGET_OS),linux)
 54 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
 55 |         else ifeq ($(TARGET_OS),qnx)
 56 |             ifeq ($(QNX_HOST),)
 57 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
 58 |             endif
 59 |             ifeq ($(QNX_TARGET),)
 60 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
 61 |             endif
 62 |             export QNX_HOST
 63 |             export QNX_TARGET
 64 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
 65 |         else ifeq ($(TARGET_OS),android)
 66 |             HOST_COMPILER ?= arm-linux-androideabi-g++
 67 |         endif
 68 |     else ifeq ($(TARGET_ARCH),aarch64)
 69 |         ifeq ($(TARGET_OS), linux)
 70 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
 71 |         else ifeq ($(TARGET_OS), android)
 72 |             HOST_COMPILER ?= aarch64-linux-android-g++
 73 |         endif
 74 |     else ifeq ($(TARGET_ARCH),ppc64le)
 75 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
 76 |     endif
 77 | endif
 78 | HOST_COMPILER ?= g++
 79 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
 80 | 
 81 | # internal flags
 82 | NVCCFLAGS   := -m${TARGET_SIZE}
 83 | CCFLAGS     :=
 84 | LDFLAGS     :=
 85 | 
 86 | # build flags
 87 | ifeq ($(TARGET_OS),darwin)
 88 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
 89 |     CCFLAGS += -arch $(HOST_ARCH)
 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
 91 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
 92 |     CCFLAGS += -mfloat-abi=hard
 93 | else ifeq ($(TARGET_OS),android)
 94 |     LDFLAGS += -pie
 95 |     CCFLAGS += -fpie -fpic -fexceptions
 96 | endif
 97 | 
 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 99 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
100 |         ifneq ($(TARGET_FS),)
101 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
102 |             ifeq ($(GCCVERSIONLTEQ46),1)
103 |                 CCFLAGS += --sysroot=$(TARGET_FS)
104 |             endif
105 |             LDFLAGS += --sysroot=$(TARGET_FS)
106 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
107 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
108 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
109 |         endif
110 |     endif
111 | endif
112 | 
113 | # Debug build flags
114 | ifeq ($(dbg),1)
115 |       NVCCFLAGS += -g -G
116 |       BUILD_TYPE := debug
117 | else
118 |       BUILD_TYPE := release
119 | endif
120 | 
121 | ALL_CCFLAGS :=
122 | ALL_CCFLAGS += $(NVCCFLAGS)
123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
126 | 
127 | SAMPLE_ENABLED := 1
128 | 
129 | ALL_LDFLAGS :=
130 | ALL_LDFLAGS += $(ALL_CCFLAGS)
131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
133 | 
134 | # Common includes and paths for CUDA
135 | ifneq ($(TARGET_ARCH), ppc64le)
136 | INCLUDES := -I$(CUDA_PATH)/include
137 | else
138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include
139 | endif
140 | LIBRARIES :=
141 | 
142 | ################################################################################
143 | 
144 | # Gencode arguments
145 | SMS ?= 30 35 50 53
146 | 
147 | ifeq ($(SMS),)
148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
149 | SAMPLE_ENABLED := 0
150 | endif
151 | 
152 | ifeq ($(GENCODE_FLAGS),)
153 | # Generate SASS code for each SM architecture listed in $(SMS)
154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
155 | 
156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
157 | HIGHEST_SM := $(lastword $(sort $(SMS)))
158 | ifneq ($(HIGHEST_SM),)
159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
160 | endif
161 | endif
162 | 
163 | INCLUDES += -IFreeImage/include
164 | LIBRARIES += -LFreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH) -LFreeImage/lib/$(TARGET_OS) -lcudart -lcublas -lcudnn -lfreeimage -lstdc++ -lm
165 | 
166 | # Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up.
167 | $(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(LIBRARIES) -l freeimage test.c)
168 | FREEIMAGE := $(shell find a.out 2>/dev/null)
169 | $(shell rm a.out test.c 2>/dev/null)
170 | 
171 | ifeq ("$(FREEIMAGE)","")
172 | $(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<)
173 | SAMPLE_ENABLED := 0
174 | endif
175 | 
176 | ifeq ($(SAMPLE_ENABLED),0)
177 | EXEC ?= @echo "[@]"
178 | endif
179 | 
180 | ################################################################################
181 | 
182 | # Target rules
183 | all: build
184 | 
185 | build: mnistCUDNN
186 | 
187 | check.deps:
188 | ifeq ($(SAMPLE_ENABLED),0)
189 | 	@echo "Sample will be waived due to the above missing dependencies"
190 | else
191 | 	@echo "Sample is ready - all dependencies have been met"
192 | endif
193 | 
194 | OBJ = fp16_dev.o fp16_emu.o mnistCUDNN.o
195 | 
196 | mnistCUDNN: $(OBJ)
197 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
198 | 
199 | %.o: %.cpp
200 | 	$(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $<
201 | 
202 | %.o: %.cu
203 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
204 | 
205 | run: build
206 | 	$(EXEC) ./mnistCUDNN
207 | 
208 | clean:
209 | 	rm -rf *o
210 | 	rm -rf mnistCUDNN
211 | 
212 | clobber: clean
213 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/conv1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv1.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/conv1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv1.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/conv2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv2.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/conv2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv2.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/five_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/five_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/ip1.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip1.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/ip1.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip1.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/ip2.bias.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip2.bias.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/ip2.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip2.bin


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/one_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/one_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/data/three_28x28.pgm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/three_28x28.pgm


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/error_util.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #if !defined(_ERROR_UTIL_H_)
 13 | #define _ERROR_UTIL_H_
 14 | 
 15 | #include <sstream>
 16 | #include <stdlib.h>
 17 | #include <stdio.h>
 18 | #include <iostream>
 19 | 
 20 | #define TOSTR_(s)   #s
 21 | #define TOSTR(s)    TOSTR_(s)
 22 | #if defined(__GNUC__)
 23 | #define COMPILER_NAME "GCC"
 24 | #define COMPILER_VER  TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__)
 25 | #elif defined(_MSC_VER)
 26 | #if _MSC_VER < 1500
 27 | #define COMPILER_NAME "MSVC_2005"
 28 | #elif _MSC_VER < 1600
 29 | #define COMPILER_NAME "MSVC_2008"
 30 | #elif _MSC_VER < 1700
 31 | #define COMPILER_NAME "MSVC_2010"
 32 | #elif _MSC_VER < 1800
 33 | #define COMPILER_NAME "MSVC_2012"
 34 | #elif _MSC_VER < 1900
 35 | #define COMPILER_NAME "MSVC_2013"
 36 | #elif _MSC_VER < 2000
 37 | #define COMPILER_NAME "MSVC_2014"
 38 | #else
 39 | #define COMPILER_NAME "MSVC"
 40 | #endif
 41 | #define COMPILER_VER  TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD)
 42 | #elif defined(__clang_major__)
 43 | #define COMPILER_NAME "CLANG"
 44 | #define COMPILER_VER  TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__)
 45 | #elif defined(__INTEL_COMPILER)
 46 | #define COMPILER_NAME "ICC"
 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE)
 48 | #else
 49 | #define COMPILER_NAME "unknown"
 50 | #define COMPILER_VER  "???"
 51 | #endif
 52 | 
 53 | #define CUDNN_VERSION_STR  TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL)
 54 | 
 55 | #define FatalError(s) {                                                \
 56 |     std::stringstream _where, _message;                                \
 57 |     _where << __FILE__ << ':' << __LINE__;                             \
 58 |     _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\
 59 |     std::cerr << _message.str() << "\nAborting...\n";                  \
 60 |     cudaDeviceReset();                                                 \
 61 |     exit(EXIT_FAILURE);                                                \
 62 | }
 63 | 
 64 | #define checkCUDNN(status) {                                           \
 65 |     std::stringstream _error;                                          \
 66 |     if (status != CUDNN_STATUS_SUCCESS) {                              \
 67 |       _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \
 68 |       FatalError(_error.str());                                        \
 69 |     }                                                                  \
 70 | }
 71 | 
 72 | #define checkCudaErrors(status) {                                      \
 73 |     std::stringstream _error;                                          \
 74 |     if (status != 0) {                                                 \
 75 |       _error << "Cuda failure\nError: " << cudaGetErrorString(status); \
 76 |       FatalError(_error.str());                                        \
 77 |     }                                                                  \
 78 | }
 79 | 
 80 | #define checkCublasErrors(status) {                                    \
 81 |     std::stringstream _error;                                          \
 82 |     if (status != 0) {                                                 \
 83 |       _error << "Cublas failure\nError code " << status;        \
 84 |       FatalError(_error.str());                                        \
 85 |     }                                                                  \
 86 | }
 87 | 
 88 | // CUDA Utility Helper Functions
 89 | 
 90 | static void  showDevices( void )
 91 | {
 92 |     int totalDevices;
 93 |     checkCudaErrors(cudaGetDeviceCount( &totalDevices ));
 94 |     printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices);
 95 |     for (int i=0; i< totalDevices; i++) {
 96 |         struct cudaDeviceProp prop;
 97 |         checkCudaErrors(cudaGetDeviceProperties( &prop, i ));
 98 |         printf( "device %d : sms %2d  Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n",
 99 |                     i, prop.multiProcessorCount, prop.major, prop.minor,
100 |                     (float)prop.clockRate*1e-3,
101 |                     (int)(prop.totalGlobalMem/(1024*1024)),
102 |                     (float)prop.memoryClockRate*1e-3,
103 |                     prop.ECCEnabled,
104 |                     prop.multiGpuBoardGroupID);
105 |     }
106 | } 
107 | 
108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
109 | #ifndef _CRT_SECURE_NO_DEPRECATE
110 | #define _CRT_SECURE_NO_DEPRECATE
111 | #endif
112 | #ifndef STRNCASECMP
113 | #define STRNCASECMP _strnicmp
114 | #endif
115 | #else // Linux Includes
116 | #include <string.h>
117 | #include <strings.h>
118 | #ifndef STRNCASECMP
119 | #define STRNCASECMP strncasecmp
120 | #endif
121 | #endif
122 | inline int stringRemoveDelimiter(char delimiter, const char *string)
123 | {
124 |     int string_start = 0;
125 | 
126 |     while (string[string_start] == delimiter)
127 |     {
128 |         string_start++;
129 |     }
130 | 
131 |     if (string_start >= (int)strlen(string)-1)
132 |     {  
133 |         return 0;
134 |     }
135 | 
136 |     return string_start;
137 | }
138 | 
139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
140 | {
141 |     bool bFound = false;
142 | 
143 |     if (argc >= 1)
144 |     {
145 |         for (int i=1; i < argc; i++)
146 |         {
147 |             int string_start = stringRemoveDelimiter('-', argv[i]);
148 |             const char *string_argv = &argv[i][string_start];
149 | 
150 |             const char *equal_pos = strchr(string_argv, '=');
151 |             int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
152 | 
153 |             int length = (int)strlen(string_ref);
154 | 
155 |             if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
156 |             {
157 |                 bFound = true;
158 |                 continue;
159 |             }
160 |         }
161 |     }
162 | 
163 |     return bFound;
164 | }
165 | 
166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
167 | {
168 |     bool bFound = false;
169 |     int value = -1;
170 | 
171 |     if (argc >= 1)
172 |     {
173 |         for (int i=1; i < argc; i++)
174 |         {
175 |             int string_start = stringRemoveDelimiter('-', argv[i]);
176 |             const char *string_argv = &argv[i][string_start];
177 |             int length = (int)strlen(string_ref);
178 | 
179 |             if (!STRNCASECMP(string_argv, string_ref, length))
180 |             {
181 |                 if (length+1 <= (int)strlen(string_argv))
182 |                 {
183 |                     int auto_inc = (string_argv[length] == '=') ? 1 : 0;
184 |                     value = atoi(&string_argv[length + auto_inc]);
185 |                 }
186 |                 else
187 |                 {
188 |                     value = 0;
189 |                 }
190 | 
191 |                 bFound = true;
192 |                 continue;
193 |             }
194 |         }
195 |     }
196 | 
197 |     if (bFound)
198 |     {
199 |         return value;
200 |     }
201 |     else
202 |     {
203 |         printf("Not found int\n");
204 |         return 0;
205 |     }
206 | }
207 | 
208 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
209 |                                      const char *string_ref, char **string_retval)
210 | {
211 |     bool bFound = false;
212 | 
213 |     if (argc >= 1)
214 |     {
215 |         for (int i=1; i < argc; i++)
216 |         {
217 |             int string_start = stringRemoveDelimiter('-', argv[i]);
218 |             char *string_argv = (char *)&argv[i][string_start];
219 |             int length = (int)strlen(string_ref);
220 | 
221 |             if (!STRNCASECMP(string_argv, string_ref, length))
222 |             {
223 |                 *string_retval = &string_argv[length+1];
224 |                 bFound = true;
225 |                 continue;
226 |             }
227 |         }
228 |     }
229 | 
230 |     if (!bFound)
231 |     {
232 |         *string_retval = NULL;
233 |     }
234 | 
235 |     return bFound;
236 | }
237 | 
238 | #endif // _ERROR_UTIL_H_


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/fp16_dev.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | 
12 | #include "error_util.h"
13 | #include "fp16_dev.h"
14 | 
15 | #define BLOCK_SIZE 128
16 | template <class value_type>
17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut)
18 | {
19 |     const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x;
20 |     if (idx >= size) {
21 |         return;
22 |     }
23 | #if CUDART_VERSION < 9000
24 |     half1 val;
25 |     val.x = __float2half_rn(float(buffIn[idx]));
26 | #else
27 |     half1 val = __float2half_rn(float(buffIn[idx]));
28 | #endif
29 |     buffOut[idx] = val;
30 | }
31 | 
32 | template <class value_type>
33 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut)
34 | {
35 |     int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
36 |     float2half_rn_kernel<value_type><<<grid_size, BLOCK_SIZE>>> (size, buffIn, buffOut);
37 |     checkCudaErrors(cudaDeviceSynchronize());
38 | }
39 | 
40 | template void gpu_float2half_rn<float> (int, const float*, half1*);
41 | template void gpu_float2half_rn<double> (int, const double*, half1*);
42 | 
43 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/fp16_dev.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
 5 | * with this source code for terms and conditions that govern your use of
 6 | * this software. Any use, reproduction, disclosure, or distribution of
 7 | * this software and related documentation outside the terms of the EULA
 8 | * is strictly prohibited.
 9 | *
10 | */
11 | 
12 | #if !defined(_FP16_DEV_H_)
13 | #define _FP16_DEV_H_
14 | 
15 | #include "fp16_emu.h"
16 | 
17 | template <class value_type>
18 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut);
19 | 
20 | #endif // _FP16_DEV_H_
21 | 
22 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/fp16_emu.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 |  
 50 | #include "fp16_emu.h" 
 51 | 
 52 | #define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0)
 53 | 
 54 | // Host functions for converting between FP32 and FP16 formats
 55 | // Paulius Micikevicius (pauliusm@nvidia.com)
 56 | 
 57 | half1 cpu_float2half_rn(float f)
 58 | {
 59 |     unsigned x = *((int*)(void*)(&f));
 60 |     unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
 61 |     unsigned sign, exponent, mantissa;
 62 | 
 63 |     __half_raw hr;
 64 | 
 65 |     // Get rid of +NaN/-NaN case first.
 66 |     if (u > 0x7f800000) {
 67 |         hr.x = 0x7fffU;
 68 |         return reinterpret_cast<half1&>(hr);
 69 |     }
 70 |   
 71 |     sign = ((x >> 16) & 0x8000);
 72 |   
 73 |     // Get rid of +Inf/-Inf, +0/-0.
 74 |     if (u > 0x477fefff) {
 75 |         hr.x = sign | 0x7c00U;
 76 |         return reinterpret_cast<half1&>(hr);
 77 |     }
 78 |     if (u < 0x33000001) {
 79 |         hr.x = sign | 0x0000U;
 80 |         return reinterpret_cast<half1&>(hr);
 81 |     }
 82 | 
 83 |     exponent = ((u >> 23) & 0xff);
 84 |     mantissa = (u & 0x7fffff);
 85 | 
 86 |     if (exponent > 0x70) {
 87 |         shift = 13;
 88 |         exponent -= 0x70;
 89 |     } else {
 90 |         shift = 0x7e - exponent;
 91 |         exponent = 0;
 92 |         mantissa |= 0x800000;
 93 |     }
 94 |     lsb = (1 << shift);
 95 |     lsb_s1 = (lsb >> 1);
 96 |     lsb_m1 = (lsb - 1);
 97 |   
 98 |     // Round to nearest even.
 99 |     remainder = (mantissa & lsb_m1);
100 |     mantissa >>= shift;
101 |     if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
102 |         ++mantissa;
103 |         if (!(mantissa & 0x3ff)) {
104 |             ++exponent;
105 |             mantissa = 0;
106 |         }
107 |     }  
108 | 
109 |     hr.x = (sign | (exponent << 10) | mantissa);  
110 | 
111 |     return reinterpret_cast<half1&>(hr);
112 | }
113 | 
114 | 
115 | float cpu_half2float(half1 h)
116 | {
117 |     STATIC_ASSERT(sizeof(int) == sizeof(float));
118 | 
119 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
120 | 
121 |     unsigned sign     = ((hr.x >> 15) & 1);
122 |     unsigned exponent = ((hr.x >> 10) & 0x1f);
123 |     unsigned mantissa = ((hr.x & 0x3ff) << 13);
124 | 
125 |     if (exponent == 0x1f) {  /* NaN or Inf */
126 |         mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
127 |         exponent = 0xff;
128 |     } else if (!exponent) {  /* Denorm or Zero */
129 |         if (mantissa) {
130 |             unsigned int msb;
131 |             exponent = 0x71;
132 |             do {
133 |                 msb = (mantissa & 0x400000);
134 |                 mantissa <<= 1;  /* normalize */
135 |                 --exponent;
136 |             } while (!msb);
137 |             mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
138 |         }
139 |     } else {
140 |         exponent += 0x70;
141 |     }
142 | 
143 |     int temp = ((sign << 31) | (exponent << 23) | mantissa);
144 | 
145 |     return reinterpret_cast<float&>(temp);
146 | }
147 | 
148 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/fp16_emu.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO LICENSEE:
  5 |  *
  6 |  * This source code and/or documentation ("Licensed Deliverables") are
  7 |  * subject to NVIDIA intellectual property rights under U.S. and
  8 |  * international Copyright laws.
  9 |  *
 10 |  * These Licensed Deliverables contained herein is PROPRIETARY and
 11 |  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 12 |  * conditions of a form of NVIDIA software license agreement by and
 13 |  * between NVIDIA and Licensee ("License Agreement") or electronically
 14 |  * accepted by Licensee.  Notwithstanding any terms or conditions to
 15 |  * the contrary in the License Agreement, reproduction or disclosure
 16 |  * of the Licensed Deliverables to any third party without the express
 17 |  * written consent of NVIDIA is prohibited.
 18 |  *
 19 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 20 |  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 21 |  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 22 |  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 23 |  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 24 |  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 25 |  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 26 |  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 27 |  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 28 |  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 29 |  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 30 |  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 31 |  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 32 |  * OF THESE LICENSED DELIVERABLES.
 33 |  *
 34 |  * U.S. Government End Users.  These Licensed Deliverables are a
 35 |  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 36 |  * 1995), consisting of "commercial computer software" and "commercial
 37 |  * computer software documentation" as such terms are used in 48
 38 |  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 39 |  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 40 |  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 41 |  * U.S. Government End Users acquire the Licensed Deliverables with
 42 |  * only those rights set forth herein.
 43 |  *
 44 |  * Any use of the Licensed Deliverables in individual and commercial
 45 |  * software must include, in the user documentation and internal
 46 |  * comments to the code, the above Disclaimer and U.S. Government End
 47 |  * Users Notice.
 48 |  */
 49 | 
 50 | // Conversion from/to 16-bit floating point (half-precision).
 51 | 
 52 | #if !defined(_FP16_EMU_H_)
 53 | #define _FP16_EMU_H_
 54 | 
 55 | #include <driver_types.h>
 56 | #include <cuda_fp16.h>
 57 | 
 58 | // Necessary to ensure visibility of CUDART_VERSION macro
 59 | #include <cuda_runtime_api.h>
 60 | 
 61 | // Definition of '__half_raw' was not provided before CUDA 9.0.
 62 | // '__half_raw' is our type where the unsigned 16-bit integer 
 63 | // data member 'x' can be accessed in both CUDA 9.0 and 8.0.
 64 | #if CUDART_VERSION < 9000 
 65 | typedef __half __half_raw;
 66 | #endif
 67 | 
 68 | // Internally, in CUDNN we use half1 struct as the FP16 type.
 69 | typedef __half half1;
 70 | 
 71 | #define HLF_EPSILON 4.887581E-04
 72 | #define HLF_MIN     6.103516E-05
 73 | #define HLF_MAX     6.550400E+04
 74 | 
 75 | half1 cpu_float2half_rn(float f);
 76 | 
 77 | float cpu_half2float(half1 h);
 78 | 
 79 | static __inline__ __device__ __host__ half1 habs(half1 h)
 80 | {
 81 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 82 |     hr.x &= 0x7fffU;
 83 |     return reinterpret_cast<half1&>(hr);
 84 | }
 85 | 
 86 | static __inline__ __device__ __host__ half1 hneg(half1 h)
 87 | {
 88 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 89 |     hr.x ^= 0x8000U;
 90 |     return reinterpret_cast<half1&>(hr);
 91 | }
 92 | 
 93 | static __inline__ __device__ __host__ int ishnan(half1 h)
 94 | {
 95 |     // When input is NaN, exponent is all ones and mantissa is non-zero.
 96 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
 97 |     return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
 98 | }
 99 | 
100 | static __inline__ __device__ __host__ int ishinf(half1 h)
101 | {
102 |     // When input is +/- inf, exponent is all ones and mantissa is zero.
103 |     __half_raw hr = reinterpret_cast<__half_raw&>(h);
104 |     return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
105 | }
106 | 
107 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
108 | {
109 |     __half_raw xr = reinterpret_cast<__half_raw&>(x);
110 |     __half_raw yr = reinterpret_cast<__half_raw&>(y);
111 |     return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
112 | }
113 | 
114 | // Returns 0.0000 in FP16 binary form
115 | static __inline__ __device__ __host__ half1 hzero()
116 | {
117 |     __half_raw hr;
118 |     hr.x = 0x0000U;
119 |     return reinterpret_cast<half1&>(hr);
120 | }
121 | 
122 | // Returns 1.0000 in FP16 binary form
123 | static __inline__ __device__ __host__ half1 hone()
124 | {
125 |     __half_raw hr;
126 |     hr.x = 0x3c00U;
127 |     return reinterpret_cast<half1&>(hr);
128 | }
129 | 
130 | // Returns quiet NaN, the most significant fraction bit #9 is set
131 | static __inline__ __device__ __host__ half1 hnan()
132 | {
133 |     __half_raw hr;
134 |     hr.x = 0x7e00U;
135 |     return reinterpret_cast<half1&>(hr);
136 | }
137 | 
138 | // Largest positive FP16 value, corresponds to 6.5504e+04
139 | static __inline__ __device__ __host__ half1 hmax()
140 | {
141 |     // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
142 |     __half_raw hr;
143 |     hr.x = 0x7bffU;
144 |     return reinterpret_cast<half1&>(hr);
145 | }
146 | 
147 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
148 | static __inline__ __device__ __host__ half1 hmin()
149 | {
150 |     // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
151 |     __half_raw hr;
152 |     hr.x = 0x0400U;
153 |     return reinterpret_cast<half1&>(hr);
154 | }
155 | 
156 | #endif  // _FP16_EMU_H_
157 | 
158 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/gemv.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Copyright 2014 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 | * with this source code for terms and conditions that govern your use of
  6 | * this software. Any use, reproduction, disclosure, or distribution of
  7 | * this software and related documentation outside the terms of the EULA
  8 | * is strictly prohibited.
  9 | *
 10 | */
 11 | 
 12 | #if !defined(_GEMV_H_)
 13 | #define _GEMV_H_
 14 | 
 15 | #include <cuda.h> // CUDA_VERSION
 16 | #include <cublas_v2.h>
 17 | #include "error_util.h"
 18 | 
 19 | //#define DISABLE_GEMV
 20 | 
 21 | void gemv(cublasHandle_t cublasHandle, int m, int n, double alpha, 
 22 |             const double *A, const double *x,
 23 |                                double beta, double *y)
 24 | {
 25 | #ifdef DISABLE_GEMV
 26 |     checkCublasErrors( cublasDgemm (cublasHandle, 
 27 |                       CUBLAS_OP_T,
 28 |                       CUBLAS_OP_N,
 29 |                       n,
 30 |                       1,
 31 |                       m,
 32 |                       &alpha, 
 33 |                       A, 
 34 |                       m,
 35 |                       x,
 36 |                       m, 
 37 |                       &beta, 
 38 |                       y,
 39 |                       m) );
 40 | #else
 41 |     checkCublasErrors( cublasDgemv(cublasHandle, CUBLAS_OP_T,
 42 |                                   m, n,
 43 |                                   &alpha,
 44 |                                   A, m,
 45 |                                   x, 1,
 46 |                                   &beta,
 47 |                                   y, 1) );    
 48 | #endif
 49 | };
 50 | 
 51 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 
 52 |             const float *A, const float *x,
 53 |                                float beta, float *y)
 54 | {
 55 | #ifdef DISABLE_GEMV
 56 |     checkCublasErrors( cublasSgemm (cublasHandle, 
 57 |                       CUBLAS_OP_T,
 58 |                       CUBLAS_OP_N,
 59 |                       n,
 60 |                       1,
 61 |                       m,
 62 |                       &alpha, 
 63 |                       A, 
 64 |                       m,
 65 |                       x,
 66 |                       m, 
 67 |                       &beta, 
 68 |                       y,
 69 |                       m) );
 70 | #else
 71 |     checkCublasErrors( cublasSgemv(cublasHandle, CUBLAS_OP_T,
 72 |                                   m, n,
 73 |                                   &alpha,
 74 |                                   A, m,
 75 |                                   x, 1,
 76 |                                   &beta,
 77 |                                   y, 1) );    
 78 | #endif
 79 | };
 80 | 
 81 | #if defined(CUDA_VERSION) && (CUDA_VERSION > 7000)
 82 | 
 83 | #if (CUDA_VERSION < 8000)
 84 | #define  CUDA_R_16F CUBLAS_DATA_HALF
 85 | #endif
 86 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 
 87 |             const half1 *A, const half1 *x,
 88 |                                float beta, half1 *y)
 89 | {
 90 |     checkCublasErrors( cublasSgemmEx  ( cublasHandle, 
 91 |                                       CUBLAS_OP_T,
 92 |                                       CUBLAS_OP_N, 
 93 |                                       n,
 94 |                                       1,
 95 |                                       m,
 96 |                                       &alpha, 
 97 |                                       A,  
 98 |                                       CUDA_R_16F,
 99 |                                       m,
100 |                                       x,
101 |                                       CUDA_R_16F,
102 |                                       m, 
103 |                                       &beta, 
104 |                                       y,
105 |                                       CUDA_R_16F,
106 |                                       m) );
107 | };
108 | #endif
109 | 
110 | #endif  // _GEMV_H_
111 | 


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/mnistCUDNN:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/mnistCUDNN


--------------------------------------------------------------------------------
/cudnn_samples_v7/mnistCUDNN/readme.txt:
--------------------------------------------------------------------------------
 1 | This sample demonstrates how to use cuDNN library to implement forward pass
 2 | given a trained network.
 3 | 
 4 | The sample is based on "Training LeNet on MNIST with Caffe" tutorial, located
 5 | at http://caffe.berkeleyvision.org/. The network is identical with the exception 
 6 | of addition of LRN layer. All the network weights are obtained and exported
 7 | using Caffe.
 8 | 
 9 | Network layer topology:
10 | 
11 | 1. Convolution
12 | 2. Pooling
13 | 3. Convolution
14 | 4. Pooling
15 | 5. Fully connected
16 | 6. Relu
17 | 7. LRN
18 | 8. Fully Connected
19 | 9. SoftMax
20 | 
21 | By default, the sample will classify three images, located in "data" directory
22 | using precomputed network weights:
23 | 1) Two convolution layers and their bias: conv1.bias.bin conv1.bin conv2.bias.bin conv2.bin
24 | 2) Two fully connected layers and their bias: ip1.bias.bin ip1.bin ip2.bias.bin ip2.bin
25 | 
26 | Supported platforms: identical to cuDNN
27 | 
28 | How to run:
29 | 
30 | mnistCUDNN {<options>}
31 | help                   : display this help
32 | device=<int>           : set the device to run the sample
33 | image=<name>           : classify specific image
34 | 
35 | New in version 3 release
36 | fp16 (three ways of conversion: on host, on device using cuDNN, on device using CUDA)
37 | Local Response Normalization (LRN)
38 | Find fastest config (cudnnFindConvolutionForwardAlgorithm)
39 | FFT convolution
40 | Demonstrate Nd API (first available in cuDNN v2)
41 | 


--------------------------------------------------------------------------------
/home-made/common.hpp:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | #include <cstdio>
 3 | #include <iostream>
 4 | 
 5 | 
 6 | #define checkCUDNN(expression)\
 7 |   {                                                          \
 8 |     cudnnStatus_t status = (expression);                     \
 9 |     if (status != CUDNN_STATUS_SUCCESS) {                    \
10 |       std::cerr << "Error on line " << __LINE__ << ": "      \
11 |                 << cudnnGetErrorString(status) << std::endl; \
12 |       std::exit(EXIT_FAILURE);                               \
13 |     }\
14 |   }\
15 | 


--------------------------------------------------------------------------------
/home-made/config_fermi_islip.icnt:
--------------------------------------------------------------------------------
 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode
 2 | use_map = 0;
 3 | flit_size = 40; 
 4 | 
 5 | // currently we do not use this, see subnets below
 6 | network_count = 2;
 7 | 
 8 | // Topology
 9 | topology = fly;
10 | k = 52;
11 | n = 1;
12 | 
13 | // Routing
14 | 
15 | routing_function = dest_tag;
16 | 
17 | // Flow control
18 | 
19 | num_vcs     = 1;
20 | vc_buf_size = 64;
21 | input_buffer_size = 256;
22 | ejection_buffer_size = 64;
23 | boundary_buffer_size = 64;
24 | 
25 | wait_for_tail_credit = 0;
26 | 
27 | // Router architecture
28 | 
29 | vc_allocator = islip; //separable_input_first;
30 | sw_allocator = islip; //separable_input_first;
31 | alloc_iters  = 1;
32 | 
33 | credit_delay   = 0;
34 | routing_delay  = 0;
35 | vc_alloc_delay = 1;
36 | sw_alloc_delay = 1;
37 | 
38 | input_speedup     = 1;
39 | output_speedup    = 1;
40 | internal_speedup  = 2.0;
41 | 
42 | // Traffic, GPGPU-Sim does not use this
43 | 
44 | traffic                = uniform;
45 | packet_size ={{1,2,3,4},{10,20}};
46 | packet_size_rate={{1,1,1,1},{2,1}};
47 | 
48 | // Simulation - Don't change
49 | 
50 | sim_type       = gpgpusim;
51 | //sim_type = latency;
52 | injection_rate = 0.1;
53 | 
54 | subnets = 2;
55 | 
56 | // Always use read and write no matter following line
57 | //use_read_write = 1;
58 | 
59 | 
60 | read_request_subnet = 0;
61 | read_reply_subnet = 1;
62 | write_request_subnet = 0;
63 | write_reply_subnet = 1;
64 | 
65 | read_request_begin_vc = 0;
66 | read_request_end_vc = 0;
67 | write_request_begin_vc = 0;
68 | write_request_end_vc = 0;
69 | read_reply_begin_vc = 0;
70 | read_reply_end_vc = 0;
71 | write_reply_begin_vc = 0;
72 | write_reply_end_vc = 0;
73 | 
74 | 


--------------------------------------------------------------------------------
/home-made/gpgpusim.config:
--------------------------------------------------------------------------------
 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode
 2 | use_map = 0;
 3 | flit_size = 40; 
 4 | 
 5 | // currently we do not use this, see subnets below
 6 | network_count = 2;
 7 | 
 8 | // Topology
 9 | topology = fly;
10 | k = 52;
11 | n = 1;
12 | 
13 | // Routing
14 | 
15 | routing_function = dest_tag;
16 | 
17 | // Flow control
18 | 
19 | num_vcs     = 1;
20 | vc_buf_size = 64;
21 | input_buffer_size = 256;
22 | ejection_buffer_size = 64;
23 | boundary_buffer_size = 64;
24 | 
25 | wait_for_tail_credit = 0;
26 | 
27 | // Router architecture
28 | 
29 | vc_allocator = islip; //separable_input_first;
30 | sw_allocator = islip; //separable_input_first;
31 | alloc_iters  = 1;
32 | 
33 | credit_delay   = 0;
34 | routing_delay  = 0;
35 | vc_alloc_delay = 1;
36 | sw_alloc_delay = 1;
37 | 
38 | input_speedup     = 1;
39 | output_speedup    = 1;
40 | internal_speedup  = 2.0;
41 | 
42 | // Traffic, GPGPU-Sim does not use this
43 | 
44 | traffic                = uniform;
45 | packet_size ={{1,2,3,4},{10,20}};
46 | packet_size_rate={{1,1,1,1},{2,1}};
47 | 
48 | // Simulation - Don't change
49 | 
50 | sim_type       = gpgpusim;
51 | //sim_type = latency;
52 | injection_rate = 0.1;
53 | 
54 | subnets = 2;
55 | 
56 | // Always use read and write no matter following line
57 | //use_read_write = 1;
58 | 
59 | 
60 | read_request_subnet = 0;
61 | read_reply_subnet = 1;
62 | write_request_subnet = 0;
63 | write_reply_subnet = 1;
64 | 
65 | read_request_begin_vc = 0;
66 | read_request_end_vc = 0;
67 | write_request_begin_vc = 0;
68 | write_request_end_vc = 0;
69 | read_reply_begin_vc = 0;
70 | read_reply_end_vc = 0;
71 | write_reply_begin_vc = 0;
72 | write_reply_end_vc = 0;
73 | 
74 | 


--------------------------------------------------------------------------------
/home-made/helloworld.cpp:
--------------------------------------------------------------------------------
 1 | #include <cudnn.h>
 2 | 
 3 | #include "common.hpp"
 4 | 
 5 | int main(int argc, char const *argv[]) {
 6 |   
 7 |   cudnnHandle_t cudnn;
 8 |   checkCUDNN(cudnnCreate(&cudnn));
 9 | 
10 |   printf("Hello World!\n");
11 | 
12 |   return 0;
13 | }


--------------------------------------------------------------------------------