├── .gitignore ├── LICENSE ├── README.md ├── cudnn_samples_v6 ├── RNN │ ├── Makefile │ ├── RNN_example.cu │ ├── compare.py │ ├── golden_1.txt │ ├── golden_2.txt │ ├── golden_3.txt │ ├── golden_4.txt │ └── result.txt └── mnistCUDNN │ ├── FreeImage │ ├── freeimage-license.txt │ └── include │ │ └── FreeImage.h │ ├── Makefile │ ├── data │ ├── conv1.bias.bin │ ├── conv1.bin │ ├── conv2.bias.bin │ ├── conv2.bin │ ├── five_28x28.pgm │ ├── ip1.bias.bin │ ├── ip1.bin │ ├── ip2.bias.bin │ ├── ip2.bin │ ├── one_28x28.pgm │ └── three_28x28.pgm │ ├── error_util.h │ ├── fp16_dev.cu │ ├── fp16_dev.h │ ├── fp16_emu.cpp │ ├── fp16_emu.h │ ├── gemv.h │ ├── mnistCUDNN.cpp │ └── readme.txt ├── cudnn_samples_v7 ├── RNN │ ├── Makefile │ ├── RNN_example.cu │ ├── compare.py │ ├── golden_1.txt │ ├── golden_2.txt │ ├── golden_3.txt │ └── golden_4.txt ├── conv_sample │ ├── Makefile │ ├── config_fermi_islip.icnt │ ├── conv_sample.cpp │ ├── error_util.h │ ├── fp16_dev.cu │ ├── fp16_dev.h │ ├── fp16_emu.cpp │ ├── fp16_emu.h │ └── gpgpusim.config └── mnistCUDNN │ ├── FreeImage │ ├── freeimage-license.txt │ └── include │ │ └── FreeImage.h │ ├── Makefile │ ├── data │ ├── conv1.bias.bin │ ├── conv1.bin │ ├── conv2.bias.bin │ ├── conv2.bin │ ├── five_28x28.pgm │ ├── ip1.bias.bin │ ├── ip1.bin │ ├── ip2.bias.bin │ ├── ip2.bin │ ├── one_28x28.pgm │ └── three_28x28.pgm │ ├── error_util.h │ ├── fp16_dev.cu │ ├── fp16_dev.h │ ├── fp16_emu.cpp │ ├── fp16_emu.h │ ├── gemv.h │ ├── mnistCUDNN │ ├── mnistCUDNN.cpp │ └── readme.txt └── home-made ├── common.hpp ├── config_fermi_islip.icnt ├── gpgpusim.config └── helloworld.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Zheng Liang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cuDNN-sample 2 | 3 | Some cuDNN sample codes provided by Nvidia as well as some home-made codes. 4 | 5 | ## cuDNN static linking 6 | 7 | There is no official guide on how to link cuDNN statically. However, I found an official guide on how to [link cuBLAS statically](https://docs.nvidia.com/cuda/cublas/index.html). Actually, nVidia takes the static library as a different library (with a different name). 8 | 9 | So, you need to use the following commands to link cuDNN statically. 10 | 11 | ```bash 12 | nvcc -lcudnn_static -o 13 | ``` -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/Makefile: -------------------------------------------------------------------------------- 1 | # Location of the CUDA Toolkit 2 | CUDA_PATH ?= /usr/local/cuda 3 | 4 | # architecture 5 | HOST_ARCH := $(shell uname -m) 6 | TARGET_ARCH ?= $(HOST_ARCH) 7 | 8 | # Adjust this for ARMv7 with a 32-bit filesystem 9 | ifeq ($(TARGET_ARCH), aarch64) 10 | ifeq ($(shell file /sbin/init | grep 32-bit), 1) 11 | TARGET_ARCH=armv7l 12 | endif 13 | endif 14 | 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 16 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 17 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 18 | TARGET_SIZE := 64 19 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 20 | TARGET_SIZE := 32 21 | endif 22 | else 23 | TARGET_SIZE := $(shell getconf LONG_BIT) 24 | endif 25 | else 26 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 27 | endif 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 29 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 30 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 31 | endif 32 | endif 33 | 34 | # operating system 35 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 36 | TARGET_OS ?= $(HOST_OS) 37 | 38 | ifeq ($(TARGET_OS),QNX) 39 | TARGET_OS := qnx 40 | endif 41 | 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android)) 43 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 44 | endif 45 | 46 | # host compiler 47 | ifeq ($(TARGET_OS),darwin) 48 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 49 | HOST_COMPILER ?= clang++ 50 | endif 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 52 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 53 | ifeq ($(TARGET_OS),linux) 54 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 55 | else ifeq ($(TARGET_OS),qnx) 56 | ifeq ($(QNX_HOST),) 57 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 58 | endif 59 | ifeq ($(QNX_TARGET),) 60 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 61 | endif 62 | export QNX_HOST 63 | export QNX_TARGET 64 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 65 | else ifeq ($(TARGET_OS),android) 66 | HOST_COMPILER ?= arm-linux-androideabi-g++ 67 | endif 68 | else ifeq ($(TARGET_ARCH),aarch64) 69 | ifeq ($(TARGET_OS), linux) 70 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 71 | else ifeq ($(TARGET_OS), android) 72 | HOST_COMPILER ?= aarch64-linux-android-g++ 73 | endif 74 | else ifeq ($(TARGET_ARCH),ppc64le) 75 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 76 | endif 77 | endif 78 | HOST_COMPILER ?= g++ 79 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 80 | 81 | # internal flags 82 | NVCCFLAGS := -m${TARGET_SIZE} 83 | CCFLAGS := 84 | LDFLAGS := 85 | 86 | # build flags 87 | ifeq ($(TARGET_OS),darwin) 88 | LDFLAGS += -rpath $(CUDA_PATH)/lib 89 | CCFLAGS += -arch $(HOST_ARCH) 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 91 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 92 | CCFLAGS += -mfloat-abi=hard 93 | else ifeq ($(TARGET_OS),android) 94 | LDFLAGS += -pie 95 | CCFLAGS += -fpie -fpic -fexceptions 96 | endif 97 | 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 99 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 100 | ifneq ($(TARGET_FS),) 101 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 102 | ifeq ($(GCCVERSIONLTEQ46),1) 103 | CCFLAGS += --sysroot=$(TARGET_FS) 104 | endif 105 | LDFLAGS += --sysroot=$(TARGET_FS) 106 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 107 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 108 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 109 | endif 110 | endif 111 | endif 112 | 113 | # Debug build flags 114 | ifeq ($(dbg),1) 115 | NVCCFLAGS += -g -G 116 | BUILD_TYPE := debug 117 | else 118 | BUILD_TYPE := release 119 | endif 120 | 121 | ALL_CCFLAGS := 122 | ALL_CCFLAGS += $(NVCCFLAGS) 123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 126 | 127 | SAMPLE_ENABLED := 1 128 | 129 | ALL_LDFLAGS := 130 | ALL_LDFLAGS += $(ALL_CCFLAGS) 131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 133 | 134 | # Common includes and paths for CUDA 135 | ifneq ($(TARGET_ARCH), ppc64le) 136 | INCLUDES := -I$(CUDA_PATH)/include 137 | else 138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include 139 | endif 140 | LIBRARIES := 141 | 142 | ################################################################################ 143 | 144 | # Gencode arguments 145 | SMS ?= 30 35 50 53 146 | 147 | ifeq ($(SMS),) 148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 149 | SAMPLE_ENABLED := 0 150 | endif 151 | 152 | ifeq ($(GENCODE_FLAGS),) 153 | # Generate SASS code for each SM architecture listed in $(SMS) 154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 155 | 156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 157 | HIGHEST_SM := $(lastword $(sort $(SMS))) 158 | ifneq ($(HIGHEST_SM),) 159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 160 | endif 161 | endif 162 | 163 | INCLUDES += -I. 164 | LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm 165 | 166 | ifeq ($(SAMPLE_ENABLED),0) 167 | EXEC ?= @echo "[@]" 168 | endif 169 | 170 | ################################################################################ 171 | 172 | # Target rules 173 | all: build 174 | 175 | build: RNN 176 | 177 | check.deps: 178 | ifeq ($(SAMPLE_ENABLED),0) 179 | @echo "Sample will be waived due to the above missing dependencies" 180 | else 181 | @echo "Sample is ready - all dependencies have been met" 182 | endif 183 | 184 | OBJ = RNN_example.o 185 | 186 | RNN: $(OBJ) 187 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 188 | 189 | %.o: %.cu 190 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 191 | 192 | run: build 193 | $(EXEC) ./RNN 100 4 512 64 2 194 | 195 | clean: 196 | rm -rf *o 197 | rm -rf RNN 198 | 199 | clobber: clean 200 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/RNN_example.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | // Reference outputs (calculated on an M40 GPU) 18 | // > ./RNN 20 2 512 64 0 19 | // Forward: 1299 GFLOPs 20 | // Backward: 2171 GFLOPs, (1564 GFLOPs), (3549 GFLOPs) 21 | // i checksum 1.315793E+06 h checksum 1.315212E+05 22 | // di checksum 6.676003E+01 dh checksum 6.425067E+01 23 | // dw checksum 1.453750E+09 24 | // 25 | // > ./RNN 20 2 512 64 1 26 | // Forward: 1296 GFLOPs 27 | // Backward: 2235 GFLOPs, (1567 GFLOPs), (3896 GFLOPs) 28 | // i checksum 6.319591E+05 h checksum 6.319605E+04 29 | // di checksum 4.501830E+00 dh checksum 4.489546E+00 30 | // dw checksum 5.012598E+07 31 | // 32 | // > ./RNN 20 2 512 64 2 33 | // Forward: 2635 GFLOPs 34 | // Backward: 2757 GFLOPs, (2001 GFLOPs), (4433 GFLOPs) 35 | // i checksum 5.749536E+05 c checksum 4.365091E+05 h checksum 5.774818E+04 36 | // di checksum 3.842206E+02 dc checksum 9.323785E+03 dh checksum 1.182566E+01 37 | // dw checksum 4.313461E+08 38 | // 39 | // > ./RNN 20 2 512 64 3 40 | // Forward: 2428 GFLOPs 41 | // Backward: 2645 GFLOPs, (1915 GFLOPs), (4270 GFLOPs) 42 | // i checksum 6.358978E+05 h checksum 6.281680E+04 43 | // di checksum 6.296622E+00 dh checksum 2.289960E+05 44 | // dw checksum 5.397419E+07 45 | 46 | 47 | 48 | // Define some error checking macros. 49 | #define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); } 50 | void cudaErrCheck_(cudaError_t stat, const char *file, int line) { 51 | if (stat != cudaSuccess) { 52 | fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line); 53 | } 54 | } 55 | 56 | #define cudnnErrCheck(stat) { cudnnErrCheck_((stat), __FILE__, __LINE__); } 57 | void cudnnErrCheck_(cudnnStatus_t stat, const char *file, int line) { 58 | if (stat != CUDNN_STATUS_SUCCESS) { 59 | fprintf(stderr, "cuDNN Error: %s %s %d\n", cudnnGetErrorString(stat), file, line); 60 | } 61 | } 62 | 63 | __global__ void initGPUData_ker(float *data, int numElements, float value) { 64 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 65 | if (tid < numElements) { 66 | data[tid] = value; 67 | } 68 | } 69 | 70 | void initGPUData(float *data, int numElements, float value) { 71 | dim3 gridDim; 72 | dim3 blockDim; 73 | 74 | blockDim.x = 1024; 75 | gridDim.x = (numElements + blockDim.x - 1) / blockDim.x; 76 | 77 | initGPUData_ker <<< gridDim, blockDim >>> (data, numElements, value); 78 | } 79 | 80 | 81 | int main(int argc, char* argv[]) { 82 | 83 | int seqLength; 84 | int numLayers; 85 | int hiddenSize; 86 | int inputSize; 87 | int miniBatch; 88 | float dropout; 89 | bool bidirectional; 90 | int mode; 91 | 92 | FILE *fp; 93 | fp=fopen("result.txt","w"); 94 | 95 | if (argc == 6) { 96 | seqLength = atoi(argv[1]); 97 | numLayers = atoi(argv[2]); 98 | hiddenSize = atoi(argv[3]); 99 | inputSize = hiddenSize; 100 | miniBatch = atoi(argv[4]); 101 | dropout = 0; 102 | bidirectional = 0; 103 | mode = atoi(argv[5]); 104 | } 105 | else { 106 | printf("Usage:\n"); 107 | printf("./RNN \n"); 108 | printf("Modes: 0 = RNN_RELU, 1 = RNN_TANH, 2 = LSTM, 3 = GRU\n"); 109 | return 1; 110 | } 111 | 112 | // ------------------------- 113 | // Create cudnn context 114 | // ------------------------- 115 | cudnnHandle_t cudnnHandle; 116 | cudnnErrCheck(cudnnCreate(&cudnnHandle)); 117 | 118 | 119 | // ------------------------- 120 | // Set up inputs and outputs 121 | // ------------------------- 122 | void *x; 123 | void *hx = NULL; 124 | void *cx = NULL; 125 | 126 | void *dx; 127 | void *dhx = NULL; 128 | void *dcx = NULL; 129 | 130 | void *y; 131 | void *hy = NULL; 132 | void *cy = NULL; 133 | 134 | void *dy; 135 | void *dhy = NULL; 136 | void *dcy = NULL; 137 | 138 | // Memory allocation. hx, cx, dhx, dcx, hy, cy, dhy and dcy can be NULL. 139 | cudaErrCheck(cudaMalloc((void**)&x, seqLength * inputSize * miniBatch * sizeof(float))); 140 | cudaErrCheck(cudaMalloc((void**)&hx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 141 | cudaErrCheck(cudaMalloc((void**)&cx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 142 | 143 | cudaErrCheck(cudaMalloc((void**)&dx, seqLength * inputSize * miniBatch * sizeof(float))); 144 | cudaErrCheck(cudaMalloc((void**)&dhx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 145 | cudaErrCheck(cudaMalloc((void**)&dcx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 146 | 147 | cudaErrCheck(cudaMalloc((void**)&y, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 148 | cudaErrCheck(cudaMalloc((void**)&hy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 149 | cudaErrCheck(cudaMalloc((void**)&cy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 150 | 151 | cudaErrCheck(cudaMalloc((void**)&dy, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 152 | cudaErrCheck(cudaMalloc((void**)&dhy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 153 | cudaErrCheck(cudaMalloc((void**)&dcy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1) * sizeof(float))); 154 | 155 | // Set up tensor descriptors. x/y/dx/dy are arrays, one per time step. 156 | cudnnTensorDescriptor_t *xDesc, *yDesc, *dxDesc, *dyDesc; 157 | cudnnTensorDescriptor_t hxDesc, cxDesc; 158 | cudnnTensorDescriptor_t hyDesc, cyDesc; 159 | cudnnTensorDescriptor_t dhxDesc, dcxDesc; 160 | cudnnTensorDescriptor_t dhyDesc, dcyDesc; 161 | 162 | xDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t)); 163 | yDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t)); 164 | dxDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t)); 165 | dyDesc = (cudnnTensorDescriptor_t*)malloc(seqLength * sizeof(cudnnTensorDescriptor_t)); 166 | 167 | int dimA[3]; 168 | int strideA[3]; 169 | 170 | // In this example dimA[1] is constant across the whole sequence 171 | // This isn't required, all that is required is that it does not increase. 172 | for (int i = 0; i < seqLength; i++) { 173 | cudnnErrCheck(cudnnCreateTensorDescriptor(&xDesc[i])); 174 | cudnnErrCheck(cudnnCreateTensorDescriptor(&yDesc[i])); 175 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dxDesc[i])); 176 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dyDesc[i])); 177 | 178 | dimA[0] = miniBatch; 179 | dimA[1] = inputSize; 180 | dimA[2] = 1; 181 | 182 | strideA[0] = dimA[2] * dimA[1]; 183 | strideA[1] = dimA[2]; 184 | strideA[2] = 1; 185 | 186 | cudnnErrCheck(cudnnSetTensorNdDescriptor(xDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA)); 187 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dxDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA)); 188 | 189 | dimA[0] = miniBatch; 190 | dimA[1] = bidirectional ? hiddenSize * 2 : hiddenSize; 191 | dimA[2] = 1; 192 | 193 | strideA[0] = dimA[2] * dimA[1]; 194 | strideA[1] = dimA[2]; 195 | strideA[2] = 1; 196 | 197 | cudnnErrCheck(cudnnSetTensorNdDescriptor(yDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA)); 198 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dyDesc[i], CUDNN_DATA_FLOAT, 3, dimA, strideA)); 199 | } 200 | 201 | 202 | dimA[0] = numLayers * (bidirectional ? 2 : 1); 203 | dimA[1] = miniBatch; 204 | dimA[2] = hiddenSize; 205 | 206 | strideA[0] = dimA[2] * dimA[1]; 207 | strideA[1] = dimA[2]; 208 | strideA[2] = 1; 209 | 210 | cudnnErrCheck(cudnnCreateTensorDescriptor(&hxDesc)); 211 | cudnnErrCheck(cudnnCreateTensorDescriptor(&cxDesc)); 212 | cudnnErrCheck(cudnnCreateTensorDescriptor(&hyDesc)); 213 | cudnnErrCheck(cudnnCreateTensorDescriptor(&cyDesc)); 214 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dhxDesc)); 215 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dcxDesc)); 216 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dhyDesc)); 217 | cudnnErrCheck(cudnnCreateTensorDescriptor(&dcyDesc)); 218 | 219 | cudnnErrCheck(cudnnSetTensorNdDescriptor(hxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 220 | cudnnErrCheck(cudnnSetTensorNdDescriptor(cxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 221 | cudnnErrCheck(cudnnSetTensorNdDescriptor(hyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 222 | cudnnErrCheck(cudnnSetTensorNdDescriptor(cyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 223 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dhxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 224 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dcxDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 225 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dhyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 226 | cudnnErrCheck(cudnnSetTensorNdDescriptor(dcyDesc, CUDNN_DATA_FLOAT, 3, dimA, strideA)); 227 | 228 | 229 | // ------------------------- 230 | // Set up the dropout descriptor (needed for the RNN descriptor) 231 | // ------------------------- 232 | unsigned long long seed = 1337ull; // Pick a seed. 233 | 234 | cudnnDropoutDescriptor_t dropoutDesc; 235 | cudnnErrCheck(cudnnCreateDropoutDescriptor(&dropoutDesc)); 236 | 237 | // How much memory does dropout need for states? 238 | // These states are used to generate random numbers internally 239 | // and should not be freed until the RNN descriptor is no longer used 240 | size_t stateSize; 241 | void *states; 242 | cudnnErrCheck(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize)); 243 | 244 | cudaErrCheck(cudaMalloc(&states, stateSize)); 245 | 246 | cudnnErrCheck(cudnnSetDropoutDescriptor(dropoutDesc, 247 | cudnnHandle, 248 | dropout, 249 | states, 250 | stateSize, 251 | seed)); 252 | 253 | // ------------------------- 254 | // Set up the RNN descriptor 255 | // ------------------------- 256 | cudnnRNNDescriptor_t rnnDesc; 257 | cudnnRNNMode_t RNNMode; 258 | 259 | cudnnErrCheck(cudnnCreateRNNDescriptor(&rnnDesc)); 260 | 261 | if (mode == 0) RNNMode = CUDNN_RNN_RELU; 262 | else if (mode == 1) RNNMode = CUDNN_RNN_TANH; 263 | else if (mode == 2) RNNMode = CUDNN_LSTM; 264 | else if (mode == 3) RNNMode = CUDNN_GRU; 265 | 266 | cudnnErrCheck(cudnnSetRNNDescriptor_v6(cudnnHandle, 267 | rnnDesc, 268 | hiddenSize, 269 | numLayers, 270 | dropoutDesc, 271 | CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation 272 | bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, 273 | RNNMode, 274 | CUDNN_RNN_ALGO_STANDARD, // Can be changed to use persistent RNNs on Pascal+ GPUs. 275 | CUDNN_DATA_FLOAT)); 276 | 277 | 278 | // ------------------------- 279 | // Set up parameters 280 | // ------------------------- 281 | // This needs to be done after the rnn descriptor is set as otherwise 282 | // we don't know how many parameters we have to allocate 283 | void *w; 284 | void *dw; 285 | 286 | cudnnFilterDescriptor_t wDesc, dwDesc; 287 | 288 | cudnnErrCheck(cudnnCreateFilterDescriptor(&wDesc)); 289 | cudnnErrCheck(cudnnCreateFilterDescriptor(&dwDesc)); 290 | 291 | size_t weightsSize; 292 | cudnnErrCheck(cudnnGetRNNParamsSize(cudnnHandle, rnnDesc, xDesc[0], &weightsSize, CUDNN_DATA_FLOAT)); 293 | 294 | int dimW[3]; 295 | dimW[0] = weightsSize / sizeof(float); 296 | dimW[1] = 1; 297 | dimW[2] = 1; 298 | 299 | cudnnErrCheck(cudnnSetFilterNdDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW)); 300 | cudnnErrCheck(cudnnSetFilterNdDescriptor(dwDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dimW)); 301 | 302 | cudaErrCheck(cudaMalloc((void**)&w, weightsSize)); 303 | cudaErrCheck(cudaMalloc((void**)&dw, weightsSize)); 304 | 305 | 306 | // ------------------------- 307 | // Set up work space and reserved memory 308 | // ------------------------- 309 | void *workspace; 310 | void *reserveSpace; 311 | 312 | size_t workSize; 313 | size_t reserveSize; 314 | 315 | // Need for every pass 316 | cudnnErrCheck(cudnnGetRNNWorkspaceSize(cudnnHandle, rnnDesc, seqLength, xDesc, &workSize)); 317 | // Only needed in training, shouldn't be touched between passes. 318 | cudnnErrCheck(cudnnGetRNNTrainingReserveSize(cudnnHandle, rnnDesc, seqLength, xDesc, &reserveSize)); 319 | 320 | cudaErrCheck(cudaMalloc((void**)&workspace, workSize)); 321 | cudaErrCheck(cudaMalloc((void**)&reserveSpace, reserveSize)); 322 | 323 | // ********************************************************************************************************* 324 | // Initialise weights and inputs 325 | // ********************************************************************************************************* 326 | // We initialise to something simple. 327 | // Matrices are initialised to 1 / matrixSize, biases to 1, data is 1. 328 | initGPUData((float*)x, seqLength * inputSize * miniBatch, 1.f); 329 | if (hx != NULL) initGPUData((float*)hx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f); 330 | if (cx != NULL) initGPUData((float*)cx, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f); 331 | 332 | initGPUData((float*)dy, seqLength * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f); 333 | if (dhy != NULL) initGPUData((float*)dhy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f); 334 | if (dcy != NULL) initGPUData((float*)dcy, numLayers * hiddenSize * miniBatch * (bidirectional ? 2 : 1), 1.f); 335 | 336 | 337 | // Weights 338 | int numLinearLayers = 0; 339 | if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) { 340 | numLinearLayers = 2; 341 | } 342 | else if (RNNMode == CUDNN_LSTM) { 343 | numLinearLayers = 8; 344 | } 345 | else if (RNNMode == CUDNN_GRU) { 346 | numLinearLayers = 6; 347 | } 348 | 349 | for (int layer = 0; layer < numLayers * (bidirectional ? 2 : 1); layer++) { 350 | for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) { 351 | cudnnFilterDescriptor_t linLayerMatDesc; 352 | cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerMatDesc)); 353 | float *linLayerMat; 354 | 355 | cudnnErrCheck(cudnnGetRNNLinLayerMatrixParams( cudnnHandle, 356 | rnnDesc, 357 | layer, 358 | xDesc[0], 359 | wDesc, 360 | w, 361 | linLayerID, 362 | linLayerMatDesc, 363 | (void**)&linLayerMat)); 364 | 365 | cudnnDataType_t dataType; 366 | cudnnTensorFormat_t format; 367 | int nbDims; 368 | int filterDimA[3]; 369 | cudnnErrCheck(cudnnGetFilterNdDescriptor(linLayerMatDesc, 370 | 3, 371 | &dataType, 372 | &format, 373 | &nbDims, 374 | filterDimA)); 375 | 376 | initGPUData(linLayerMat, filterDimA[0] * filterDimA[1] * filterDimA[2], 1.f / (float)(filterDimA[0] * filterDimA[1] * filterDimA[2])); 377 | 378 | cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerMatDesc)); 379 | 380 | cudnnFilterDescriptor_t linLayerBiasDesc; 381 | cudnnErrCheck(cudnnCreateFilterDescriptor(&linLayerBiasDesc)); 382 | float *linLayerBias; 383 | 384 | cudnnErrCheck(cudnnGetRNNLinLayerBiasParams( cudnnHandle, 385 | rnnDesc, 386 | layer, 387 | xDesc[0], 388 | wDesc, 389 | w, 390 | linLayerID, 391 | linLayerBiasDesc, 392 | (void**)&linLayerBias)); 393 | 394 | cudnnErrCheck(cudnnGetFilterNdDescriptor(linLayerBiasDesc, 395 | 3, 396 | &dataType, 397 | &format, 398 | &nbDims, 399 | filterDimA)); 400 | 401 | initGPUData(linLayerBias, filterDimA[0] * filterDimA[1] * filterDimA[2], 1.f); 402 | 403 | cudnnErrCheck(cudnnDestroyFilterDescriptor(linLayerBiasDesc)); 404 | } 405 | } 406 | 407 | // ********************************************************************************************************* 408 | // At this point all of the setup is done. We now need to pass through the RNN. 409 | // ********************************************************************************************************* 410 | 411 | 412 | 413 | cudaErrCheck(cudaDeviceSynchronize()); 414 | 415 | cudaEvent_t start, stop; 416 | float timeForward, timeBackward1, timeBackward2; 417 | cudaErrCheck(cudaEventCreate(&start)); 418 | cudaErrCheck(cudaEventCreate(&stop)); 419 | 420 | cudaErrCheck(cudaEventRecord(start)); 421 | 422 | // If we're not training we use this instead 423 | // cudnnErrCheck(cudnnRNNForwardInference(cudnnHandle, 424 | // rnnDesc, 425 | // xDesc, 426 | // x, 427 | // hxDesc, 428 | // hx, 429 | // cxDesc, 430 | // cx, 431 | // wDesc, 432 | // w, 433 | // yDesc, 434 | // y, 435 | // hyDesc, 436 | // hy, 437 | // cyDesc, 438 | // cy, 439 | // workspace, 440 | // workSize)); 441 | 442 | cudnnErrCheck(cudnnRNNForwardTraining(cudnnHandle, 443 | rnnDesc, 444 | seqLength, 445 | xDesc, 446 | x, 447 | hxDesc, 448 | hx, 449 | cxDesc, 450 | cx, 451 | wDesc, 452 | w, 453 | yDesc, 454 | y, 455 | hyDesc, 456 | hy, 457 | cyDesc, 458 | cy, 459 | workspace, 460 | workSize, 461 | reserveSpace, 462 | reserveSize)); 463 | 464 | cudaErrCheck(cudaEventRecord(stop)); 465 | cudaErrCheck(cudaEventSynchronize(stop)); 466 | cudaErrCheck(cudaEventElapsedTime(&timeForward, start, stop)); 467 | 468 | cudaErrCheck(cudaEventRecord(start)); 469 | 470 | cudnnErrCheck(cudnnRNNBackwardData(cudnnHandle, 471 | rnnDesc, 472 | seqLength, 473 | yDesc, 474 | y, 475 | dyDesc, 476 | dy, 477 | dhyDesc, 478 | dhy, 479 | dcyDesc, 480 | dcy, 481 | wDesc, 482 | w, 483 | hxDesc, 484 | hx, 485 | cxDesc, 486 | cx, 487 | dxDesc, 488 | dx, 489 | dhxDesc, 490 | dhx, 491 | dcxDesc, 492 | dcx, 493 | workspace, 494 | workSize, 495 | reserveSpace, 496 | reserveSize )); 497 | 498 | cudaErrCheck(cudaEventRecord(stop)); 499 | cudaErrCheck(cudaEventSynchronize(stop)); 500 | cudaErrCheck(cudaEventElapsedTime(&timeBackward1, start, stop)); 501 | 502 | cudaErrCheck(cudaEventRecord(start)); 503 | 504 | // cudnnRNNBackwardWeights adds to the data in dw. 505 | cudaErrCheck(cudaMemset(dw, 0, weightsSize)); 506 | 507 | cudnnErrCheck(cudnnRNNBackwardWeights( cudnnHandle, 508 | rnnDesc, 509 | seqLength, 510 | xDesc, 511 | x, 512 | hxDesc, 513 | hx, 514 | yDesc, 515 | y, 516 | workspace, 517 | workSize, 518 | dwDesc, 519 | dw, 520 | reserveSpace, 521 | reserveSize )); 522 | 523 | 524 | 525 | cudaErrCheck(cudaEventRecord(stop)); 526 | 527 | cudaErrCheck(cudaEventSynchronize(stop)); 528 | cudaErrCheck(cudaEventElapsedTime(&timeBackward2, start, stop)); 529 | 530 | 531 | int numMats = 0; 532 | 533 | if (RNNMode == CUDNN_RNN_RELU || RNNMode == CUDNN_RNN_TANH) { 534 | numMats = 2; 535 | } 536 | else if (RNNMode == CUDNN_LSTM) { 537 | numMats = 8; 538 | } 539 | else if (RNNMode == CUDNN_GRU) { 540 | numMats = 6; 541 | } 542 | 543 | // Calculate FLOPS 544 | printf("Forward: %3.0f GFLOPS\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeForward)); 545 | printf("Backward: %3.0f GFLOPS, ", numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * (timeBackward1 + timeBackward2))); 546 | printf("(%3.0f GFLOPS), ", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward1)); 547 | printf("(%3.0f GFLOPS)\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward2)); 548 | 549 | // Calculate FLOPS 550 | fprintf(fp,"Forward: %3.0f GFLOPS\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeForward)); 551 | fprintf(fp,"Backward: %3.0f GFLOPS, ", numMats * 4ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * (timeBackward1 + timeBackward2))); 552 | fprintf(fp,"(%3.0f GFLOPS), ", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward1)); 553 | fprintf(fp,"(%3.0f GFLOPS)\n", numMats * 2ull * (bidirectional ? 2 : 1) * hiddenSize * hiddenSize * seqLength * miniBatch * numLayers / (1e6 * timeBackward2)); 554 | 555 | // Make double-sure everything is finished before we copy for result checking. 556 | cudaDeviceSynchronize(); 557 | 558 | // ********************************************************************************************************* 559 | // Print checksums. 560 | // ********************************************************************************************************* 561 | if (true) { 562 | float* testOutputi; 563 | float* testOutputh; 564 | float* testOutputc; 565 | 566 | int biDirScale = (bidirectional ? 2 : 1); 567 | 568 | testOutputi = (float*)malloc(hiddenSize * seqLength * miniBatch * biDirScale * sizeof(float)); 569 | testOutputh = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float)); 570 | testOutputc = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float)); 571 | 572 | cudaErrCheck(cudaMemcpy(testOutputi, y, hiddenSize * seqLength * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost)); 573 | if (hy != NULL) cudaErrCheck(cudaMemcpy(testOutputh, hy, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost)); 574 | if (cy != NULL && RNNMode == CUDNN_LSTM) cudaErrCheck(cudaMemcpy(testOutputc, cy, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost)); 575 | 576 | double checksumi = 0.f; 577 | double checksumh = 0.f; 578 | double checksumc = 0.f; 579 | 580 | for (int m = 0; m < miniBatch; m++) { 581 | double localSumi = 0; 582 | double localSumh = 0; 583 | double localSumc = 0; 584 | 585 | for (int j = 0; j < seqLength; j++) { 586 | for (int i = 0; i < hiddenSize * biDirScale; i++) { 587 | localSumi += testOutputi[j * miniBatch * hiddenSize * biDirScale + m * hiddenSize * biDirScale + i]; 588 | } 589 | } 590 | for (int j = 0; j < numLayers * biDirScale; j++) { 591 | for (int i = 0; i < hiddenSize; i++) { 592 | if (hy != NULL) localSumh += testOutputh[j * hiddenSize * miniBatch + m * hiddenSize + i]; 593 | if (cy != NULL) if (RNNMode == CUDNN_LSTM) localSumc += testOutputc[j * hiddenSize * miniBatch + m * hiddenSize + i]; 594 | } 595 | } 596 | 597 | checksumi += localSumi; 598 | checksumh += localSumh; 599 | checksumc += localSumc; 600 | } 601 | 602 | printf("i checksum %E ", checksumi); 603 | fprintf(fp,"i checksum %E ", checksumi); 604 | if (RNNMode == CUDNN_LSTM) { printf("c checksum %E ", checksumc); fprintf(fp,"c checksum %E ", checksumc); } 605 | printf("h checksum %E\n", checksumh); 606 | fprintf(fp,"h checksum %E\n", checksumh); 607 | 608 | free(testOutputi); 609 | free(testOutputc); 610 | free(testOutputh); 611 | } 612 | 613 | if (true) { 614 | float* testOutputdi; 615 | float* testOutputdh; 616 | float* testOutputdc; 617 | 618 | int biDirScale = (bidirectional ? 2 : 1); 619 | 620 | testOutputdi = (float*)malloc(inputSize * seqLength * miniBatch * sizeof(float)); 621 | testOutputdh = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float)); 622 | testOutputdc = (float*)malloc(hiddenSize * miniBatch * numLayers * biDirScale * sizeof(float)); 623 | cudaErrCheck(cudaMemcpy(testOutputdi, dx, seqLength * miniBatch * inputSize * sizeof(float), cudaMemcpyDeviceToHost)); 624 | if (dhx != NULL) cudaErrCheck(cudaMemcpy(testOutputdh, dhx, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost)); 625 | if (dcx != NULL) if (RNNMode == CUDNN_LSTM) cudaErrCheck(cudaMemcpy(testOutputdc, dcx, numLayers * hiddenSize * miniBatch * biDirScale * sizeof(float), cudaMemcpyDeviceToHost)); 626 | 627 | float checksumdi = 0.f; 628 | float checksumdh = 0.f; 629 | float checksumdc = 0.f; 630 | 631 | for (int m = 0; m < miniBatch; m++) { 632 | double localSumdi = 0; 633 | double localSumdh = 0; 634 | double localSumdc = 0; 635 | 636 | for (int j = 0; j < seqLength; j++) { 637 | for (int i = 0; i < inputSize; i++) { 638 | localSumdi += testOutputdi[j * miniBatch * inputSize + m * inputSize + i]; 639 | } 640 | } 641 | 642 | for (int j = 0; j < numLayers * biDirScale; j++) { 643 | for (int i = 0; i < hiddenSize; i++) { 644 | localSumdh += testOutputdh[j * hiddenSize * miniBatch + m * hiddenSize + i]; 645 | if (RNNMode == CUDNN_LSTM) localSumdc += testOutputdc[j * hiddenSize * miniBatch + m * hiddenSize + i]; 646 | } 647 | } 648 | 649 | checksumdi += localSumdi; 650 | checksumdh += localSumdh; 651 | checksumdc += localSumdc; 652 | 653 | } 654 | 655 | printf("di checksum %E ", checksumdi); 656 | fprintf(fp,"di checksum %E ", checksumdi); 657 | if (RNNMode == CUDNN_LSTM) { printf("dc checksum %E ", checksumdc); fprintf(fp,"dc checksum %E ", checksumdc); } 658 | printf("dh checksum %E\n", checksumdh); 659 | fprintf(fp,"dh checksum %E\n", checksumdh); 660 | 661 | free(testOutputdi); 662 | free(testOutputdh); 663 | free(testOutputdc); 664 | } 665 | 666 | if (true) { 667 | float* testOutputdw; 668 | testOutputdw = (float*)malloc(weightsSize); 669 | 670 | cudaErrCheck(cudaMemcpy(testOutputdw, dw, weightsSize, cudaMemcpyDeviceToHost)); 671 | 672 | double checksumdw = 0.; 673 | 674 | for (int i = 0; i < weightsSize / sizeof(float); i++) { 675 | checksumdw += testOutputdw[i]; 676 | } 677 | 678 | printf("dw checksum %E\n", checksumdw); 679 | fprintf(fp,"dw checksum %E\n", checksumdw); 680 | 681 | free(testOutputdw); 682 | } 683 | 684 | cudaFree(x); 685 | cudaFree(hx); 686 | cudaFree(cx); 687 | cudaFree(y); 688 | cudaFree(hy); 689 | cudaFree(cy); 690 | cudaFree(dx); 691 | cudaFree(dhx); 692 | cudaFree(dcx); 693 | cudaFree(dy); 694 | cudaFree(dhy); 695 | cudaFree(dcy); 696 | cudaFree(workspace); 697 | cudaFree(reserveSpace); 698 | cudaFree(w); 699 | cudaFree(dw); 700 | 701 | cudnnDestroy(cudnnHandle); 702 | fclose(fp); 703 | return 0; 704 | } 705 | 706 | 707 | 708 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/compare.py: -------------------------------------------------------------------------------- 1 | #This script can compare the result files with the golden files and report the status: pass or failed\ 2 | #Usage: python compare_result.py results.txt golden.txt 3 | import os, sys, re 4 | 5 | patterns = ['{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)\s+{key3}\s+checksum\s+([.eE+0-9]+)', #3 similar keys as below each line 6 | '{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)', #2 similar keys as below each line 7 | '{key}\s+checksum\s+([.eE+0-9]+)', #one key each line: di checksum 6.676003E+01 8 | '{key}[: ]+([0-9]+)\s+GFLOPS[, ]+\\(([0-9]+)\s+GFLOPS\\)[, ]+\\(([0-9]+)\s+GFLOPS\\)', #1 key each line with more returns 9 | '{key}[: ]+([0-9]+)\s+GFLOPS'] #one key each line: Forward: 673 GFLOPS 10 | #keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw', 'Backward', 'Forward'] 11 | keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw'] # skip the last 2 targets 12 | pats = [0,0,1,1,2,3,4] 13 | datnum = [len(k) if isinstance(k, tuple) else (3 if k == 'Backward' else 1) for k in keys] 14 | #tol = 1.0e-3 15 | def compare_results(ftarget, fgolden): 16 | assert ftarget and fgolden, 'No enough input files given!' 17 | print ftarget, fgolden 18 | targ, _ = get_results_from_file(ftarget) 19 | golden, tol = get_results_from_file(fgolden, golden=True) 20 | 21 | ret = 0 22 | assert targ and golden, 'targets or golen results not generated!' 23 | for k, vals in golden.iteritems(): 24 | if not isinstance(vals, list): 25 | vals = [vals] 26 | targ[k] = [targ[k]] 27 | for idx, v in enumerate(vals): 28 | tval = float(targ[k][idx]) 29 | gval = float(v) 30 | err = None 31 | if tol[k]['type'] == 'rel': 32 | err = abs((tval-gval)/max(gval,tval)) # clamp rel_err <= 1 33 | elif tol[k]['type'] == 'abs': 34 | err = abs(tval-gval) 35 | assert err is not None, 'Error is Empty!' 36 | tol_i = tol[k]['val'] 37 | #print 'k,t,g,err',k,tval, gval, err 38 | if err > tol_i: 39 | print 'FAILED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i) 40 | ret = 1 41 | else: 42 | print 'PASSED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i) 43 | if ret == 0: 44 | print 'ALL PASSED' 45 | return ret 46 | 47 | def _get_tolerance_line(line): 48 | """get a data item for a tolerance line with format (each line only one item): 49 | i: type=rel, 1e-3 50 | """ 51 | assert line, 'Empty line!' 52 | line = line.strip().replace(' ','') 53 | stmp = line.split(':') 54 | key = stmp[0] 55 | _type, _val = stmp[1].split(',') 56 | _type = _type.split('=')[-1] 57 | tol={key:{'type':_type, 'val':float(_val)}} 58 | return tol 59 | 60 | def get_results_from_file(fname, golden=False): 61 | assert fname, 'No file name given!' 62 | ret = {} 63 | tol = {} 64 | is_tolerance = False 65 | with open(fname, 'r') as fin: 66 | lines = fin.readlines() 67 | if len(lines) == 1: 68 | lines = lines[0].split('\r') 69 | for idx, line in enumerate(lines): 70 | line = line.strip() 71 | if not line: 72 | continue 73 | val = get_valpat_line(line) 74 | if val: 75 | ret = dict(ret, **val) 76 | if golden: 77 | if 'TOLERANCE' in line: # the next line is the tol value 78 | is_tolerance = True 79 | elif is_tolerance: 80 | _tol = _get_tolerance_line(line) 81 | tol = dict(tol, **_tol) 82 | 83 | return ret, tol 84 | 85 | def get_valpat_line(line): 86 | for idx, key in enumerate(keys): 87 | Ndat = datnum[idx] 88 | if isinstance(key, tuple): 89 | format_expr = {} 90 | for j in range(Ndat): 91 | format_expr['key%d'%(j+1)] = keys[idx][j] 92 | ret = re.search(patterns[pats[idx]].format(**format_expr), line) 93 | if ret: 94 | vals = {} 95 | for j in range(Ndat): 96 | vals[key[j]] = ret.group(j+1) 97 | return vals 98 | else: 99 | ret = re.search(patterns[pats[idx]].format(key=key), line) 100 | if ret: 101 | if Ndat >1: 102 | #print Ndat, key, datnum, idx 103 | return {key:[ret.group(j+1) for j in range(Ndat)]} 104 | else: 105 | return {key:ret.group(1)} 106 | return None 107 | 108 | def str_test(): 109 | s='Forward: 673 GFLOPS' 110 | s1='Backward: 835 GFLOPS, (654 GFLOPS), (1155 GFLOPS)' 111 | s2='i checksum 1.315793E+06 h checksum 1.315212E+05' 112 | s3='di checksum 6.676003E+01 dh checksum 6.425050E+01' 113 | s4='dw checksum 1.453750E+09' 114 | print get_valpat_line(s1) 115 | print get_valpat_line(s) 116 | print get_valpat_line(s2) 117 | print get_valpat_line(s3) 118 | print get_valpat_line(s4) 119 | if __name__ == '__main__': 120 | #str_test() 121 | #print get_results_from_file('results.txt') 122 | #print get_results_from_file('golden.txt', golden=True) 123 | sys.exit(compare_results(sys.argv[1], sys.argv[2])) 124 | 125 | 126 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/golden_1.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 1250 GFLOPS 3 | Backward: 1896 GFLOPS, (1299 GFLOPS), (3511 GFLOPS) 4 | i checksum 1.315793E+06 h checksum 1.315212E+05 5 | di checksum 6.676003E+01 dh checksum 6.425050E+01 6 | dw checksum 1.453750E+09 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/golden_2.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 1225 GFLOPS 3 | Backward: 1910 GFLOPS, (1299 GFLOPS), (3601 GFLOPS) 4 | i checksum 6.319591E+05 h checksum 6.319605E+04 5 | di checksum 4.501830E+00 dh checksum 4.489543E+00 6 | dw checksum 5.012598E+07 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/golden_3.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 2569 GFLOPS 3 | Backward: 2654 GFLOPS, (2071 GFLOPS), (3694 GFLOPS) 4 | i checksum 5.749536E+05 c checksum 4.365091E+05 h checksum 5.774818E+04 5 | di checksum 3.842206E+02 dc checksum 9.323785E+03 dh checksum 1.182562E+01 6 | dw checksum 4.313461E+08 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | c: type=rel, 1e-3 13 | dc: type=rel, 1e-3 14 | di: type=rel, 1e-3 15 | dh: type=rel, 1e-3 16 | dw: type=rel, 1e-3 17 | 18 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/golden_4.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 2310 GFLOPS 3 | Backward: 2536 GFLOPS, (1955 GFLOPS), (3606 GFLOPS) 4 | i checksum 6.358978E+05 h checksum 6.281680E+04 5 | di checksum 6.296622E+00 dh checksum 2.289960E+05 6 | dw checksum 5.397419E+07 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v6/RNN/result.txt: -------------------------------------------------------------------------------- 1 | Forward: 413 GFLOPS 2 | Backward: 666 GFLOPS, (410 GFLOPS), (1762 GFLOPS) 3 | i checksum 4.210712E+06 h checksum 6.576062E+04 4 | di checksum 4.015642E+01 dh checksum 3.212526E+01 5 | dw checksum 4.379117E+09 6 | -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/FreeImage/freeimage-license.txt: -------------------------------------------------------------------------------- 1 | FreeImage Public License - Version 1.0 2 | --------------------------------------------- 3 | 4 | 1. Definitions. 5 | 6 | 1.1. "Contributor" means each entity that creates or contributes to the creation of Modifications. 7 | 8 | 1.2. "Contributor Version" means the combination of the Original Code, prior Modifications used by a Contributor, and the Modifications made by that particular Contributor. 9 | 10 | 1.3. "Covered Code" means the Original Code or Modifications or the combination of the Original Code and Modifications, in each case including portions thereof. 11 | 12 | 1.4. "Electronic Distribution Mechanism" means a mechanism generally accepted in the software development community for the electronic transfer of data. 13 | 14 | 1.5. "Executable" means Covered Code in any form other than Source Code. 15 | 16 | 1.6. "Initial Developer" means the individual or entity identified as the Initial Developer in the Source Code notice required by Exhibit A. 17 | 18 | 1.7. "Larger Work" means a work which combines Covered Code or portions thereof with code not governed by the terms of this License. 19 | 20 | 1.8. "License" means this document. 21 | 22 | 1.9. "Modifications" means any addition to or deletion from the substance or structure of either the Original Code or any previous Modifications. When Covered Code is released as a series of files, a 23 | Modification is: 24 | 25 | A. Any addition to or deletion from the contents of a file containing Original Code or previous Modifications. 26 | 27 | B. Any new file that contains any part of the Original Code or previous Modifications. 28 | 29 | 1.10. "Original Code" means Source Code of computer software code which is described in the Source Code notice required by Exhibit A as Original Code, and which, at the time of its release under this License is not already Covered Code governed by this License. 30 | 31 | 1.11. "Source Code" means the preferred form of the Covered Code for making modifications to it, including all modules it contains, plus any associated interface definition files, scripts used to control 32 | compilation and installation of an Executable, or a list of source code differential comparisons against either the Original Code or another well known, available Covered Code of the Contributor's choice. The Source Code can be in a compressed or archival form, provided the appropriate decompression or de-archiving software is widely available for no charge. 33 | 34 | 1.12. "You" means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License or a future version of this License issued under Section 6.1. For legal entities, "You" includes any entity which controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the 35 | direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares or beneficial ownership of such entity. 36 | 37 | 2. Source Code License. 38 | 39 | 2.1. The Initial Developer Grant. 40 | The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims: 41 | 42 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Original Code (or portions thereof) with or without Modifications, or as part of a Larger Work; and 43 | 44 | (b) under patents now or hereafter owned or controlled by Initial Developer, to make, have made, use and sell ("Utilize") the Original Code (or portions thereof), but solely to the extent that 45 | any such patent is reasonably necessary to enable You to Utilize the Original Code (or portions thereof) and not to any greater extent that may be necessary to Utilize further Modifications or 46 | combinations. 47 | 48 | 2.2. Contributor Grant. 49 | Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims: 50 | 51 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Modifications created by such Contributor (or portions thereof) either on an unmodified basis, with other Modifications, as Covered Code or as part of a Larger Work; and 52 | 53 | (b) under patents now or hereafter owned or controlled by Contributor, to Utilize the Contributor Version (or portions thereof), but solely to the extent that any such patent is reasonably necessary to enable You to Utilize the Contributor Version (or portions thereof), and not to any greater extent that 54 | may be necessary to Utilize further Modifications or combinations. 55 | 56 | 3. Distribution Obligations. 57 | 58 | 3.1. Application of License. 59 | The Modifications which You create or to which You contribute are governed by the terms of this License, including without limitation Section 2.2. The Source Code version of Covered Code may be distributed only under the terms of this License or a future version of this License released under Section 6.1, and You must include a copy of this License with every copy of the Source Code You distribute. You may not offer or impose any terms on any Source Code version that alters or 60 | restricts the applicable version of this License or the recipients' rights hereunder. However, You may include an additional document offering the additional rights described in Section 3.5. 61 | 62 | 3.2. Availability of Source Code. 63 | Any Modification which You create or to which You contribute must be made available in Source Code form under the terms of this License either on the same media as an Executable version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an Executable version available; and if made available via Electronic Distribution Mechanism, must remain available for at least twelve (12) months after the date it initially became available, or at least six (6) months after a subsequent version of that particular Modification has been made available to such recipients. You are responsible for ensuring that the Source Code version remains available even if the Electronic Distribution Mechanism is maintained by a third party. 64 | 65 | 3.3. Description of Modifications. 66 | You must cause all Covered Code to which you contribute to contain a file documenting the changes You made to create that Covered Code and the date of any change. You must include a prominent statement that the Modification is derived, directly or indirectly, from Original Code provided by the Initial Developer and including the name of the Initial Developer in (a) the Source Code, and (b) in any notice in an Executable version or related documentation in which You describe the origin or ownership of the Covered Code. 67 | 68 | 3.4. Intellectual Property Matters 69 | 70 | (a) Third Party Claims. 71 | If You have knowledge that a party claims an intellectual property right in particular functionality or code (or its utilization under this License), you must include a text file with the source code distribution titled "LEGAL" which describes the claim and the party making the claim in sufficient detail that a recipient will know whom to contact. If you obtain such knowledge after You make Your Modification available as described in Section 3.2, You shall promptly modify the LEGAL file in all copies You make 72 | available thereafter and shall take other steps (such as notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who received the Covered Code that new knowledge has been obtained. 73 | 74 | (b) Contributor APIs. 75 | If Your Modification is an application programming interface and You own or control patents which are reasonably necessary to implement that API, you must also include this information in the LEGAL file. 76 | 77 | 3.5. Required Notices. 78 | You must duplicate the notice in Exhibit A in each file of the Source Code, and this License in any documentation for the Source Code, where You describe recipients' rights relating to Covered Code. If You created one or more Modification(s), You may add your name as a Contributor to the notice described in Exhibit A. If it is not possible to put such notice in a particular Source Code file due to its 79 | structure, then you must include such notice in a location (such as a relevant directory file) where a user would be likely to look for such a notice. You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Code. However, You may do so only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You must make it absolutely clear than any such warranty, support, indemnity or 80 | liability obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of 81 | warranty, support, indemnity or liability terms You offer. 82 | 83 | 3.6. Distribution of Executable Versions. 84 | You may distribute Covered Code in Executable form only if the requirements of Section 3.1-3.5 have been met for that Covered Code, and if You include a notice stating that the Source Code version of the Covered Code is available under the terms of this License, including a description of how and where You have fulfilled the obligations of Section 3.2. The notice must be conspicuously included in any notice in an Executable version, related documentation or collateral in which You 85 | describe recipients' rights relating to the Covered Code. You may distribute the Executable version of Covered Code under a license of Your choice, which may contain terms different from this License, 86 | provided that You are in compliance with the terms of this License and that the license for the Executable version does not attempt to limit or alter the recipient's rights in the Source Code version from the rights set forth in this License. If You distribute the Executable version under a different license You must make it absolutely clear that any terms which differ from this License are offered by You alone, not by the Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of any such terms You offer. 87 | 88 | 3.7. Larger Works. 89 | You may create a Larger Work by combining Covered Code with other code not governed by the terms of this License and distribute the Larger Work as a single product. In such a case, You must make sure the requirements of this License are fulfilled for the Covered Code. 90 | 91 | 4. Inability to Comply Due to Statute or Regulation. 92 | 93 | If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Code due to statute or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be included in the LEGAL file described in Section 3.4 and must be included with all distributions of the Source Code. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 94 | 95 | 5. Application of this License. 96 | 97 | This License applies to code to which the Initial Developer has attached the notice in Exhibit A, and to related Covered Code. 98 | 99 | 6. Versions of the License. 100 | 101 | 6.1. New Versions. 102 | Floris van den Berg may publish revised and/or new versions of the License from time to time. Each version will be given a distinguishing version number. 103 | 104 | 6.2. Effect of New Versions. 105 | Once Covered Code has been published under a particular version of the License, You may always continue to use it under the terms of that version. You may also choose to use such Covered Code under the terms of any subsequent version of the License published by Floris van den Berg 106 | No one other than Floris van den Berg has the right to modify the terms applicable to Covered Code created under this License. 107 | 108 | 6.3. Derivative Works. 109 | If you create or use a modified version of this License (which you may only do in order to apply it to code which is not already Covered Code governed by this License), you must (a) rename Your license so that the phrases "FreeImage", `FreeImage Public License", "FIPL", or any confusingly similar phrase do not appear anywhere in your license and (b) otherwise make it clear that your version of the license contains terms which differ from the FreeImage Public License. (Filling in the name of the Initial Developer, Original Code or Contributor in the notice described in Exhibit A shall not of themselves be deemed to be modifications of this License.) 110 | 111 | 7. DISCLAIMER OF WARRANTY. 112 | 113 | COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. 114 | 115 | 8. TERMINATION. 116 | 117 | This License and the rights granted hereunder will terminate automatically if You fail to comply with terms herein and fail to cure such breach within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which are properly granted shall survive any termination of this License. Provisions which, by their nature, must remain in effect beyond the termination of this License shall survive. 118 | 119 | 9. LIMITATION OF LIABILITY. 120 | 121 | UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL THE INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO YOU OR ANY OTHER PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE 122 | EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THAT EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. 123 | 124 | 10. U.S. GOVERNMENT END USERS. 125 | 126 | The Covered Code is a "commercial item," as that term is defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer software" and "commercial computer software documentation," as such terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users acquire Covered Code with only those rights set forth herein. 127 | 128 | 11. MISCELLANEOUS. 129 | 130 | This License represents the complete agreement concerning subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. This License shall be governed by Dutch law provisions (except to the extent applicable law, if any, provides otherwise), excluding its conflict-of-law provisions. With respect to disputes in which at least one party is a citizen of, or an entity chartered or registered to do business in, the The Netherlands: (a) unless otherwise agreed in writing, all disputes relating to this License (excepting any dispute relating to intellectual property rights) shall be subject to final and binding arbitration, with the losing party paying all costs of arbitration; (b) any arbitration relating to this Agreement shall be held in Almelo, The Netherlands; and (c) any litigation relating to this Agreement shall be subject to the jurisdiction of the court of Almelo, The Netherlands with the losing party responsible for costs, including without limitation, court costs and reasonable attorneys fees and expenses. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not apply to this License. 131 | 132 | 12. RESPONSIBILITY FOR CLAIMS. 133 | 134 | Except in cases where another Contributor has failed to comply with Section 3.4, You are responsible for damages arising, directly or indirectly, out of Your utilization of rights under this License, based 135 | on the number of copies of Covered Code you made available, the revenues you received from utilizing such rights, and other relevant factors. You agree to work with affected parties to distribute 136 | responsibility on an equitable basis. 137 | 138 | EXHIBIT A. 139 | 140 | "The contents of this file are subject to the FreeImage Public License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://home.wxs.nl/~flvdberg/freeimage-license.txt 141 | 142 | Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/FreeImage/include/FreeImage.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/FreeImage/include/FreeImage.h -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/Makefile: -------------------------------------------------------------------------------- 1 | # Location of the CUDA Toolkit 2 | CUDA_PATH ?= /usr/local/cuda 3 | 4 | # architecture 5 | HOST_ARCH := $(shell uname -m) 6 | TARGET_ARCH ?= $(HOST_ARCH) 7 | 8 | # Adjust this for ARMv7 with a 32-bit filesystem 9 | ifeq ($(TARGET_ARCH), aarch64) 10 | ifeq ($(shell file /sbin/init | grep 32-bit), 1) 11 | TARGET_ARCH=armv7l 12 | endif 13 | endif 14 | 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 16 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 17 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 18 | TARGET_SIZE := 64 19 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 20 | TARGET_SIZE := 32 21 | endif 22 | else 23 | TARGET_SIZE := $(shell getconf LONG_BIT) 24 | endif 25 | else 26 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 27 | endif 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 29 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 30 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 31 | endif 32 | endif 33 | 34 | # operating system 35 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 36 | TARGET_OS ?= $(HOST_OS) 37 | 38 | ifeq ($(TARGET_OS),QNX) 39 | override TARGET_OS := qnx 40 | endif 41 | 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 43 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 44 | endif 45 | 46 | # host compiler 47 | ifeq ($(TARGET_OS),darwin) 48 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 49 | HOST_COMPILER ?= clang++ 50 | endif 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 52 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 53 | ifeq ($(TARGET_OS),linux) 54 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 55 | else ifeq ($(TARGET_OS),qnx) 56 | ifeq ($(QNX_HOST),) 57 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 58 | endif 59 | ifeq ($(QNX_TARGET),) 60 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 61 | endif 62 | export QNX_HOST 63 | export QNX_TARGET 64 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 65 | else ifeq ($(TARGET_OS),android) 66 | HOST_COMPILER ?= arm-linux-androideabi-g++ 67 | endif 68 | else ifeq ($(TARGET_ARCH),aarch64) 69 | ifeq ($(TARGET_OS), linux) 70 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 71 | else ifeq ($(TARGET_OS), android) 72 | HOST_COMPILER ?= aarch64-linux-android-g++ 73 | endif 74 | else ifeq ($(TARGET_ARCH),ppc64le) 75 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 76 | endif 77 | endif 78 | HOST_COMPILER ?= g++ 79 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 80 | 81 | # internal flags 82 | NVCCFLAGS := -m${TARGET_SIZE} 83 | CCFLAGS := 84 | LDFLAGS := 85 | 86 | # build flags 87 | ifeq ($(TARGET_OS),darwin) 88 | LDFLAGS += -rpath $(CUDA_PATH)/lib 89 | CCFLAGS += -arch $(HOST_ARCH) 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 91 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 92 | CCFLAGS += -mfloat-abi=hard 93 | else ifeq ($(TARGET_OS),android) 94 | LDFLAGS += -pie 95 | CCFLAGS += -fpie -fpic -fexceptions 96 | endif 97 | 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 99 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 100 | ifneq ($(TARGET_FS),) 101 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 102 | ifeq ($(GCCVERSIONLTEQ46),1) 103 | CCFLAGS += --sysroot=$(TARGET_FS) 104 | endif 105 | LDFLAGS += --sysroot=$(TARGET_FS) 106 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 107 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 108 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 109 | endif 110 | endif 111 | endif 112 | 113 | # Debug build flags 114 | ifeq ($(dbg),1) 115 | NVCCFLAGS += -g -G 116 | BUILD_TYPE := debug 117 | else 118 | BUILD_TYPE := release 119 | endif 120 | 121 | ALL_CCFLAGS := 122 | ALL_CCFLAGS += $(NVCCFLAGS) 123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 126 | 127 | SAMPLE_ENABLED := 1 128 | 129 | ALL_LDFLAGS := 130 | ALL_LDFLAGS += $(ALL_CCFLAGS) 131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 133 | 134 | # Common includes and paths for CUDA 135 | ifneq ($(TARGET_ARCH), ppc64le) 136 | INCLUDES := -I$(CUDA_PATH)/include 137 | else 138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include 139 | endif 140 | LIBRARIES := 141 | 142 | ################################################################################ 143 | 144 | # Gencode arguments 145 | SMS ?= 30 35 50 53 146 | 147 | ifeq ($(SMS),) 148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 149 | SAMPLE_ENABLED := 0 150 | endif 151 | 152 | ifeq ($(GENCODE_FLAGS),) 153 | # Generate SASS code for each SM architecture listed in $(SMS) 154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 155 | 156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 157 | HIGHEST_SM := $(lastword $(sort $(SMS))) 158 | ifneq ($(HIGHEST_SM),) 159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 160 | endif 161 | endif 162 | 163 | INCLUDES += -IFreeImage/include 164 | LIBRARIES += -LFreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH) -LFreeImage/lib/$(TARGET_OS) -lcudart -lcublas -lcudnn -lfreeimage -lstdc++ -lm 165 | 166 | # Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up. 167 | $(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(LIBRARIES) -l freeimage test.c) 168 | FREEIMAGE := $(shell find a.out 2>/dev/null) 169 | $(shell rm a.out test.c 2>/dev/null) 170 | 171 | ifeq ("$(FREEIMAGE)","") 172 | $(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<) 173 | SAMPLE_ENABLED := 0 174 | endif 175 | 176 | ifeq ($(SAMPLE_ENABLED),0) 177 | EXEC ?= @echo "[@]" 178 | endif 179 | 180 | ################################################################################ 181 | 182 | # Target rules 183 | all: build 184 | 185 | build: mnistCUDNN 186 | 187 | check.deps: 188 | ifeq ($(SAMPLE_ENABLED),0) 189 | @echo "Sample will be waived due to the above missing dependencies" 190 | else 191 | @echo "Sample is ready - all dependencies have been met" 192 | endif 193 | 194 | OBJ = fp16_dev.o fp16_emu.o mnistCUDNN.o 195 | 196 | mnistCUDNN: $(OBJ) 197 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 198 | 199 | %.o: %.cpp 200 | $(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $< 201 | 202 | %.o: %.cu 203 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 204 | 205 | run: build 206 | $(EXEC) ./mnistCUDNN 207 | 208 | clean: 209 | rm -rf *o 210 | rm -rf mnistCUDNN 211 | 212 | clobber: clean 213 | -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/conv1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv1.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/conv1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv1.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/conv2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv2.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/conv2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/conv2.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/five_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/five_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/ip1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip1.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/ip1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip1.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/ip2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip2.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/ip2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/ip2.bin -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/one_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/one_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/data/three_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v6/mnistCUDNN/data/three_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/error_util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_ERROR_UTIL_H_) 13 | #define _ERROR_UTIL_H_ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define TOSTR_(s) #s 21 | #define TOSTR(s) TOSTR_(s) 22 | #if defined(__GNUC__) 23 | #define COMPILER_NAME "GCC" 24 | #define COMPILER_VER TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__) 25 | #elif defined(_MSC_VER) 26 | #if _MSC_VER < 1500 27 | #define COMPILER_NAME "MSVC_2005" 28 | #elif _MSC_VER < 1600 29 | #define COMPILER_NAME "MSVC_2008" 30 | #elif _MSC_VER < 1700 31 | #define COMPILER_NAME "MSVC_2010" 32 | #elif _MSC_VER < 1800 33 | #define COMPILER_NAME "MSVC_2012" 34 | #elif _MSC_VER < 1900 35 | #define COMPILER_NAME "MSVC_2013" 36 | #elif _MSC_VER < 2000 37 | #define COMPILER_NAME "MSVC_2014" 38 | #else 39 | #define COMPILER_NAME "MSVC" 40 | #endif 41 | #define COMPILER_VER TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD) 42 | #elif defined(__clang_major__) 43 | #define COMPILER_NAME "CLANG" 44 | #define COMPILER_VER TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__) 45 | #elif defined(__INTEL_COMPILER) 46 | #define COMPILER_NAME "ICC" 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE) 48 | #else 49 | #define COMPILER_NAME "unknown" 50 | #define COMPILER_VER "???" 51 | #endif 52 | 53 | #define CUDNN_VERSION_STR TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL) 54 | 55 | #define FatalError(s) { \ 56 | std::stringstream _where, _message; \ 57 | _where << __FILE__ << ':' << __LINE__; \ 58 | _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\ 59 | std::cerr << _message.str() << "\nAborting...\n"; \ 60 | cudaDeviceReset(); \ 61 | exit(EXIT_FAILURE); \ 62 | } 63 | 64 | #define checkCUDNN(status) { \ 65 | std::stringstream _error; \ 66 | if (status != CUDNN_STATUS_SUCCESS) { \ 67 | _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \ 68 | FatalError(_error.str()); \ 69 | } \ 70 | } 71 | 72 | #define checkCudaErrors(status) { \ 73 | std::stringstream _error; \ 74 | if (status != 0) { \ 75 | _error << "Cuda failure\nError: " << cudaGetErrorString(status); \ 76 | FatalError(_error.str()); \ 77 | } \ 78 | } 79 | 80 | #define checkCublasErrors(status) { \ 81 | std::stringstream _error; \ 82 | if (status != 0) { \ 83 | _error << "Cublas failure\nError code " << status; \ 84 | FatalError(_error.str()); \ 85 | } \ 86 | } 87 | 88 | // CUDA Utility Helper Functions 89 | 90 | static void showDevices( void ) 91 | { 92 | int totalDevices; 93 | checkCudaErrors(cudaGetDeviceCount( &totalDevices )); 94 | printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices); 95 | for (int i=0; i< totalDevices; i++) { 96 | struct cudaDeviceProp prop; 97 | checkCudaErrors(cudaGetDeviceProperties( &prop, i )); 98 | printf( "device %d : sms %2d Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n", 99 | i, prop.multiProcessorCount, prop.major, prop.minor, 100 | (float)prop.clockRate*1e-3, 101 | (int)(prop.totalGlobalMem/(1024*1024)), 102 | (float)prop.memoryClockRate*1e-3, 103 | prop.ECCEnabled, 104 | prop.multiGpuBoardGroupID); 105 | } 106 | } 107 | 108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 109 | #ifndef _CRT_SECURE_NO_DEPRECATE 110 | #define _CRT_SECURE_NO_DEPRECATE 111 | #endif 112 | #ifndef STRNCASECMP 113 | #define STRNCASECMP _strnicmp 114 | #endif 115 | #else // Linux Includes 116 | #include 117 | #include 118 | #ifndef STRNCASECMP 119 | #define STRNCASECMP strncasecmp 120 | #endif 121 | #endif 122 | inline int stringRemoveDelimiter(char delimiter, const char *string) 123 | { 124 | int string_start = 0; 125 | 126 | while (string[string_start] == delimiter) 127 | { 128 | string_start++; 129 | } 130 | 131 | if (string_start >= (int)strlen(string)-1) 132 | { 133 | return 0; 134 | } 135 | 136 | return string_start; 137 | } 138 | 139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 140 | { 141 | bool bFound = false; 142 | 143 | if (argc >= 1) 144 | { 145 | for (int i=1; i < argc; i++) 146 | { 147 | int string_start = stringRemoveDelimiter('-', argv[i]); 148 | const char *string_argv = &argv[i][string_start]; 149 | 150 | const char *equal_pos = strchr(string_argv, '='); 151 | int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 152 | 153 | int length = (int)strlen(string_ref); 154 | 155 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 156 | { 157 | bFound = true; 158 | continue; 159 | } 160 | } 161 | } 162 | 163 | return bFound; 164 | } 165 | 166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 167 | { 168 | bool bFound = false; 169 | int value = -1; 170 | 171 | if (argc >= 1) 172 | { 173 | for (int i=1; i < argc; i++) 174 | { 175 | int string_start = stringRemoveDelimiter('-', argv[i]); 176 | const char *string_argv = &argv[i][string_start]; 177 | int length = (int)strlen(string_ref); 178 | 179 | if (!STRNCASECMP(string_argv, string_ref, length)) 180 | { 181 | if (length+1 <= (int)strlen(string_argv)) 182 | { 183 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 184 | value = atoi(&string_argv[length + auto_inc]); 185 | } 186 | else 187 | { 188 | value = 0; 189 | } 190 | 191 | bFound = true; 192 | continue; 193 | } 194 | } 195 | } 196 | 197 | if (bFound) 198 | { 199 | return value; 200 | } 201 | else 202 | { 203 | printf("Not found int\n"); 204 | return 0; 205 | } 206 | } 207 | 208 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 209 | const char *string_ref, char **string_retval) 210 | { 211 | bool bFound = false; 212 | 213 | if (argc >= 1) 214 | { 215 | for (int i=1; i < argc; i++) 216 | { 217 | int string_start = stringRemoveDelimiter('-', argv[i]); 218 | char *string_argv = (char *)&argv[i][string_start]; 219 | int length = (int)strlen(string_ref); 220 | 221 | if (!STRNCASECMP(string_argv, string_ref, length)) 222 | { 223 | *string_retval = &string_argv[length+1]; 224 | bFound = true; 225 | continue; 226 | } 227 | } 228 | } 229 | 230 | if (!bFound) 231 | { 232 | *string_retval = NULL; 233 | } 234 | 235 | return bFound; 236 | } 237 | 238 | #endif // _ERROR_UTIL_H_ -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/fp16_dev.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | #include "error_util.h" 12 | 13 | #include "fp16_dev.h" 14 | 15 | #define BLOCK_SIZE 128 16 | template 17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, 18 | half1 *buffOut) 19 | { 20 | const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x; 21 | if (idx >= size) return; 22 | half1 val; 23 | val.x = __float2half_rn(float(buffIn[idx])); 24 | buffOut[idx] = val; 25 | } 26 | 27 | template 28 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut) 29 | { 30 | int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; 31 | float2half_rn_kernel<<>> 32 | (size, buffIn, buffOut); 33 | checkCudaErrors(cudaDeviceSynchronize()); 34 | } 35 | 36 | template void gpu_float2half_rn (int, const float*, half1*); 37 | template void gpu_float2half_rn (int, const double*, half1*); -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/fp16_dev.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | #if !defined(_FP16_DEV_H_) 12 | #define _FP16_DEV_H_ 13 | 14 | #include 15 | 16 | typedef struct __align__(2) { 17 | unsigned short x; 18 | } half1; 19 | 20 | template 21 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut); 22 | 23 | #endif // _FP16_DEV_H_ -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/fp16_emu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | #include "fp16_emu.h" 51 | 52 | // Host functions for converting between FP32 and FP16 formats 53 | // Paulius Micikevicius (pauliusm@nvidia.com) 54 | 55 | half1 cpu_float2half_rn(float f) 56 | { 57 | half1 ret; 58 | 59 | unsigned x = *((int*)(void*)(&f)); 60 | unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; 61 | unsigned sign, exponent, mantissa; 62 | 63 | // Get rid of +NaN/-NaN case first. 64 | if (u > 0x7f800000) { 65 | ret.x = 0x7fffU; 66 | return ret; 67 | } 68 | 69 | sign = ((x >> 16) & 0x8000); 70 | 71 | // Get rid of +Inf/-Inf, +0/-0. 72 | if (u > 0x477fefff) { 73 | ret.x = sign | 0x7c00U; 74 | return ret; 75 | } 76 | if (u < 0x33000001) { 77 | ret.x = (sign | 0x0000); 78 | return ret; 79 | } 80 | 81 | exponent = ((u >> 23) & 0xff); 82 | mantissa = (u & 0x7fffff); 83 | 84 | if (exponent > 0x70) { 85 | shift = 13; 86 | exponent -= 0x70; 87 | } else { 88 | shift = 0x7e - exponent; 89 | exponent = 0; 90 | mantissa |= 0x800000; 91 | } 92 | lsb = (1 << shift); 93 | lsb_s1 = (lsb >> 1); 94 | lsb_m1 = (lsb - 1); 95 | 96 | // Round to nearest even. 97 | remainder = (mantissa & lsb_m1); 98 | mantissa >>= shift; 99 | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { 100 | ++mantissa; 101 | if (!(mantissa & 0x3ff)) { 102 | ++exponent; 103 | mantissa = 0; 104 | } 105 | } 106 | 107 | ret.x = (sign | (exponent << 10) | mantissa); 108 | 109 | return ret; 110 | } 111 | 112 | 113 | float cpu_half2float(half1 h) 114 | { 115 | unsigned sign = ((h.x >> 15) & 1); 116 | unsigned exponent = ((h.x >> 10) & 0x1f); 117 | unsigned mantissa = ((h.x & 0x3ff) << 13); 118 | 119 | if (exponent == 0x1f) { /* NaN or Inf */ 120 | mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); 121 | exponent = 0xff; 122 | } else if (!exponent) { /* Denorm or Zero */ 123 | if (mantissa) { 124 | unsigned int msb; 125 | exponent = 0x71; 126 | do { 127 | msb = (mantissa & 0x400000); 128 | mantissa <<= 1; /* normalize */ 129 | --exponent; 130 | } while (!msb); 131 | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ 132 | } 133 | } else { 134 | exponent += 0x70; 135 | } 136 | 137 | int temp = ((sign << 31) | (exponent << 23) | mantissa); 138 | 139 | return *((float*)((void*)&temp)); 140 | } 141 | 142 | -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/fp16_emu.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | // Conversion from/to 16-bit floating point (half-precision). 51 | 52 | #if !defined(_FP16_EMU_H_) 53 | #define _FP16_EMU_H_ 54 | 55 | #include "fp16_dev.h" 56 | 57 | #define HLF_EPSILON 4.887581E-04 58 | #define HLF_MIN 6.103516E-05 59 | #define HLF_MAX 6.550400E+04 60 | 61 | half1 cpu_float2half_rn(float f); 62 | 63 | float cpu_half2float(half1 h); 64 | 65 | static __inline__ __device__ __host__ half1 habs(half1 h) 66 | { 67 | h.x &= 0x7fffU; 68 | return h; 69 | } 70 | 71 | static __inline__ __device__ __host__ half1 hneg(half1 h) 72 | { 73 | h.x ^= 0x8000U; 74 | return h; 75 | } 76 | 77 | static __inline__ __device__ __host__ int ishnan(half1 h) 78 | { 79 | // When input is NaN, exponent is all ones and mantissa is non-zero. 80 | return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) != 0; 81 | } 82 | 83 | static __inline__ __device__ __host__ int ishinf(half1 h) 84 | { 85 | // When input is +/- inf, exponent is all ones and mantissa is zero. 86 | return (h.x & 0x7c00U) == 0x7c00U && (h.x & 0x03ffU) == 0; 87 | } 88 | 89 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y) 90 | { 91 | return ishnan(x) == 0 && ishnan(y) == 0 && x.x == y.x; 92 | } 93 | 94 | // Returns 0.0000 in FP16 binary form 95 | static __inline__ __device__ __host__ half1 hzero() 96 | { 97 | half1 ret; 98 | ret.x = 0x0000U; 99 | return ret; 100 | } 101 | 102 | // Returns 1.0000 in FP16 binary form 103 | static __inline__ __device__ __host__ half1 hone() 104 | { 105 | half1 ret; 106 | ret.x = 0x3c00U; 107 | return ret; 108 | } 109 | 110 | // Returns quiet NaN, the most significant fraction bit #9 is set 111 | static __inline__ __device__ __host__ half1 hnan() 112 | { 113 | half1 ret; 114 | ret.x = 0x7e00U; 115 | return ret; 116 | } 117 | 118 | // Largest positive FP16 value, corresponds to 6.5504e+04 119 | static __inline__ __device__ __host__ half1 hmax() 120 | { 121 | half1 ret; 122 | // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff) 123 | ret.x = 0x7bffU; 124 | return ret; 125 | } 126 | 127 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05 128 | static __inline__ __device__ __host__ half1 hmin() 129 | { 130 | half1 ret; 131 | // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits) 132 | ret.x = 0x0400U; 133 | return ret; 134 | } 135 | 136 | #endif // _FP16_EMU_H_ 137 | 138 | -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/gemv.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_GEMV_H_) 13 | #define _GEMV_H_ 14 | 15 | #include // CUDA_VERSION 16 | #include 17 | #include "error_util.h" 18 | 19 | //#define DISABLE_GEMV 20 | 21 | void gemv(cublasHandle_t cublasHandle, int m, int n, double alpha, 22 | const double *A, const double *x, 23 | double beta, double *y) 24 | { 25 | #ifdef DISABLE_GEMV 26 | checkCublasErrors( cublasDgemm (cublasHandle, 27 | CUBLAS_OP_T, 28 | CUBLAS_OP_N, 29 | n, 30 | 1, 31 | m, 32 | &alpha, 33 | A, 34 | m, 35 | x, 36 | m, 37 | &beta, 38 | y, 39 | m) ); 40 | #else 41 | checkCublasErrors( cublasDgemv(cublasHandle, CUBLAS_OP_T, 42 | m, n, 43 | &alpha, 44 | A, m, 45 | x, 1, 46 | &beta, 47 | y, 1) ); 48 | #endif 49 | }; 50 | 51 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 52 | const float *A, const float *x, 53 | float beta, float *y) 54 | { 55 | #ifdef DISABLE_GEMV 56 | checkCublasErrors( cublasSgemm (cublasHandle, 57 | CUBLAS_OP_T, 58 | CUBLAS_OP_N, 59 | n, 60 | 1, 61 | m, 62 | &alpha, 63 | A, 64 | m, 65 | x, 66 | m, 67 | &beta, 68 | y, 69 | m) ); 70 | #else 71 | checkCublasErrors( cublasSgemv(cublasHandle, CUBLAS_OP_T, 72 | m, n, 73 | &alpha, 74 | A, m, 75 | x, 1, 76 | &beta, 77 | y, 1) ); 78 | #endif 79 | }; 80 | 81 | #if defined(CUDA_VERSION) && (CUDA_VERSION > 7000) 82 | 83 | #if (CUDA_VERSION < 8000) 84 | #define CUDA_R_16F CUBLAS_DATA_HALF 85 | #endif 86 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 87 | const half1 *A, const half1 *x, 88 | float beta, half1 *y) 89 | { 90 | checkCublasErrors( cublasSgemmEx ( cublasHandle, 91 | CUBLAS_OP_T, 92 | CUBLAS_OP_N, 93 | n, 94 | 1, 95 | m, 96 | &alpha, 97 | A, 98 | CUDA_R_16F, 99 | m, 100 | x, 101 | CUDA_R_16F, 102 | m, 103 | &beta, 104 | y, 105 | CUDA_R_16F, 106 | m) ); 107 | }; 108 | #endif 109 | 110 | #endif // _GEMV_H_ 111 | -------------------------------------------------------------------------------- /cudnn_samples_v6/mnistCUDNN/readme.txt: -------------------------------------------------------------------------------- 1 | This sample demonstrates how to use cuDNN library to implement forward pass 2 | given a trained network. 3 | 4 | The sample is based on "Training LeNet on MNIST with Caffe" tutorial, located 5 | at http://caffe.berkeleyvision.org/. The network is identical with the exception 6 | of addition of LRN layer. All the network weights are obtained and exported 7 | using Caffe. 8 | 9 | Network layer topology: 10 | 11 | 1. Convolution 12 | 2. Pooling 13 | 3. Convolution 14 | 4. Pooling 15 | 5. Fully connected 16 | 6. Relu 17 | 7. LRN 18 | 8. Fully Connected 19 | 9. SoftMax 20 | 21 | By default, the sample will classify three images, located in "data" directory 22 | using precomputed network weights: 23 | 1) Two convolution layers and their bias: conv1.bias.bin conv1.bin conv2.bias.bin conv2.bin 24 | 2) Two fully connected layers and their bias: ip1.bias.bin ip1.bin ip2.bias.bin ip2.bin 25 | 26 | Supported platforms: identical to cuDNN 27 | 28 | How to run: 29 | 30 | mnistCUDNN {} 31 | help : display this help 32 | device= : set the device to run the sample 33 | image= : classify specific image 34 | 35 | New in version 3 release 36 | fp16 (three ways of conversion: on host, on device using cuDNN, on device using CUDA) 37 | Local Response Normalization (LRN) 38 | Find fastest config (cudnnFindConvolutionForwardAlgorithm) 39 | FFT convolution 40 | Demonstrate Nd API (first available in cuDNN v2) 41 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/Makefile: -------------------------------------------------------------------------------- 1 | # Location of the CUDA Toolkit 2 | CUDA_PATH ?= /usr/local/cuda 3 | 4 | # architecture 5 | HOST_ARCH := $(shell uname -m) 6 | TARGET_ARCH ?= $(HOST_ARCH) 7 | 8 | # Adjust this for ARMv7 with a 32-bit filesystem 9 | ifeq ($(TARGET_ARCH), aarch64) 10 | ifeq ($(shell file /sbin/init | grep 32-bit), 1) 11 | TARGET_ARCH=armv7l 12 | endif 13 | endif 14 | 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 16 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 17 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 18 | TARGET_SIZE := 64 19 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 20 | TARGET_SIZE := 32 21 | endif 22 | else 23 | TARGET_SIZE := $(shell getconf LONG_BIT) 24 | endif 25 | else 26 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 27 | endif 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 29 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 30 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 31 | endif 32 | endif 33 | 34 | # operating system 35 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 36 | TARGET_OS ?= $(HOST_OS) 37 | 38 | ifeq ($(TARGET_OS),QNX) 39 | TARGET_OS := qnx 40 | endif 41 | 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx QNX android)) 43 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 44 | endif 45 | 46 | # host compiler 47 | ifeq ($(TARGET_OS),darwin) 48 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 49 | HOST_COMPILER ?= clang++ 50 | endif 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 52 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 53 | ifeq ($(TARGET_OS),linux) 54 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 55 | else ifeq ($(TARGET_OS),qnx) 56 | ifeq ($(QNX_HOST),) 57 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 58 | endif 59 | ifeq ($(QNX_TARGET),) 60 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 61 | endif 62 | export QNX_HOST 63 | export QNX_TARGET 64 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 65 | else ifeq ($(TARGET_OS),android) 66 | HOST_COMPILER ?= arm-linux-androideabi-g++ 67 | endif 68 | else ifeq ($(TARGET_ARCH),aarch64) 69 | ifeq ($(TARGET_OS), linux) 70 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 71 | else ifeq ($(TARGET_OS), android) 72 | HOST_COMPILER ?= aarch64-linux-android-g++ 73 | endif 74 | else ifeq ($(TARGET_ARCH),ppc64le) 75 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 76 | endif 77 | endif 78 | HOST_COMPILER ?= g++ 79 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 80 | 81 | # internal flags 82 | NVCCFLAGS := -m${TARGET_SIZE} 83 | CCFLAGS := 84 | LDFLAGS := 85 | 86 | # build flags 87 | ifeq ($(TARGET_OS),darwin) 88 | LDFLAGS += -rpath $(CUDA_PATH)/lib 89 | CCFLAGS += -arch $(HOST_ARCH) 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 91 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 92 | CCFLAGS += -mfloat-abi=hard 93 | else ifeq ($(TARGET_OS),android) 94 | LDFLAGS += -pie 95 | CCFLAGS += -fpie -fpic -fexceptions 96 | endif 97 | 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 99 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 100 | ifneq ($(TARGET_FS),) 101 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 102 | ifeq ($(GCCVERSIONLTEQ46),1) 103 | CCFLAGS += --sysroot=$(TARGET_FS) 104 | endif 105 | LDFLAGS += --sysroot=$(TARGET_FS) 106 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 107 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 108 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 109 | endif 110 | endif 111 | endif 112 | 113 | # Debug build flags 114 | ifeq ($(dbg),1) 115 | NVCCFLAGS += -g -G 116 | BUILD_TYPE := debug 117 | else 118 | BUILD_TYPE := release 119 | endif 120 | 121 | ALL_CCFLAGS := 122 | ALL_CCFLAGS += $(NVCCFLAGS) 123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 126 | 127 | SAMPLE_ENABLED := 1 128 | 129 | ALL_LDFLAGS := 130 | ALL_LDFLAGS += $(ALL_CCFLAGS) 131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 133 | 134 | # Common includes and paths for CUDA 135 | ifneq ($(TARGET_ARCH), ppc64le) 136 | INCLUDES := -I$(CUDA_PATH)/include 137 | else 138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include 139 | endif 140 | LIBRARIES := 141 | 142 | ################################################################################ 143 | 144 | # Gencode arguments 145 | SMS ?= 30 35 50 53 146 | 147 | ifeq ($(SMS),) 148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 149 | SAMPLE_ENABLED := 0 150 | endif 151 | 152 | ifeq ($(GENCODE_FLAGS),) 153 | # Generate SASS code for each SM architecture listed in $(SMS) 154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 155 | 156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 157 | HIGHEST_SM := $(lastword $(sort $(SMS))) 158 | ifneq ($(HIGHEST_SM),) 159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 160 | endif 161 | endif 162 | 163 | INCLUDES += -I. 164 | LIBRARIES += -L. -lcublas -lcudnn -lcudart -lstdc++ -lm 165 | 166 | ifeq ($(SAMPLE_ENABLED),0) 167 | EXEC ?= @echo "[@]" 168 | endif 169 | 170 | ################################################################################ 171 | 172 | # Target rules 173 | all: build 174 | 175 | build: RNN 176 | 177 | check.deps: 178 | ifeq ($(SAMPLE_ENABLED),0) 179 | @echo "Sample will be waived due to the above missing dependencies" 180 | else 181 | @echo "Sample is ready - all dependencies have been met" 182 | endif 183 | 184 | OBJ = RNN_example.o 185 | 186 | RNN: $(OBJ) 187 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 188 | 189 | %.o: %.cu 190 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 191 | 192 | run: build 193 | $(EXEC) ./RNN 100 4 512 64 2 194 | 195 | clean: 196 | rm -rf *o 197 | rm -rf RNN 198 | 199 | clobber: clean 200 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/compare.py: -------------------------------------------------------------------------------- 1 | #This script can compare the result files with the golden files and report the status: pass or failed\ 2 | #Usage: python compare_result.py results.txt golden.txt 3 | import os, sys, re 4 | 5 | patterns = ['{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)\s+{key3}\s+checksum\s+([.eE+0-9]+)', #3 similar keys as below each line 6 | '{key1}\s+checksum\s+([.eE+0-9]+)\s+{key2}\s+checksum\s+([.eE+0-9]+)', #2 similar keys as below each line 7 | '{key}\s+checksum\s+([.eE+0-9]+)', #one key each line: di checksum 6.676003E+01 8 | '{key}[: ]+([0-9]+)\s+GFLOPS[, ]+\\(([0-9]+)\s+GFLOPS\\)[, ]+\\(([0-9]+)\s+GFLOPS\\)', #1 key each line with more returns 9 | '{key}[: ]+([0-9]+)\s+GFLOPS'] #one key each line: Forward: 673 GFLOPS 10 | #keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw', 'Backward', 'Forward'] 11 | keys = [('i', 'c', 'h'), ('di', 'dc', 'dh'), ('i', 'h'), ('di', 'dh'), 'dw'] # skip the last 2 targets 12 | pats = [0,0,1,1,2,3,4] 13 | datnum = [len(k) if isinstance(k, tuple) else (3 if k == 'Backward' else 1) for k in keys] 14 | #tol = 1.0e-3 15 | def compare_results(ftarget, fgolden): 16 | assert ftarget and fgolden, 'No enough input files given!' 17 | print ftarget, fgolden 18 | targ, _ = get_results_from_file(ftarget) 19 | golden, tol = get_results_from_file(fgolden, golden=True) 20 | 21 | ret = 0 22 | assert targ and golden, 'targets or golen results not generated!' 23 | for k, vals in golden.iteritems(): 24 | if not isinstance(vals, list): 25 | vals = [vals] 26 | targ[k] = [targ[k]] 27 | for idx, v in enumerate(vals): 28 | tval = float(targ[k][idx]) 29 | gval = float(v) 30 | err = None 31 | if tol[k]['type'] == 'rel': 32 | err = abs((tval-gval)/max(gval,tval)) # clamp rel_err <= 1 33 | elif tol[k]['type'] == 'abs': 34 | err = abs(tval-gval) 35 | assert err is not None, 'Error is Empty!' 36 | tol_i = tol[k]['val'] 37 | #print 'k,t,g,err',k,tval, gval, err 38 | if err > tol_i: 39 | print 'FAILED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i) 40 | ret = 1 41 | else: 42 | print 'PASSED %s=%s Error: %.2e vs. golden (%s) with tol (%.2e)'%(k, targ[k][idx], err, v, tol_i) 43 | if ret == 0: 44 | print 'ALL PASSED' 45 | return ret 46 | 47 | def _get_tolerance_line(line): 48 | """get a data item for a tolerance line with format (each line only one item): 49 | i: type=rel, 1e-3 50 | """ 51 | assert line, 'Empty line!' 52 | line = line.strip().replace(' ','') 53 | stmp = line.split(':') 54 | key = stmp[0] 55 | _type, _val = stmp[1].split(',') 56 | _type = _type.split('=')[-1] 57 | tol={key:{'type':_type, 'val':float(_val)}} 58 | return tol 59 | 60 | def get_results_from_file(fname, golden=False): 61 | assert fname, 'No file name given!' 62 | ret = {} 63 | tol = {} 64 | is_tolerance = False 65 | with open(fname, 'r') as fin: 66 | lines = fin.readlines() 67 | if len(lines) == 1: 68 | lines = lines[0].split('\r') 69 | for idx, line in enumerate(lines): 70 | line = line.strip() 71 | if not line: 72 | continue 73 | val = get_valpat_line(line) 74 | if val: 75 | ret = dict(ret, **val) 76 | if golden: 77 | if 'TOLERANCE' in line: # the next line is the tol value 78 | is_tolerance = True 79 | elif is_tolerance: 80 | _tol = _get_tolerance_line(line) 81 | tol = dict(tol, **_tol) 82 | 83 | return ret, tol 84 | 85 | def get_valpat_line(line): 86 | for idx, key in enumerate(keys): 87 | Ndat = datnum[idx] 88 | if isinstance(key, tuple): 89 | format_expr = {} 90 | for j in range(Ndat): 91 | format_expr['key%d'%(j+1)] = keys[idx][j] 92 | ret = re.search(patterns[pats[idx]].format(**format_expr), line) 93 | if ret: 94 | vals = {} 95 | for j in range(Ndat): 96 | vals[key[j]] = ret.group(j+1) 97 | return vals 98 | else: 99 | ret = re.search(patterns[pats[idx]].format(key=key), line) 100 | if ret: 101 | if Ndat >1: 102 | #print Ndat, key, datnum, idx 103 | return {key:[ret.group(j+1) for j in range(Ndat)]} 104 | else: 105 | return {key:ret.group(1)} 106 | return None 107 | 108 | def str_test(): 109 | s='Forward: 673 GFLOPS' 110 | s1='Backward: 835 GFLOPS, (654 GFLOPS), (1155 GFLOPS)' 111 | s2='i checksum 1.315793E+06 h checksum 1.315212E+05' 112 | s3='di checksum 6.676003E+01 dh checksum 6.425050E+01' 113 | s4='dw checksum 1.453750E+09' 114 | print get_valpat_line(s1) 115 | print get_valpat_line(s) 116 | print get_valpat_line(s2) 117 | print get_valpat_line(s3) 118 | print get_valpat_line(s4) 119 | if __name__ == '__main__': 120 | #str_test() 121 | #print get_results_from_file('results.txt') 122 | #print get_results_from_file('golden.txt', golden=True) 123 | sys.exit(compare_results(sys.argv[1], sys.argv[2])) 124 | 125 | 126 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/golden_1.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 1250 GFLOPS 3 | Backward: 1896 GFLOPS, (1299 GFLOPS), (3511 GFLOPS) 4 | i checksum 1.315793E+06 h checksum 1.315212E+05 5 | di checksum 6.676003E+01 dh checksum 6.425050E+01 6 | dw checksum 1.453750E+09 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/golden_2.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 1225 GFLOPS 3 | Backward: 1910 GFLOPS, (1299 GFLOPS), (3601 GFLOPS) 4 | i checksum 6.319591E+05 h checksum 6.319605E+04 5 | di checksum 4.501830E+00 dh checksum 4.489543E+00 6 | dw checksum 5.012598E+07 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/golden_3.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 2569 GFLOPS 3 | Backward: 2654 GFLOPS, (2071 GFLOPS), (3694 GFLOPS) 4 | i checksum 5.749536E+05 c checksum 4.365091E+05 h checksum 5.774818E+04 5 | di checksum 3.842206E+02 dc checksum 9.323785E+03 dh checksum 1.182562E+01 6 | dw checksum 4.313461E+08 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | c: type=rel, 1e-3 13 | dc: type=rel, 1e-3 14 | di: type=rel, 1e-3 15 | dh: type=rel, 1e-3 16 | dw: type=rel, 1e-3 17 | 18 | -------------------------------------------------------------------------------- /cudnn_samples_v7/RNN/golden_4.txt: -------------------------------------------------------------------------------- 1 | ------------GOLDEN------------ 2 | Forward: 2310 GFLOPS 3 | Backward: 2536 GFLOPS, (1955 GFLOPS), (3606 GFLOPS) 4 | i checksum 6.358978E+05 h checksum 6.281680E+04 5 | di checksum 6.296622E+00 dh checksum 2.289960E+05 6 | dw checksum 5.397419E+07 7 | -----------TOLERANCE----------- 8 | Forward: type=rel, 1 9 | Backward: type=rel, 1 10 | i: type=rel, 1e-3 11 | h: type=rel, 1e-3 12 | di: type=rel, 1e-3 13 | dh: type=rel, 1e-3 14 | dw: type=rel, 1e-3 15 | 16 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/Makefile: -------------------------------------------------------------------------------- 1 | # Location of the CUDA Toolkit 2 | CUDA_PATH ?= /usr/local/cuda 3 | 4 | # architecture 5 | HOST_ARCH := $(shell uname -m) 6 | TARGET_ARCH ?= $(HOST_ARCH) 7 | 8 | # Adjust this for ARMv7 with a 32-bit filesystem 9 | ifeq ($(TARGET_ARCH), aarch64) 10 | ifeq ($(shell file /sbin/init | grep 32-bit), 1) 11 | TARGET_ARCH=armv7l 12 | endif 13 | endif 14 | 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 16 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 17 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 18 | TARGET_SIZE := 64 19 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 20 | TARGET_SIZE := 32 21 | endif 22 | else 23 | TARGET_SIZE := $(shell getconf LONG_BIT) 24 | endif 25 | else 26 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 27 | endif 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 29 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 30 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 31 | endif 32 | endif 33 | 34 | # operating system 35 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 36 | TARGET_OS ?= $(HOST_OS) 37 | 38 | ifeq ($(TARGET_OS),QNX) 39 | override TARGET_OS := qnx 40 | endif 41 | 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 43 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 44 | endif 45 | 46 | # host compiler 47 | ifeq ($(TARGET_OS),darwin) 48 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 49 | HOST_COMPILER ?= clang++ 50 | endif 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 52 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 53 | ifeq ($(TARGET_OS),linux) 54 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 55 | else ifeq ($(TARGET_OS),qnx) 56 | ifeq ($(QNX_HOST),) 57 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 58 | endif 59 | ifeq ($(QNX_TARGET),) 60 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 61 | endif 62 | export QNX_HOST 63 | export QNX_TARGET 64 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 65 | else ifeq ($(TARGET_OS),android) 66 | HOST_COMPILER ?= arm-linux-androideabi-g++ 67 | endif 68 | else ifeq ($(TARGET_ARCH),aarch64) 69 | ifeq ($(TARGET_OS), linux) 70 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 71 | else ifeq ($(TARGET_OS), android) 72 | HOST_COMPILER ?= aarch64-linux-android-g++ 73 | endif 74 | else ifeq ($(TARGET_ARCH),ppc64le) 75 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 76 | endif 77 | endif 78 | HOST_COMPILER ?= g++ 79 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 80 | 81 | # internal flags 82 | NVCCFLAGS := -m${TARGET_SIZE} 83 | CCFLAGS := 84 | LDFLAGS := 85 | 86 | # build flags 87 | ifeq ($(TARGET_OS),darwin) 88 | LDFLAGS += -rpath $(CUDA_PATH)/lib 89 | CCFLAGS += -arch $(HOST_ARCH) 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 91 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 92 | CCFLAGS += -mfloat-abi=hard 93 | else ifeq ($(TARGET_OS),android) 94 | LDFLAGS += -pie 95 | CCFLAGS += -fpie -fpic -fexceptions 96 | endif 97 | 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 99 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 100 | ifneq ($(TARGET_FS),) 101 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 102 | ifeq ($(GCCVERSIONLTEQ46),1) 103 | CCFLAGS += --sysroot=$(TARGET_FS) 104 | endif 105 | LDFLAGS += --sysroot=$(TARGET_FS) 106 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 107 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 108 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 109 | endif 110 | endif 111 | endif 112 | 113 | # Debug build flags 114 | ifeq ($(dbg),1) 115 | NVCCFLAGS += -g -G 116 | BUILD_TYPE := debug 117 | else 118 | BUILD_TYPE := release 119 | endif 120 | 121 | ALL_CCFLAGS := 122 | ALL_CCFLAGS += $(NVCCFLAGS) 123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 126 | 127 | SAMPLE_ENABLED := 1 128 | 129 | ALL_LDFLAGS := 130 | ALL_LDFLAGS += $(ALL_CCFLAGS) 131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 133 | 134 | # Common includes and paths for CUDA 135 | ifneq ($(TARGET_ARCH), ppc64le) 136 | INCLUDES := -I$(CUDA_PATH)/include 137 | else 138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include 139 | endif 140 | LIBRARIES := 141 | 142 | ################################################################################ 143 | 144 | # Gencode arguments 145 | #$(warning "print cuda path $(CUDA_PATH)") 146 | 147 | ifneq ($(TARGET_ARCH), ppc64le) 148 | CUDA_VERSION := $(shell cat $(CUDA_PATH)/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') 149 | else 150 | CUDA_VERSION := $(shell cat $(CUDA_PATH)/targets/ppc64le-linux/include/cuda.h |grep "define CUDA_VERSION" |awk '{print $$3}') 151 | endif 152 | #$(warning "print cuda version $(CUDA_VERSION)") 153 | 154 | ifeq ($(CUDA_VERSION),8000 ) 155 | SMS_VOLTA = 156 | else 157 | ifneq ($(TARGET_ARCH), ppc64le) 158 | ifeq ($(CUDA_VERSION),9000 ) 159 | SMS_VOLTA ?= 70 160 | else 161 | SMS_VOLTA ?= 70 72 162 | endif 163 | else 164 | SMS_VOLTA ?= 70 165 | endif 166 | endif 167 | #$(warning "print sms_volta $(SMS_VOLTA)") 168 | 169 | SMS ?= 30 35 50 53 60 61 $(SMS_VOLTA) 170 | $(warning "print CUDA version $(CUDA_VERSION)") 171 | $(warning "print sms new $(SMS)") 172 | 173 | 174 | ifeq ($(SMS),) 175 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 176 | SAMPLE_ENABLED := 0 177 | endif 178 | 179 | ifeq ($(GENCODE_FLAGS),) 180 | # Generate SASS code for each SM architecture listed in $(SMS) 181 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 182 | 183 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 184 | HIGHEST_SM := $(lastword $(sort $(SMS))) 185 | ifneq ($(HIGHEST_SM),) 186 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 187 | endif 188 | endif 189 | 190 | #INCLUDES += -IFreeImage/include 191 | LIBRARIES += -lcudart -lcublas -lcudnn -lstdc++ -lm 192 | 193 | ifeq ($(SAMPLE_ENABLED),0) 194 | EXEC ?= @echo "[@]" 195 | endif 196 | 197 | ################################################################################ 198 | 199 | # Target rules 200 | all: build 201 | 202 | build: conv_sample 203 | 204 | check.deps: 205 | ifeq ($(SAMPLE_ENABLED),0) 206 | @echo "Sample will be waived due to the above missing dependencies" 207 | else 208 | @echo "Sample is ready - all dependencies have been met" 209 | endif 210 | 211 | OBJ = fp16_dev.o fp16_emu.o conv_sample.o 212 | 213 | conv_sample: $(OBJ) 214 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 215 | 216 | %.o: %.cpp 217 | $(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $< 218 | 219 | %.o: %.cu 220 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 221 | 222 | run: build 223 | $(EXEC) ./conv_sample 224 | 225 | clean: 226 | rm -rf *o 227 | rm -rf conv_sample 228 | 229 | clobber: clean 230 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/config_fermi_islip.icnt: -------------------------------------------------------------------------------- 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode 2 | use_map = 0; 3 | flit_size = 40; 4 | 5 | // currently we do not use this, see subnets below 6 | network_count = 2; 7 | 8 | // Topology 9 | topology = fly; 10 | k = 52; 11 | n = 1; 12 | 13 | // Routing 14 | 15 | routing_function = dest_tag; 16 | 17 | // Flow control 18 | 19 | num_vcs = 1; 20 | vc_buf_size = 64; 21 | input_buffer_size = 256; 22 | ejection_buffer_size = 64; 23 | boundary_buffer_size = 64; 24 | 25 | wait_for_tail_credit = 0; 26 | 27 | // Router architecture 28 | 29 | vc_allocator = islip; //separable_input_first; 30 | sw_allocator = islip; //separable_input_first; 31 | alloc_iters = 1; 32 | 33 | credit_delay = 0; 34 | routing_delay = 0; 35 | vc_alloc_delay = 1; 36 | sw_alloc_delay = 1; 37 | 38 | input_speedup = 1; 39 | output_speedup = 1; 40 | internal_speedup = 2.0; 41 | 42 | // Traffic, GPGPU-Sim does not use this 43 | 44 | traffic = uniform; 45 | packet_size ={{1,2,3,4},{10,20}}; 46 | packet_size_rate={{1,1,1,1},{2,1}}; 47 | 48 | // Simulation - Don't change 49 | 50 | sim_type = gpgpusim; 51 | //sim_type = latency; 52 | injection_rate = 0.1; 53 | 54 | subnets = 2; 55 | 56 | // Always use read and write no matter following line 57 | //use_read_write = 1; 58 | 59 | 60 | read_request_subnet = 0; 61 | read_reply_subnet = 1; 62 | write_request_subnet = 0; 63 | write_reply_subnet = 1; 64 | 65 | read_request_begin_vc = 0; 66 | read_request_end_vc = 0; 67 | write_request_begin_vc = 0; 68 | write_request_end_vc = 0; 69 | read_reply_begin_vc = 0; 70 | read_reply_end_vc = 0; 71 | write_reply_begin_vc = 0; 72 | write_reply_end_vc = 0; 73 | 74 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/error_util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_ERROR_UTIL_H_) 13 | #define _ERROR_UTIL_H_ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define TOSTR_(s) #s 21 | #define TOSTR(s) TOSTR_(s) 22 | #if defined(__GNUC__) 23 | #define COMPILER_NAME "GCC" 24 | #define COMPILER_VER TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__) 25 | #elif defined(_MSC_VER) 26 | #if _MSC_VER < 1500 27 | #define COMPILER_NAME "MSVC_2005" 28 | #elif _MSC_VER < 1600 29 | #define COMPILER_NAME "MSVC_2008" 30 | #elif _MSC_VER < 1700 31 | #define COMPILER_NAME "MSVC_2010" 32 | #elif _MSC_VER < 1800 33 | #define COMPILER_NAME "MSVC_2012" 34 | #elif _MSC_VER < 1900 35 | #define COMPILER_NAME "MSVC_2013" 36 | #elif _MSC_VER < 2000 37 | #define COMPILER_NAME "MSVC_2014" 38 | #else 39 | #define COMPILER_NAME "MSVC" 40 | #endif 41 | #define COMPILER_VER TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD) 42 | #elif defined(__clang_major__) 43 | #define COMPILER_NAME "CLANG" 44 | #define COMPILER_VER TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__) 45 | #elif defined(__INTEL_COMPILER) 46 | #define COMPILER_NAME "ICC" 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE) 48 | #else 49 | #define COMPILER_NAME "unknown" 50 | #define COMPILER_VER "???" 51 | #endif 52 | 53 | #define CUDNN_VERSION_STR TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL) 54 | 55 | #define FatalError(s) { \ 56 | std::stringstream _where, _message; \ 57 | _where << __FILE__ << ':' << __LINE__; \ 58 | _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\ 59 | std::cerr << _message.str() << "\nAborting...\n"; \ 60 | cudaDeviceReset(); \ 61 | exit(EXIT_FAILURE); \ 62 | } 63 | 64 | #define checkCUDNN(status) { \ 65 | std::stringstream _error; \ 66 | if (status != CUDNN_STATUS_SUCCESS) { \ 67 | _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \ 68 | FatalError(_error.str()); \ 69 | } \ 70 | } 71 | 72 | #define checkCudaErrors(status) { \ 73 | std::stringstream _error; \ 74 | if (status != 0) { \ 75 | _error << "Cuda failure\nError: " << cudaGetErrorString(status); \ 76 | FatalError(_error.str()); \ 77 | } \ 78 | } 79 | 80 | #define checkCublasErrors(status) { \ 81 | std::stringstream _error; \ 82 | if (status != 0) { \ 83 | _error << "Cublas failure\nError code " << status; \ 84 | FatalError(_error.str()); \ 85 | } \ 86 | } 87 | 88 | // CUDA Utility Helper Functions 89 | 90 | static void showDevices( void ) 91 | { 92 | int totalDevices; 93 | checkCudaErrors(cudaGetDeviceCount( &totalDevices )); 94 | printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices); 95 | for (int i=0; i< totalDevices; i++) { 96 | struct cudaDeviceProp prop; 97 | checkCudaErrors(cudaGetDeviceProperties( &prop, i )); 98 | printf( "device %d : sms %2d Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n", 99 | i, prop.multiProcessorCount, prop.major, prop.minor, 100 | (float)prop.clockRate*1e-3, 101 | (int)(prop.totalGlobalMem/(1024*1024)), 102 | (float)prop.memoryClockRate*1e-3, 103 | prop.ECCEnabled, 104 | prop.multiGpuBoardGroupID); 105 | } 106 | } 107 | 108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 109 | #ifndef _CRT_SECURE_NO_DEPRECATE 110 | #define _CRT_SECURE_NO_DEPRECATE 111 | #endif 112 | #ifndef STRNCASECMP 113 | #define STRNCASECMP _strnicmp 114 | #endif 115 | #else // Linux Includes 116 | #include 117 | #include 118 | #ifndef STRNCASECMP 119 | #define STRNCASECMP strncasecmp 120 | #endif 121 | #endif 122 | inline int stringRemoveDelimiter(char delimiter, const char *string) 123 | { 124 | int string_start = 0; 125 | 126 | while (string[string_start] == delimiter) 127 | { 128 | string_start++; 129 | } 130 | 131 | if (string_start >= (int)strlen(string)-1) 132 | { 133 | return 0; 134 | } 135 | 136 | return string_start; 137 | } 138 | 139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 140 | { 141 | bool bFound = false; 142 | 143 | if (argc >= 1) 144 | { 145 | for (int i=1; i < argc; i++) 146 | { 147 | int string_start = stringRemoveDelimiter('-', argv[i]); 148 | const char *string_argv = &argv[i][string_start]; 149 | 150 | const char *equal_pos = strchr(string_argv, '='); 151 | int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 152 | 153 | int length = (int)strlen(string_ref); 154 | 155 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 156 | { 157 | bFound = true; 158 | continue; 159 | } 160 | } 161 | } 162 | 163 | return bFound; 164 | } 165 | 166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 167 | { 168 | bool bFound = false; 169 | int value = -1; 170 | 171 | if (argc >= 1) 172 | { 173 | for (int i=1; i < argc; i++) 174 | { 175 | int string_start = stringRemoveDelimiter('-', argv[i]); 176 | const char *string_argv = &argv[i][string_start]; 177 | int length = (int)strlen(string_ref); 178 | 179 | if (!STRNCASECMP(string_argv, string_ref, length)) 180 | { 181 | if (length+1 <= (int)strlen(string_argv)) 182 | { 183 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 184 | value = atoi(&string_argv[length + auto_inc]); 185 | } 186 | else 187 | { 188 | value = 0; 189 | } 190 | 191 | bFound = true; 192 | continue; 193 | } 194 | } 195 | } 196 | 197 | if (bFound) 198 | { 199 | return value; 200 | } 201 | else 202 | { 203 | printf("Not found int\n"); 204 | return 0; 205 | } 206 | } 207 | 208 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 209 | const char *string_ref, char **string_retval) 210 | { 211 | bool bFound = false; 212 | 213 | if (argc >= 1) 214 | { 215 | for (int i=1; i < argc; i++) 216 | { 217 | int string_start = stringRemoveDelimiter('-', argv[i]); 218 | char *string_argv = (char *)&argv[i][string_start]; 219 | int length = (int)strlen(string_ref); 220 | 221 | if (!STRNCASECMP(string_argv, string_ref, length)) 222 | { 223 | *string_retval = &string_argv[length+1]; 224 | bFound = true; 225 | continue; 226 | } 227 | } 228 | } 229 | 230 | if (!bFound) 231 | { 232 | *string_retval = NULL; 233 | } 234 | 235 | return bFound; 236 | } 237 | 238 | #endif // _ERROR_UTIL_H_ -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/fp16_dev.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include "error_util.h" 13 | #include "fp16_dev.h" 14 | 15 | #define BLOCK_SIZE 128 16 | template 17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut) 18 | { 19 | const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x; 20 | if (idx >= size) { 21 | return; 22 | } 23 | #if CUDART_VERSION < 9000 24 | half1 val; 25 | val.x = __float2half_rn(float(buffIn[idx])); 26 | #else 27 | half1 val = __float2half_rn(float(buffIn[idx])); 28 | #endif 29 | buffOut[idx] = val; 30 | } 31 | 32 | template 33 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut) 34 | { 35 | int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; 36 | float2half_rn_kernel<<>> (size, buffIn, buffOut); 37 | checkCudaErrors(cudaDeviceSynchronize()); 38 | } 39 | 40 | template void gpu_float2half_rn (int, const float*, half1*); 41 | template void gpu_float2half_rn (int, const double*, half1*); 42 | 43 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/fp16_dev.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_FP16_DEV_H_) 13 | #define _FP16_DEV_H_ 14 | 15 | #include "fp16_emu.h" 16 | 17 | template 18 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut); 19 | 20 | #endif // _FP16_DEV_H_ 21 | 22 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/fp16_emu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | #include "fp16_emu.h" 51 | 52 | #define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0) 53 | 54 | // Host functions for converting between FP32 and FP16 formats 55 | // Paulius Micikevicius (pauliusm@nvidia.com) 56 | 57 | half1 cpu_float2half_rn(float f) 58 | { 59 | unsigned x = *((int*)(void*)(&f)); 60 | unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; 61 | unsigned sign, exponent, mantissa; 62 | 63 | __half_raw hr; 64 | 65 | // Get rid of +NaN/-NaN case first. 66 | if (u > 0x7f800000) { 67 | hr.x = 0x7fffU; 68 | return reinterpret_cast(hr); 69 | } 70 | 71 | sign = ((x >> 16) & 0x8000); 72 | 73 | // Get rid of +Inf/-Inf, +0/-0. 74 | if (u > 0x477fefff) { 75 | hr.x = sign | 0x7c00U; 76 | return reinterpret_cast(hr); 77 | } 78 | if (u < 0x33000001) { 79 | hr.x = sign | 0x0000U; 80 | return reinterpret_cast(hr); 81 | } 82 | 83 | exponent = ((u >> 23) & 0xff); 84 | mantissa = (u & 0x7fffff); 85 | 86 | if (exponent > 0x70) { 87 | shift = 13; 88 | exponent -= 0x70; 89 | } else { 90 | shift = 0x7e - exponent; 91 | exponent = 0; 92 | mantissa |= 0x800000; 93 | } 94 | lsb = (1 << shift); 95 | lsb_s1 = (lsb >> 1); 96 | lsb_m1 = (lsb - 1); 97 | 98 | // Round to nearest even. 99 | remainder = (mantissa & lsb_m1); 100 | mantissa >>= shift; 101 | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { 102 | ++mantissa; 103 | if (!(mantissa & 0x3ff)) { 104 | ++exponent; 105 | mantissa = 0; 106 | } 107 | } 108 | 109 | hr.x = (sign | (exponent << 10) | mantissa); 110 | 111 | return reinterpret_cast(hr); 112 | } 113 | 114 | 115 | float cpu_half2float(half1 h) 116 | { 117 | STATIC_ASSERT(sizeof(int) == sizeof(float)); 118 | 119 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 120 | 121 | unsigned sign = ((hr.x >> 15) & 1); 122 | unsigned exponent = ((hr.x >> 10) & 0x1f); 123 | unsigned mantissa = ((hr.x & 0x3ff) << 13); 124 | 125 | if (exponent == 0x1f) { /* NaN or Inf */ 126 | mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); 127 | exponent = 0xff; 128 | } else if (!exponent) { /* Denorm or Zero */ 129 | if (mantissa) { 130 | unsigned int msb; 131 | exponent = 0x71; 132 | do { 133 | msb = (mantissa & 0x400000); 134 | mantissa <<= 1; /* normalize */ 135 | --exponent; 136 | } while (!msb); 137 | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ 138 | } 139 | } else { 140 | exponent += 0x70; 141 | } 142 | 143 | int temp = ((sign << 31) | (exponent << 23) | mantissa); 144 | 145 | return reinterpret_cast(temp); 146 | } 147 | 148 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/fp16_emu.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | // Conversion from/to 16-bit floating point (half-precision). 51 | 52 | #if !defined(_FP16_EMU_H_) 53 | #define _FP16_EMU_H_ 54 | 55 | #include 56 | #include 57 | 58 | // Necessary to ensure visibility of CUDART_VERSION macro 59 | #include 60 | 61 | // Definition of '__half_raw' was not provided before CUDA 9.0. 62 | // '__half_raw' is our type where the unsigned 16-bit integer 63 | // data member 'x' can be accessed in both CUDA 9.0 and 8.0. 64 | #if CUDART_VERSION < 9000 65 | typedef __half __half_raw; 66 | #endif 67 | 68 | // Internally, in CUDNN we use half1 struct as the FP16 type. 69 | typedef __half half1; 70 | 71 | #define HLF_EPSILON 4.887581E-04 72 | #define HLF_MIN 6.103516E-05 73 | #define HLF_MAX 6.550400E+04 74 | 75 | half1 cpu_float2half_rn(float f); 76 | 77 | float cpu_half2float(half1 h); 78 | 79 | static __inline__ __device__ __host__ half1 habs(half1 h) 80 | { 81 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 82 | hr.x &= 0x7fffU; 83 | return reinterpret_cast(hr); 84 | } 85 | 86 | static __inline__ __device__ __host__ half1 hneg(half1 h) 87 | { 88 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 89 | hr.x ^= 0x8000U; 90 | return reinterpret_cast(hr); 91 | } 92 | 93 | static __inline__ __device__ __host__ int ishnan(half1 h) 94 | { 95 | // When input is NaN, exponent is all ones and mantissa is non-zero. 96 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 97 | return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0; 98 | } 99 | 100 | static __inline__ __device__ __host__ int ishinf(half1 h) 101 | { 102 | // When input is +/- inf, exponent is all ones and mantissa is zero. 103 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 104 | return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0; 105 | } 106 | 107 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y) 108 | { 109 | __half_raw xr = reinterpret_cast<__half_raw&>(x); 110 | __half_raw yr = reinterpret_cast<__half_raw&>(y); 111 | return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x; 112 | } 113 | 114 | // Returns 0.0000 in FP16 binary form 115 | static __inline__ __device__ __host__ half1 hzero() 116 | { 117 | __half_raw hr; 118 | hr.x = 0x0000U; 119 | return reinterpret_cast(hr); 120 | } 121 | 122 | // Returns 1.0000 in FP16 binary form 123 | static __inline__ __device__ __host__ half1 hone() 124 | { 125 | __half_raw hr; 126 | hr.x = 0x3c00U; 127 | return reinterpret_cast(hr); 128 | } 129 | 130 | // Returns quiet NaN, the most significant fraction bit #9 is set 131 | static __inline__ __device__ __host__ half1 hnan() 132 | { 133 | __half_raw hr; 134 | hr.x = 0x7e00U; 135 | return reinterpret_cast(hr); 136 | } 137 | 138 | // Largest positive FP16 value, corresponds to 6.5504e+04 139 | static __inline__ __device__ __host__ half1 hmax() 140 | { 141 | // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff) 142 | __half_raw hr; 143 | hr.x = 0x7bffU; 144 | return reinterpret_cast(hr); 145 | } 146 | 147 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05 148 | static __inline__ __device__ __host__ half1 hmin() 149 | { 150 | // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits) 151 | __half_raw hr; 152 | hr.x = 0x0400U; 153 | return reinterpret_cast(hr); 154 | } 155 | 156 | #endif // _FP16_EMU_H_ 157 | 158 | -------------------------------------------------------------------------------- /cudnn_samples_v7/conv_sample/gpgpusim.config: -------------------------------------------------------------------------------- 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode 2 | use_map = 0; 3 | flit_size = 40; 4 | 5 | // currently we do not use this, see subnets below 6 | network_count = 2; 7 | 8 | // Topology 9 | topology = fly; 10 | k = 52; 11 | n = 1; 12 | 13 | // Routing 14 | 15 | routing_function = dest_tag; 16 | 17 | // Flow control 18 | 19 | num_vcs = 1; 20 | vc_buf_size = 64; 21 | input_buffer_size = 256; 22 | ejection_buffer_size = 64; 23 | boundary_buffer_size = 64; 24 | 25 | wait_for_tail_credit = 0; 26 | 27 | // Router architecture 28 | 29 | vc_allocator = islip; //separable_input_first; 30 | sw_allocator = islip; //separable_input_first; 31 | alloc_iters = 1; 32 | 33 | credit_delay = 0; 34 | routing_delay = 0; 35 | vc_alloc_delay = 1; 36 | sw_alloc_delay = 1; 37 | 38 | input_speedup = 1; 39 | output_speedup = 1; 40 | internal_speedup = 2.0; 41 | 42 | // Traffic, GPGPU-Sim does not use this 43 | 44 | traffic = uniform; 45 | packet_size ={{1,2,3,4},{10,20}}; 46 | packet_size_rate={{1,1,1,1},{2,1}}; 47 | 48 | // Simulation - Don't change 49 | 50 | sim_type = gpgpusim; 51 | //sim_type = latency; 52 | injection_rate = 0.1; 53 | 54 | subnets = 2; 55 | 56 | // Always use read and write no matter following line 57 | //use_read_write = 1; 58 | 59 | 60 | read_request_subnet = 0; 61 | read_reply_subnet = 1; 62 | write_request_subnet = 0; 63 | write_reply_subnet = 1; 64 | 65 | read_request_begin_vc = 0; 66 | read_request_end_vc = 0; 67 | write_request_begin_vc = 0; 68 | write_request_end_vc = 0; 69 | read_reply_begin_vc = 0; 70 | read_reply_end_vc = 0; 71 | write_reply_begin_vc = 0; 72 | write_reply_end_vc = 0; 73 | 74 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/FreeImage/freeimage-license.txt: -------------------------------------------------------------------------------- 1 | FreeImage Public License - Version 1.0 2 | --------------------------------------------- 3 | 4 | 1. Definitions. 5 | 6 | 1.1. "Contributor" means each entity that creates or contributes to the creation of Modifications. 7 | 8 | 1.2. "Contributor Version" means the combination of the Original Code, prior Modifications used by a Contributor, and the Modifications made by that particular Contributor. 9 | 10 | 1.3. "Covered Code" means the Original Code or Modifications or the combination of the Original Code and Modifications, in each case including portions thereof. 11 | 12 | 1.4. "Electronic Distribution Mechanism" means a mechanism generally accepted in the software development community for the electronic transfer of data. 13 | 14 | 1.5. "Executable" means Covered Code in any form other than Source Code. 15 | 16 | 1.6. "Initial Developer" means the individual or entity identified as the Initial Developer in the Source Code notice required by Exhibit A. 17 | 18 | 1.7. "Larger Work" means a work which combines Covered Code or portions thereof with code not governed by the terms of this License. 19 | 20 | 1.8. "License" means this document. 21 | 22 | 1.9. "Modifications" means any addition to or deletion from the substance or structure of either the Original Code or any previous Modifications. When Covered Code is released as a series of files, a 23 | Modification is: 24 | 25 | A. Any addition to or deletion from the contents of a file containing Original Code or previous Modifications. 26 | 27 | B. Any new file that contains any part of the Original Code or previous Modifications. 28 | 29 | 1.10. "Original Code" means Source Code of computer software code which is described in the Source Code notice required by Exhibit A as Original Code, and which, at the time of its release under this License is not already Covered Code governed by this License. 30 | 31 | 1.11. "Source Code" means the preferred form of the Covered Code for making modifications to it, including all modules it contains, plus any associated interface definition files, scripts used to control 32 | compilation and installation of an Executable, or a list of source code differential comparisons against either the Original Code or another well known, available Covered Code of the Contributor's choice. The Source Code can be in a compressed or archival form, provided the appropriate decompression or de-archiving software is widely available for no charge. 33 | 34 | 1.12. "You" means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License or a future version of this License issued under Section 6.1. For legal entities, "You" includes any entity which controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the 35 | direction or management of such entity, whether by contract or otherwise, or (b) ownership of fifty percent (50%) or more of the outstanding shares or beneficial ownership of such entity. 36 | 37 | 2. Source Code License. 38 | 39 | 2.1. The Initial Developer Grant. 40 | The Initial Developer hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims: 41 | 42 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Original Code (or portions thereof) with or without Modifications, or as part of a Larger Work; and 43 | 44 | (b) under patents now or hereafter owned or controlled by Initial Developer, to make, have made, use and sell ("Utilize") the Original Code (or portions thereof), but solely to the extent that 45 | any such patent is reasonably necessary to enable You to Utilize the Original Code (or portions thereof) and not to any greater extent that may be necessary to Utilize further Modifications or 46 | combinations. 47 | 48 | 2.2. Contributor Grant. 49 | Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license, subject to third party intellectual property claims: 50 | 51 | (a) to use, reproduce, modify, display, perform, sublicense and distribute the Modifications created by such Contributor (or portions thereof) either on an unmodified basis, with other Modifications, as Covered Code or as part of a Larger Work; and 52 | 53 | (b) under patents now or hereafter owned or controlled by Contributor, to Utilize the Contributor Version (or portions thereof), but solely to the extent that any such patent is reasonably necessary to enable You to Utilize the Contributor Version (or portions thereof), and not to any greater extent that 54 | may be necessary to Utilize further Modifications or combinations. 55 | 56 | 3. Distribution Obligations. 57 | 58 | 3.1. Application of License. 59 | The Modifications which You create or to which You contribute are governed by the terms of this License, including without limitation Section 2.2. The Source Code version of Covered Code may be distributed only under the terms of this License or a future version of this License released under Section 6.1, and You must include a copy of this License with every copy of the Source Code You distribute. You may not offer or impose any terms on any Source Code version that alters or 60 | restricts the applicable version of this License or the recipients' rights hereunder. However, You may include an additional document offering the additional rights described in Section 3.5. 61 | 62 | 3.2. Availability of Source Code. 63 | Any Modification which You create or to which You contribute must be made available in Source Code form under the terms of this License either on the same media as an Executable version or via an accepted Electronic Distribution Mechanism to anyone to whom you made an Executable version available; and if made available via Electronic Distribution Mechanism, must remain available for at least twelve (12) months after the date it initially became available, or at least six (6) months after a subsequent version of that particular Modification has been made available to such recipients. You are responsible for ensuring that the Source Code version remains available even if the Electronic Distribution Mechanism is maintained by a third party. 64 | 65 | 3.3. Description of Modifications. 66 | You must cause all Covered Code to which you contribute to contain a file documenting the changes You made to create that Covered Code and the date of any change. You must include a prominent statement that the Modification is derived, directly or indirectly, from Original Code provided by the Initial Developer and including the name of the Initial Developer in (a) the Source Code, and (b) in any notice in an Executable version or related documentation in which You describe the origin or ownership of the Covered Code. 67 | 68 | 3.4. Intellectual Property Matters 69 | 70 | (a) Third Party Claims. 71 | If You have knowledge that a party claims an intellectual property right in particular functionality or code (or its utilization under this License), you must include a text file with the source code distribution titled "LEGAL" which describes the claim and the party making the claim in sufficient detail that a recipient will know whom to contact. If you obtain such knowledge after You make Your Modification available as described in Section 3.2, You shall promptly modify the LEGAL file in all copies You make 72 | available thereafter and shall take other steps (such as notifying appropriate mailing lists or newsgroups) reasonably calculated to inform those who received the Covered Code that new knowledge has been obtained. 73 | 74 | (b) Contributor APIs. 75 | If Your Modification is an application programming interface and You own or control patents which are reasonably necessary to implement that API, you must also include this information in the LEGAL file. 76 | 77 | 3.5. Required Notices. 78 | You must duplicate the notice in Exhibit A in each file of the Source Code, and this License in any documentation for the Source Code, where You describe recipients' rights relating to Covered Code. If You created one or more Modification(s), You may add your name as a Contributor to the notice described in Exhibit A. If it is not possible to put such notice in a particular Source Code file due to its 79 | structure, then you must include such notice in a location (such as a relevant directory file) where a user would be likely to look for such a notice. You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Code. However, You may do so only on Your own behalf, and not on behalf of the Initial Developer or any Contributor. You must make it absolutely clear than any such warranty, support, indemnity or 80 | liability obligation is offered by You alone, and You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of 81 | warranty, support, indemnity or liability terms You offer. 82 | 83 | 3.6. Distribution of Executable Versions. 84 | You may distribute Covered Code in Executable form only if the requirements of Section 3.1-3.5 have been met for that Covered Code, and if You include a notice stating that the Source Code version of the Covered Code is available under the terms of this License, including a description of how and where You have fulfilled the obligations of Section 3.2. The notice must be conspicuously included in any notice in an Executable version, related documentation or collateral in which You 85 | describe recipients' rights relating to the Covered Code. You may distribute the Executable version of Covered Code under a license of Your choice, which may contain terms different from this License, 86 | provided that You are in compliance with the terms of this License and that the license for the Executable version does not attempt to limit or alter the recipient's rights in the Source Code version from the rights set forth in this License. If You distribute the Executable version under a different license You must make it absolutely clear that any terms which differ from this License are offered by You alone, not by the Initial Developer or any Contributor. You hereby agree to indemnify the Initial Developer and every Contributor for any liability incurred by the Initial Developer or such Contributor as a result of any such terms You offer. 87 | 88 | 3.7. Larger Works. 89 | You may create a Larger Work by combining Covered Code with other code not governed by the terms of this License and distribute the Larger Work as a single product. In such a case, You must make sure the requirements of this License are fulfilled for the Covered Code. 90 | 91 | 4. Inability to Comply Due to Statute or Regulation. 92 | 93 | If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Code due to statute or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be included in the LEGAL file described in Section 3.4 and must be included with all distributions of the Source Code. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 94 | 95 | 5. Application of this License. 96 | 97 | This License applies to code to which the Initial Developer has attached the notice in Exhibit A, and to related Covered Code. 98 | 99 | 6. Versions of the License. 100 | 101 | 6.1. New Versions. 102 | Floris van den Berg may publish revised and/or new versions of the License from time to time. Each version will be given a distinguishing version number. 103 | 104 | 6.2. Effect of New Versions. 105 | Once Covered Code has been published under a particular version of the License, You may always continue to use it under the terms of that version. You may also choose to use such Covered Code under the terms of any subsequent version of the License published by Floris van den Berg 106 | No one other than Floris van den Berg has the right to modify the terms applicable to Covered Code created under this License. 107 | 108 | 6.3. Derivative Works. 109 | If you create or use a modified version of this License (which you may only do in order to apply it to code which is not already Covered Code governed by this License), you must (a) rename Your license so that the phrases "FreeImage", `FreeImage Public License", "FIPL", or any confusingly similar phrase do not appear anywhere in your license and (b) otherwise make it clear that your version of the license contains terms which differ from the FreeImage Public License. (Filling in the name of the Initial Developer, Original Code or Contributor in the notice described in Exhibit A shall not of themselves be deemed to be modifications of this License.) 110 | 111 | 7. DISCLAIMER OF WARRANTY. 112 | 113 | COVERED CODE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED CODE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE COVERED CODE IS WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED CODE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. 114 | 115 | 8. TERMINATION. 116 | 117 | This License and the rights granted hereunder will terminate automatically if You fail to comply with terms herein and fail to cure such breach within 30 days of becoming aware of the breach. All sublicenses to the Covered Code which are properly granted shall survive any termination of this License. Provisions which, by their nature, must remain in effect beyond the termination of this License shall survive. 118 | 119 | 9. LIMITATION OF LIABILITY. 120 | 121 | UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL THE INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF COVERED CODE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE TO YOU OR ANY OTHER PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE 122 | EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THAT EXCLUSION AND LIMITATION MAY NOT APPLY TO YOU. 123 | 124 | 10. U.S. GOVERNMENT END USERS. 125 | 126 | The Covered Code is a "commercial item," as that term is defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer software" and "commercial computer software documentation," as such terms are used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all U.S. Government End Users acquire Covered Code with only those rights set forth herein. 127 | 128 | 11. MISCELLANEOUS. 129 | 130 | This License represents the complete agreement concerning subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. This License shall be governed by Dutch law provisions (except to the extent applicable law, if any, provides otherwise), excluding its conflict-of-law provisions. With respect to disputes in which at least one party is a citizen of, or an entity chartered or registered to do business in, the The Netherlands: (a) unless otherwise agreed in writing, all disputes relating to this License (excepting any dispute relating to intellectual property rights) shall be subject to final and binding arbitration, with the losing party paying all costs of arbitration; (b) any arbitration relating to this Agreement shall be held in Almelo, The Netherlands; and (c) any litigation relating to this Agreement shall be subject to the jurisdiction of the court of Almelo, The Netherlands with the losing party responsible for costs, including without limitation, court costs and reasonable attorneys fees and expenses. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not apply to this License. 131 | 132 | 12. RESPONSIBILITY FOR CLAIMS. 133 | 134 | Except in cases where another Contributor has failed to comply with Section 3.4, You are responsible for damages arising, directly or indirectly, out of Your utilization of rights under this License, based 135 | on the number of copies of Covered Code you made available, the revenues you received from utilizing such rights, and other relevant factors. You agree to work with affected parties to distribute 136 | responsibility on an equitable basis. 137 | 138 | EXHIBIT A. 139 | 140 | "The contents of this file are subject to the FreeImage Public License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://home.wxs.nl/~flvdberg/freeimage-license.txt 141 | 142 | Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License. -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/FreeImage/include/FreeImage.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/FreeImage/include/FreeImage.h -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/Makefile: -------------------------------------------------------------------------------- 1 | # Location of the CUDA Toolkit 2 | CUDA_PATH ?= /usr/local/cuda 3 | 4 | # architecture 5 | HOST_ARCH := $(shell uname -m) 6 | TARGET_ARCH ?= $(HOST_ARCH) 7 | 8 | # Adjust this for ARMv7 with a 32-bit filesystem 9 | ifeq ($(TARGET_ARCH), aarch64) 10 | ifeq ($(shell file /sbin/init | grep 32-bit), 1) 11 | TARGET_ARCH=armv7l 12 | endif 13 | endif 14 | 15 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 16 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 17 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 18 | TARGET_SIZE := 64 19 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 20 | TARGET_SIZE := 32 21 | endif 22 | else 23 | TARGET_SIZE := $(shell getconf LONG_BIT) 24 | endif 25 | else 26 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 27 | endif 28 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 29 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 30 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 31 | endif 32 | endif 33 | 34 | # operating system 35 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 36 | TARGET_OS ?= $(HOST_OS) 37 | 38 | ifeq ($(TARGET_OS),QNX) 39 | override TARGET_OS := qnx 40 | endif 41 | 42 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 43 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 44 | endif 45 | 46 | # host compiler 47 | ifeq ($(TARGET_OS),darwin) 48 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 49 | HOST_COMPILER ?= clang++ 50 | endif 51 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 52 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 53 | ifeq ($(TARGET_OS),linux) 54 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 55 | else ifeq ($(TARGET_OS),qnx) 56 | ifeq ($(QNX_HOST),) 57 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 58 | endif 59 | ifeq ($(QNX_TARGET),) 60 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 61 | endif 62 | export QNX_HOST 63 | export QNX_TARGET 64 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 65 | else ifeq ($(TARGET_OS),android) 66 | HOST_COMPILER ?= arm-linux-androideabi-g++ 67 | endif 68 | else ifeq ($(TARGET_ARCH),aarch64) 69 | ifeq ($(TARGET_OS), linux) 70 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 71 | else ifeq ($(TARGET_OS), android) 72 | HOST_COMPILER ?= aarch64-linux-android-g++ 73 | endif 74 | else ifeq ($(TARGET_ARCH),ppc64le) 75 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 76 | endif 77 | endif 78 | HOST_COMPILER ?= g++ 79 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 80 | 81 | # internal flags 82 | NVCCFLAGS := -m${TARGET_SIZE} 83 | CCFLAGS := 84 | LDFLAGS := 85 | 86 | # build flags 87 | ifeq ($(TARGET_OS),darwin) 88 | LDFLAGS += -rpath $(CUDA_PATH)/lib 89 | CCFLAGS += -arch $(HOST_ARCH) 90 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 91 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 92 | CCFLAGS += -mfloat-abi=hard 93 | else ifeq ($(TARGET_OS),android) 94 | LDFLAGS += -pie 95 | CCFLAGS += -fpie -fpic -fexceptions 96 | endif 97 | 98 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 99 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 100 | ifneq ($(TARGET_FS),) 101 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 102 | ifeq ($(GCCVERSIONLTEQ46),1) 103 | CCFLAGS += --sysroot=$(TARGET_FS) 104 | endif 105 | LDFLAGS += --sysroot=$(TARGET_FS) 106 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 107 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 108 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 109 | endif 110 | endif 111 | endif 112 | 113 | # Debug build flags 114 | ifeq ($(dbg),1) 115 | NVCCFLAGS += -g -G 116 | BUILD_TYPE := debug 117 | else 118 | BUILD_TYPE := release 119 | endif 120 | 121 | ALL_CCFLAGS := 122 | ALL_CCFLAGS += $(NVCCFLAGS) 123 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 124 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 125 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 126 | 127 | SAMPLE_ENABLED := 1 128 | 129 | ALL_LDFLAGS := 130 | ALL_LDFLAGS += $(ALL_CCFLAGS) 131 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 132 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 133 | 134 | # Common includes and paths for CUDA 135 | ifneq ($(TARGET_ARCH), ppc64le) 136 | INCLUDES := -I$(CUDA_PATH)/include 137 | else 138 | INCLUDES := -I$(CUDA_PATH)/targets/ppc64le-linux/include 139 | endif 140 | LIBRARIES := 141 | 142 | ################################################################################ 143 | 144 | # Gencode arguments 145 | SMS ?= 30 35 50 53 146 | 147 | ifeq ($(SMS),) 148 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 149 | SAMPLE_ENABLED := 0 150 | endif 151 | 152 | ifeq ($(GENCODE_FLAGS),) 153 | # Generate SASS code for each SM architecture listed in $(SMS) 154 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 155 | 156 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 157 | HIGHEST_SM := $(lastword $(sort $(SMS))) 158 | ifneq ($(HIGHEST_SM),) 159 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 160 | endif 161 | endif 162 | 163 | INCLUDES += -IFreeImage/include 164 | LIBRARIES += -LFreeImage/lib/$(TARGET_OS)/$(TARGET_ARCH) -LFreeImage/lib/$(TARGET_OS) -lcudart -lcublas -lcudnn -lfreeimage -lstdc++ -lm 165 | 166 | # Attempt to compile a minimal application linked against FreeImage. If a.out exists, FreeImage is properly set up. 167 | $(shell echo "#include \"FreeImage.h\"" > test.c; echo "int main() { return 0; }" >> test.c ; $(NVCC) $(ALL_CCFLAGS) $(INCLUDES) $(LIBRARIES) -l freeimage test.c) 168 | FREEIMAGE := $(shell find a.out 2>/dev/null) 169 | $(shell rm a.out test.c 2>/dev/null) 170 | 171 | ifeq ("$(FREEIMAGE)","") 172 | $(info >>> WARNING - FreeImage is not set up correctly. Please ensure FreeImage is set up correctly. <<<) 173 | SAMPLE_ENABLED := 0 174 | endif 175 | 176 | ifeq ($(SAMPLE_ENABLED),0) 177 | EXEC ?= @echo "[@]" 178 | endif 179 | 180 | ################################################################################ 181 | 182 | # Target rules 183 | all: build 184 | 185 | build: mnistCUDNN 186 | 187 | check.deps: 188 | ifeq ($(SAMPLE_ENABLED),0) 189 | @echo "Sample will be waived due to the above missing dependencies" 190 | else 191 | @echo "Sample is ready - all dependencies have been met" 192 | endif 193 | 194 | OBJ = fp16_dev.o fp16_emu.o mnistCUDNN.o 195 | 196 | mnistCUDNN: $(OBJ) 197 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 198 | 199 | %.o: %.cpp 200 | $(EXEC) $(HOST_COMPILER) $(INCLUDES) $(CCFLAGS) $(EXTRA_CCFLAGS) -o $@ -c $< 201 | 202 | %.o: %.cu 203 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 204 | 205 | run: build 206 | $(EXEC) ./mnistCUDNN 207 | 208 | clean: 209 | rm -rf *o 210 | rm -rf mnistCUDNN 211 | 212 | clobber: clean 213 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/conv1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv1.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/conv1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv1.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/conv2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv2.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/conv2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/conv2.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/five_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/five_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/ip1.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip1.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/ip1.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip1.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/ip2.bias.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip2.bias.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/ip2.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/ip2.bin -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/one_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/one_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/data/three_28x28.pgm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/data/three_28x28.pgm -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/error_util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_ERROR_UTIL_H_) 13 | #define _ERROR_UTIL_H_ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define TOSTR_(s) #s 21 | #define TOSTR(s) TOSTR_(s) 22 | #if defined(__GNUC__) 23 | #define COMPILER_NAME "GCC" 24 | #define COMPILER_VER TOSTR(__GNUC__) "." TOSTR(__GNUC_MINOR__) "." TOSTR(__GNUC_PATCHLEVEL__) 25 | #elif defined(_MSC_VER) 26 | #if _MSC_VER < 1500 27 | #define COMPILER_NAME "MSVC_2005" 28 | #elif _MSC_VER < 1600 29 | #define COMPILER_NAME "MSVC_2008" 30 | #elif _MSC_VER < 1700 31 | #define COMPILER_NAME "MSVC_2010" 32 | #elif _MSC_VER < 1800 33 | #define COMPILER_NAME "MSVC_2012" 34 | #elif _MSC_VER < 1900 35 | #define COMPILER_NAME "MSVC_2013" 36 | #elif _MSC_VER < 2000 37 | #define COMPILER_NAME "MSVC_2014" 38 | #else 39 | #define COMPILER_NAME "MSVC" 40 | #endif 41 | #define COMPILER_VER TOSTR(_MSC_FULL_VER) "." TOSTR(_MSC_BUILD) 42 | #elif defined(__clang_major__) 43 | #define COMPILER_NAME "CLANG" 44 | #define COMPILER_VER TOSTR(__clang_major__ ) "." TOSTR(__clang_minor__) "." TOSTR(__clang_patchlevel__) 45 | #elif defined(__INTEL_COMPILER) 46 | #define COMPILER_NAME "ICC" 47 | #define COMPILER_VER TOSTR(__INTEL_COMPILER) "." TOSTR(__INTEL_COMPILER_BUILD_DATE) 48 | #else 49 | #define COMPILER_NAME "unknown" 50 | #define COMPILER_VER "???" 51 | #endif 52 | 53 | #define CUDNN_VERSION_STR TOSTR(CUDNN_MAJOR) "." TOSTR (CUDNN_MINOR) "." TOSTR(CUDNN_PATCHLEVEL) 54 | 55 | #define FatalError(s) { \ 56 | std::stringstream _where, _message; \ 57 | _where << __FILE__ << ':' << __LINE__; \ 58 | _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;\ 59 | std::cerr << _message.str() << "\nAborting...\n"; \ 60 | cudaDeviceReset(); \ 61 | exit(EXIT_FAILURE); \ 62 | } 63 | 64 | #define checkCUDNN(status) { \ 65 | std::stringstream _error; \ 66 | if (status != CUDNN_STATUS_SUCCESS) { \ 67 | _error << "CUDNN failure\nError: " << cudnnGetErrorString(status); \ 68 | FatalError(_error.str()); \ 69 | } \ 70 | } 71 | 72 | #define checkCudaErrors(status) { \ 73 | std::stringstream _error; \ 74 | if (status != 0) { \ 75 | _error << "Cuda failure\nError: " << cudaGetErrorString(status); \ 76 | FatalError(_error.str()); \ 77 | } \ 78 | } 79 | 80 | #define checkCublasErrors(status) { \ 81 | std::stringstream _error; \ 82 | if (status != 0) { \ 83 | _error << "Cublas failure\nError code " << status; \ 84 | FatalError(_error.str()); \ 85 | } \ 86 | } 87 | 88 | // CUDA Utility Helper Functions 89 | 90 | static void showDevices( void ) 91 | { 92 | int totalDevices; 93 | checkCudaErrors(cudaGetDeviceCount( &totalDevices )); 94 | printf("\nThere are %d CUDA capable devices on your machine :\n", totalDevices); 95 | for (int i=0; i< totalDevices; i++) { 96 | struct cudaDeviceProp prop; 97 | checkCudaErrors(cudaGetDeviceProperties( &prop, i )); 98 | printf( "device %d : sms %2d Capabilities %d.%d, SmClock %.1f Mhz, MemSize (Mb) %d, MemClock %.1f Mhz, Ecc=%d, boardGroupID=%d\n", 99 | i, prop.multiProcessorCount, prop.major, prop.minor, 100 | (float)prop.clockRate*1e-3, 101 | (int)(prop.totalGlobalMem/(1024*1024)), 102 | (float)prop.memoryClockRate*1e-3, 103 | prop.ECCEnabled, 104 | prop.multiGpuBoardGroupID); 105 | } 106 | } 107 | 108 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 109 | #ifndef _CRT_SECURE_NO_DEPRECATE 110 | #define _CRT_SECURE_NO_DEPRECATE 111 | #endif 112 | #ifndef STRNCASECMP 113 | #define STRNCASECMP _strnicmp 114 | #endif 115 | #else // Linux Includes 116 | #include 117 | #include 118 | #ifndef STRNCASECMP 119 | #define STRNCASECMP strncasecmp 120 | #endif 121 | #endif 122 | inline int stringRemoveDelimiter(char delimiter, const char *string) 123 | { 124 | int string_start = 0; 125 | 126 | while (string[string_start] == delimiter) 127 | { 128 | string_start++; 129 | } 130 | 131 | if (string_start >= (int)strlen(string)-1) 132 | { 133 | return 0; 134 | } 135 | 136 | return string_start; 137 | } 138 | 139 | inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 140 | { 141 | bool bFound = false; 142 | 143 | if (argc >= 1) 144 | { 145 | for (int i=1; i < argc; i++) 146 | { 147 | int string_start = stringRemoveDelimiter('-', argv[i]); 148 | const char *string_argv = &argv[i][string_start]; 149 | 150 | const char *equal_pos = strchr(string_argv, '='); 151 | int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 152 | 153 | int length = (int)strlen(string_ref); 154 | 155 | if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 156 | { 157 | bFound = true; 158 | continue; 159 | } 160 | } 161 | } 162 | 163 | return bFound; 164 | } 165 | 166 | inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 167 | { 168 | bool bFound = false; 169 | int value = -1; 170 | 171 | if (argc >= 1) 172 | { 173 | for (int i=1; i < argc; i++) 174 | { 175 | int string_start = stringRemoveDelimiter('-', argv[i]); 176 | const char *string_argv = &argv[i][string_start]; 177 | int length = (int)strlen(string_ref); 178 | 179 | if (!STRNCASECMP(string_argv, string_ref, length)) 180 | { 181 | if (length+1 <= (int)strlen(string_argv)) 182 | { 183 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 184 | value = atoi(&string_argv[length + auto_inc]); 185 | } 186 | else 187 | { 188 | value = 0; 189 | } 190 | 191 | bFound = true; 192 | continue; 193 | } 194 | } 195 | } 196 | 197 | if (bFound) 198 | { 199 | return value; 200 | } 201 | else 202 | { 203 | printf("Not found int\n"); 204 | return 0; 205 | } 206 | } 207 | 208 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 209 | const char *string_ref, char **string_retval) 210 | { 211 | bool bFound = false; 212 | 213 | if (argc >= 1) 214 | { 215 | for (int i=1; i < argc; i++) 216 | { 217 | int string_start = stringRemoveDelimiter('-', argv[i]); 218 | char *string_argv = (char *)&argv[i][string_start]; 219 | int length = (int)strlen(string_ref); 220 | 221 | if (!STRNCASECMP(string_argv, string_ref, length)) 222 | { 223 | *string_retval = &string_argv[length+1]; 224 | bFound = true; 225 | continue; 226 | } 227 | } 228 | } 229 | 230 | if (!bFound) 231 | { 232 | *string_retval = NULL; 233 | } 234 | 235 | return bFound; 236 | } 237 | 238 | #endif // _ERROR_UTIL_H_ -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/fp16_dev.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include "error_util.h" 13 | #include "fp16_dev.h" 14 | 15 | #define BLOCK_SIZE 128 16 | template 17 | __global__ void float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut) 18 | { 19 | const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x; 20 | if (idx >= size) { 21 | return; 22 | } 23 | #if CUDART_VERSION < 9000 24 | half1 val; 25 | val.x = __float2half_rn(float(buffIn[idx])); 26 | #else 27 | half1 val = __float2half_rn(float(buffIn[idx])); 28 | #endif 29 | buffOut[idx] = val; 30 | } 31 | 32 | template 33 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut) 34 | { 35 | int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; 36 | float2half_rn_kernel<<>> (size, buffIn, buffOut); 37 | checkCudaErrors(cudaDeviceSynchronize()); 38 | } 39 | 40 | template void gpu_float2half_rn (int, const float*, half1*); 41 | template void gpu_float2half_rn (int, const double*, half1*); 42 | 43 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/fp16_dev.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_FP16_DEV_H_) 13 | #define _FP16_DEV_H_ 14 | 15 | #include "fp16_emu.h" 16 | 17 | template 18 | void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut); 19 | 20 | #endif // _FP16_DEV_H_ 21 | 22 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/fp16_emu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | #include "fp16_emu.h" 51 | 52 | #define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0) 53 | 54 | // Host functions for converting between FP32 and FP16 formats 55 | // Paulius Micikevicius (pauliusm@nvidia.com) 56 | 57 | half1 cpu_float2half_rn(float f) 58 | { 59 | unsigned x = *((int*)(void*)(&f)); 60 | unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; 61 | unsigned sign, exponent, mantissa; 62 | 63 | __half_raw hr; 64 | 65 | // Get rid of +NaN/-NaN case first. 66 | if (u > 0x7f800000) { 67 | hr.x = 0x7fffU; 68 | return reinterpret_cast(hr); 69 | } 70 | 71 | sign = ((x >> 16) & 0x8000); 72 | 73 | // Get rid of +Inf/-Inf, +0/-0. 74 | if (u > 0x477fefff) { 75 | hr.x = sign | 0x7c00U; 76 | return reinterpret_cast(hr); 77 | } 78 | if (u < 0x33000001) { 79 | hr.x = sign | 0x0000U; 80 | return reinterpret_cast(hr); 81 | } 82 | 83 | exponent = ((u >> 23) & 0xff); 84 | mantissa = (u & 0x7fffff); 85 | 86 | if (exponent > 0x70) { 87 | shift = 13; 88 | exponent -= 0x70; 89 | } else { 90 | shift = 0x7e - exponent; 91 | exponent = 0; 92 | mantissa |= 0x800000; 93 | } 94 | lsb = (1 << shift); 95 | lsb_s1 = (lsb >> 1); 96 | lsb_m1 = (lsb - 1); 97 | 98 | // Round to nearest even. 99 | remainder = (mantissa & lsb_m1); 100 | mantissa >>= shift; 101 | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { 102 | ++mantissa; 103 | if (!(mantissa & 0x3ff)) { 104 | ++exponent; 105 | mantissa = 0; 106 | } 107 | } 108 | 109 | hr.x = (sign | (exponent << 10) | mantissa); 110 | 111 | return reinterpret_cast(hr); 112 | } 113 | 114 | 115 | float cpu_half2float(half1 h) 116 | { 117 | STATIC_ASSERT(sizeof(int) == sizeof(float)); 118 | 119 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 120 | 121 | unsigned sign = ((hr.x >> 15) & 1); 122 | unsigned exponent = ((hr.x >> 10) & 0x1f); 123 | unsigned mantissa = ((hr.x & 0x3ff) << 13); 124 | 125 | if (exponent == 0x1f) { /* NaN or Inf */ 126 | mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); 127 | exponent = 0xff; 128 | } else if (!exponent) { /* Denorm or Zero */ 129 | if (mantissa) { 130 | unsigned int msb; 131 | exponent = 0x71; 132 | do { 133 | msb = (mantissa & 0x400000); 134 | mantissa <<= 1; /* normalize */ 135 | --exponent; 136 | } while (!msb); 137 | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ 138 | } 139 | } else { 140 | exponent += 0x70; 141 | } 142 | 143 | int temp = ((sign << 31) | (exponent << 23) | mantissa); 144 | 145 | return reinterpret_cast(temp); 146 | } 147 | 148 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/fp16_emu.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO LICENSEE: 5 | * 6 | * This source code and/or documentation ("Licensed Deliverables") are 7 | * subject to NVIDIA intellectual property rights under U.S. and 8 | * international Copyright laws. 9 | * 10 | * These Licensed Deliverables contained herein is PROPRIETARY and 11 | * CONFIDENTIAL to NVIDIA and is being provided under the terms and 12 | * conditions of a form of NVIDIA software license agreement by and 13 | * between NVIDIA and Licensee ("License Agreement") or electronically 14 | * accepted by Licensee. Notwithstanding any terms or conditions to 15 | * the contrary in the License Agreement, reproduction or disclosure 16 | * of the Licensed Deliverables to any third party without the express 17 | * written consent of NVIDIA is prohibited. 18 | * 19 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 20 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 21 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS 22 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 23 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 24 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 25 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 26 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 27 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 28 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 29 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 30 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 31 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 32 | * OF THESE LICENSED DELIVERABLES. 33 | * 34 | * U.S. Government End Users. These Licensed Deliverables are a 35 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 36 | * 1995), consisting of "commercial computer software" and "commercial 37 | * computer software documentation" as such terms are used in 48 38 | * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government 39 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 40 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 41 | * U.S. Government End Users acquire the Licensed Deliverables with 42 | * only those rights set forth herein. 43 | * 44 | * Any use of the Licensed Deliverables in individual and commercial 45 | * software must include, in the user documentation and internal 46 | * comments to the code, the above Disclaimer and U.S. Government End 47 | * Users Notice. 48 | */ 49 | 50 | // Conversion from/to 16-bit floating point (half-precision). 51 | 52 | #if !defined(_FP16_EMU_H_) 53 | #define _FP16_EMU_H_ 54 | 55 | #include 56 | #include 57 | 58 | // Necessary to ensure visibility of CUDART_VERSION macro 59 | #include 60 | 61 | // Definition of '__half_raw' was not provided before CUDA 9.0. 62 | // '__half_raw' is our type where the unsigned 16-bit integer 63 | // data member 'x' can be accessed in both CUDA 9.0 and 8.0. 64 | #if CUDART_VERSION < 9000 65 | typedef __half __half_raw; 66 | #endif 67 | 68 | // Internally, in CUDNN we use half1 struct as the FP16 type. 69 | typedef __half half1; 70 | 71 | #define HLF_EPSILON 4.887581E-04 72 | #define HLF_MIN 6.103516E-05 73 | #define HLF_MAX 6.550400E+04 74 | 75 | half1 cpu_float2half_rn(float f); 76 | 77 | float cpu_half2float(half1 h); 78 | 79 | static __inline__ __device__ __host__ half1 habs(half1 h) 80 | { 81 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 82 | hr.x &= 0x7fffU; 83 | return reinterpret_cast(hr); 84 | } 85 | 86 | static __inline__ __device__ __host__ half1 hneg(half1 h) 87 | { 88 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 89 | hr.x ^= 0x8000U; 90 | return reinterpret_cast(hr); 91 | } 92 | 93 | static __inline__ __device__ __host__ int ishnan(half1 h) 94 | { 95 | // When input is NaN, exponent is all ones and mantissa is non-zero. 96 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 97 | return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0; 98 | } 99 | 100 | static __inline__ __device__ __host__ int ishinf(half1 h) 101 | { 102 | // When input is +/- inf, exponent is all ones and mantissa is zero. 103 | __half_raw hr = reinterpret_cast<__half_raw&>(h); 104 | return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0; 105 | } 106 | 107 | static __inline__ __device__ __host__ int ishequ(half1 x, half1 y) 108 | { 109 | __half_raw xr = reinterpret_cast<__half_raw&>(x); 110 | __half_raw yr = reinterpret_cast<__half_raw&>(y); 111 | return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x; 112 | } 113 | 114 | // Returns 0.0000 in FP16 binary form 115 | static __inline__ __device__ __host__ half1 hzero() 116 | { 117 | __half_raw hr; 118 | hr.x = 0x0000U; 119 | return reinterpret_cast(hr); 120 | } 121 | 122 | // Returns 1.0000 in FP16 binary form 123 | static __inline__ __device__ __host__ half1 hone() 124 | { 125 | __half_raw hr; 126 | hr.x = 0x3c00U; 127 | return reinterpret_cast(hr); 128 | } 129 | 130 | // Returns quiet NaN, the most significant fraction bit #9 is set 131 | static __inline__ __device__ __host__ half1 hnan() 132 | { 133 | __half_raw hr; 134 | hr.x = 0x7e00U; 135 | return reinterpret_cast(hr); 136 | } 137 | 138 | // Largest positive FP16 value, corresponds to 6.5504e+04 139 | static __inline__ __device__ __host__ half1 hmax() 140 | { 141 | // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff) 142 | __half_raw hr; 143 | hr.x = 0x7bffU; 144 | return reinterpret_cast(hr); 145 | } 146 | 147 | // Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05 148 | static __inline__ __device__ __host__ half1 hmin() 149 | { 150 | // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits) 151 | __half_raw hr; 152 | hr.x = 0x0400U; 153 | return reinterpret_cast(hr); 154 | } 155 | 156 | #endif // _FP16_EMU_H_ 157 | 158 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/gemv.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | #if !defined(_GEMV_H_) 13 | #define _GEMV_H_ 14 | 15 | #include // CUDA_VERSION 16 | #include 17 | #include "error_util.h" 18 | 19 | //#define DISABLE_GEMV 20 | 21 | void gemv(cublasHandle_t cublasHandle, int m, int n, double alpha, 22 | const double *A, const double *x, 23 | double beta, double *y) 24 | { 25 | #ifdef DISABLE_GEMV 26 | checkCublasErrors( cublasDgemm (cublasHandle, 27 | CUBLAS_OP_T, 28 | CUBLAS_OP_N, 29 | n, 30 | 1, 31 | m, 32 | &alpha, 33 | A, 34 | m, 35 | x, 36 | m, 37 | &beta, 38 | y, 39 | m) ); 40 | #else 41 | checkCublasErrors( cublasDgemv(cublasHandle, CUBLAS_OP_T, 42 | m, n, 43 | &alpha, 44 | A, m, 45 | x, 1, 46 | &beta, 47 | y, 1) ); 48 | #endif 49 | }; 50 | 51 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 52 | const float *A, const float *x, 53 | float beta, float *y) 54 | { 55 | #ifdef DISABLE_GEMV 56 | checkCublasErrors( cublasSgemm (cublasHandle, 57 | CUBLAS_OP_T, 58 | CUBLAS_OP_N, 59 | n, 60 | 1, 61 | m, 62 | &alpha, 63 | A, 64 | m, 65 | x, 66 | m, 67 | &beta, 68 | y, 69 | m) ); 70 | #else 71 | checkCublasErrors( cublasSgemv(cublasHandle, CUBLAS_OP_T, 72 | m, n, 73 | &alpha, 74 | A, m, 75 | x, 1, 76 | &beta, 77 | y, 1) ); 78 | #endif 79 | }; 80 | 81 | #if defined(CUDA_VERSION) && (CUDA_VERSION > 7000) 82 | 83 | #if (CUDA_VERSION < 8000) 84 | #define CUDA_R_16F CUBLAS_DATA_HALF 85 | #endif 86 | void gemv(cublasHandle_t cublasHandle, int m, int n, float alpha, 87 | const half1 *A, const half1 *x, 88 | float beta, half1 *y) 89 | { 90 | checkCublasErrors( cublasSgemmEx ( cublasHandle, 91 | CUBLAS_OP_T, 92 | CUBLAS_OP_N, 93 | n, 94 | 1, 95 | m, 96 | &alpha, 97 | A, 98 | CUDA_R_16F, 99 | m, 100 | x, 101 | CUDA_R_16F, 102 | m, 103 | &beta, 104 | y, 105 | CUDA_R_16F, 106 | m) ); 107 | }; 108 | #endif 109 | 110 | #endif // _GEMV_H_ 111 | -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/mnistCUDNN: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hardware-Alchemy/cuDNN-sample/723abc278779ead78a0d80439b0d83da29cd3bae/cudnn_samples_v7/mnistCUDNN/mnistCUDNN -------------------------------------------------------------------------------- /cudnn_samples_v7/mnistCUDNN/readme.txt: -------------------------------------------------------------------------------- 1 | This sample demonstrates how to use cuDNN library to implement forward pass 2 | given a trained network. 3 | 4 | The sample is based on "Training LeNet on MNIST with Caffe" tutorial, located 5 | at http://caffe.berkeleyvision.org/. The network is identical with the exception 6 | of addition of LRN layer. All the network weights are obtained and exported 7 | using Caffe. 8 | 9 | Network layer topology: 10 | 11 | 1. Convolution 12 | 2. Pooling 13 | 3. Convolution 14 | 4. Pooling 15 | 5. Fully connected 16 | 6. Relu 17 | 7. LRN 18 | 8. Fully Connected 19 | 9. SoftMax 20 | 21 | By default, the sample will classify three images, located in "data" directory 22 | using precomputed network weights: 23 | 1) Two convolution layers and their bias: conv1.bias.bin conv1.bin conv2.bias.bin conv2.bin 24 | 2) Two fully connected layers and their bias: ip1.bias.bin ip1.bin ip2.bias.bin ip2.bin 25 | 26 | Supported platforms: identical to cuDNN 27 | 28 | How to run: 29 | 30 | mnistCUDNN {} 31 | help : display this help 32 | device= : set the device to run the sample 33 | image= : classify specific image 34 | 35 | New in version 3 release 36 | fp16 (three ways of conversion: on host, on device using cuDNN, on device using CUDA) 37 | Local Response Normalization (LRN) 38 | Find fastest config (cudnnFindConvolutionForwardAlgorithm) 39 | FFT convolution 40 | Demonstrate Nd API (first available in cuDNN v2) 41 | -------------------------------------------------------------------------------- /home-made/common.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | #define checkCUDNN(expression)\ 7 | { \ 8 | cudnnStatus_t status = (expression); \ 9 | if (status != CUDNN_STATUS_SUCCESS) { \ 10 | std::cerr << "Error on line " << __LINE__ << ": " \ 11 | << cudnnGetErrorString(status) << std::endl; \ 12 | std::exit(EXIT_FAILURE); \ 13 | }\ 14 | }\ 15 | -------------------------------------------------------------------------------- /home-made/config_fermi_islip.icnt: -------------------------------------------------------------------------------- 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode 2 | use_map = 0; 3 | flit_size = 40; 4 | 5 | // currently we do not use this, see subnets below 6 | network_count = 2; 7 | 8 | // Topology 9 | topology = fly; 10 | k = 52; 11 | n = 1; 12 | 13 | // Routing 14 | 15 | routing_function = dest_tag; 16 | 17 | // Flow control 18 | 19 | num_vcs = 1; 20 | vc_buf_size = 64; 21 | input_buffer_size = 256; 22 | ejection_buffer_size = 64; 23 | boundary_buffer_size = 64; 24 | 25 | wait_for_tail_credit = 0; 26 | 27 | // Router architecture 28 | 29 | vc_allocator = islip; //separable_input_first; 30 | sw_allocator = islip; //separable_input_first; 31 | alloc_iters = 1; 32 | 33 | credit_delay = 0; 34 | routing_delay = 0; 35 | vc_alloc_delay = 1; 36 | sw_alloc_delay = 1; 37 | 38 | input_speedup = 1; 39 | output_speedup = 1; 40 | internal_speedup = 2.0; 41 | 42 | // Traffic, GPGPU-Sim does not use this 43 | 44 | traffic = uniform; 45 | packet_size ={{1,2,3,4},{10,20}}; 46 | packet_size_rate={{1,1,1,1},{2,1}}; 47 | 48 | // Simulation - Don't change 49 | 50 | sim_type = gpgpusim; 51 | //sim_type = latency; 52 | injection_rate = 0.1; 53 | 54 | subnets = 2; 55 | 56 | // Always use read and write no matter following line 57 | //use_read_write = 1; 58 | 59 | 60 | read_request_subnet = 0; 61 | read_reply_subnet = 1; 62 | write_request_subnet = 0; 63 | write_reply_subnet = 1; 64 | 65 | read_request_begin_vc = 0; 66 | read_request_end_vc = 0; 67 | write_request_begin_vc = 0; 68 | write_request_end_vc = 0; 69 | read_reply_begin_vc = 0; 70 | read_reply_end_vc = 0; 71 | write_reply_begin_vc = 0; 72 | write_reply_end_vc = 0; 73 | 74 | -------------------------------------------------------------------------------- /home-made/gpgpusim.config: -------------------------------------------------------------------------------- 1 | //21*1 fly with 32 flits per packet under gpgpusim injection mode 2 | use_map = 0; 3 | flit_size = 40; 4 | 5 | // currently we do not use this, see subnets below 6 | network_count = 2; 7 | 8 | // Topology 9 | topology = fly; 10 | k = 52; 11 | n = 1; 12 | 13 | // Routing 14 | 15 | routing_function = dest_tag; 16 | 17 | // Flow control 18 | 19 | num_vcs = 1; 20 | vc_buf_size = 64; 21 | input_buffer_size = 256; 22 | ejection_buffer_size = 64; 23 | boundary_buffer_size = 64; 24 | 25 | wait_for_tail_credit = 0; 26 | 27 | // Router architecture 28 | 29 | vc_allocator = islip; //separable_input_first; 30 | sw_allocator = islip; //separable_input_first; 31 | alloc_iters = 1; 32 | 33 | credit_delay = 0; 34 | routing_delay = 0; 35 | vc_alloc_delay = 1; 36 | sw_alloc_delay = 1; 37 | 38 | input_speedup = 1; 39 | output_speedup = 1; 40 | internal_speedup = 2.0; 41 | 42 | // Traffic, GPGPU-Sim does not use this 43 | 44 | traffic = uniform; 45 | packet_size ={{1,2,3,4},{10,20}}; 46 | packet_size_rate={{1,1,1,1},{2,1}}; 47 | 48 | // Simulation - Don't change 49 | 50 | sim_type = gpgpusim; 51 | //sim_type = latency; 52 | injection_rate = 0.1; 53 | 54 | subnets = 2; 55 | 56 | // Always use read and write no matter following line 57 | //use_read_write = 1; 58 | 59 | 60 | read_request_subnet = 0; 61 | read_reply_subnet = 1; 62 | write_request_subnet = 0; 63 | write_reply_subnet = 1; 64 | 65 | read_request_begin_vc = 0; 66 | read_request_end_vc = 0; 67 | write_request_begin_vc = 0; 68 | write_request_end_vc = 0; 69 | read_reply_begin_vc = 0; 70 | read_reply_end_vc = 0; 71 | write_reply_begin_vc = 0; 72 | write_reply_end_vc = 0; 73 | 74 | -------------------------------------------------------------------------------- /home-made/helloworld.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "common.hpp" 4 | 5 | int main(int argc, char const *argv[]) { 6 | 7 | cudnnHandle_t cudnn; 8 | checkCUDNN(cudnnCreate(&cudnn)); 9 | 10 | printf("Hello World!\n"); 11 | 12 | return 0; 13 | } --------------------------------------------------------------------------------