├── .gitignore ├── Makefile ├── README.md ├── convolution.cu ├── convolution.cuh ├── data └── lena.ppm ├── kernels.h └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Debug files 32 | *.dSYM/ 33 | *.su 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 4 | # 5 | # NOTICE TO USER: 6 | # 7 | # This source code is subject to NVIDIA ownership rights under U.S. and 8 | # international Copyright laws. 9 | # 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 11 | # CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 12 | # IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 19 | # OR PERFORMANCE OF THIS SOURCE CODE. 20 | # 21 | # U.S. Government End Users. This source code is a "commercial item" as 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 23 | # "commercial computer software" and "commercial computer software 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 25 | # and is provided to the U.S. Government only as a commercial end item. 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 28 | # source code with only those rights set forth herein. 29 | # 30 | ################################################################################ 31 | # 32 | # Makefile project only supported on Mac OS X and Linux Platforms) 33 | # 34 | ################################################################################ 35 | 36 | # Location of the CUDA Toolkit 37 | CUDA_PATH ?= /usr/local/cuda-8.0 38 | 39 | ############################## 40 | # start deprecated interface # 41 | ############################## 42 | ifeq ($(x86_64),1) 43 | $(info WARNING - x86_64 variable has been deprecated) 44 | $(info WARNING - please use TARGET_ARCH=x86_64 instead) 45 | TARGET_ARCH ?= x86_64 46 | endif 47 | ifeq ($(ARMv7),1) 48 | $(info WARNING - ARMv7 variable has been deprecated) 49 | $(info WARNING - please use TARGET_ARCH=armv7l instead) 50 | TARGET_ARCH ?= armv7l 51 | endif 52 | ifeq ($(aarch64),1) 53 | $(info WARNING - aarch64 variable has been deprecated) 54 | $(info WARNING - please use TARGET_ARCH=aarch64 instead) 55 | TARGET_ARCH ?= aarch64 56 | endif 57 | ifeq ($(ppc64le),1) 58 | $(info WARNING - ppc64le variable has been deprecated) 59 | $(info WARNING - please use TARGET_ARCH=ppc64le instead) 60 | TARGET_ARCH ?= ppc64le 61 | endif 62 | ifneq ($(GCC),) 63 | $(info WARNING - GCC variable has been deprecated) 64 | $(info WARNING - please use HOST_COMPILER=$(GCC) instead) 65 | HOST_COMPILER ?= $(GCC) 66 | endif 67 | ifneq ($(abi),) 68 | $(error ERROR - abi variable has been removed) 69 | endif 70 | ############################ 71 | # end deprecated interface # 72 | ############################ 73 | 74 | # architecture 75 | HOST_ARCH := $(shell uname -m) 76 | TARGET_ARCH ?= $(HOST_ARCH) 77 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l)) 78 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 79 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le)) 80 | TARGET_SIZE := 64 81 | else ifneq (,$(filter $(TARGET_ARCH),armv7l)) 82 | TARGET_SIZE := 32 83 | endif 84 | else 85 | TARGET_SIZE := $(shell getconf LONG_BIT) 86 | endif 87 | else 88 | $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!) 89 | endif 90 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 91 | ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le)) 92 | $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!) 93 | endif 94 | endif 95 | 96 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l 97 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32) 98 | TARGET_ARCH = armv7l 99 | endif 100 | 101 | # operating system 102 | HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]") 103 | TARGET_OS ?= $(HOST_OS) 104 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android)) 105 | $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!) 106 | endif 107 | 108 | # host compiler 109 | ifeq ($(TARGET_OS),darwin) 110 | ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1) 111 | HOST_COMPILER ?= clang++ 112 | endif 113 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 114 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l) 115 | ifeq ($(TARGET_OS),linux) 116 | HOST_COMPILER ?= arm-linux-gnueabihf-g++ 117 | else ifeq ($(TARGET_OS),qnx) 118 | ifeq ($(QNX_HOST),) 119 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 120 | endif 121 | ifeq ($(QNX_TARGET),) 122 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 123 | endif 124 | export QNX_HOST 125 | export QNX_TARGET 126 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++ 127 | else ifeq ($(TARGET_OS),android) 128 | HOST_COMPILER ?= arm-linux-androideabi-g++ 129 | endif 130 | else ifeq ($(TARGET_ARCH),aarch64) 131 | ifeq ($(TARGET_OS), linux) 132 | HOST_COMPILER ?= aarch64-linux-gnu-g++ 133 | else ifeq ($(TARGET_OS),qnx) 134 | ifeq ($(QNX_HOST),) 135 | $(error ERROR - QNX_HOST must be passed to the QNX host toolchain) 136 | endif 137 | ifeq ($(QNX_TARGET),) 138 | $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain) 139 | endif 140 | export QNX_HOST 141 | export QNX_TARGET 142 | HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++ 143 | else ifeq ($(TARGET_OS), android) 144 | HOST_COMPILER ?= aarch64-linux-android-g++ 145 | endif 146 | else ifeq ($(TARGET_ARCH),ppc64le) 147 | HOST_COMPILER ?= powerpc64le-linux-gnu-g++ 148 | endif 149 | endif 150 | HOST_COMPILER ?= g++ 151 | NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER) 152 | 153 | # internal flags 154 | NVCCFLAGS := -m${TARGET_SIZE} 155 | CCFLAGS := 156 | LDFLAGS := 157 | 158 | # build flags 159 | ifeq ($(TARGET_OS),darwin) 160 | LDFLAGS += -rpath $(CUDA_PATH)/lib 161 | CCFLAGS += -arch $(HOST_ARCH) 162 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux) 163 | LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3 164 | CCFLAGS += -mfloat-abi=hard 165 | else ifeq ($(TARGET_OS),android) 166 | LDFLAGS += -pie 167 | CCFLAGS += -fpie -fpic -fexceptions 168 | endif 169 | 170 | ifneq ($(TARGET_ARCH),$(HOST_ARCH)) 171 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux) 172 | ifneq ($(TARGET_FS),) 173 | GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6) 174 | ifeq ($(GCCVERSIONLTEQ46),1) 175 | CCFLAGS += --sysroot=$(TARGET_FS) 176 | endif 177 | LDFLAGS += --sysroot=$(TARGET_FS) 178 | LDFLAGS += -rpath-link=$(TARGET_FS)/lib 179 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib 180 | LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf 181 | endif 182 | endif 183 | endif 184 | 185 | # Debug build flags 186 | ifeq ($(dbg),1) 187 | NVCCFLAGS += -g -G 188 | BUILD_TYPE := debug 189 | else 190 | BUILD_TYPE := release 191 | endif 192 | 193 | ALL_CCFLAGS := 194 | ALL_CCFLAGS += $(NVCCFLAGS) 195 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS) 196 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS)) 197 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS)) 198 | 199 | SAMPLE_ENABLED := 1 200 | 201 | ALL_LDFLAGS := 202 | ALL_LDFLAGS += $(ALL_CCFLAGS) 203 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS)) 204 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS)) 205 | 206 | # Common includes and paths for CUDA 207 | INCLUDES := -I$(CUDA_PATH)/samples/common/inc 208 | LIBRARIES := 209 | 210 | ################################################################################ 211 | 212 | # Makefile include to help find GL Libraries 213 | include ./findgllib.mk 214 | 215 | # OpenGL specific libraries 216 | ifeq ($(TARGET_OS),darwin) 217 | # Mac OSX specific libraries and paths to include 218 | LIBRARIES += -L/System/Library/Frameworks/OpenGL.framework/Libraries 219 | LIBRARIES += -lGL -lGLU 220 | ALL_LDFLAGS += -Xlinker -framework -Xlinker GLUT 221 | else 222 | LIBRARIES += $(GLLINK) 223 | LIBRARIES += -lGL -lGLU -lX11 -lglut 224 | endif 225 | 226 | # Gencode arguments 227 | #SMS ?= 20 30 35 37 50 52 60 228 | # 52 for GTX 980 229 | SMS = 52 230 | 231 | ifeq ($(SMS),) 232 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<) 233 | SAMPLE_ENABLED := 0 234 | endif 235 | 236 | ifeq ($(GENCODE_FLAGS),) 237 | # Generate SASS code for each SM architecture listed in $(SMS) 238 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm))) 239 | 240 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility 241 | HIGHEST_SM := $(lastword $(sort $(SMS))) 242 | ifneq ($(HIGHEST_SM),) 243 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM) 244 | endif 245 | endif 246 | ################################################################################ 247 | 248 | # Target rules 249 | all: build 250 | 251 | build: convolution 252 | 253 | main.o:main.cpp 254 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 255 | 256 | convolution.o:convolution.cu 257 | $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< 258 | 259 | convolution: main.o convolution.o 260 | $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) 261 | 262 | clean: 263 | rm -f main main.o convolution.o 264 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Created by Rob Golshan (uteid: rpg499) 2 | 3 | Credit: 4 | main.cpp: GL functions / idea of the image loop taken from NVIDIA cuda sample code 5 | convolution.cu: Algorithms implemented based on convolutionSeparable.pdf in cuda sample code and a parallel implementation of algorithms in https://web.archive.org/web/20060718054020/http://www.acm.uiuc.edu/siggraph/workshops/wjarosz_convolution_2001.pdf 6 | 7 | What does the program do? 8 | ======================== 9 | This is an implementation of several image processesing algorithms utilizing the parallelism of an NVIDIA GPU via CUDA. Algorithms implemented are: 10 | 1. 2D Convolution in parallel that works with any kernel (i.e. filter matrix) 11 | * O(radius^2) assuming all blocks run in parallel 12 | 2. 2D Convolution in parallel similar to #1, but uses shared memory. Works with any kernel. 13 | * This is faster than #1. 14 | * Shared memory requirements are (BLOCK_SIZE x kernel radius) squared 15 | * Could possibly be faster (but same time complexity) by loading the source image in a texture 16 | 3. 2D Convolution in parallel with SEPARABLE kernels ONLY. 17 | * Split into two functions that compute convolution of rows or convolution of columns 18 | * O(radius) assuming all blocks run in parallel 19 | 4. Boxfilter 20 | * Similar to #3, but uses properties of box filters to keep time low when using a big radius 21 | * O(width+height) assuming all blocks run in parallel 22 | * Could possibly be faster (but same time complexity) by loading the source image in a texture 23 | * Time taken independent of radius size 24 | * Multiple iterations of this similate a Guassian filter 25 | 26 | Filters I purposely did not implement: 27 | 1. FFT filter 28 | * Requires more math knowledge than I currently have 29 | * Implementation would be padding kernel/image and using FFT library in cuda 30 | * Slower than separable implementation 31 | * Should only really be needed with using BIG kernels that are not separable 32 | 2. Guassian filters 33 | * We can either use a separable filter (#3) or a box filter several times (#4) to get the same result 34 | 35 | Any other filters I didn't implement were either because I thought it was already a filter mentioned earlier, or I missed it in my research 36 | 37 | 38 | 39 | Why use this over NVIDIA sample code? There is no reason. I doubt my implementations are any faster than the samples provided. 40 | 41 | BUILDING 42 | ========================== 43 | Build with make 44 | 45 | Tested and built on a single GPU system with a GTX 980 (compute capability 5.2) 46 | Have Xwindow system enabled to visually see results 47 | 48 | Must either use the sample lena.ppm or have your own ppm image file 49 | 50 | Running 51 | ======================== 52 | ./convolution --image [path to image] 53 | 54 | While the program is running and the XWindow is in focus, press h for a help command. 55 | -------------------------------------------------------------------------------- /convolution.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Created by Rob Golshan 3 | * Demos common image filters using parallel gpu algorithms 4 | * Algorithms based of convolutionSeperable.pdf in cuda samples 5 | * and wjarosz_convolution_2001.pdf --> converted to parallel 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "convolution.cuh" 15 | 16 | // Kernel cannot have radius bigger than 15 17 | __constant__ int d_kernel[1024]; 18 | 19 | #define BLOCK_SIZE 16 20 | 21 | /* 22 | * Converts a uint to a uint3, seperating RGB 23 | * colors by every byte. 24 | * Most significat to least significant: 25 | * Red, Green, Blue 26 | */ 27 | __device__ __forceinline__ int3 d_uintToRGB(unsigned int orig) 28 | { 29 | int3 rgb; 30 | rgb.x = orig & 0xff; 31 | rgb.y = (orig>>8)&0xff; 32 | rgb.z = (orig>>16)&0xff; 33 | return rgb; 34 | } 35 | 36 | /* 37 | * Converts a uint3 to an unsigned int 38 | * Assumes each vector member correspond to RGB colors 39 | * Truncates rgb colors bigger than 1 byte 40 | */ 41 | __device__ __forceinline__ unsigned int d_rgbToUint(int3 rgb) 42 | { 43 | if (rgb.x > 0xff) rgb.x = 0xff; 44 | else if (rgb.x < 0) rgb.x = 0; 45 | if (rgb.y > 0xff) rgb.y = 0xff; 46 | else if (rgb.y < 0) rgb.y = 0; 47 | if (rgb.z > 0xff) rgb.z = 0xff; 48 | else if (rgb.z < 0) rgb.z = 0; 49 | 50 | return (rgb.x & 0xff) | ((rgb.y & 0xff) << 8) | ((rgb.z & 0xff) << 16); 51 | } 52 | 53 | /* 54 | * divides an int3 by an int 55 | * Maybe faster to just multiply by float instead.. 56 | */ 57 | __device__ __forceinline__ int3 d_divide(int3 orig, int op) 58 | { 59 | orig.x = orig.x/op; 60 | orig.y = orig.y/op; 61 | orig.z = orig.z/op; 62 | return orig; 63 | } 64 | 65 | /* The most basic convolution method in parallel 66 | * Does not take advantage of memory optimizations with a GPU 67 | * Can be used with any (square) kernel filter 68 | * SLOW 69 | * Each output pixel does radius^2 multiplications 70 | * T = O(radius^2) 71 | * W = O(radius^2 * width * height) 72 | */ 73 | __global__ void d_slowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius, int weight) 74 | { 75 | int x = blockIdx.x*blockDim.x + threadIdx.x; 76 | int y = blockIdx.y*blockDim.y + threadIdx.y; 77 | const unsigned int loc = x + y*width; 78 | int3 accumulation = make_int3(0,0,0); 79 | int3 value; 80 | 81 | if (x >= width || y >= height) return; 82 | assert(x < width); 83 | assert(y < height); 84 | for (int i = -radius; i <= radius; i++) { 85 | for (int j = -radius; j <= radius; j++) { 86 | if ((x + i < 0) || //left side out of bounds 87 | (x + i >= width) || //right side OoB 88 | (y + j < 0) || //top OoB 89 | (y + j >= height)) //bot OoB 90 | continue; 91 | value = d_uintToRGB(d_img[loc + i + j * width]); 92 | int temp = d_kernel[i + radius + (j+radius)*((radius << 1) + 1)]; 93 | value *= temp; 94 | accumulation += value; 95 | } 96 | } 97 | accumulation = d_divide(accumulation, weight); 98 | d_result[loc] = d_rgbToUint(accumulation); 99 | } 100 | 101 | /* The most basic convolution method in parallel 102 | * Takes advantage of shared memory in a GPU 103 | * Can be used with any (square) kernel filter 104 | * Faster than without shared memory 105 | * Each output pixel does radius^2 multiplications 106 | * T = O(radius^2) 107 | * W = O(radius^2 * width * height) 108 | */ 109 | __global__ void d_sharedSlowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius, int weight) 110 | { 111 | // Use a 1d array instead of 2D in order to coalesce memory access 112 | extern __shared__ unsigned int data[]; 113 | 114 | int x = blockIdx.x*blockDim.x + threadIdx.x; 115 | int y = blockIdx.y*blockDim.y + threadIdx.y; 116 | if (x >= width || y >= height) return; 117 | 118 | // memory location in d_img 119 | const unsigned int loc = x + y*width; 120 | 121 | int3 accumulation = make_int3(0,0,0); 122 | int3 value; 123 | 124 | int w = blockDim.x; 125 | int h = blockDim.y; 126 | 127 | /* to convolute the edges of a block, the shared memory must extend outwards of radius */ 128 | #pragma unroll 3 129 | for (int i = -w; i <= w; i+= w) { 130 | #pragma unroll 3 131 | for (int j = -h; j <= h; j+= h) { 132 | int x0 = threadIdx.x + i; 133 | int y0 = threadIdx.y + j; 134 | int newLoc = loc + i + j*width; 135 | if (x0 < -radius || 136 | x0 >= radius + w || 137 | y0 < -radius || 138 | y0 >= radius + h || 139 | newLoc < 0 || 140 | newLoc >= width*height) 141 | continue; 142 | data[threadIdx.x + i + radius + (threadIdx.y + j + radius)*(blockDim.x+(radius << 1))] = d_img[newLoc]; 143 | } 144 | } 145 | 146 | __syncthreads(); 147 | 148 | for (int i = -radius; i <= radius; i++) { 149 | for (int j = -radius; j <= radius; j++) { 150 | unsigned int t = data[threadIdx.x + i + radius + (threadIdx.y + j + radius)*(blockDim.x+(radius << 1))]; 151 | int temp = d_kernel[i + radius + (j+radius)*((radius << 1) + 1)]; 152 | value = d_uintToRGB(t); 153 | value *= temp; 154 | accumulation += value; 155 | } 156 | } 157 | accumulation = d_divide(accumulation, weight); 158 | d_result[loc] = d_rgbToUint(accumulation); 159 | } 160 | 161 | /* VERY FAST convolution method in parallel 162 | * Takes advantage of shared memory in a GPU 163 | * Can be used with ONLY WITH SEPARABLE kernel filters 164 | * Each output pixel does radius+radius multiplications 165 | * T = O(radius + radius) 166 | * W = O(radius * width + radius*height) 167 | */ 168 | __global__ void d_sepRowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius) 169 | { 170 | // Use a 1d array instead of 2D in order to coalesce memory access 171 | extern __shared__ unsigned int data[]; 172 | 173 | int x = blockIdx.x*blockDim.x + threadIdx.x; 174 | int y = blockIdx.y*blockDim.y + threadIdx.y; 175 | if (x >= width || y >= height) return; 176 | 177 | // memory location in d_img 178 | const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y)*width + threadIdx.y*width; 179 | 180 | int3 accumulation = make_int3(0,0,0); 181 | int3 value; 182 | int weight = 0; 183 | 184 | 185 | int w = blockDim.x; 186 | 187 | /* to convolute the edges of a block, the shared memory must extend outwards of radius */ 188 | #pragma unroll 3 189 | for (int i = -w; i <= w; i+= w) { 190 | int x0 = threadIdx.x + i; 191 | int newLoc = loc + i; 192 | if (x0 < -radius || 193 | x0 >= radius + w || 194 | newLoc < 0 || 195 | newLoc >= width*height) 196 | continue; 197 | data[threadIdx.x + i + radius + (threadIdx.y) *(blockDim.x+(radius << 1))] = d_img[newLoc]; 198 | } 199 | 200 | __syncthreads(); 201 | 202 | for (int i = -radius; i <= radius; i++) { 203 | unsigned int t = data[threadIdx.x + i + radius + (threadIdx.y)*(blockDim.x+(radius << 1))]; 204 | int temp = d_kernel[i + radius]; 205 | value = d_uintToRGB(t); 206 | value *= temp; 207 | weight += temp; 208 | accumulation += value; 209 | } 210 | accumulation = d_divide(accumulation, weight); 211 | d_result[loc] = d_rgbToUint(accumulation); 212 | } 213 | 214 | /* VERY FAST convolution method in parallel 215 | * Takes advantage of shared memory in a GPU 216 | * Can be used with ONLY WITH SEPERABLE kernel filters 217 | * Each output pixel does radius^2 multiplications 218 | * T = O(radius + radius) 219 | * W = O(radius * width + radius*height) 220 | */ 221 | __global__ void d_sepColConvolution(unsigned int *d_result, int width, int height, int radius) 222 | { 223 | // Use a 1d array instead of 2D in order to coalesce memory access 224 | extern __shared__ unsigned int data[]; 225 | 226 | int x = blockIdx.x*blockDim.x + threadIdx.x; 227 | int y = blockIdx.y*blockDim.y + threadIdx.y; 228 | if (x >= width || y >= height) return; 229 | 230 | // memory location in d_img 231 | const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y)*width + threadIdx.y*width; 232 | 233 | int3 accumulation = make_int3(0,0,0); 234 | int3 value; 235 | int weight = 0; 236 | 237 | 238 | int h = blockDim.y; 239 | 240 | /* to convolute the edges of a block, the shared memory must extend outwards of radius */ 241 | #pragma unroll 3 242 | for (int j = -h; j <= h; j+= h) { 243 | int y0 = threadIdx.y + j; 244 | int newLoc = loc + j*width; 245 | if (y0 < -radius || 246 | y0 >= radius + h || 247 | newLoc < 0 || 248 | newLoc >= width*height) 249 | continue; 250 | data[threadIdx.x + (threadIdx.y + j + radius)*(blockDim.x)] = d_result[newLoc]; 251 | } 252 | 253 | __syncthreads(); 254 | 255 | for (int j = -radius; j <= radius; j++) { 256 | unsigned int t = data[threadIdx.x + (threadIdx.y + j + radius)*(blockDim.x)]; 257 | float temp = d_kernel[(j + radius)*((radius << 1)+1)]; 258 | value = d_uintToRGB(t); 259 | value *= temp; 260 | weight += temp; 261 | accumulation += value; 262 | } 263 | accumulation = d_divide(accumulation, weight); 264 | d_result[loc] = d_rgbToUint(accumulation); 265 | } 266 | 267 | 268 | /* 269 | * Fast radius independent box filter 270 | * Do Rows followed by Columns 271 | * T = O(width + height) 272 | * W = O(width*height + width*height) 273 | */ 274 | __global__ void d_boxFilterRow(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius) 275 | { 276 | // memory location in d_img 277 | const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) * width; 278 | if (loc > height*width) return; 279 | 280 | d_img = d_img + loc; 281 | d_result = d_result + loc; 282 | int3 accumulation; 283 | int bWeight = (radius<<1) + 1; //all values in kernel weighted equally 284 | 285 | //initial clamping of left value 286 | accumulation = d_uintToRGB(d_img[0])*radius; 287 | for (int i = 0; i < radius + 1; i++) { 288 | accumulation += d_uintToRGB(d_img[i]); 289 | } 290 | d_result[0] = d_rgbToUint(d_divide(accumulation, bWeight)); 291 | 292 | for (int i = 1; i < radius + 1; i++) { 293 | accumulation += d_uintToRGB(d_img[i + radius]); 294 | accumulation -= d_uintToRGB(d_img[0]); //clamp left side 295 | d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight)); 296 | } 297 | 298 | //resuses previous computed value 299 | for (int i = radius + 1; i < width - radius; i++) { 300 | accumulation += d_uintToRGB(d_img[i + radius]); 301 | accumulation -= d_uintToRGB(d_img[i - radius - 1]); 302 | d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight)); 303 | } 304 | 305 | for (int i = width - radius; i < width; i++){ 306 | //clamp right side 307 | accumulation += d_uintToRGB(d_img[width - 1]); 308 | accumulation -= d_uintToRGB(d_img[i - radius - 1]); 309 | d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight)); 310 | } 311 | } 312 | 313 | 314 | /* 315 | * Fast radius independent box filter 316 | * Do Rows followed by Columns 317 | * d_img should be d_result from the row filter 318 | * T = O(width + height) 319 | * W = O(width*height + width*height) 320 | */ 321 | __global__ void d_boxFilterCol(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius) 322 | { 323 | // memory location in d_img 324 | const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x); 325 | if (loc >= width) return; 326 | 327 | d_img = d_img + loc; 328 | d_result = d_result + loc; 329 | int3 accumulation; 330 | int bWeight = (radius<<1) + 1; //all values in kernel weighted equally 331 | 332 | 333 | //initial clamping of left value 334 | accumulation = d_uintToRGB(d_img[0])*radius; 335 | for (int i = 0; i < radius + 1; i++) { 336 | accumulation += d_uintToRGB(d_img[i * width]); 337 | } 338 | d_result[0] = d_rgbToUint(d_divide(accumulation, bWeight)); 339 | 340 | for (int i = 1; i < radius + 1; i++) { 341 | accumulation += d_uintToRGB(d_img[(i + radius) * width]); 342 | accumulation -= d_uintToRGB(d_img[0]); //clamp left side 343 | d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight)); 344 | } 345 | 346 | //resuses previous computed value 347 | for (int i = radius + 1; i < height - radius; i++) { 348 | accumulation += d_uintToRGB(d_img[(i + radius)*width]); 349 | accumulation -= d_uintToRGB(d_img[(i - radius)*width - width]); 350 | d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight)); 351 | } 352 | 353 | for (int i = height - radius; i < height; i++){ 354 | //clamp right side 355 | accumulation += d_uintToRGB(d_img[(height - 1)*width]); 356 | accumulation -= d_uintToRGB(d_img[(i - radius)*width - width]); 357 | d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight)); 358 | } 359 | } 360 | 361 | extern StopWatchInterface *timer; 362 | 363 | /* 364 | * look at main.cpp kerboard interrupts for descriptions on what type and kernels do 365 | */ 366 | double convolution(unsigned int *d_img, unsigned int *d_result, int *h_kernel, int width, int height, 367 | int radius, int type, int weight, int iterations) 368 | { 369 | checkCudaErrors(cudaDeviceSynchronize()); 370 | 371 | // threadsPerBlock needs to be a multiple of 32 for proper coalesce 372 | dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE); 373 | //numBlocks should probably be a multiple of warp size here for proper coalesce.. 374 | dim3 numBlocks(ceil((float)width / threadsPerBlock.x), ceil((float)height/threadsPerBlock.y)); 375 | 376 | //copy kernel to device memory 377 | if (radius < 15) 378 | checkCudaErrors(cudaMemcpyToSymbol(d_kernel, h_kernel, ((radius << 1)+1)*((radius << 1)+1)*sizeof(int))); 379 | 380 | unsigned int *d_temp = NULL; 381 | if (type == 3) 382 | checkCudaErrors(cudaMalloc((void **) &d_temp, width*height*sizeof(unsigned int))); 383 | 384 | sdkResetTimer(&timer); 385 | sdkStartTimer(&timer); 386 | for (int i = 0; i < iterations; i++) { 387 | switch (type) { 388 | case 0: 389 | d_slowConvolution<<< numBlocks, threadsPerBlock>>>(d_img, d_result, width, height, radius, weight); 390 | break; 391 | case 1: 392 | d_sharedSlowConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE+(radius << 1))*(BLOCK_SIZE+(radius << 1))*sizeof(unsigned int)>>>(d_img, d_result, width, height, radius, weight); 393 | break; 394 | case 2: 395 | d_sepRowConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE+(radius << 1))*(BLOCK_SIZE)*sizeof(unsigned int)>>>(d_img, d_result, width, height, radius); 396 | d_sepColConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE)*(BLOCK_SIZE+(radius << 1))*sizeof(unsigned int)>>>(d_result, width, height, radius); 397 | break; 398 | case 3: 399 | d_boxFilterRow<<< ceil((float)height/BLOCK_SIZE), BLOCK_SIZE>>>(d_img, d_temp, width, height, radius); 400 | d_boxFilterCol<<< ceil((float)width/BLOCK_SIZE), BLOCK_SIZE>>>(d_temp, d_result, width, height, radius); 401 | break; 402 | } 403 | checkCudaErrors(cudaDeviceSynchronize()); 404 | d_img = d_result; 405 | } 406 | sdkStopTimer(&timer); 407 | printf("time taken: %f\n", sdkGetTimerValue(&timer)); 408 | 409 | checkCudaErrors(cudaFree(d_temp)); 410 | return 0; 411 | } 412 | -------------------------------------------------------------------------------- /convolution.cuh: -------------------------------------------------------------------------------- 1 | #ifndef _CONVOLUTION_CUH_ 2 | #define _CONVOLUTION_CUH_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | double convolution(unsigned int *d_img, unsigned int *d_result, int *d_kernel, int width, int height, int radius, int type, int weight, int iterations); 11 | 12 | #endif // #ifndef _CONVOLUTION_CUH_ 13 | -------------------------------------------------------------------------------- /data/lena.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rpgolshan/CUDA-image-processing/b704508fbc375ddf269579f8a498d68b2952dba5/data/lena.ppm -------------------------------------------------------------------------------- /kernels.h: -------------------------------------------------------------------------------- 1 | // identity, r=1 w=1 2 | int k0[] = 3 | { 4 | 0, 0, 0, 5 | 0, 1, 0, 6 | 0, 0, 0 7 | }; 8 | 9 | // blur, r = 2 w = 13 10 | int k1[] = 11 | { 12 | 0, 0, 1, 0, 0, 13 | 0, 1, 1, 1, 0, 14 | 1, 1, 1, 1, 1, 15 | 0, 1, 1, 1, 0, 16 | 0, 0, 1, 0, 0, 17 | }; 18 | 19 | // motion bur, r=4 w=9 20 | int k2[] = 21 | { 22 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 23 | 0, 1, 0, 0, 0, 0, 0, 0, 0, 24 | 0, 0, 1, 0, 0, 0, 0, 0, 0, 25 | 0, 0, 0, 1, 0, 0, 0, 0, 0, 26 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 27 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 28 | 0, 0, 0, 0, 0, 0, 1, 0, 0, 29 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 30 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 31 | }; 32 | 33 | 34 | // horiz edges, r=2 w=1 35 | int k3[] = 36 | { 37 | 0, 0, -1, 0, 0, 38 | 0, 0, -1, 0, 0, 39 | 0, 0, 2, 0, 0, 40 | 0, 0, 0, 0, 0, 41 | 0, 0, 0, 0, 0, 42 | }; 43 | 44 | // vertical edges, r=2 w=1 45 | int k4[] = 46 | { 47 | 0, 0, -1, 0, 0, 48 | 0, 0, -1, 0, 0, 49 | 0, 0, 4, 0, 0, 50 | 0, 0, -1, 0, 0, 51 | 0, 0, -1, 0, 0, 52 | }; 53 | 54 | // all edges, r=1 w=1 55 | int k5[] = 56 | { 57 | -1, -1, -1, 58 | -1, 8, -1, 59 | -1, -1, -1 60 | }; 61 | 62 | // sharpen, r=1 w=1 63 | int k6[] = 64 | { 65 | 0, -1, 0, 66 | -1, 5, -1, 67 | 0, -1, 0 68 | }; 69 | 70 | // super sharpen, r=1 w=1 71 | int k7[] = 72 | { 73 | -1, -1, -1, 74 | -1, 9, -1, 75 | -1, -1, -1 76 | }; 77 | 78 | //emboss r=1, w=1 79 | int k8[] = 80 | { 81 | -2, -1, 0, 82 | -1, 1, 1, 83 | 0, 1, 2 84 | }; 85 | 86 | //box filter, r=9(max) w=(r*2+1)^2 87 | int k9[] = 88 | { 89 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 90 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 91 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 92 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 93 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 94 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 95 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 96 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 97 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 98 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 100 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 101 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 102 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 103 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 104 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 105 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 106 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 107 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 108 | }; 109 | 110 | // guassian blur, r=2 w=273 111 | int guass[] = 112 | { 113 | 1, 4, 7, 4, 1, 114 | 4, 16, 26, 16, 4, 115 | 7, 26, 41, 26, 7, 116 | 4, 16, 26, 16, 4, 117 | 1, 4, 7, 4, 1 118 | }; 119 | 120 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Created by Rob Golshan 3 | * gl code and helper functions taken from NVIDIA sample code 4 | * Demos common image filters using parallel gpu algorithms 5 | */ 6 | 7 | // OpenGL Graphics includes 8 | #include 9 | #include 10 | 11 | // CUDA includes and interop headers 12 | #include 13 | #include 14 | 15 | // CUDA utilities and system includes 16 | #include 17 | #include // includes cuda.h and cuda_runtime_api.h 18 | #include // includes cuda_runtime_api.h 19 | 20 | // Includes 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "kernels.h" 27 | #include "convolution.cuh" 28 | 29 | int weight = 1; 30 | int radius = 1; 31 | 32 | const char *image_filename = "./data/lena.ppm"; 33 | int filter = 0; 34 | int type = 0; 35 | unsigned int iterations = 1; 36 | 37 | 38 | unsigned int width, height; 39 | unsigned int *h_img = NULL; 40 | unsigned int *d_img = NULL; 41 | //unsigned int *d_result = NULL; 42 | int *k = k0; 43 | 44 | GLuint pbo = 0; // OpenGL pixel buffer object 45 | GLuint texid = 0; // texture 46 | 47 | StopWatchInterface *timer = 0; 48 | 49 | void print_help() 50 | { 51 | printf("press:\n"); 52 | printf("\t\tq\t- Normal Convolution\n"); 53 | printf("\t\tw\t- Normal Convolution with Shared memory\n"); 54 | printf("\t\te\t- Fast Box Filter\n"); 55 | printf("\t\tr\t- Separable Box Filter\n"); 56 | printf("\t\tt\t- Separable Guassian Filter\n"); 57 | printf("\n\n"); 58 | printf("In q or w mode, press 1 - 9, 0 for crazy filters!"); 59 | printf("\n\n"); 60 | } 61 | 62 | // display results using OpenGL 63 | void display() 64 | { 65 | 66 | // execute filter, writing results to pbo 67 | unsigned int *d_result; 68 | checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo)); 69 | convolution(d_img, d_result, k, width, height, radius, type, weight, iterations); 70 | checkCudaErrors(cudaGLUnmapBufferObject(pbo)); 71 | 72 | // load texture from pbo 73 | glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); 74 | glBindTexture(GL_TEXTURE_2D, texid); 75 | glPixelStorei(GL_UNPACK_ALIGNMENT, 1); 76 | glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); 77 | glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); 78 | 79 | // display results 80 | glClear(GL_COLOR_BUFFER_BIT); 81 | 82 | glEnable(GL_TEXTURE_2D); 83 | glDisable(GL_DEPTH_TEST); 84 | 85 | glBegin(GL_QUADS); 86 | glTexCoord2f(0, 1); 87 | glVertex2f(0, 0); 88 | glTexCoord2f(1, 1); 89 | glVertex2f(1, 0); 90 | glTexCoord2f(1, 0); 91 | glVertex2f(1, 1); 92 | glTexCoord2f(0, 0); 93 | glVertex2f(0, 1); 94 | glEnd(); 95 | 96 | glDisable(GL_TEXTURE_2D); 97 | glutSwapBuffers(); 98 | } 99 | 100 | void idle() 101 | { 102 | glutPostRedisplay(); 103 | } 104 | 105 | void cleanup() 106 | { 107 | sdkDeleteTimer(&timer); 108 | 109 | checkCudaErrors(cudaFree(d_img)); 110 | if (pbo) 111 | { 112 | checkCudaErrors(cudaGLUnregisterBufferObject(pbo)); 113 | glDeleteBuffers(1, &pbo); 114 | } 115 | 116 | if (texid) 117 | { 118 | glDeleteTextures(1, &texid); 119 | } 120 | } 121 | 122 | const char *s = "identity"; 123 | const char *s_type = "Normal convolution"; 124 | int prev_type= 0; 125 | void keyboard(unsigned char key, int x, int y) 126 | { 127 | switch (key) 128 | { 129 | case 27: 130 | glutDestroyWindow(glutGetWindow()); 131 | return; 132 | break; 133 | case '=': 134 | case '+': 135 | if (type == 3) 136 | radius++; 137 | else if (filter == 9) 138 | radius >= 9? radius=9: radius++; 139 | break; 140 | case '-': 141 | case '_': 142 | if (filter == 9) 143 | radius <= 0? radius=0: radius--; 144 | break; 145 | case '[': 146 | iterations <= 1? iterations=1: iterations--; 147 | break; 148 | case ']': 149 | iterations+=1; 150 | break; 151 | case '0': 152 | filter = 0; 153 | weight = 1; 154 | radius = 1; 155 | k = k0; 156 | s = "identity"; 157 | type = prev_type; 158 | break; 159 | case '1': 160 | filter = 1; 161 | weight = 13; 162 | radius = 2; 163 | k = k1; 164 | s = "blur"; 165 | type = prev_type; 166 | break; 167 | case '2': 168 | filter = 2; 169 | weight = 9; 170 | radius = 4; 171 | k = k2; 172 | s = "motion blur"; 173 | type = prev_type; 174 | break; 175 | case '3': 176 | filter = 3; 177 | weight = 1; 178 | radius = 2; 179 | k = k3; 180 | s = "detect horizontol edges"; 181 | type = prev_type; 182 | break; 183 | case '4': 184 | filter = 4; 185 | weight = 1; 186 | radius = 2; 187 | k = k4; 188 | s = "detect vertical edges"; 189 | type = prev_type; 190 | break; 191 | case '5': 192 | filter = 5; 193 | weight = 1; 194 | radius = 1; 195 | k = k5; 196 | s = "detect all edges"; 197 | type = prev_type; 198 | break; 199 | case '6': 200 | filter = 6; 201 | weight = 1; 202 | radius = 1; 203 | k = k6; 204 | s = "sharpen"; 205 | type = prev_type; 206 | break; 207 | case '7': 208 | filter = 7; 209 | weight = 273; 210 | radius = 2; 211 | k = guass; 212 | s = "guassian blur"; 213 | type = prev_type; 214 | break; 215 | case '8': 216 | filter = 8; 217 | weight = 1; 218 | radius = 1; 219 | k = k8; 220 | s = "emboss"; 221 | type = prev_type; 222 | break; 223 | case '9': 224 | filter = 9; 225 | radius = 9; 226 | weight = ((radius<<1)+1)*((radius<<1)+1); 227 | k = k9; 228 | s = "box filter (max r=9)"; 229 | type = prev_type; 230 | break; 231 | case 'q': 232 | if (filter == 9) { 233 | s = "box filter (max r=9)"; 234 | k = k9; 235 | weight = ((radius<<1)+1)*((radius<<1)+1); 236 | } 237 | if (filter == 7) { 238 | s = "guassian blur"; 239 | k = guass; 240 | weight = 273; 241 | } 242 | prev_type = 0; 243 | type = 0; 244 | if (radius > 9) radius=9; 245 | break; 246 | case 'w': 247 | if (filter == 9) { 248 | s = "box filter (max r=9)"; 249 | k = k9; 250 | weight = ((radius<<1)+1)*((radius<<1)+1); 251 | } 252 | if (filter == 7) { 253 | s = "guassian blur"; 254 | k = guass; 255 | weight = 273; 256 | } 257 | prev_type = 1; 258 | type = 1; 259 | if (radius > 9) radius=9; 260 | break; 261 | case 'e': 262 | filter = 9; 263 | type = 3; 264 | s = "fast box filter"; 265 | break; 266 | case 'r': 267 | filter = 9; 268 | type = 2; 269 | k = k9; 270 | s = "separable box filter"; 271 | break; 272 | case 't': 273 | filter = 7; 274 | type = 2; 275 | radius = 2; 276 | k = guass; 277 | s = "separable guassian blur"; 278 | break; 279 | default: 280 | print_help(); 281 | break; 282 | } 283 | 284 | switch (type) { 285 | case 0: 286 | s_type = "Normal"; 287 | break; 288 | case 1: 289 | s_type = "Normalw/ shared memory"; 290 | break; 291 | case 2: 292 | s_type = "Separable"; 293 | break; 294 | case 3: 295 | s_type = "Fast Box Filter"; 296 | break; 297 | } 298 | 299 | 300 | printf("filter: %-30s\tconvolution function: %-30s\tradius: %3d\titerations:%d ", s, s_type, radius, iterations); 301 | 302 | glutPostRedisplay(); 303 | } 304 | 305 | void reshape(int x, int y) 306 | { 307 | glViewport(0, 0, x, y); 308 | 309 | glMatrixMode(GL_MODELVIEW); 310 | glLoadIdentity(); 311 | 312 | glMatrixMode(GL_PROJECTION); 313 | glLoadIdentity(); 314 | glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); 315 | } 316 | 317 | 318 | 319 | void initCudaBuffers() 320 | { 321 | unsigned int size = width * height * sizeof(unsigned int); 322 | unsigned int ksize = (2*radius+1)*(2*radius+1) * sizeof(int); 323 | 324 | // allocate device memory 325 | checkCudaErrors(cudaMalloc((void **) &d_img, size)); 326 | checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); 327 | sdkCreateTimer(&timer); 328 | } 329 | 330 | void initGLBuffers() 331 | { 332 | // create pixel buffer object to store final image 333 | glGenBuffers(1, &pbo); 334 | glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); 335 | glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLubyte)*4, h_img, GL_STREAM_DRAW_ARB); 336 | 337 | glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); 338 | checkCudaErrors(cudaGLRegisterBufferObject(pbo)); 339 | 340 | // create texture for display 341 | glGenTextures(1, &texid); 342 | glBindTexture(GL_TEXTURE_2D, texid); 343 | glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); 344 | glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); 345 | glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); 346 | glBindTexture(GL_TEXTURE_2D, 0); 347 | } 348 | 349 | void initGL(int *argc, char **argv) 350 | { 351 | glutInit(argc, argv); 352 | glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); 353 | glutInitWindowSize(width, height); 354 | glutCreateWindow("CUDA image processing"); 355 | glutDisplayFunc(display); 356 | glutKeyboardFunc(keyboard); 357 | glutReshapeFunc(reshape); 358 | glutIdleFunc(NULL); //IDLE here so its not just endlessly computing 359 | 360 | glutCloseFunc(cleanup); 361 | 362 | printf("Press '+' and '-' to change filter width\n"); 363 | printf("0, 1, 2 - change filter order\n"); 364 | printf("a = slow convolution, b = slow convolution w/ shared memory\n"); 365 | 366 | if (!isGLVersionSupported(2,0) || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) 367 | { 368 | fprintf(stderr, "Required OpenGL extensions missing."); 369 | exit(EXIT_FAILURE); 370 | } 371 | } 372 | 373 | int main(int argc, char **argv) 374 | { 375 | setenv ("DISPLAY", ":0", 0); 376 | printf("[%s] - Starting...\n", argv[0]); 377 | 378 | //Use command-line specified CUDA device, otherwise use device with highest Gflops/s 379 | findCudaDevice(argc, (const char **)argv); 380 | 381 | int nFailures = 0; 382 | 383 | // Get the path of the filename 384 | char *filename; 385 | 386 | if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename)) 387 | { 388 | image_filename = filename; 389 | } 390 | 391 | // load image 392 | char *image_path = sdkFindFilePath(image_filename, argv[0]); 393 | 394 | if (image_path == NULL) 395 | { 396 | fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); 397 | exit(EXIT_FAILURE); 398 | } 399 | 400 | //PPM images only 401 | sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); 402 | 403 | if (!h_img) 404 | { 405 | printf("Error unable to load PPM file: '%s'\n", image_path); 406 | exit(EXIT_FAILURE); 407 | } 408 | 409 | initGL(&argc, argv); 410 | findCudaGLDevice(argc, (const char **)argv); 411 | printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); 412 | 413 | initCudaBuffers(); 414 | 415 | initGLBuffers(); 416 | glutMainLoop(); 417 | exit(EXIT_SUCCESS); 418 | } 419 | 420 | --------------------------------------------------------------------------------