├── .gitignore
├── Makefile
├── README.md
├── convolution.cu
├── convolution.cuh
├── data
    └── lena.ppm
├── kernels.h
└── main.cpp


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Debug files
32 | *.dSYM/
33 | *.su
34 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | #
  5 | # NOTICE TO USER:
  6 | #
  7 | # This source code is subject to NVIDIA ownership rights under U.S. and
  8 | # international Copyright laws.
  9 | #
 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 19 | # OR PERFORMANCE OF THIS SOURCE CODE.
 20 | #
 21 | # U.S. Government End Users.  This source code is a "commercial item" as
 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 23 | # "commercial computer software" and "commercial computer software
 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 25 | # and is provided to the U.S. Government only as a commercial end item.
 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 28 | # source code with only those rights set forth herein.
 29 | #
 30 | ################################################################################
 31 | #
 32 | # Makefile project only supported on Mac OS X and Linux Platforms)
 33 | #
 34 | ################################################################################
 35 | 
 36 | # Location of the CUDA Toolkit
 37 | CUDA_PATH ?= /usr/local/cuda-8.0
 38 | 
 39 | ##############################
 40 | # start deprecated interface #
 41 | ##############################
 42 | ifeq ($(x86_64),1)
 43 |     $(info WARNING - x86_64 variable has been deprecated)
 44 |     $(info WARNING - please use TARGET_ARCH=x86_64 instead)
 45 |     TARGET_ARCH ?= x86_64
 46 | endif
 47 | ifeq ($(ARMv7),1)
 48 |     $(info WARNING - ARMv7 variable has been deprecated)
 49 |     $(info WARNING - please use TARGET_ARCH=armv7l instead)
 50 |     TARGET_ARCH ?= armv7l
 51 | endif
 52 | ifeq ($(aarch64),1)
 53 |     $(info WARNING - aarch64 variable has been deprecated)
 54 |     $(info WARNING - please use TARGET_ARCH=aarch64 instead)
 55 |     TARGET_ARCH ?= aarch64
 56 | endif
 57 | ifeq ($(ppc64le),1)
 58 |     $(info WARNING - ppc64le variable has been deprecated)
 59 |     $(info WARNING - please use TARGET_ARCH=ppc64le instead)
 60 |     TARGET_ARCH ?= ppc64le
 61 | endif
 62 | ifneq ($(GCC),)
 63 |     $(info WARNING - GCC variable has been deprecated)
 64 |     $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
 65 |     HOST_COMPILER ?= $(GCC)
 66 | endif
 67 | ifneq ($(abi),)
 68 |     $(error ERROR - abi variable has been removed)
 69 | endif
 70 | ############################
 71 | # end deprecated interface #
 72 | ############################
 73 | 
 74 | # architecture
 75 | HOST_ARCH   := $(shell uname -m)
 76 | TARGET_ARCH ?= $(HOST_ARCH)
 77 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 78 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 79 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 80 |             TARGET_SIZE := 64
 81 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 82 |             TARGET_SIZE := 32
 83 |         endif
 84 |     else
 85 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 86 |     endif
 87 | else
 88 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 89 | endif
 90 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 91 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 92 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 93 |     endif
 94 | endif
 95 | 
 96 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 97 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 98 |     TARGET_ARCH = armv7l
 99 | endif
100 | 
101 | # operating system
102 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
103 | TARGET_OS ?= $(HOST_OS)
104 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
105 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
106 | endif
107 | 
108 | # host compiler
109 | ifeq ($(TARGET_OS),darwin)
110 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
111 |         HOST_COMPILER ?= clang++
112 |     endif
113 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
115 |         ifeq ($(TARGET_OS),linux)
116 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
117 |         else ifeq ($(TARGET_OS),qnx)
118 |             ifeq ($(QNX_HOST),)
119 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
120 |             endif
121 |             ifeq ($(QNX_TARGET),)
122 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
123 |             endif
124 |             export QNX_HOST
125 |             export QNX_TARGET
126 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
127 |         else ifeq ($(TARGET_OS),android)
128 |             HOST_COMPILER ?= arm-linux-androideabi-g++
129 |         endif
130 |     else ifeq ($(TARGET_ARCH),aarch64)
131 |         ifeq ($(TARGET_OS), linux)
132 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
133 |         else ifeq ($(TARGET_OS),qnx)
134 |             ifeq ($(QNX_HOST),)
135 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
136 |             endif
137 |             ifeq ($(QNX_TARGET),)
138 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
139 |             endif
140 |             export QNX_HOST
141 |             export QNX_TARGET
142 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
143 |         else ifeq ($(TARGET_OS), android)
144 |             HOST_COMPILER ?= aarch64-linux-android-g++
145 |         endif
146 |     else ifeq ($(TARGET_ARCH),ppc64le)
147 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
148 |     endif
149 | endif
150 | HOST_COMPILER ?= g++
151 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
152 | 
153 | # internal flags
154 | NVCCFLAGS   := -m${TARGET_SIZE}
155 | CCFLAGS     := 
156 | LDFLAGS     := 
157 | 
158 | # build flags
159 | ifeq ($(TARGET_OS),darwin)
160 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
161 |     CCFLAGS += -arch $(HOST_ARCH)
162 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
163 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
164 |     CCFLAGS += -mfloat-abi=hard
165 | else ifeq ($(TARGET_OS),android)
166 |     LDFLAGS += -pie
167 |     CCFLAGS += -fpie -fpic -fexceptions
168 | endif
169 | 
170 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
171 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
172 |         ifneq ($(TARGET_FS),)
173 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
174 |             ifeq ($(GCCVERSIONLTEQ46),1)
175 |                 CCFLAGS += --sysroot=$(TARGET_FS)
176 |             endif
177 |             LDFLAGS += --sysroot=$(TARGET_FS)
178 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
179 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
180 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
181 |         endif
182 |     endif
183 | endif
184 | 
185 | # Debug build flags
186 | ifeq ($(dbg),1)
187 |       NVCCFLAGS += -g -G
188 |       BUILD_TYPE := debug
189 | else
190 |       BUILD_TYPE := release
191 | endif
192 | 
193 | ALL_CCFLAGS :=
194 | ALL_CCFLAGS += $(NVCCFLAGS)
195 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
196 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
197 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
198 | 
199 | SAMPLE_ENABLED := 1
200 | 
201 | ALL_LDFLAGS :=
202 | ALL_LDFLAGS += $(ALL_CCFLAGS)
203 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
204 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
205 | 
206 | # Common includes and paths for CUDA
207 | INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
208 | LIBRARIES :=
209 | 
210 | ################################################################################
211 | 
212 | # Makefile include to help find GL Libraries
213 | include ./findgllib.mk
214 | 
215 | # OpenGL specific libraries
216 | ifeq ($(TARGET_OS),darwin)
217 |  # Mac OSX specific libraries and paths to include
218 |  LIBRARIES += -L/System/Library/Frameworks/OpenGL.framework/Libraries
219 |  LIBRARIES += -lGL -lGLU
220 |  ALL_LDFLAGS += -Xlinker -framework -Xlinker GLUT
221 | else
222 |  LIBRARIES += $(GLLINK)
223 |  LIBRARIES += -lGL -lGLU -lX11 -lglut
224 | endif
225 | 
226 | # Gencode arguments
227 | #SMS ?= 20 30 35 37 50 52 60
228 | # 52 for GTX 980
229 | SMS = 52 
230 | 
231 | ifeq ($(SMS),)
232 | $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
233 | SAMPLE_ENABLED := 0
234 | endif
235 | 
236 | ifeq ($(GENCODE_FLAGS),)
237 | # Generate SASS code for each SM architecture listed in $(SMS)
238 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
239 | 
240 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
241 | HIGHEST_SM := $(lastword $(sort $(SMS)))
242 | ifneq ($(HIGHEST_SM),)
243 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
244 | endif
245 | endif
246 | ################################################################################
247 | 
248 | # Target rules
249 | all: build
250 | 
251 | build: convolution 
252 | 
253 | main.o:main.cpp
254 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
255 | 
256 | convolution.o:convolution.cu
257 | 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
258 | 
259 | convolution: main.o  convolution.o
260 | 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
261 | 
262 | clean:
263 | 	rm -f main main.o convolution.o 
264 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Created by Rob Golshan (uteid: rpg499)
 2 | 
 3 | Credit:
 4 | main.cpp: GL functions / idea of the image loop taken from NVIDIA cuda sample code
 5 | convolution.cu: Algorithms implemented based on convolutionSeparable.pdf in cuda sample code and a parallel implementation of algorithms in https://web.archive.org/web/20060718054020/http://www.acm.uiuc.edu/siggraph/workshops/wjarosz_convolution_2001.pdf
 6 | 
 7 | What does the program do?
 8 | ========================
 9 | This is an implementation of several image processesing algorithms utilizing the parallelism of an NVIDIA GPU via CUDA. Algorithms implemented are:
10 | 1. 2D Convolution in parallel that works with any kernel (i.e. filter matrix)
11 |   * O(radius^2) assuming all blocks run in parallel
12 | 2. 2D Convolution in parallel similar to #1, but uses shared memory. Works with any kernel.
13 |   * This is faster than #1.
14 |   * Shared memory requirements are (BLOCK_SIZE x kernel radius) squared
15 |   * Could possibly be faster (but same time complexity) by loading the source image in a texture
16 | 3. 2D Convolution in parallel with SEPARABLE kernels ONLY.
17 |   * Split into two functions that compute convolution of rows or convolution of columns
18 |   * O(radius) assuming all blocks run in parallel
19 | 4. Boxfilter
20 |   * Similar to #3, but uses properties of box filters to keep time low when using a big radius
21 |   * O(width+height) assuming all blocks run in parallel
22 |   * Could possibly be faster (but same time complexity) by loading the source image in a texture
23 |   * Time taken independent of radius size
24 |   * Multiple iterations of this similate a Guassian filter
25 | 
26 | Filters I purposely did not implement:
27 | 1. FFT filter
28 |   * Requires more math knowledge than I currently have
29 |   * Implementation would be padding kernel/image and using FFT library in cuda
30 |   * Slower than separable implementation
31 |   * Should only really be needed with using BIG kernels that are not separable
32 | 2. Guassian filters
33 |   * We can either use a separable filter (#3) or a box filter several times (#4) to get the same result
34 | 
35 | Any other filters I didn't implement were either because I thought it was already a filter mentioned earlier, or I missed it in my research
36 | 
37 | 
38 | 
39 | Why use this over NVIDIA sample code? There is no reason. I doubt my implementations are any faster than the samples provided.
40 | 
41 | BUILDING
42 | ==========================
43 | Build with make
44 | 
45 | Tested and built on a single GPU system with a GTX 980 (compute capability 5.2)
46 | Have Xwindow system enabled to visually see results
47 | 
48 | Must either use the sample lena.ppm or have your own ppm image file
49 | 
50 | Running
51 | ========================
52 | ./convolution --image [path to image]
53 | 
54 | While the program is running and the XWindow is in focus, press h for a help command.
55 | 


--------------------------------------------------------------------------------
/convolution.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Created by Rob Golshan
  3 |  * Demos common image filters using parallel gpu algorithms
  4 |  * Algorithms based of convolutionSeperable.pdf in cuda samples
  5 |  * and wjarosz_convolution_2001.pdf --> converted to parallel
  6 |  */
  7 | 
  8 | #include <assert.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | #include <helper_cuda.h>
 13 | #include <helper_functions.h>
 14 | #include "convolution.cuh"
 15 | 
 16 | // Kernel cannot have radius bigger than 15
 17 | __constant__ int d_kernel[1024];
 18 | 
 19 | #define BLOCK_SIZE 16 
 20 | 
 21 | /* 
 22 |  * Converts a uint to a uint3, seperating RGB
 23 |  * colors by every byte.
 24 |  * Most significat to least significant:
 25 |  * Red, Green, Blue
 26 |  */
 27 | __device__ __forceinline__ int3 d_uintToRGB(unsigned int orig)
 28 | {
 29 |     int3 rgb;
 30 |     rgb.x = orig & 0xff;
 31 |     rgb.y = (orig>>8)&0xff;
 32 |     rgb.z = (orig>>16)&0xff;
 33 |     return rgb;
 34 | }
 35 | 
 36 | /*
 37 |  * Converts a uint3 to an unsigned int
 38 |  * Assumes each vector member correspond to RGB colors
 39 |  * Truncates rgb colors bigger than 1 byte
 40 |  */
 41 | __device__ __forceinline__ unsigned int d_rgbToUint(int3 rgb)
 42 | {
 43 |     if (rgb.x > 0xff) rgb.x = 0xff;
 44 |     else if (rgb.x < 0) rgb.x = 0;
 45 |     if (rgb.y > 0xff) rgb.y = 0xff;
 46 |     else if (rgb.y < 0) rgb.y = 0;
 47 |     if (rgb.z > 0xff) rgb.z = 0xff;
 48 |     else if (rgb.z < 0) rgb.z = 0;
 49 | 
 50 |     return (rgb.x & 0xff) | ((rgb.y & 0xff) << 8) | ((rgb.z & 0xff) << 16);
 51 | }
 52 | 
 53 | /*
 54 |  * divides an int3 by an int
 55 |  * Maybe faster to just multiply by float instead..
 56 |  */
 57 | __device__ __forceinline__ int3 d_divide(int3 orig, int op)
 58 | {
 59 |     orig.x = orig.x/op;
 60 |     orig.y = orig.y/op;
 61 |     orig.z = orig.z/op;
 62 |     return orig;
 63 | }
 64 | 
 65 | /* The most basic convolution method in parallel
 66 |  * Does not take advantage of memory optimizations with a GPU
 67 |  * Can be used with any (square) kernel filter
 68 |  * SLOW
 69 |  * Each output pixel does radius^2 multiplications 
 70 |  * T = O(radius^2)
 71 |  * W = O(radius^2 * width * height)
 72 |  */
 73 | __global__ void d_slowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius, int weight)
 74 | {
 75 |     int x = blockIdx.x*blockDim.x + threadIdx.x;
 76 |     int y = blockIdx.y*blockDim.y + threadIdx.y;
 77 |     const unsigned int loc =  x + y*width;
 78 |     int3 accumulation = make_int3(0,0,0);
 79 |     int3 value;
 80 | 
 81 |     if (x >= width || y >= height) return;
 82 |     assert(x < width);
 83 |     assert(y < height);
 84 |     for (int i = -radius; i <= radius; i++) {
 85 |         for (int j = -radius; j <= radius; j++) {
 86 |             if ((x + i < 0) || //left side out of bounds
 87 |                 (x + i >= width) || //right side OoB
 88 |                 (y + j < 0) || //top OoB
 89 |                 (y + j >= height)) //bot OoB
 90 |                 continue;
 91 |             value = d_uintToRGB(d_img[loc + i + j * width]);
 92 |             int temp = d_kernel[i + radius +  (j+radius)*((radius << 1) + 1)];
 93 |             value *= temp;
 94 |             accumulation += value;
 95 |         }
 96 |     }
 97 |     accumulation = d_divide(accumulation, weight);
 98 |     d_result[loc] = d_rgbToUint(accumulation);
 99 | }
100 | 
101 | /* The most basic convolution method in parallel
102 |  * Takes advantage of shared memory in a GPU 
103 |  * Can be used with any (square) kernel filter
104 |  * Faster than without shared memory 
105 |  * Each output pixel does radius^2 multiplications 
106 |  * T = O(radius^2)
107 |  * W = O(radius^2 * width * height)
108 |  */
109 | __global__ void d_sharedSlowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius, int weight)
110 | {
111 |     // Use a 1d array instead of 2D in order to coalesce memory access
112 |     extern __shared__ unsigned int data[];
113 | 
114 |     int x = blockIdx.x*blockDim.x + threadIdx.x;
115 |     int y = blockIdx.y*blockDim.y + threadIdx.y;
116 |     if (x >= width || y >= height) return;
117 | 
118 |     // memory location in d_img
119 |     const unsigned int loc =  x + y*width;
120 | 
121 |     int3 accumulation = make_int3(0,0,0);
122 |     int3 value;
123 | 
124 |     int w = blockDim.x;
125 |     int h = blockDim.y;
126 | 
127 |     /* to convolute the edges of a block, the shared memory must extend outwards of radius  */
128 | #pragma unroll 3 
129 |     for (int i = -w; i <= w; i+= w) {
130 | #pragma unroll 3
131 |         for (int j = -h; j <= h; j+= h) {
132 |             int x0 = threadIdx.x + i;
133 |             int y0 = threadIdx.y + j;
134 |             int newLoc = loc + i + j*width;
135 |             if (x0 < -radius || 
136 |                 x0 >= radius + w ||
137 |                 y0 < -radius ||
138 |                 y0 >= radius + h || 
139 |                 newLoc < 0 ||
140 |                 newLoc >= width*height)
141 |                 continue;
142 |             data[threadIdx.x + i + radius + (threadIdx.y + j + radius)*(blockDim.x+(radius << 1))] = d_img[newLoc];
143 |         }
144 |     }
145 | 
146 |     __syncthreads();
147 | 
148 |     for (int i = -radius; i <= radius; i++) {
149 |         for (int j = -radius; j <= radius; j++) {
150 |             unsigned int t = data[threadIdx.x + i + radius + (threadIdx.y + j + radius)*(blockDim.x+(radius << 1))];
151 |             int temp = d_kernel[i + radius +  (j+radius)*((radius << 1) + 1)];
152 |             value = d_uintToRGB(t);
153 |             value *= temp; 
154 |             accumulation += value;
155 |         }
156 |     }
157 |     accumulation = d_divide(accumulation, weight);
158 |     d_result[loc] = d_rgbToUint(accumulation);
159 | }
160 | 
161 | /* VERY FAST convolution method in parallel 
162 |  * Takes advantage of shared memory in a GPU 
163 |  * Can be used with ONLY WITH SEPARABLE kernel filters
164 |  * Each output pixel does radius+radius multiplications 
165 |  * T = O(radius + radius)
166 |  * W = O(radius * width + radius*height)
167 |  */
168 | __global__ void d_sepRowConvolution(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius)
169 | {
170 |     // Use a 1d array instead of 2D in order to coalesce memory access
171 |     extern __shared__ unsigned int data[];
172 | 
173 |     int x = blockIdx.x*blockDim.x + threadIdx.x;
174 |     int y = blockIdx.y*blockDim.y + threadIdx.y;
175 |     if (x >= width || y >= height) return;
176 | 
177 |     // memory location in d_img
178 |     const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y)*width + threadIdx.y*width;
179 | 
180 |     int3 accumulation = make_int3(0,0,0);
181 |     int3 value;
182 |     int weight = 0;
183 | 
184 | 
185 |     int w = blockDim.x;
186 | 
187 |     /* to convolute the edges of a block, the shared memory must extend outwards of radius  */
188 | #pragma unroll 3
189 |     for (int i = -w; i <= w; i+= w) {
190 |         int x0 = threadIdx.x + i;
191 |         int newLoc = loc + i;
192 |         if (x0 < -radius || 
193 |             x0 >= radius + w ||
194 |             newLoc < 0 || 
195 |             newLoc >= width*height)
196 |             continue;
197 |         data[threadIdx.x + i + radius + (threadIdx.y) *(blockDim.x+(radius << 1))] = d_img[newLoc];
198 |     }
199 | 
200 |     __syncthreads();
201 | 
202 |     for (int i = -radius; i <= radius; i++) {
203 |         unsigned int t = data[threadIdx.x + i + radius + (threadIdx.y)*(blockDim.x+(radius << 1))];
204 |         int temp = d_kernel[i + radius];
205 |         value = d_uintToRGB(t);
206 |         value *= temp;
207 |         weight += temp;
208 |         accumulation += value;
209 |     }
210 |     accumulation = d_divide(accumulation, weight);
211 |     d_result[loc] = d_rgbToUint(accumulation);
212 | }
213 | 
214 | /* VERY FAST convolution method in parallel 
215 |  * Takes advantage of shared memory in a GPU 
216 |  * Can be used with ONLY WITH SEPERABLE kernel filters
217 |  * Each output pixel does radius^2 multiplications 
218 |  * T = O(radius + radius)
219 |  * W = O(radius * width + radius*height)
220 |  */
221 | __global__ void d_sepColConvolution(unsigned int *d_result, int width, int height, int radius)
222 | {
223 |     // Use a 1d array instead of 2D in order to coalesce memory access
224 |     extern __shared__ unsigned int data[];
225 | 
226 |     int x = blockIdx.x*blockDim.x + threadIdx.x;
227 |     int y = blockIdx.y*blockDim.y + threadIdx.y;
228 |     if (x >= width || y >= height) return;
229 | 
230 |     // memory location in d_img
231 |     const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) + (blockIdx.y*blockDim.y)*width + threadIdx.y*width;
232 | 
233 |     int3 accumulation = make_int3(0,0,0);
234 |     int3 value;
235 |     int weight = 0;
236 | 
237 | 
238 |     int h = blockDim.y;
239 | 
240 |     /* to convolute the edges of a block, the shared memory must extend outwards of radius  */
241 | #pragma unroll 3
242 |     for (int j = -h; j <= h; j+= h) {
243 |         int y0 = threadIdx.y + j;
244 |         int newLoc = loc + j*width;
245 |         if (y0 < -radius || 
246 |             y0 >= radius + h ||
247 |             newLoc < 0 ||
248 |             newLoc >= width*height)
249 |             continue;
250 |             data[threadIdx.x + (threadIdx.y + j + radius)*(blockDim.x)] = d_result[newLoc];
251 |     }
252 | 
253 |     __syncthreads();
254 | 
255 |     for (int j = -radius; j <= radius; j++) {
256 |         unsigned int t = data[threadIdx.x + (threadIdx.y + j + radius)*(blockDim.x)];
257 |         float temp = d_kernel[(j + radius)*((radius << 1)+1)];
258 |         value = d_uintToRGB(t);
259 |         value *= temp;
260 |         weight += temp;
261 |         accumulation += value;
262 |     }
263 |     accumulation = d_divide(accumulation, weight);
264 |     d_result[loc] = d_rgbToUint(accumulation);
265 | }
266 | 
267 | 
268 | /*
269 |  *  Fast radius independent box filter
270 |  *  Do Rows followed by Columns
271 |  *  T = O(width + height)
272 |  *  W = O(width*height + width*height)
273 |  */
274 | __global__ void d_boxFilterRow(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius)
275 | {
276 |     // memory location in d_img
277 |     const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x) * width;
278 |     if (loc > height*width) return;
279 | 
280 |     d_img = d_img + loc;
281 |     d_result = d_result + loc;
282 |     int3 accumulation;
283 |     int bWeight = (radius<<1) + 1; //all values in kernel weighted equally
284 |     
285 |     //initial clamping of left value
286 |     accumulation = d_uintToRGB(d_img[0])*radius;
287 |     for (int i = 0; i < radius + 1; i++) {
288 |         accumulation += d_uintToRGB(d_img[i]); 
289 |     }
290 |     d_result[0] = d_rgbToUint(d_divide(accumulation, bWeight));
291 | 
292 |     for (int i = 1; i < radius + 1; i++) {
293 |         accumulation += d_uintToRGB(d_img[i + radius]); 
294 |         accumulation -= d_uintToRGB(d_img[0]); //clamp left side
295 |         d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight));
296 |     }
297 | 
298 |     //resuses previous computed value
299 |     for (int i = radius + 1; i < width - radius; i++) {
300 |         accumulation += d_uintToRGB(d_img[i + radius]); 
301 |         accumulation -= d_uintToRGB(d_img[i - radius - 1]); 
302 |         d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight));
303 |     }
304 | 
305 |     for (int i = width - radius; i < width; i++){
306 |         //clamp right side
307 |         accumulation += d_uintToRGB(d_img[width - 1]); 
308 |         accumulation -= d_uintToRGB(d_img[i - radius - 1]); 
309 |         d_result[i] = d_rgbToUint(d_divide(accumulation, bWeight));
310 |     }
311 | }
312 | 
313 | 
314 | /*
315 |  *  Fast radius independent box filter
316 |  *  Do Rows followed by Columns
317 |  *  d_img should be d_result from the row filter
318 |  *  T = O(width + height)
319 |  *  W = O(width*height + width*height)
320 |  */
321 | __global__ void d_boxFilterCol(unsigned int *d_img, unsigned int *d_result, int width, int height, int radius)
322 | {
323 |     // memory location in d_img
324 |     const unsigned int loc = (blockIdx.x*blockDim.x + threadIdx.x);
325 |     if (loc >= width) return;
326 | 
327 |     d_img = d_img + loc;
328 |     d_result = d_result + loc;
329 |     int3 accumulation;
330 |     int bWeight = (radius<<1) + 1; //all values in kernel weighted equally
331 |     
332 | 
333 |     //initial clamping of left value
334 |     accumulation = d_uintToRGB(d_img[0])*radius;
335 |     for (int i = 0; i < radius + 1; i++) {
336 |         accumulation += d_uintToRGB(d_img[i * width]); 
337 |     }
338 |     d_result[0] = d_rgbToUint(d_divide(accumulation, bWeight));
339 | 
340 |     for (int i = 1; i < radius + 1; i++) {
341 |         accumulation += d_uintToRGB(d_img[(i + radius) * width]); 
342 |         accumulation -= d_uintToRGB(d_img[0]); //clamp left side
343 |         d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight));
344 |     }
345 | 
346 |     //resuses previous computed value
347 |     for (int i = radius + 1; i < height - radius; i++) {
348 |         accumulation += d_uintToRGB(d_img[(i + radius)*width]); 
349 |         accumulation -= d_uintToRGB(d_img[(i - radius)*width - width]); 
350 |         d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight));
351 |     }
352 | 
353 |     for (int i = height - radius; i < height; i++){
354 |         //clamp right side
355 |         accumulation += d_uintToRGB(d_img[(height - 1)*width]); 
356 |         accumulation -= d_uintToRGB(d_img[(i - radius)*width - width]); 
357 |         d_result[i * width] = d_rgbToUint(d_divide(accumulation, bWeight));
358 |     }
359 | }
360 | 
361 | extern StopWatchInterface *timer;
362 | 
363 | /*
364 |  * look at main.cpp kerboard interrupts for descriptions on what type and kernels do
365 |  */
366 | double convolution(unsigned int *d_img, unsigned int *d_result, int *h_kernel, int width, int height,
367 |                  int radius, int type, int weight, int iterations)
368 | {
369 |     checkCudaErrors(cudaDeviceSynchronize());
370 | 
371 |     // threadsPerBlock needs to be a multiple of 32 for proper coalesce
372 |     dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
373 |     //numBlocks should probably be a multiple of warp size here for proper coalesce..
374 |     dim3 numBlocks(ceil((float)width / threadsPerBlock.x), ceil((float)height/threadsPerBlock.y));
375 | 
376 |     //copy kernel to device memory
377 |     if (radius < 15)
378 |         checkCudaErrors(cudaMemcpyToSymbol(d_kernel, h_kernel, ((radius << 1)+1)*((radius << 1)+1)*sizeof(int)));
379 | 
380 |     unsigned int *d_temp = NULL;
381 |     if (type == 3)
382 |         checkCudaErrors(cudaMalloc((void **) &d_temp, width*height*sizeof(unsigned int)));
383 | 
384 |     sdkResetTimer(&timer);
385 |     sdkStartTimer(&timer);
386 |     for (int i = 0; i < iterations; i++) {
387 |         switch (type) {
388 |             case 0: 
389 |                 d_slowConvolution<<< numBlocks, threadsPerBlock>>>(d_img, d_result, width, height, radius, weight);
390 |                 break;
391 |             case 1:
392 |                 d_sharedSlowConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE+(radius << 1))*(BLOCK_SIZE+(radius << 1))*sizeof(unsigned int)>>>(d_img, d_result, width, height, radius, weight);
393 |                 break;
394 |             case 2:
395 |                 d_sepRowConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE+(radius << 1))*(BLOCK_SIZE)*sizeof(unsigned int)>>>(d_img, d_result, width, height, radius);
396 |                 d_sepColConvolution<<< numBlocks, threadsPerBlock, (BLOCK_SIZE)*(BLOCK_SIZE+(radius << 1))*sizeof(unsigned int)>>>(d_result, width, height, radius);
397 |                 break;
398 |             case 3:
399 |                 d_boxFilterRow<<< ceil((float)height/BLOCK_SIZE), BLOCK_SIZE>>>(d_img, d_temp, width, height, radius);
400 |                 d_boxFilterCol<<< ceil((float)width/BLOCK_SIZE), BLOCK_SIZE>>>(d_temp, d_result, width, height, radius);
401 |                 break;
402 |         }
403 |         checkCudaErrors(cudaDeviceSynchronize());
404 |         d_img = d_result;
405 |     }
406 |     sdkStopTimer(&timer);
407 |     printf("time taken: %f\n", sdkGetTimerValue(&timer));
408 | 
409 |     checkCudaErrors(cudaFree(d_temp));
410 |     return 0;
411 | }
412 | 


--------------------------------------------------------------------------------
/convolution.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef _CONVOLUTION_CUH_
 2 | #define _CONVOLUTION_CUH_
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <string.h>
 7 | #include <helper_cuda.h>
 8 | #include <helper_math.h>
 9 | 
10 | double convolution(unsigned int *d_img, unsigned int *d_result, int *d_kernel, int width, int height, int radius, int type, int weight, int iterations);
11 | 
12 | #endif // #ifndef _CONVOLUTION_CUH_
13 | 


--------------------------------------------------------------------------------
/data/lena.ppm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rpgolshan/CUDA-image-processing/b704508fbc375ddf269579f8a498d68b2952dba5/data/lena.ppm


--------------------------------------------------------------------------------
/kernels.h:
--------------------------------------------------------------------------------
  1 | // identity, r=1 w=1
  2 | int k0[] =
  3 | {
  4 |   0, 0, 0,
  5 |   0, 1, 0,
  6 |   0, 0, 0
  7 | };
  8 | 
  9 | // blur, r = 2 w = 13
 10 | int k1[] = 
 11 | {
 12 |   0, 0, 1, 0, 0,
 13 |   0, 1, 1, 1, 0,
 14 |   1, 1, 1, 1, 1,
 15 |   0, 1, 1, 1, 0,
 16 |   0, 0, 1, 0, 0,
 17 | };
 18 | 
 19 | // motion bur, r=4 w=9
 20 | int k2[] = 
 21 | {
 22 |   1, 0, 0, 0, 0, 0, 0, 0, 0,
 23 |   0, 1, 0, 0, 0, 0, 0, 0, 0,
 24 |   0, 0, 1, 0, 0, 0, 0, 0, 0,
 25 |   0, 0, 0, 1, 0, 0, 0, 0, 0,
 26 |   0, 0, 0, 0, 1, 0, 0, 0, 0,
 27 |   0, 0, 0, 0, 0, 1, 0, 0, 0,
 28 |   0, 0, 0, 0, 0, 0, 1, 0, 0,
 29 |   0, 0, 0, 0, 0, 0, 0, 1, 0,
 30 |   0, 0, 0, 0, 0, 0, 0, 0, 1,
 31 | };
 32 | 
 33 | 
 34 | // horiz edges, r=2 w=1
 35 | int k3[] = 
 36 | {
 37 |    0,  0, -1,  0,  0,
 38 |    0,  0, -1,  0,  0,
 39 |    0,  0,  2,  0,  0,
 40 |    0,  0,  0,  0,  0,
 41 |    0,  0,  0,  0,  0,
 42 | };
 43 | 
 44 | // vertical edges, r=2 w=1
 45 | int k4[] = 
 46 | {
 47 |    0,  0, -1,  0,  0,
 48 |    0,  0, -1,  0,  0,
 49 |    0,  0,  4,  0,  0,
 50 |    0,  0, -1,  0,  0,
 51 |    0,  0, -1,  0,  0,
 52 | };
 53 | 
 54 | // all edges, r=1 w=1
 55 | int k5[] = 
 56 | {
 57 |   -1, -1, -1,
 58 |   -1,  8, -1,
 59 |   -1, -1, -1
 60 | };
 61 | 
 62 | // sharpen, r=1 w=1
 63 | int k6[] = 
 64 | {
 65 |    0, -1,  0,
 66 |   -1,  5, -1,
 67 |    0, -1,  0
 68 | };
 69 | 
 70 | // super sharpen, r=1 w=1
 71 | int k7[] = 
 72 | {
 73 |   -1, -1, -1,
 74 |   -1,  9, -1,
 75 |   -1, -1, -1
 76 | };
 77 | 
 78 | //emboss r=1, w=1
 79 | int k8[] = 
 80 | {
 81 |   -2, -1,  0,
 82 |   -1,  1,  1,
 83 |    0,  1,  2
 84 | };
 85 | 
 86 | //box filter, r=9(max) w=(r*2+1)^2
 87 | int k9[] = 
 88 | {
 89 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 90 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 91 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 92 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 93 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 94 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 95 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 96 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 97 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 98 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 99 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 |    1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
108 | };
109 | 
110 | // guassian blur, r=2 w=273
111 | int guass[] =
112 | {
113 |     1, 4, 7, 4, 1,
114 |     4, 16, 26, 16, 4,
115 |     7, 26, 41, 26, 7,
116 |     4, 16, 26, 16, 4,
117 |     1, 4, 7, 4, 1
118 | };
119 | 
120 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Created by Rob Golshan
  3 |  * gl code and helper functions taken from NVIDIA sample code 
  4 |  * Demos common image filters using parallel gpu algorithms
  5 |  */
  6 | 
  7 | // OpenGL Graphics includes
  8 | #include <helper_gl.h>
  9 | #include <GL/freeglut.h>
 10 | 
 11 | // CUDA includes and interop headers
 12 | #include <cuda_runtime.h>
 13 | #include <cuda_gl_interop.h>
 14 | 
 15 | // CUDA utilities and system includes
 16 | #include <helper_functions.h>
 17 | #include <helper_cuda.h>      // includes cuda.h and cuda_runtime_api.h
 18 | #include <helper_cuda_gl.h>   // includes cuda_runtime_api.h
 19 | 
 20 | // Includes
 21 | #include <stdlib.h>
 22 | #include <stdio.h>
 23 | #include <string.h>
 24 | #include <math.h>
 25 | 
 26 | #include "kernels.h"
 27 | #include "convolution.cuh"
 28 | 
 29 | int weight = 1;
 30 | int radius = 1;
 31 | 
 32 | const char *image_filename = "./data/lena.ppm";
 33 | int filter = 0;
 34 | int type = 0;
 35 | unsigned int iterations = 1;
 36 | 
 37 | 
 38 | unsigned int width, height;
 39 | unsigned int *h_img = NULL;
 40 | unsigned int *d_img = NULL;
 41 | //unsigned int *d_result = NULL;
 42 | int *k = k0;
 43 | 
 44 | GLuint pbo = 0;     // OpenGL pixel buffer object
 45 | GLuint texid = 0;   // texture
 46 | 
 47 | StopWatchInterface *timer = 0;
 48 | 
 49 | void print_help() 
 50 | {
 51 |     printf("press:\n");
 52 |     printf("\t\tq\t- Normal Convolution\n");
 53 |     printf("\t\tw\t- Normal Convolution with Shared memory\n");
 54 |     printf("\t\te\t- Fast Box Filter\n");
 55 |     printf("\t\tr\t- Separable Box Filter\n");
 56 |     printf("\t\tt\t- Separable Guassian Filter\n");
 57 |     printf("\n\n");
 58 |     printf("In q or w mode, press 1 - 9, 0 for crazy filters!");
 59 |     printf("\n\n");
 60 | }
 61 | 
 62 | // display results using OpenGL
 63 | void display()
 64 | {
 65 | 
 66 |     // execute filter, writing results to pbo
 67 |     unsigned int *d_result;
 68 |     checkCudaErrors(cudaGLMapBufferObject((void **)&d_result, pbo));
 69 |     convolution(d_img, d_result, k, width, height, radius, type, weight, iterations); 
 70 |     checkCudaErrors(cudaGLUnmapBufferObject(pbo));
 71 | 
 72 |     // load texture from pbo
 73 |     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
 74 |     glBindTexture(GL_TEXTURE_2D, texid);
 75 |     glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
 76 |     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
 77 |     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 78 | 
 79 |     // display results
 80 |     glClear(GL_COLOR_BUFFER_BIT);
 81 | 
 82 |     glEnable(GL_TEXTURE_2D);
 83 |     glDisable(GL_DEPTH_TEST);
 84 | 
 85 |     glBegin(GL_QUADS);
 86 |     glTexCoord2f(0, 1);
 87 |     glVertex2f(0, 0);
 88 |     glTexCoord2f(1, 1);
 89 |     glVertex2f(1, 0);
 90 |     glTexCoord2f(1, 0);
 91 |     glVertex2f(1, 1);
 92 |     glTexCoord2f(0, 0);
 93 |     glVertex2f(0, 1);
 94 |     glEnd();
 95 | 
 96 |     glDisable(GL_TEXTURE_2D);
 97 |     glutSwapBuffers();
 98 | }
 99 | 
100 | void idle()
101 | {
102 |     glutPostRedisplay();
103 | }
104 | 
105 | void cleanup()
106 | {
107 |     sdkDeleteTimer(&timer);
108 | 
109 |     checkCudaErrors(cudaFree(d_img));
110 |     if (pbo)
111 |     {
112 |         checkCudaErrors(cudaGLUnregisterBufferObject(pbo));
113 |         glDeleteBuffers(1, &pbo);
114 |     }
115 | 
116 |     if (texid)
117 |     {
118 |         glDeleteTextures(1, &texid);
119 |     }
120 | }
121 | 
122 | const char *s = "identity";
123 | const char *s_type = "Normal convolution";
124 | int prev_type= 0;
125 | void keyboard(unsigned char key, int x, int y)
126 | {
127 |     switch (key)
128 |     {
129 |         case 27:
130 |                 glutDestroyWindow(glutGetWindow());
131 |                 return;
132 |             break;
133 |         case '=':
134 |         case '+':
135 |             if (type == 3)
136 |                 radius++;
137 |             else if (filter == 9)
138 |                 radius >= 9? radius=9: radius++;
139 |             break;
140 |         case '-':
141 |         case '_':
142 |             if (filter == 9)
143 |                 radius <= 0? radius=0: radius--;
144 |             break;
145 |         case '[':
146 |             iterations <= 1? iterations=1: iterations--;
147 |             break;
148 |         case ']':
149 |             iterations+=1;
150 |             break;
151 |         case '0':
152 |             filter = 0;
153 |             weight = 1;
154 |             radius = 1;
155 |             k = k0;
156 |             s = "identity";
157 |             type = prev_type;
158 |             break;
159 |         case '1':
160 |             filter = 1;
161 |             weight = 13;
162 |             radius = 2;
163 |             k = k1;
164 |             s = "blur";
165 |             type = prev_type;
166 |             break;
167 |         case '2':
168 |             filter = 2;
169 |             weight = 9;
170 |             radius = 4;
171 |             k = k2;
172 |             s = "motion blur";
173 |             type = prev_type;
174 |             break;
175 |         case '3':
176 |             filter = 3;
177 |             weight = 1;
178 |             radius = 2;
179 |             k = k3;
180 |             s = "detect horizontol edges";
181 |             type = prev_type;
182 |             break;
183 |         case '4':
184 |             filter = 4;
185 |             weight = 1;
186 |             radius = 2;
187 |             k = k4;
188 |             s = "detect vertical edges";
189 |             type = prev_type;
190 |             break;
191 |         case '5':
192 |             filter = 5;
193 |             weight = 1;
194 |             radius = 1;
195 |             k = k5;
196 |             s = "detect all edges";
197 |             type = prev_type;
198 |             break;
199 |         case '6':
200 |             filter = 6;
201 |             weight = 1;
202 |             radius = 1;
203 |             k = k6;
204 |             s = "sharpen";
205 |             type = prev_type;
206 |             break;
207 |         case '7':
208 |             filter = 7;
209 |             weight = 273;
210 |             radius = 2;
211 |             k = guass;
212 |             s = "guassian blur";
213 |             type = prev_type;
214 |             break;
215 |         case '8':
216 |             filter = 8;
217 |             weight = 1;
218 |             radius = 1;
219 |             k = k8;
220 |             s = "emboss";
221 |             type = prev_type;
222 |             break;
223 |         case '9':
224 |             filter = 9;
225 |             radius = 9;
226 |             weight = ((radius<<1)+1)*((radius<<1)+1);
227 |             k = k9;
228 |             s = "box filter (max r=9)";
229 |             type = prev_type;
230 |             break;
231 |         case 'q':
232 |             if (filter == 9) {
233 |                 s = "box filter (max r=9)";
234 |                 k = k9;
235 |                 weight = ((radius<<1)+1)*((radius<<1)+1);
236 |             }
237 |             if (filter == 7) {
238 |                 s = "guassian blur";
239 |                 k = guass;
240 |                 weight = 273;
241 |             }
242 |             prev_type = 0;
243 |             type = 0;
244 |             if (radius > 9) radius=9;
245 |             break;
246 |         case 'w':
247 |             if (filter == 9) {
248 |                 s = "box filter (max r=9)";
249 |                 k = k9;
250 |                 weight = ((radius<<1)+1)*((radius<<1)+1);
251 |             }
252 |             if (filter == 7) {
253 |                 s = "guassian blur";
254 |                 k = guass;
255 |                 weight = 273;
256 |             }
257 |             prev_type = 1;
258 |             type = 1;
259 |             if (radius > 9) radius=9;
260 |             break;
261 |         case 'e':
262 |             filter = 9;
263 |             type = 3;
264 |             s = "fast box filter";
265 |             break;
266 |         case 'r':
267 |             filter = 9;
268 |             type = 2;
269 |             k = k9;
270 |             s = "separable box filter";
271 |             break;
272 |         case 't':
273 |             filter = 7;
274 |             type = 2;
275 |             radius = 2;
276 |             k = guass;
277 |             s = "separable guassian blur";
278 |             break;
279 |         default:
280 |             print_help();
281 |             break;
282 |     }
283 | 
284 |     switch (type) {
285 |         case 0:
286 |             s_type = "Normal";
287 |             break;
288 |         case 1:
289 |             s_type = "Normalw/ shared memory";
290 |             break;
291 |         case 2:
292 |             s_type = "Separable";
293 |             break;
294 |         case 3:
295 |             s_type = "Fast Box Filter";
296 |             break;
297 |     }
298 | 
299 | 
300 |     printf("filter: %-30s\tconvolution function: %-30s\tradius: %3d\titerations:%d   ", s, s_type, radius, iterations);
301 | 
302 |     glutPostRedisplay();
303 | }
304 | 
305 | void reshape(int x, int y)
306 | {
307 |     glViewport(0, 0, x, y);
308 | 
309 |     glMatrixMode(GL_MODELVIEW);
310 |     glLoadIdentity();
311 | 
312 |     glMatrixMode(GL_PROJECTION);
313 |     glLoadIdentity();
314 |     glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
315 | }
316 | 
317 | 
318 | 
319 | void initCudaBuffers()
320 | {
321 |     unsigned int size = width * height * sizeof(unsigned int);
322 |     unsigned int ksize =  (2*radius+1)*(2*radius+1) * sizeof(int);
323 | 
324 |     // allocate device memory
325 |     checkCudaErrors(cudaMalloc((void **) &d_img, size));
326 |     checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice));
327 |     sdkCreateTimer(&timer);
328 | }
329 | 
330 | void initGLBuffers()
331 | {
332 |     // create pixel buffer object to store final image
333 |     glGenBuffers(1, &pbo);
334 |     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
335 |     glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLubyte)*4, h_img, GL_STREAM_DRAW_ARB);
336 | 
337 |     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
338 |     checkCudaErrors(cudaGLRegisterBufferObject(pbo));
339 | 
340 |     // create texture for display
341 |     glGenTextures(1, &texid);
342 |     glBindTexture(GL_TEXTURE_2D, texid);
343 |     glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
344 |     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
345 |     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
346 |     glBindTexture(GL_TEXTURE_2D, 0);
347 | }
348 | 
349 | void initGL(int *argc, char **argv)
350 | {
351 |     glutInit(argc, argv);
352 |     glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
353 |     glutInitWindowSize(width, height);
354 |     glutCreateWindow("CUDA image processing");
355 |     glutDisplayFunc(display);
356 |     glutKeyboardFunc(keyboard);
357 |     glutReshapeFunc(reshape);
358 |     glutIdleFunc(NULL); //IDLE here so its not just endlessly computing
359 | 
360 |     glutCloseFunc(cleanup);
361 | 
362 |     printf("Press '+' and '-' to change filter width\n");
363 |     printf("0, 1, 2 - change filter order\n");
364 |     printf("a = slow convolution, b = slow convolution w/ shared memory\n");
365 | 
366 |     if (!isGLVersionSupported(2,0) || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object"))
367 |     {
368 |         fprintf(stderr, "Required OpenGL extensions missing.");
369 |         exit(EXIT_FAILURE);
370 |     }
371 | }
372 | 
373 | int main(int argc, char **argv)
374 | {
375 |     setenv ("DISPLAY", ":0", 0);
376 |     printf("[%s] - Starting...\n", argv[0]);
377 | 
378 |     //Use command-line specified CUDA device, otherwise use device with highest Gflops/s
379 |     findCudaDevice(argc, (const char **)argv);
380 | 
381 |     int nFailures = 0;
382 | 
383 |     // Get the path of the filename
384 |     char *filename;
385 | 
386 |     if (getCmdLineArgumentString(argc, (const char **) argv, "image", &filename))
387 |     {
388 |         image_filename = filename;
389 |     }
390 | 
391 |     // load image
392 |     char *image_path = sdkFindFilePath(image_filename, argv[0]);
393 | 
394 |     if (image_path == NULL)
395 |     {
396 |         fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename);
397 |         exit(EXIT_FAILURE);
398 |     }
399 | 
400 |     //PPM images only
401 |     sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
402 | 
403 |     if (!h_img)
404 |     {
405 |         printf("Error unable to load PPM file: '%s'\n", image_path);
406 |         exit(EXIT_FAILURE);
407 |     }
408 | 
409 |     initGL(&argc, argv);
410 |     findCudaGLDevice(argc, (const char **)argv);
411 |     printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);
412 |     
413 |     initCudaBuffers();
414 | 
415 |     initGLBuffers();
416 |     glutMainLoop();
417 |     exit(EXIT_SUCCESS);
418 | }
419 | 
420 | 


--------------------------------------------------------------------------------