├── cris.png
├── doge.jpg
├── huge.jpg
├── large.jpg
├── tiny.jpg
├── medium.jpg
├── source0.png
├── source1.jpg
├── source2.jpg
├── source3.jpg
├── verylarge.jpg
├── destination0.png
├── destination1.jpg
├── destination2.jpg
├── destination3.jpg
├── huge_source.jpg
├── large_source.jpg
├── tiny_source.jpg
├── wall_source.jpg
├── medium_source.jpg
├── verylarge_source.jpg
├── README.md
├── poisson_serial.py
├── parallel_poisson.py
└── parallel_poisson_cuda.py


/cris.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/cris.png


--------------------------------------------------------------------------------
/doge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/doge.jpg


--------------------------------------------------------------------------------
/huge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/huge.jpg


--------------------------------------------------------------------------------
/large.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/large.jpg


--------------------------------------------------------------------------------
/tiny.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/tiny.jpg


--------------------------------------------------------------------------------
/medium.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/medium.jpg


--------------------------------------------------------------------------------
/source0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/source0.png


--------------------------------------------------------------------------------
/source1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/source1.jpg


--------------------------------------------------------------------------------
/source2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/source2.jpg


--------------------------------------------------------------------------------
/source3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/source3.jpg


--------------------------------------------------------------------------------
/verylarge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/verylarge.jpg


--------------------------------------------------------------------------------
/destination0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/destination0.png


--------------------------------------------------------------------------------
/destination1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/destination1.jpg


--------------------------------------------------------------------------------
/destination2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/destination2.jpg


--------------------------------------------------------------------------------
/destination3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/destination3.jpg


--------------------------------------------------------------------------------
/huge_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/huge_source.jpg


--------------------------------------------------------------------------------
/large_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/large_source.jpg


--------------------------------------------------------------------------------
/tiny_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/tiny_source.jpg


--------------------------------------------------------------------------------
/wall_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/wall_source.jpg


--------------------------------------------------------------------------------
/medium_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/medium_source.jpg


--------------------------------------------------------------------------------
/verylarge_source.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ealehman/parallel-poisson-blending/HEAD/verylarge_source.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Parallelization of Poisson Blending
 2 | ===================================
 3 | By Jason Ting, Ryan Lee, Alex Lehman
 4 | 
 5 | Final Project for Computer Science 205, Fall 2013 (Cris Cecka)
 6 | --------------------------------------------------------------
 7 | 
 8 | The objective of the Poisson Blending algorithm is to compose a source image and a target image in the gradient domain. The code implements Poisson Blending in parallel with CUDA and Cheetah to efficiently and automatically superimpose images without visible seams.
 9 | 
10 | How to Run:
11 | -----------
12 | There are two ways to run the code:
13 | 
14 | 1) Using the images included in the folder and the course software load, execute the following on the Resonance node:  
15 | $ python parallel_poisson.py [# iterations]
16 | 
17 | 2) Specifying the image that you would like to process, execute the following, again on the Resonance node:  
18 | $ python parallel_poisson.py [source image] [destination image] [# iterations]
19 | 
20 | Benchmarking:
21 | -------------
22 | For the purposes of analysis, the average time per iteration was computed over 800 iterations (N) for destination images of 5 sizes: (200, 142), (375, 266), (750, 531), (1500, 1062), and (2500, 1770).
23 | 


--------------------------------------------------------------------------------
/poisson_serial.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Jason Ting, Alex Lehman, Ryan Lee
 3 | CS205 Final Project
 4 | Serial Implementation of Discretized Poisson Blending Algorithm
 5 | '''
 6 | 
 7 | import numpy as np
 8 | import os
 9 | from sys import argv
10 | import Image
11 | import time
12 | 
13 | def mask(source_im):
14 | 	# Compute list of tuples of mask from source image
15 | 	mask = []
16 | 	for i in range(source_im.shape[0]):
17 | 		for j in range(source_im.shape[1]):
18 | 			if np.all(source_im[i,j] != [255, 255, 255]):
19 | 				mask.append((i,j))
20 | 
21 | 	# Compute border and interior tuples
22 | 	interior = []
23 | 	for i,j in mask:
24 | 		if ((i,j+1) in mask) and ((i,j-1) in mask) and ((i+1,j) in mask) and ((i-1,j) in mask):
25 | 			interior.append((i,j))
26 | 	return interior
27 | 
28 | def poisson_serial(source_im, dest_im, out_im, interior, buffer1, buffer2, N):
29 | 
30 | 	# Do Jacobi iterations (800 times)
31 | 	for color in [0,1,2]:
32 | 
33 | 		# Initialize first buffer to incoming source image
34 | 		buffer1 = source_im[:,:,color]
35 | 
36 | 		# Compute each iteration
37 | 		for count in range(N):
38 | 			print count 
39 | 
40 | 			for i,j in interior:
41 | 
42 | 				# Compute sum 1 and 2
43 | 				sum1 = 0
44 | 				sum2 = 0
45 | 
46 | 				for k,l in [(i,j+1), (i,j-1), (i+1,j), (i-1,j)]:
47 | 					if (k,l) in interior:
48 | 						sum1 += buffer1[k,l]
49 | 					else:
50 | 						sum1 += dest_im[k,l,color]
51 | 
52 | 					sum2 += (source_im[i,j,color] - source_im[k,l,color])
53 | 
54 | 				buffer2[i,j] = min(255, max(0, (sum1 + sum2) / float(4)))
55 | 
56 | 			# Set buffer 1 to buffer 2 and iterate
57 | 			buffer1 = buffer2
58 | 
59 | 		# Copy to an output image
60 | 		for i,j in interior:
61 | 			out_im[i,j,color] = buffer1[i,j]
62 | 
63 | 	return out_im
64 | 
65 | if __name__ == '__main__':
66 | 	if len(argv) != 4:
67 | 		print "Usage: python", argv[0], "[source image] [destination image] [number of iterations]"
68 | 		exit()
69 | 
70 | 	# Number of iterations
71 | 	N = int(argv[3])
72 | 
73 |   # Load in source/dest images; convert to Numpy arrays for blending; use uint8 for CUDA as we did in hw5
74 | 	source_im = np.array(Image.open(argv[1]), dtype = float)
75 | 	dest_im = np.array(Image.open(argv[2]), dtype = float)
76 | 
77 | 	# Allocate buffers the size of dest (optimization: maybe only keep size of source + 1 around so buffer is smaller, then add into image later)
78 | 	buffer1 = np.zeros((dest_im.shape[0], dest_im.shape[1]), dtype = float)
79 | 	buffer2 = np.zeros((dest_im.shape[0], dest_im.shape[1]), dtype = float)
80 | 	out_im = dest_im
81 | 
82 | 	start = time.time()
83 | 	interior = mask(source_im)
84 | 	out_im = np.uint8(poisson_serial(source_im, dest_im, out_im, interior, buffer1, buffer2, N))
85 | 	stop = time.time()
86 | 
87 | 	print str(N) + ' Iterations, Serial Time: ' + str(stop - start) + ' seconds'
88 | 
89 | 	out_im = Image.fromarray(out_im, 'RGB')
90 | 	out_im.save('result.png')
91 | 	


--------------------------------------------------------------------------------
/parallel_poisson.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Jason Ting, Alex Lehman, Ryan Lee
  3 | CS205 Final Project
  4 | Parallelized Implementation of Discretized Poisson Blending Algorithm using CUDA with Cheetah
  5 | '''
  6 | 
  7 | from PIL import Image
  8 | import numpy as np
  9 | import time
 10 | import glob
 11 | import pycuda.autoinit
 12 | import pycuda.driver as cu
 13 | import pycuda.compiler as nvcc
 14 | import pycuda.gpuarray as gpu
 15 | from Cheetah.Template import Template
 16 | 
 17 | # define the CUDA kernels for the mask and blending
 18 | mask_source = """
 19 | // define the interior pixels and make border pixels white
 20 | __global__ void mask_kernel(uchar3* source)
 21 | {
 22 | 	// Compute thread id in x, y, and coalesced
 23 |     int i = $BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
 24 |     int j = $BLOCK_DIM_X * blockIdx.x + threadIdx.x;
 25 |     int tid = i * $WIDTH + j;
 26 | 
 27 |     // ensure each pixel is within image size and not white
 28 |     if (i >= 0 && i < $HEIGHT && j >= 0 && j < $WIDTH && (source[tid].x < 255 || source[tid].y < 255 || source[tid].z < 255)) {
 29 |     	// set up calculcations
 30 |     	int pos;
 31 | 
 32 |     	// goes over neighbors (up, down, left, right)
 33 |     	#for ($x,$y) in $NEIGHBORS
 34 |     		// define position for neighbor
 35 | 		    pos = tid + $x + $y*$WIDTH;
 36 | 
 37 | 		    // changes pixels in the border to white 
 38 | 		    if (source[pos].x == 255 && source[pos].y == 255 && source[pos].z == 255) {
 39 | 		    	#for $l in $RGB
 40 | 		    		source[tid].$l = 255;
 41 | 		    	#end for
 42 | 		    }
 43 |     	#end for
 44 |     }
 45 | }
 46 | """
 47 | 
 48 | poisson_blending_source = """
 49 | __global__ void poisson_blending_kernel(uchar3* source, uchar3* destination, uchar3* buffer)
 50 | {
 51 |     // Compute thread id in x, y, and coalesced
 52 |     int i = $BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
 53 |     int j = $BLOCK_DIM_X * blockIdx.x + threadIdx.x;
 54 |     int tid = i * $WIDTH + j;
 55 | 
 56 |     // ensure each pixel is within image size and not white
 57 |     if (i >= 0 && i < $HEIGHT && j >= 0 && j < $WIDTH && (buffer[tid].x < 255 || buffer[tid].y < 255 || buffer[tid].z < 255)){
 58 |     	// set up calculations for next buffer
 59 | 		int pos;
 60 | 		float sum;
 61 | 
 62 | 		// iterates over RGB
 63 |     	#for $l in $RGB
 64 | 		    // setup calculations
 65 | 		    sum = 0.0;
 66 | 		    float next_buffer_$l = 0.0;
 67 | 
 68 | 		   	// iterates over neighbors (up, down, left, right)
 69 | 		    #for ($x,$y) in $NEIGHBORS
 70 | 		    	// define position for neighbor
 71 | 		    	pos = tid + $x + $y*$WIDTH;
 72 | 
 73 | 		    	// adds buffer neighbors if pixel is in interior otherwise add destination neighbors
 74 | 		    	if (buffer[pos].x < 255 || buffer[pos].y < 255 || buffer[pos].z < 255)
 75 | 		    		sum += buffer[pos].$l;
 76 | 		    	else
 77 | 		    		sum += destination[pos].$l;
 78 | 
 79 | 		    	//add difference between source and neighbor
 80 | 		    	sum += (source[tid].$l - source[pos].$l);
 81 | 		    #end for
 82 | 
 83 | 		    // updates the next buffer and clip (0,255)
 84 | 		    next_buffer_$l = min(255.f, max(0.f, sum/4.f));
 85 | 		#end for
 86 | 
 87 | 		// updates the destination image and buffer
 88 | 		destination[tid] = make_uchar3(next_buffer_x, next_buffer_y, next_buffer_z);
 89 | 		buffer[tid] = make_uchar3(next_buffer_x, next_buffer_y, next_buffer_z);
 90 | 	}
 91 | }
 92 | """
 93 | 
 94 | def cuda_compile(source_string, function_name):
 95 | 	# compile the CUDA Kernel at runtime
 96 | 	source_module = nvcc.SourceModule(source_string)
 97 | 	# return a handle to the compiled CUDA kernel
 98 | 	return source_module.get_function(function_name)
 99 | 
100 | def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
101 | 	# create Cheetah template and fill in variables for mask kernel
102 | 	mask_template = Template(mask_source)
103 | 	mask_template.BLOCK_DIM_X = b_size[0]
104 |   	mask_template.BLOCK_DIM_Y = b_size[1]
105 |   	mask_template.WIDTH = dest_im.shape[1]
106 |   	mask_template.HEIGHT = dest_im.shape[0]
107 |   	mask_template.RGB = RGB
108 |   	mask_template.NEIGHBORS = neighbors
109 | 
110 |   	# compile the CUDA kernel
111 |   	mask_kernel = cuda_compile(mask_template, "mask_kernel")
112 | 
113 |   	# alloc memory to GPU
114 |   	d_source = cu.mem_alloc(source_im.nbytes)
115 |   	cu.memcpy_htod(d_source, source_im)
116 | 
117 |   	# sends to GPU filter out interior points in the mask
118 |   	mask_kernel(d_source, block=b_size, grid=g_size)
119 | 
120 |   	# retrieves interior point buffer from GPU
121 |   	inner_buffer = np.array(dest_im, dtype =np.uint8)
122 |   	cu.memcpy_dtoh(inner_buffer, d_source)
123 | 
124 |   	# returns the interior buffer
125 |   	return inner_buffer
126 | 
127 | def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
128 | 	# create Cheetah template and fill in variables for Poisson kernal
129 |   	template = Template(poisson_blending_source)
130 |   	template.BLOCK_DIM_X = b_size[0]
131 |   	template.BLOCK_DIM_Y = b_size[1]
132 |   	template.WIDTH = dest_im.shape[1]
133 |   	template.HEIGHT = dest_im.shape[0]
134 |   	template.RGB = RGB
135 |   	template.NEIGHBORS = neighbors
136 | 
137 |   	# compile the CUDA kernel
138 |   	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")
139 | 
140 |   	# alloc memory in GPU
141 |   	out_image = np.array(dest_im, dtype =np.uint8)
142 |   	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
143 |   	cu.memcpy_htod(d_source, source_im)
144 |   	cu.memcpy_htod(d_destination, dest_im)
145 |   	cu.memcpy_htod(d_buffer, interior_buffer)
146 | 
147 |   	# calls CUDA for Poisson Blending n # of times
148 |   	for i in range(n):
149 | 		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)
150 | 
151 | 	# retrieves the final output image and returns
152 | 	cu.memcpy_dtoh(out_image, d_destination)
153 |   	return out_image
154 | 
155 | 
156 | if __name__ == '__main__':
157 | 	# checks for proper usage
158 | 	if len(argv) == 2:
159 | 		source_files = glob.glob('source*.jpg')
160 | 		dest_files = glob.glob('dest*.jpg')
161 | 		N = argv[1]
162 | 		if len(source_files) != len(dest_files):
163 | 			print "Please make sure that your files are named sourceN.jpg, destN.jpg, and that each source is paired with a dest image."
164 | 	elif len(argv) == 4:
165 | 		source_files = [argv[1]]
166 | 		dest_files = [argv[2]]
167 | 		N = argv[3]
168 | 	else:
169 | 		print "Usage: python", argv[0], "[source image] [destination image] [# iterations] OR python", argv[0], "[# iterations] (for entire directory)"
170 | 		exit()
171 | 
172 | 	# iterates over the image files
173 | 	for i in range(len(source_files)):
174 | 		# load in source/dest images and convert to Numpy arrays for blending with uint8
175 | 		source_im = np.array(Image.open(in_file), dtype = np.uint8)
176 | 		dest_im = np.array(Image.open(out_file_name[i]), dtype = np.uint8)
177 | 
178 | 		# warmup the GPU (no calculations)
179 | 		for k in range(100):
180 | 			d_source = gpu.to_gpu(source_im)
181 | 			d_dest = gpu.to_gpu(dest_im)
182 | 			source_im = d_source.get()
183 | 			dest_im = d_dest.get()
184 | 
185 | 		# block size (threads per block)   
186 | 		b_size = (16,16,1)    
187 | 		# grid size (blocks per grid)
188 | 		g_size = (int(np.ceil(float(dest_im.shape[1])/b_size[0])), int(np.ceil(float(dest_im.shape[0])/b_size[1])))
189 | 
190 | 		# initialize color pixel locations and neighboring positions [(+-1,0),(0,+-1)] for Cheetah
191 | 		RGB = ['x','y','z']
192 | 		neighbors = []
193 | 		for j in range(-1,2,2):
194 | 			neighbors.append((j,0))
195 | 			neighbors.append((0,j))
196 | 
197 | 		# apply Poisson blending and time
198 | 		start = time.time()
199 | 		inner_buffer = interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors)
200 | 		out_im = poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, inner_buffer, N)
201 | 		end = time.time()
202 | 		print 'Parallel Time: ' + str(end - start) + ' seconds'
203 | 
204 | 		# creates output and save the image
205 | 		out_im = Image.fromarray(out_im, 'RGB')
206 | 		out_im.save('results_' + str(i) + '.png')


--------------------------------------------------------------------------------
/parallel_poisson_cuda.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Jason Ting, Alex Lehman, Ryan Lee
  3 | CS205 Final Project
  4 | Parallelized Implementation of Discretized Poisson Blending Algorithm using CUDA
  5 | '''
  6 | 
  7 | from PIL import Image
  8 | import numpy as np
  9 | import time
 10 | import glob
 11 | import pycuda.autoinit
 12 | import pycuda.driver as cu
 13 | import pycuda.compiler as nvcc
 14 | import pycuda.gpuarray as gpu
 15 | from Cheetah.Template import Template
 16 | 
 17 | # define the CUDA kernels for the mask and blending
 18 | mask_source = """
 19 | // define the interior pixels and make border pixels white
 20 | __global__ void mask_kernel(uchar3* source, int width, int height)
 21 | {
 22 | 	// Compute thread id in x, y, and coalesced
 23 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 24 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
 25 |     int tid = i * width + j;
 26 | 
 27 |     // ensure each pixel is within image size and not white
 28 |     if (i >= 0 && i < height && j >= 0 && j < width && (source[tid].x < 255 || source[tid].y < 255 || source[tid].z < 255)) {
 29 |     	// set up calculcations
 30 |     	int pos;
 31 | 
 32 |     	// goes over neighbors (up, down, left, right)
 33 |     	for (int j = 0; j < 4; j++) {
 34 | 		    // define position for neighbor
 35 | 		    pos = tid + ((j-2) % 2) + ((int)((j-2) / 2)) * width;
 36 | 
 37 | 		    // changes pixels in the border to white 
 38 | 		    if (source[pos].x == 255 && source[pos].y == 255 && source[pos].z == 255) {
 39 | 		    	for (int i = 0; i < 3; i++) {
 40 | 		    		switch(i) {
 41 | 		    			case 0:
 42 | 		    				source[tid].x = 255;
 43 | 		    				break;
 44 | 		    			case 1:
 45 | 		    				source[tid].y = 255;
 46 | 		    				break;
 47 | 		    			case 2:
 48 | 		    				source[tid].z = 255;
 49 | 		    				break;
 50 | 		    		}
 51 | 		    	}
 52 | 		    }
 53 |     	}
 54 |     }
 55 | }
 56 | """
 57 | 
 58 | poisson_blending_source = """
 59 | __global__ void poisson_blending_kernel(uchar3* source, uchar3* destination, uchar3* buffer, int width, int height)
 60 | {
 61 |     // Compute thread id in x, y, and coalesced
 62 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
 63 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
 64 |     int tid = i * width + j;
 65 |     unsigned char source_pix;
 66 |     unsigned char source_pos;
 67 |     unsigned char dest_pix;
 68 |     unsigned char dest_pos;
 69 |     unsigned char buffer_pos;
 70 |     float next_buffer_x = 0.0;
 71 |     float next_buffer_y = 0.0;
 72 |     float next_buffer_z = 0.0;
 73 | 
 74 |     // ensure each pixel is within image size and not white
 75 |     if (i >= 0 && i < height && j >= 0 && j < width && (buffer[tid].x < 255 || buffer[tid].y < 255 || buffer[tid].z < 255)) {
 76 |     	// set up calculations for next buffer
 77 | 		int pos;
 78 | 		float sum;
 79 | 
 80 | 		// iterates over RGB
 81 |     	for (int i = 0; i < 3; i++) {
 82 |     		switch(i) {
 83 |     			case 0:
 84 |     				source_pix = source[tid].x;
 85 |     				dest_pix = dest[tid].x;
 86 |     				break;
 87 |     			case 1:
 88 |     				source_pix = source[tid].y;
 89 |     				dest_pix = dest[tid].y;
 90 |     				break;
 91 |     			case 2:
 92 |     				source_pix = source[tid].z;
 93 |     				dest_pix = dest[tid].z;
 94 |     				break;
 95 |     		}
 96 | 
 97 | 		    // setup calculations
 98 | 		    sum = 0.0;
 99 | 
100 | 		   	// iterates over neighbors (up, down, left, right)
101 | 		    for (int j = 0; j < 4; j++) {
102 | 		    	// define position for neighbor
103 | 		    	pos = tid + ((j-2) % 2) + ((int)((j-2) / 2)) * width;
104 | 		    	switch(i) {
105 |     				case 0:
106 |     					source_pos = source[pos].x;
107 |     					dest_pos = dest[pos].x;
108 |     					buffer_pos = buffer[pos].x;
109 |     					break;
110 |     				case 1:
111 |     					source_pos = source[pos].y;
112 |    						dest_pos = dest[pos].y;
113 |    						buffer_pos = buffer[pos].y;
114 |    						break;
115 |     				case 2:
116 |     					source_pos = source[pos].z;
117 |     					dest_pos = dest[pos].z;
118 |     					buffer_pos = buffer[pos].z;
119 |     					break;
120 |    				}
121 | 
122 | 		    	// adds buffer neighbors if pixel is in interior otherwise add destination neighbors
123 | 		    	if (buffer[pos].x < 255 || buffer[pos].y < 255 || buffer[pos].z < 255)
124 | 		   			sum += buffer_pos;
125 | 		   		else
126 | 		   			sum += dest_pos;
127 | 
128 | 		    	//add difference between source and neighbor
129 | 		    	sum += (source_pix - source_pos);
130 | 		   	}
131 | 
132 | 		   	// updates the next buffer and clip (0,255)
133 | 		   	switch(i) {
134 | 		   		case 0:
135 | 		   			next_buffer_x = min(255.f, max(0.f, sum/4.f));
136 | 		   			break;
137 | 		   		case 1:
138 | 		   			next_buffer_y = min(255.f, max(0.f, sum/4.f));
139 | 		   			break;
140 | 		   		case 2:
141 | 		   			next_buffer_z = min(255.f, max(0.f, sum/4.f));
142 | 		   			break;
143 | 		   	}
144 | 	    }
145 | 	}
146 | 
147 | 	// updates the destination image and buffer
148 | 	destination[tid] = make_uchar3(next_buffer_x, next_buffer_y, next_buffer_z);
149 | 	buffer[tid] = make_uchar3(next_buffer_x, next_buffer_y, next_buffer_z);
150 | 	}
151 | }
152 | """
153 | 
154 | def cuda_compile(source_string, function_name):
155 | 	# compile the CUDA Kernel at runtime
156 | 	source_module = nvcc.SourceModule(source_string)
157 | 	# return a handle to the compiled CUDA kernel
158 | 	return source_module.get_function(function_name)
159 | 
160 | def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors):
161 |   	# compile the CUDA kernel
162 |   	mask_kernel = cuda_compile(mask_source, "mask_kernel")
163 | 
164 |   	# alloc memory to GPU
165 |   	d_source = cu.mem_alloc(source_im.nbytes)
166 |   	cu.memcpy_htod(d_source, source_im)
167 | 
168 |   	# sends to GPU filter out interior points in the mask
169 |   	mask_kernel(d_source, dest_im.shape[1], dest_im.shape[0], block=b_size, grid=g_size)
170 | 
171 |   	# retrieves interior point buffer from GPU
172 |   	inner_buffer = np.array(dest_im, dtype =np.uint8)
173 |   	cu.memcpy_dtoh(inner_buffer, d_source)
174 | 
175 |   	# returns the interior buffer
176 |   	return inner_buffer
177 | 
178 | def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
179 | 	# compile the CUDA kernel
180 |   	poisson_blending_kernel = cuda_compile(poisson_blending_source, "poisson_blending_kernel")
181 | 
182 |   	# alloc memory in GPU
183 |   	out_image = np.array(dest_im, dtype = np.uint8)
184 |   	d_source, d_destination, d_buffer = cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
185 |   	cu.memcpy_htod(d_source, source_im)
186 |   	cu.memcpy_htod(d_destination, dest_im)
187 |   	cu.memcpy_htod(d_buffer, interior_buffer)
188 | 
189 |   	# calls CUDA for Poisson Blending n # of times
190 |   	for i in range(n):
191 | 		poisson_blending_kernel(d_source, d_destination, d_buffer, dest_im.shape[1], dest_im.shape[0], block=b_size, grid=g_size)
192 | 
193 | 	# retrieves the final output image and returns
194 | 	cu.memcpy_dtoh(out_image, d_destination)
195 |   	return out_image
196 | 
197 | 
198 | if __name__ == '__main__':
199 | 	# checks for proper usage
200 | 	if len(argv) == 2:
201 | 		source_files = glob.glob('source*.jpg')
202 | 		dest_files = glob.glob('dest*.jpg')
203 | 		N = argv[1]
204 | 		if len(source_files) != len(dest_files):
205 | 			print "Please make sure that your files are named sourceN.jpg, destN.jpg, and that each source is paired with a dest image."
206 | 	elif len(argv) == 4:
207 | 		source_files = [argv[1]]
208 | 		dest_files = [argv[2]]
209 | 		N = argv[3]
210 | 	else:
211 | 		print "Usage: python", argv[0], "[source image] [destination image] [# iterations] OR python", argv[0], "[# iterations] (for entire directory)"
212 | 		exit()
213 | 
214 | 	# iterates over the image files
215 | 	for i in range(len(source_files)):
216 | 		# load in source/dest images and convert to Numpy arrays for blending with uint8
217 | 		source_im = np.array(Image.open(in_file), dtype = np.uint8)
218 | 		dest_im = np.array(Image.open(out_file_name[i]), dtype = np.uint8)
219 | 
220 | 		# warmup the GPU (no calculations)
221 | 		for k in range(100):
222 | 			d_source = gpu.to_gpu(source_im)
223 | 			d_dest = gpu.to_gpu(dest_im)
224 | 			source_im = d_source.get()
225 | 			dest_im = d_dest.get()
226 | 
227 | 		# block size (threads per block)   
228 | 		b_size = (16,16,1)    
229 | 		# grid size (blocks per grid)
230 | 		g_size = (int(np.ceil(float(dest_im.shape[1])/b_size[0])), int(np.ceil(float(dest_im.shape[0])/b_size[1])))
231 | 
232 | 		# initialize color pixel locations and neighboring positions [(+-1,0),(0,+-1)] for Cheetah
233 | 		RGB = ['x','y','z']
234 | 		neighbors = []
235 | 		for j in range(-1,2,2):
236 | 			neighbors.append((j,0))
237 | 			neighbors.append((0,j))
238 | 
239 | 		# apply Poisson blending and time
240 | 		start = time.time()
241 | 		inner_buffer = interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors)
242 | 		out_im = poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, inner_buffer, N)
243 | 		end = time.time()
244 | 		print 'Parallel Time: ' + str(end - start) + ' seconds'
245 | 
246 | 		# creates output and save the image
247 | 		out_im = Image.fromarray(out_im, 'RGB')
248 | 		out_im.save('results_' + str(i) + '.png')
249 | 		


--------------------------------------------------------------------------------