├── IPython
├── BilateralFilter.ipynb
├── CinqueTerre.jpg
├── Destination.png
├── GaussianBlur.ipynb
├── Histogram.ipynb
├── HistogramEqualization.ipynb
├── Map.ipynb
├── Mask.png
├── PrefixSum.ipynb
├── RadixSort.ipynb
├── RedEye.jpg
├── RedEyeRemoval.ipynb
├── RedEyeTemplate.jpg
├── Reduce.ipynb
├── SeamlessImageCloning.ipynb
├── Source.png
├── Split.ipynb
├── memorial.exr
├── prefixsum.py
├── prefixsum.pyc
├── radixsort.py
├── reduce.py
├── split.py
└── split.pyc
└── README.md
/IPython/CinqueTerre.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/CinqueTerre.jpg
--------------------------------------------------------------------------------
/IPython/Destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/Destination.png
--------------------------------------------------------------------------------
/IPython/Mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/Mask.png
--------------------------------------------------------------------------------
/IPython/RadixSort.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "RadixSort"
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Radix Sort\n",
15 | "\n",
16 | "The purpose of this code is to implement a canonical radix sort algorithm on the GPU."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## generate data"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "collapsed": false,
29 | "input": [
30 | "n = 10000\n",
31 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
32 | "input_values = input_keys.astype(numpy.float32)\n",
33 | "\n",
34 | "print input_keys\n",
35 | "print input_values"
36 | ],
37 | "language": "python",
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "output_type": "stream",
42 | "stream": "stdout",
43 | "text": [
44 | "[9875 764 6555 ..., 5036 7990 9689]\n",
45 | "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n"
46 | ]
47 | }
48 | ],
49 | "prompt_number": 1
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {},
54 | "source": [
55 | "## perform mostly CPU radix sort "
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "collapsed": false,
61 | "input": [
62 | "import split\n",
63 | "\n",
64 | "num_bits_per_element = 32\n",
65 | "split_manager = split.SplitManager(15000)\n",
66 | "\n",
67 | "flag_data = zeros_like(input_keys)\n",
68 | "split_keys_old = input_keys.copy()\n",
69 | "split_values_old = input_values.copy()\n",
70 | "split_keys_new = zeros_like(input_keys)\n",
71 | "split_values_new = zeros_like(input_values)\n",
72 | "\n",
73 | "for b in range(num_bits_per_element):\n",
74 | "\n",
75 | " mask = 2**b\n",
76 | "\n",
77 | " for i in range(n):\n",
78 | " input_value = split_keys_old[i]\n",
79 | " flag_data[i] = not (input_value & mask)\n",
80 | "\n",
81 | " split_manager.split_host(split_keys_old, flag_data, split_keys_new)\n",
82 | " split_manager.split_host(split_values_old, flag_data, split_values_new)\n",
83 | "\n",
84 | " split_keys_old, split_keys_new = split_keys_new, split_keys_old\n",
85 | " split_values_old, split_values_new = split_values_new, split_values_old\n",
86 | " \n",
87 | " \n",
88 | " \n",
89 | "print input_keys\n",
90 | "print input_values\n",
91 | "print split_keys_old\n",
92 | "print split_values_old\n",
93 | "print numpy.sort(input_keys)\n",
94 | "print\n",
95 | " \n",
96 | "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))\n",
97 | "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))"
98 | ],
99 | "language": "python",
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "output_type": "stream",
104 | "stream": "stdout",
105 | "text": [
106 | "[9875 764 6555 ..., 5036 7990 9689]\n",
107 | "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n",
108 | "[ 0 1 2 ..., 9999 9999 9999]\n",
109 | "[ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03\n",
110 | " 9.99900000e+03 9.99900000e+03]\n",
111 | "[ 0 1 2 ..., 9999 9999 9999]\n",
112 | "\n",
113 | "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n",
114 | "Difference between GPU and CPU values (should be 0.0%): 0.000000\n"
115 | ]
116 | }
117 | ],
118 | "prompt_number": 2
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "## perform fully GPU radix sort "
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "collapsed": false,
130 | "input": [
131 | "import pycuda.autoinit\n",
132 | "import pycuda.driver\n",
133 | "import pycuda.compiler\n",
134 | "import split\n",
135 | "\n",
136 | "source_module = pycuda.compiler.SourceModule \\\n",
137 | "(\n",
138 | "\"\"\"\n",
139 | "__global__ void radix_sort_compute_flags(\n",
140 | " unsigned int* d_input_data,\n",
141 | " unsigned int* d_output_data,\n",
142 | " int mask,\n",
143 | " int n )\n",
144 | "{\n",
145 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
146 | "\n",
147 | " if ( global_index_1d < n )\n",
148 | " {\n",
149 | " unsigned int input_value = d_input_data[ global_index_1d ];\n",
150 | "\n",
151 | " if ( input_value & mask )\n",
152 | " {\n",
153 | " d_output_data[ global_index_1d ] = 0;\n",
154 | " }\n",
155 | " else\n",
156 | " {\n",
157 | " d_output_data[ global_index_1d ] = 1;\n",
158 | " }\n",
159 | " }\n",
160 | "}\n",
161 | "\"\"\"\n",
162 | ")\n",
163 | "\n",
164 | "radix_sort_compute_flags_function = source_module.get_function(\"radix_sort_compute_flags\")\n",
165 | "\n",
166 | "size_of_element_bytes = 4\n",
167 | "size_of_element_bits = 32\n",
168 | "max_num_elements = 15000\n",
169 | "num_bytes = max_num_elements * size_of_element_bytes\n",
170 | "\n",
171 | "input_keys_device = pycuda.driver.mem_alloc(num_bytes)\n",
172 | "input_values_device = pycuda.driver.mem_alloc(num_bytes)\n",
173 | "flag_data_device = pycuda.driver.mem_alloc(num_bytes)\n",
174 | "split_keys_old_device = pycuda.driver.mem_alloc(num_bytes)\n",
175 | "split_values_old_device = pycuda.driver.mem_alloc(num_bytes)\n",
176 | "split_keys_new_device = pycuda.driver.mem_alloc(num_bytes)\n",
177 | "split_values_new_device = pycuda.driver.mem_alloc(num_bytes)\n",
178 | "\n",
179 | "split_manager = split.SplitManager(max_num_elements)"
180 | ],
181 | "language": "python",
182 | "metadata": {},
183 | "outputs": [],
184 | "prompt_number": 3
185 | },
186 | {
187 | "cell_type": "code",
188 | "collapsed": false,
189 | "input": [
190 | "pycuda.driver.memcpy_htod(input_keys_device, input_keys)\n",
191 | "pycuda.driver.memcpy_htod(input_values_device, input_values)\n",
192 | "pycuda.driver.memcpy_htod(split_keys_old_device, input_keys)\n",
193 | "pycuda.driver.memcpy_htod(split_values_old_device, input_values)\n",
194 | "\n",
195 | "pycuda.driver.memset_d32(flag_data_device, 0, n)\n",
196 | "pycuda.driver.memset_d32(split_keys_new_device, 0, n)\n",
197 | "pycuda.driver.memset_d32(split_values_new_device, 0, n)\n",
198 | "\n",
199 | "for b in range(num_bits_per_element):\n",
200 | "\n",
201 | " mask = numpy.int32(2**numpy.int8(b))\n",
202 | "\n",
203 | " radix_sort_compute_flags_funcion_block = (512,1,1)\n",
204 | " num_blocks = int(ceil(float(n) / float(radix_sort_compute_flags_funcion_block[0])))\n",
205 | " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n",
206 | "\n",
207 | " radix_sort_compute_flags_function(\n",
208 | " split_keys_old_device,\n",
209 | " flag_data_device,\n",
210 | " numpy.int32(mask),\n",
211 | " numpy.int32(n),\n",
212 | " block=radix_sort_compute_flags_funcion_block,\n",
213 | " grid=radix_sort_compute_flags_funcion_grid)\n",
214 | "\n",
215 | " split_manager.split_device(split_keys_old_device, flag_data_device, split_keys_new_device, n)\n",
216 | " split_manager.split_device(split_values_old_device, flag_data_device, split_values_new_device, n)\n",
217 | "\n",
218 | " split_keys_old_device, split_keys_new_device = split_keys_new_device, split_keys_old_device\n",
219 | " split_values_old_device, split_values_new_device = split_values_new_device, split_values_old_device\n",
220 | " \n",
221 | "pycuda.driver.memcpy_dtoh(split_keys_new, split_keys_old_device)\n",
222 | "pycuda.driver.memcpy_dtoh(split_values_new, split_values_old_device)\n",
223 | "\n",
224 | "\n",
225 | "\n",
226 | "print input_keys\n",
227 | "print input_values\n",
228 | "print split_keys_new\n",
229 | "print split_values_new\n",
230 | "print numpy.sort(input_keys)\n",
231 | "print\n",
232 | "\n",
233 | "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))\n",
234 | "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))"
235 | ],
236 | "language": "python",
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "output_type": "stream",
241 | "stream": "stdout",
242 | "text": [
243 | "[9875 764 6555 ..., 5036 7990 9689]\n",
244 | "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n",
245 | "[ 0 1 2 ..., 9999 9999 9999]\n",
246 | "[ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03\n",
247 | " 9.99900000e+03 9.99900000e+03]\n",
248 | "[ 0 1 2 ..., 9999 9999 9999]\n",
249 | "\n",
250 | "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n",
251 | "Difference between GPU and CPU values (should be 0.0%): 0.000000\n"
252 | ]
253 | },
254 | {
255 | "output_type": "stream",
256 | "stream": "stderr",
257 | "text": [
258 | "-c:12: RuntimeWarning: invalid value encountered in power\n"
259 | ]
260 | }
261 | ],
262 | "prompt_number": 4
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "## define GPU radix sort class"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "collapsed": false,
274 | "input": [
275 | "import math\n",
276 | "import numpy\n",
277 | "import pycuda.autoinit\n",
278 | "import pycuda.driver\n",
279 | "import pycuda.compiler\n",
280 | "import split\n",
281 | "\n",
282 | "class RadixSortManager:\n",
283 | " \n",
284 | " source_module = pycuda.compiler.SourceModule \\\n",
285 | " (\n",
286 | " \"\"\"\n",
287 | " __global__ void radix_sort_compute_flags_ascending(\n",
288 | " unsigned int* d_input_data,\n",
289 | " unsigned int* d_output_data,\n",
290 | " int mask,\n",
291 | " int n )\n",
292 | " {\n",
293 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
294 | "\n",
295 | " if ( global_index_1d < n )\n",
296 | " {\n",
297 | " unsigned int input_value = d_input_data[ global_index_1d ];\n",
298 | "\n",
299 | " if ( input_value & mask )\n",
300 | " {\n",
301 | " d_output_data[ global_index_1d ] = 0;\n",
302 | " }\n",
303 | " else\n",
304 | " {\n",
305 | " d_output_data[ global_index_1d ] = 1;\n",
306 | " }\n",
307 | " }\n",
308 | " }\n",
309 | "\n",
310 | " __global__ void radix_sort_compute_flags_descending(\n",
311 | " unsigned int* d_input_data,\n",
312 | " unsigned int* d_output_data,\n",
313 | " int mask,\n",
314 | " int n )\n",
315 | " {\n",
316 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
317 | "\n",
318 | " if ( global_index_1d < n )\n",
319 | " {\n",
320 | " unsigned int input_value = d_input_data[ global_index_1d ];\n",
321 | "\n",
322 | " if ( input_value & mask )\n",
323 | " {\n",
324 | " d_output_data[ global_index_1d ] = 1;\n",
325 | " }\n",
326 | " else\n",
327 | " {\n",
328 | " d_output_data[ global_index_1d ] = 0;\n",
329 | " }\n",
330 | " }\n",
331 | " } \n",
332 | " \"\"\"\n",
333 | " )\n",
334 | "\n",
335 | " _radix_sort_compute_flags_ascending_function = source_module.get_function(\"radix_sort_compute_flags_ascending\")\n",
336 | " _radix_sort_compute_flags_descending_function = source_module.get_function(\"radix_sort_compute_flags_descending\")\n",
337 | "\n",
338 | " _size_of_element_bytes = 4\n",
339 | " _size_of_element_bits = 32\n",
340 | " \n",
341 | " _max_num_elements = -1\n",
342 | " _num_bytes = -1\n",
343 | "\n",
344 | " _input_keys_device = -1\n",
345 | " _input_values_device = -1\n",
346 | " _flag_data_device = -1\n",
347 | " _split_keys_old_device = -1\n",
348 | " _split_values_old_device = -1\n",
349 | " _split_keys_new_device = -1\n",
350 | " _split_values_new_device = -1\n",
351 | "\n",
352 | " _split_manager = -1\n",
353 | " \n",
354 | " def __init__(self, max_num_elements):\n",
355 | " \n",
356 | " self._max_num_elements = max_num_elements\n",
357 | " self._num_bytes = self._max_num_elements * self._size_of_element_bytes\n",
358 | "\n",
359 | " self._input_keys_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
360 | " self._input_values_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
361 | " self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
362 | " self._split_keys_old_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
363 | " self._split_values_old_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
364 | " self._output_keys_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
365 | " self._output_values_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
366 | "\n",
367 | " self._split_manager = split.SplitManager(max_num_elements)\n",
368 | "\n",
369 | " def __copy_input_htod_key(self, input_keys_host):\n",
370 | "\n",
371 | " assert input_keys_host.shape[0] <= self._max_num_elements\n",
372 | "\n",
373 | " assert \\\n",
374 | " input_keys_host.dtype == numpy.uint32 or \\\n",
375 | " input_keys_host.dtype == numpy.int32 or \\\n",
376 | " input_keys_host.dtype == numpy.float32\n",
377 | "\n",
378 | " pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)\n",
379 | "\n",
380 | " def __copy_input_htod_key_value(self, input_keys_host, input_values_host):\n",
381 | "\n",
382 | " assert input_keys_host.shape[0] == input_values_host.shape[0]\n",
383 | " assert input_keys_host.shape[0] <= self._max_num_elements\n",
384 | "\n",
385 | " assert \\\n",
386 | " input_keys_host.dtype == numpy.uint32 or \\\n",
387 | " input_keys_host.dtype == numpy.int32 or \\\n",
388 | " input_keys_host.dtype == numpy.float32\n",
389 | "\n",
390 | " assert \\\n",
391 | " input_values_host.dtype == numpy.uint32 or \\\n",
392 | " input_values_host.dtype == numpy.int32 or \\\n",
393 | " input_values_host.dtype == numpy.float32\n",
394 | "\n",
395 | " pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)\n",
396 | " pycuda.driver.memcpy_htod(self._input_values_device, input_values_host)\n",
397 | "\n",
398 | " def __radix_sort_key(self, input_keys_device, output_keys_device, num_elements, compute_flags_function):\n",
399 | "\n",
400 | " assert num_elements <= self._max_num_elements\n",
401 | "\n",
402 | " self._n = num_elements\n",
403 | "\n",
404 | " pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)\n",
405 | " \n",
406 | " pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)\n",
407 | " pycuda.driver.memset_d32(output_keys_device, 0, self._n)\n",
408 | "\n",
409 | " for b in range(self._size_of_element_bits):\n",
410 | "\n",
411 | " mask = numpy.int32(2**numpy.int8(b))\n",
412 | "\n",
413 | " radix_sort_compute_flags_funcion_block = (512,1,1)\n",
414 | " num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))\n",
415 | " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n",
416 | " \n",
417 | " compute_flags_function(\n",
418 | " self._split_keys_old_device,\n",
419 | " self._flag_data_device,\n",
420 | " numpy.int32(mask),\n",
421 | " numpy.int32(self._n),\n",
422 | " block=radix_sort_compute_flags_funcion_block,\n",
423 | " grid=radix_sort_compute_flags_funcion_grid)\n",
424 | "\n",
425 | " self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)\n",
426 | "\n",
427 | " self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device\n",
428 | "\n",
429 | " pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)\n",
430 | " \n",
431 | " def __radix_sort_key_value(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements, compute_flags_function):\n",
432 | "\n",
433 | " assert num_elements <= self._max_num_elements\n",
434 | "\n",
435 | " self._n = num_elements\n",
436 | "\n",
437 | " pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)\n",
438 | " pycuda.driver.memcpy_dtod(self._split_values_old_device, input_values_device, self._n * self._size_of_element_bytes)\n",
439 | " \n",
440 | " pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)\n",
441 | " pycuda.driver.memset_d32(output_keys_device, 0, self._n)\n",
442 | " pycuda.driver.memset_d32(output_values_device, 0, self._n)\n",
443 | "\n",
444 | " for b in range(self._size_of_element_bits):\n",
445 | "\n",
446 | " mask = numpy.int32(2**numpy.int8(b))\n",
447 | "\n",
448 | " radix_sort_compute_flags_funcion_block = (512,1,1)\n",
449 | " num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))\n",
450 | " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n",
451 | " \n",
452 | " compute_flags_function(\n",
453 | " self._split_keys_old_device,\n",
454 | " self._flag_data_device,\n",
455 | " numpy.int32(mask),\n",
456 | " numpy.int32(self._n),\n",
457 | " block=radix_sort_compute_flags_funcion_block,\n",
458 | " grid=radix_sort_compute_flags_funcion_grid)\n",
459 | "\n",
460 | " self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)\n",
461 | " self._split_manager.split_device(self._split_values_old_device, self._flag_data_device, output_values_device, self._n)\n",
462 | "\n",
463 | " self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device\n",
464 | " self._split_values_old_device, output_values_device = output_values_device, self._split_values_old_device\n",
465 | "\n",
466 | " pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)\n",
467 | " pycuda.driver.memcpy_dtod(output_values_device, self._split_values_old_device, self._n * self._size_of_element_bytes)\n",
468 | "\n",
469 | " def __copy_output_dtoh_key(self, output_keys_host):\n",
470 | "\n",
471 | " pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)\n",
472 | "\n",
473 | " def __copy_output_dtoh_key_value(self, output_keys_host, output_values_host):\n",
474 | "\n",
475 | " pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)\n",
476 | " pycuda.driver.memcpy_dtoh(output_values_host, self._output_values_device)\n",
477 | "\n",
478 | " def radix_sort_key_ascending_device(self, input_keys_device, output_keys_device, num_elements):\n",
479 | "\n",
480 | " self.__radix_sort_key(\n",
481 | " input_keys_device,\n",
482 | " output_keys_device,\n",
483 | " num_elements,\n",
484 | " self._radix_sort_compute_flags_ascending_function)\n",
485 | "\n",
486 | " def radix_sort_key_descending_device(self, input_keys_device, output_keys_device, num_elements):\n",
487 | "\n",
488 | " self.__radix_sort_key(\n",
489 | " input_keys_device,\n",
490 | " output_keys_device,\n",
491 | " num_elements,\n",
492 | " self._radix_sort_compute_flags_descending_function)\n",
493 | "\n",
494 | " def radix_sort_key_ascending_host(self, input_keys_host, output_keys_host):\n",
495 | "\n",
496 | " num_elements = input_keys_host.shape[0]\n",
497 | " \n",
498 | " self.__copy_input_htod_key(input_keys_host)\n",
499 | " self.radix_sort_key_ascending_device(self._input_keys_device, self._output_keys_device, num_elements)\n",
500 | " self.__copy_output_dtoh_key(output_keys_host)\n",
501 | "\n",
502 | " def radix_sort_key_descending_host(self, input_keys_host, output_keys_host):\n",
503 | "\n",
504 | " num_elements = input_keys_host.shape[0]\n",
505 | " \n",
506 | " self.__copy_input_htod_key(input_keys_host)\n",
507 | " self.radix_sort_key_descending_device(self._input_keys_device, self._output_keys_device, num_elements)\n",
508 | " self.__copy_output_dtoh_key(output_keys_host)\n",
509 | "\n",
510 | " def radix_sort_key_value_ascending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):\n",
511 | "\n",
512 | " self.__radix_sort_key_value(\n",
513 | " input_keys_device,\n",
514 | " input_values_device,\n",
515 | " output_keys_device,\n",
516 | " output_values_device,\n",
517 | " num_elements,\n",
518 | " self._radix_sort_compute_flags_ascending_function)\n",
519 | "\n",
520 | " def radix_sort_key_value_descending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):\n",
521 | "\n",
522 | " self.__radix_sort_key_value(\n",
523 | " input_keys_device,\n",
524 | " input_values_device,\n",
525 | " output_keys_device,\n",
526 | " output_values_device,\n",
527 | " num_elements,\n",
528 | " self._radix_sort_compute_flags_descending_function)\n",
529 | "\n",
530 | " def radix_sort_key_value_ascending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):\n",
531 | "\n",
532 | " num_elements = input_keys_host.shape[0]\n",
533 | " \n",
534 | " self.__copy_input_htod_key_value(input_keys_host, input_values_host)\n",
535 | " self.radix_sort_key_value_ascending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)\n",
536 | " self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)\n",
537 | "\n",
538 | " def radix_sort_key_value_descending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):\n",
539 | "\n",
540 | " num_elements = input_keys_host.shape[0]\n",
541 | " \n",
542 | " self.__copy_input_htod_key_value(input_keys_host, input_values_host)\n",
543 | " self.radix_sort_key_value_descending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)\n",
544 | " self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)\n"
545 | ],
546 | "language": "python",
547 | "metadata": {},
548 | "outputs": [],
549 | "prompt_number": 5
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "metadata": {},
554 | "source": [
555 | "## invoke GPU radix sort class"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "collapsed": false,
561 | "input": [
562 | "n = 10000\n",
563 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
564 | "input_values = input_keys.astype(numpy.float32)\n",
565 | "output_keys = numpy.zeros_like(input_keys)\n",
566 | "output_values = numpy.zeros_like(input_values)\n",
567 | "radix_sort_manager = RadixSortManager(15000)\n",
568 | "\n",
569 | "radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)\n",
570 | "\n",
571 | "print input_keys\n",
572 | "print input_values\n",
573 | "print output_keys\n",
574 | "print output_values\n",
575 | "print\n",
576 | "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))\n",
577 | "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(output_values - numpy.sort(input_keys).astype(float32))"
578 | ],
579 | "language": "python",
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "output_type": "stream",
584 | "stream": "stdout",
585 | "text": [
586 | "[3229 8213 9674 ..., 5248 4543 1902]\n",
587 | "[ 3229. 8213. 9674. ..., 5248. 4543. 1902.]\n",
588 | "[ 0 5 5 ..., 9997 9997 9999]\n",
589 | "[ 0.00000000e+00 5.00000000e+00 5.00000000e+00 ..., 9.99700000e+03\n",
590 | " 9.99700000e+03 9.99900000e+03]\n",
591 | "\n",
592 | "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n",
593 | "Difference between GPU and CPU values (should be 0.0%): 0.000000\n"
594 | ]
595 | },
596 | {
597 | "output_type": "stream",
598 | "stream": "stderr",
599 | "text": [
600 | "-c:172: RuntimeWarning: invalid value encountered in power\n"
601 | ]
602 | }
603 | ],
604 | "prompt_number": 6
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "## ascending vs. descending and key-value vs. key-only"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "collapsed": false,
616 | "input": [
617 | "n = 10\n",
618 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
619 | "input_values = input_keys.astype(numpy.float32)\n",
620 | "output_keys = numpy.zeros_like(input_keys)\n",
621 | "output_values = numpy.zeros_like(input_values)\n",
622 | "radix_sort_manager = RadixSortManager(15000)\n",
623 | "\n",
624 | "radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)\n",
625 | "\n",
626 | "print input_keys\n",
627 | "print input_values\n",
628 | "print output_keys\n",
629 | "print output_values\n",
630 | "print\n",
631 | "\n",
632 | "\n",
633 | "\n",
634 | "n = 10\n",
635 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
636 | "input_values = input_keys.astype(numpy.float32)\n",
637 | "output_keys = numpy.zeros_like(input_keys)\n",
638 | "output_values = numpy.zeros_like(input_values)\n",
639 | "radix_sort_manager = RadixSortManager(15000)\n",
640 | "\n",
641 | "radix_sort_manager.radix_sort_key_value_descending_host(input_keys, input_values, output_keys, output_values)\n",
642 | "\n",
643 | "print input_keys\n",
644 | "print input_values\n",
645 | "print output_keys\n",
646 | "print output_values\n",
647 | "print\n",
648 | "\n",
649 | "\n",
650 | "\n",
651 | "n = 10\n",
652 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
653 | "input_values = input_keys.astype(numpy.float32)\n",
654 | "output_keys = numpy.zeros_like(input_keys)\n",
655 | "output_values = numpy.zeros_like(input_values)\n",
656 | "radix_sort_manager = RadixSortManager(15000)\n",
657 | "\n",
658 | "radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)\n",
659 | "\n",
660 | "print input_keys\n",
661 | "print output_keys\n",
662 | "print\n",
663 | "\n",
664 | "\n",
665 | "\n",
666 | "n = 10\n",
667 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
668 | "input_values = input_keys.astype(numpy.float32)\n",
669 | "output_keys = numpy.zeros_like(input_keys)\n",
670 | "output_values = numpy.zeros_like(input_values)\n",
671 | "radix_sort_manager = RadixSortManager(15000)\n",
672 | "\n",
673 | "radix_sort_manager.radix_sort_key_descending_host(input_keys, output_keys)\n",
674 | "\n",
675 | "print input_keys\n",
676 | "print output_keys\n",
677 | "print\n"
678 | ],
679 | "language": "python",
680 | "metadata": {},
681 | "outputs": [
682 | {
683 | "output_type": "stream",
684 | "stream": "stdout",
685 | "text": [
686 | "[9 8 8 2 4 3 3 8 4 6]\n",
687 | "[ 9. 8. 8. 2. 4. 3. 3. 8. 4. 6.]\n",
688 | "[2 3 3 4 4 6 8 8 8 9]\n",
689 | "[ 2. 3. 3. 4. 4. 6. 8. 8. 8. 9.]\n",
690 | "\n",
691 | "[8 9 8 4 2 1 1 8 8 5]"
692 | ]
693 | },
694 | {
695 | "output_type": "stream",
696 | "stream": "stdout",
697 | "text": [
698 | "\n",
699 | "[ 8. 9. 8. 4. 2. 1. 1. 8. 8. 5.]\n",
700 | "[9 8 8 8 8 5 4 2 1 1]\n",
701 | "[ 9. 8. 8. 8. 8. 5. 4. 2. 1. 1.]\n",
702 | "\n",
703 | "[7 6 8 7 4 4 9 7 8 5]"
704 | ]
705 | },
706 | {
707 | "output_type": "stream",
708 | "stream": "stdout",
709 | "text": [
710 | "\n",
711 | "[4 4 5 6 7 7 7 8 8 9]\n",
712 | "\n",
713 | "[6 5 8 6 5 0 6 0 6 3]"
714 | ]
715 | },
716 | {
717 | "output_type": "stream",
718 | "stream": "stdout",
719 | "text": [
720 | "\n",
721 | "[8 6 6 6 6 5 5 3 0 0]\n",
722 | "\n"
723 | ]
724 | },
725 | {
726 | "output_type": "stream",
727 | "stream": "stderr",
728 | "text": [
729 | "-c:137: RuntimeWarning: invalid value encountered in power\n"
730 | ]
731 | }
732 | ],
733 | "prompt_number": 7
734 | },
735 | {
736 | "cell_type": "markdown",
737 | "metadata": {},
738 | "source": [
739 | "## floating point keys"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "collapsed": false,
745 | "input": [
746 | "n = 10000\n",
747 | "input_keys = (numpy.random.rand(n) * n).astype(numpy.float32)\n",
748 | "output_keys = numpy.zeros_like(input_keys)\n",
749 | "radix_sort_manager = RadixSortManager(15000)\n",
750 | "\n",
751 | "radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)\n",
752 | "\n",
753 | "print input_keys\n",
754 | "print output_keys\n",
755 | "print\n",
756 | "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))"
757 | ],
758 | "language": "python",
759 | "metadata": {},
760 | "outputs": [
761 | {
762 | "output_type": "stream",
763 | "stream": "stdout",
764 | "text": [
765 | "[ 4938.57958984 9086.15136719 9926.49511719 ..., 9953.21875\n",
766 | " 5300.93457031 2960.62329102]\n",
767 | "[ 1.85716784e+00 2.97219419e+00 3.74738264e+00 ..., 9.99855371e+03\n",
768 | " 9.99873047e+03 9.99973535e+03]\n",
769 | "\n",
770 | "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n"
771 | ]
772 | }
773 | ],
774 | "prompt_number": 8
775 | }
776 | ],
777 | "metadata": {}
778 | }
779 | ]
780 | }
--------------------------------------------------------------------------------
/IPython/RedEye.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/RedEye.jpg
--------------------------------------------------------------------------------
/IPython/RedEyeTemplate.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/RedEyeTemplate.jpg
--------------------------------------------------------------------------------
/IPython/Reduce.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "Reduce"
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Reduce\n",
15 | "\n",
16 | "The purpose of this code is to implement a canonical reduce algorithm on the GPU."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "collapsed": false,
22 | "input": [
23 | "import math\n",
24 | "import numpy\n",
25 | "import pycuda.autoinit\n",
26 | "import pycuda.driver\n",
27 | "import pycuda.compiler\n",
28 | "\n",
29 | "class ReduceManager:\n",
30 | " \n",
31 | " source_module = pycuda.compiler.SourceModule \\\n",
32 | " (\n",
33 | " \"\"\"\n",
34 | " __global__ void reduce_sum( float* d_scratchpad, int n, int num_threads )\n",
35 | " {\n",
36 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
37 | " \n",
38 | " int left_index = global_index_1d;\n",
39 | " int right_index = global_index_1d + num_threads;\n",
40 | "\n",
41 | " if ( right_index < n )\n",
42 | " {\n",
43 | " d_scratchpad[ left_index ] = d_scratchpad[ left_index ] + d_scratchpad[ right_index ];\n",
44 | " }\n",
45 | " }\n",
46 | "\n",
47 | " __global__ void reduce_product( float* d_scratchpad, int n, int num_threads )\n",
48 | " {\n",
49 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
50 | " \n",
51 | " int left_index = global_index_1d;\n",
52 | " int right_index = global_index_1d + num_threads;\n",
53 | "\n",
54 | " if ( right_index < n )\n",
55 | " {\n",
56 | " d_scratchpad[ left_index ] = d_scratchpad[ left_index ] * d_scratchpad[ right_index ];\n",
57 | " }\n",
58 | " }\n",
59 | "\n",
60 | " __global__ void reduce_min( float* d_scratchpad, int n, int num_threads )\n",
61 | " {\n",
62 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
63 | " \n",
64 | " int left_index = global_index_1d;\n",
65 | " int right_index = global_index_1d + num_threads;\n",
66 | "\n",
67 | " if ( right_index < n )\n",
68 | " {\n",
69 | " d_scratchpad[ left_index ] = min( d_scratchpad[ left_index ], d_scratchpad[ right_index ] );\n",
70 | " }\n",
71 | " }\n",
72 | "\n",
73 | " __global__ void reduce_max( float* d_scratchpad, int n, int num_threads )\n",
74 | " {\n",
75 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
76 | " \n",
77 | " int left_index = global_index_1d;\n",
78 | " int right_index = global_index_1d + num_threads;\n",
79 | "\n",
80 | " if ( right_index < n )\n",
81 | " {\n",
82 | " d_scratchpad[ left_index ] = max( d_scratchpad[ left_index ], d_scratchpad[ right_index ] );\n",
83 | " }\n",
84 | " } \n",
85 | " \"\"\"\n",
86 | " )\n",
87 | "\n",
88 | " _reduce_sum_function = source_module.get_function(\"reduce_sum\")\n",
89 | " _reduce_product_function = source_module.get_function(\"reduce_product\")\n",
90 | " _reduce_min_function = source_module.get_function(\"reduce_min\")\n",
91 | " _reduce_max_function = source_module.get_function(\"reduce_max\")\n",
92 | "\n",
93 | " _size_of_element_bytes = 4\n",
94 | " _block_size_num_elements = 1024\n",
95 | " _block_size_num_threads = _block_size_num_elements / 2\n",
96 | "\n",
97 | " _max_num_elements = -1\n",
98 | " _n = -1\n",
99 | " _scratchpad_device = -1\n",
100 | "\n",
101 | " def __init__(self, max_num_elements):\n",
102 | " \n",
103 | " self._max_num_elements = max_num_elements\n",
104 | " self._num_bytes = self._max_num_elements * self._size_of_element_bytes\n",
105 | " self._scratchpad_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
106 | "\n",
107 | " def __copy_input_htod(self, input_data_host):\n",
108 | "\n",
109 | " assert input_data_host.shape[0] <= self._max_num_elements\n",
110 | " assert input_data_host.dtype == numpy.float32\n",
111 | "\n",
112 | " pycuda.driver.memcpy_htod(self._scratchpad_device, input_data_host)\n",
113 | "\n",
114 | " def __copy_input_dtod(self, input_data_device, num_elements):\n",
115 | "\n",
116 | " pycuda.driver.memcpy_dtod(self._scratchpad_device, input_data_device, int(num_elements * self._size_of_element_bytes))\n",
117 | " \n",
118 | " def __reduce(self, num_elements, reduce_function):\n",
119 | "\n",
120 | " self._n = num_elements\n",
121 | "\n",
122 | " num_sweep_passes = int(math.ceil(math.log(num_elements,2)))\n",
123 | " reduce_num_elements = self._n\n",
124 | " \n",
125 | " for d in range(num_sweep_passes):\n",
126 | "\n",
127 | " reduce_num_threads = int(math.ceil(float(reduce_num_elements) / float(2)))\n",
128 | " \n",
129 | " reduce_function_block = (self._block_size_num_threads,1,1)\n",
130 | " num_blocks = int(math.ceil(float(reduce_num_threads) / float(reduce_function_block[0])))\n",
131 | " reduce_function_grid = (num_blocks, 1)\n",
132 | " \n",
133 | " reduce_function(\n",
134 | " self._scratchpad_device,\n",
135 | " numpy.int32(reduce_num_elements),\n",
136 | " numpy.int32(reduce_num_threads), \n",
137 | " block=reduce_function_block,\n",
138 | " grid=reduce_function_grid)\n",
139 | "\n",
140 | " reduce_num_elements = reduce_num_threads\n",
141 | "\n",
142 | " tmp = numpy.zeros(1, dtype=numpy.float32)\n",
143 | "\n",
144 | " pycuda.driver.memcpy_dtoh(tmp, self._scratchpad_device)\n",
145 | "\n",
146 | " return tmp[0]\n",
147 | "\n",
148 | " def reduce_sum_device(self, input_data_device, num_elements):\n",
149 | "\n",
150 | " self.__copy_input_dtod(input_data_device, num_elements)\n",
151 | " return self.__reduce(num_elements, self._reduce_sum_function)\n",
152 | "\n",
153 | " def reduce_product_device(self, input_data_device, num_elements):\n",
154 | "\n",
155 | " self.__copy_input_dtod(input_data_device, num_elements)\n",
156 | " return self.__reduce(num_elements, self._reduce_product_function)\n",
157 | "\n",
158 | " def reduce_min_device(self, input_data_device, num_elements):\n",
159 | "\n",
160 | " self.__copy_input_dtod(input_data_device, num_elements)\n",
161 | " return self.__reduce(num_elements, self._reduce_min_function)\n",
162 | "\n",
163 | " def reduce_max_device(self, input_data_device, num_elements):\n",
164 | "\n",
165 | " self.__copy_input_dtod(input_data_device, num_elements)\n",
166 | " return self.__reduce(num_elements, self._reduce_max_function)\n",
167 | "\n",
168 | " def reduce_sum_host(self, input_data_host):\n",
169 | "\n",
170 | " num_elements = input_data_host.shape[0]\n",
171 | " \n",
172 | " self.__copy_input_htod(input_data_host)\n",
173 | " return self.__reduce(num_elements, self._reduce_sum_function)\n",
174 | "\n",
175 | " def reduce_product_host(self, input_data_host):\n",
176 | "\n",
177 | " num_elements = input_data_host.shape[0]\n",
178 | " \n",
179 | " self.__copy_input_htod(input_data_host)\n",
180 | " return self.__reduce(num_elements, self._reduce_product_function)\n",
181 | "\n",
182 | " def reduce_min_host(self, input_data_host):\n",
183 | "\n",
184 | " num_elements = input_data_host.shape[0]\n",
185 | " \n",
186 | " self.__copy_input_htod(input_data_host)\n",
187 | " return self.__reduce(num_elements, self._reduce_min_function)\n",
188 | "\n",
189 | " def reduce_max_host(self, input_data_host):\n",
190 | "\n",
191 | " num_elements = input_data_host.shape[0]\n",
192 | " \n",
193 | " self.__copy_input_htod(input_data_host)\n",
194 | " return self.__reduce(num_elements, self._reduce_max_function)"
195 | ],
196 | "language": "python",
197 | "metadata": {},
198 | "outputs": [],
199 | "prompt_number": 1
200 | },
201 | {
202 | "cell_type": "code",
203 | "collapsed": false,
204 | "input": [
205 | "n = 10000\n",
206 | "input_data = (numpy.random.rand(n)).astype(numpy.float32)\n",
207 | "\n",
208 | "reduce_manager = ReduceManager(n)\n",
209 | "\n",
210 | "print \"Difference between GPU and CPU sum reduce (should be less than 0.1): %f\" % abs( reduce_manager.reduce_sum_host(input_data) - numpy.sum(input_data) )\n",
211 | "print\n",
212 | "print \"%f\" % numpy.sum(input_data)\n",
213 | "print \"%f\" % reduce_manager.reduce_sum_host(input_data)"
214 | ],
215 | "language": "python",
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "output_type": "stream",
220 | "stream": "stdout",
221 | "text": [
222 | "Difference between GPU and CPU sum reduce (should be less than 0.1): 0.004395\n",
223 | "\n",
224 | "5001.355957\n",
225 | "5001.351562\n"
226 | ]
227 | }
228 | ],
229 | "prompt_number": 2
230 | },
231 | {
232 | "cell_type": "code",
233 | "collapsed": false,
234 | "input": [
235 | "n = 20\n",
236 | "input_data = (numpy.random.rand(n) + 1.01).astype(numpy.float32)\n",
237 | "\n",
238 | "reduce_manager = ReduceManager(n)\n",
239 | "\n",
240 | "print \"Difference between GPU and CPU product reduce (should be less than 0.1): %f\" % abs( reduce_manager.reduce_product_host(input_data) - numpy.prod(input_data) )\n",
241 | "print\n",
242 | "print \"%f\" % numpy.prod(input_data)\n",
243 | "print \"%f\" % reduce_manager.reduce_product_host(input_data)"
244 | ],
245 | "language": "python",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "output_type": "stream",
250 | "stream": "stdout",
251 | "text": [
252 | "Difference between GPU and CPU product reduce (should be less than 0.1): 0.000061\n",
253 | "\n",
254 | "926.643677\n",
255 | "926.643616\n"
256 | ]
257 | }
258 | ],
259 | "prompt_number": 3
260 | },
261 | {
262 | "cell_type": "code",
263 | "collapsed": false,
264 | "input": [
265 | "n = 10000\n",
266 | "input_data = (numpy.random.rand(n)).astype(numpy.float32)\n",
267 | "\n",
268 | "reduce_manager = ReduceManager(n)\n",
269 | "\n",
270 | "print \"Difference between GPU and CPU min reduce (should be 0.0): %f\" % abs( reduce_manager.reduce_min_host(input_data) - numpy.min(input_data) )\n",
271 | "print\n",
272 | "print \"%f\" % numpy.min(input_data)\n",
273 | "print \"%f\" % reduce_manager.reduce_min_host(input_data)"
274 | ],
275 | "language": "python",
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "output_type": "stream",
280 | "stream": "stdout",
281 | "text": [
282 | "Difference between GPU and CPU min reduce (should be 0.0): 0.000000\n",
283 | "\n",
284 | "0.000093\n",
285 | "0.000093\n"
286 | ]
287 | }
288 | ],
289 | "prompt_number": 4
290 | },
291 | {
292 | "cell_type": "code",
293 | "collapsed": false,
294 | "input": [
295 | "n = 10000\n",
296 | "input_data = (numpy.random.rand(n)).astype(numpy.float32)\n",
297 | "\n",
298 | "reduce_manager = ReduceManager(n)\n",
299 | "\n",
300 | "print \"Difference between GPU and CPU max reduce (should be 0.0): %f\" % abs( reduce_manager.reduce_max_host(input_data) - numpy.max(input_data) )\n",
301 | "print\n",
302 | "print \"%f\" % numpy.max(input_data)\n",
303 | "print \"%f\" % reduce_manager.reduce_max_host(input_data)"
304 | ],
305 | "language": "python",
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "output_type": "stream",
310 | "stream": "stdout",
311 | "text": [
312 | "Difference between GPU and CPU max reduce (should be 0.0): 0.000000\n",
313 | "\n",
314 | "0.999933\n",
315 | "0.999933\n"
316 | ]
317 | }
318 | ],
319 | "prompt_number": 5
320 | }
321 | ],
322 | "metadata": {}
323 | }
324 | ]
325 | }
--------------------------------------------------------------------------------
/IPython/Source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/Source.png
--------------------------------------------------------------------------------
/IPython/Split.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "metadata": {
3 | "name": "Split"
4 | },
5 | "nbformat": 3,
6 | "nbformat_minor": 0,
7 | "worksheets": [
8 | {
9 | "cells": [
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Split\n",
15 | "\n",
16 | "The purpose of this code is to implement a canonical split algorithm on the GPU."
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "collapsed": false,
22 | "input": [
23 | "import math\n",
24 | "import numpy\n",
25 | "import pycuda.autoinit\n",
26 | "import pycuda.driver\n",
27 | "import pycuda.compiler\n",
28 | "import prefixsum\n",
29 | "\n",
30 | "class SplitManager:\n",
31 | " \n",
32 | " source_module = pycuda.compiler.SourceModule \\\n",
33 | " (\n",
34 | " \"\"\"\n",
35 | " __global__ void split_scatter(\n",
36 | " unsigned int* d_input_data,\n",
37 | " unsigned int* d_flag_data,\n",
38 | " unsigned int* d_flag_set_scatter_offset,\n",
39 | " unsigned int* d_output_data,\n",
40 | " int total_flags_set,\n",
41 | " int n )\n",
42 | " {\n",
43 | " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n",
44 | "\n",
45 | " if ( global_index_1d < n )\n",
46 | " {\n",
47 | " unsigned int input_value = d_input_data[ global_index_1d ];\n",
48 | " unsigned int flag_value = d_flag_data[ global_index_1d ];\n",
49 | " unsigned int flag_set_scatter_offset_value = d_flag_set_scatter_offset[ global_index_1d ];\n",
50 | " \n",
51 | " unsigned int scatter_offset_value;\n",
52 | "\n",
53 | " if ( flag_value > 0 )\n",
54 | " {\n",
55 | " scatter_offset_value = flag_set_scatter_offset_value;\n",
56 | " }\n",
57 | " else\n",
58 | " {\n",
59 | " scatter_offset_value = global_index_1d - flag_set_scatter_offset_value + total_flags_set;\n",
60 | " }\n",
61 | " \n",
62 | " d_output_data[ scatter_offset_value ] = input_value;\n",
63 | " }\n",
64 | " }\n",
65 | " \"\"\"\n",
66 | " )\n",
67 | "\n",
68 | " _split_scatter_funcion = source_module.get_function(\"split_scatter\")\n",
69 | "\n",
70 | " _prefix_sum_manager = -1\n",
71 | "\n",
72 | " _size_of_element_bytes = 4\n",
73 | " _max_num_elements = -1\n",
74 | " _num_bytes = -1\n",
75 | " _n = -1\n",
76 | " _input_data_device = -1\n",
77 | " _flag_data_device = -1\n",
78 | " _flag_set_scatter_offset_device = -1\n",
79 | " _output_data_device = -1\n",
80 | " _block_sums_device = -1\n",
81 | " \n",
82 | " def __init__(self, max_num_elements):\n",
83 | " \n",
84 | " self._max_num_elements = max_num_elements\n",
85 | " self._num_bytes = self._max_num_elements * self._size_of_element_bytes\n",
86 | " self._input_data_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
87 | " self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
88 | " self._flag_set_scatter_offset_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
89 | " self._output_data_device = pycuda.driver.mem_alloc(self._num_bytes)\n",
90 | " self._prefix_sum_manager = prefixsum.PrefixSumManager(self._max_num_elements)\n",
91 | "\n",
92 | " def __copy_input_htod(self, input_data_host, flag_data_host):\n",
93 | "\n",
94 | " assert input_data_host.shape[0] <= self._max_num_elements\n",
95 | " assert \\\n",
96 | " input_data_host.dtype == numpy.uint32 or \\\n",
97 | " input_data_host.dtype == numpy.int32 or \\\n",
98 | " input_data_host.dtype == numpy.float32\n",
99 | "\n",
100 | " pycuda.driver.memcpy_htod(self._input_data_device, input_data_host)\n",
101 | " pycuda.driver.memcpy_htod(self._flag_data_device, flag_data_host)\n",
102 | " \n",
103 | " def __split(self, input_data_device, flag_data_device, output_data_device, num_elements):\n",
104 | "\n",
105 | " assert num_elements <= self._max_num_elements\n",
106 | "\n",
107 | " self._n = num_elements\n",
108 | " \n",
109 | " pycuda.driver.memset_d32(self._flag_set_scatter_offset_device, 0, self._n)\n",
110 | " pycuda.driver.memset_d32(output_data_device, 0, self._n)\n",
111 | "\n",
112 | " self._prefix_sum_manager.prefix_sum_device(flag_data_device, self._flag_set_scatter_offset_device, self._n)\n",
113 | "\n",
114 | " tmp = numpy.zeros(1, dtype=numpy.uint32)\n",
115 | "\n",
116 | " pycuda.driver.memcpy_dtoh(tmp, int(self._flag_set_scatter_offset_device) + ((self._n - 1) * self._size_of_element_bytes))\n",
117 | " flag_set_scatter_offset_end = tmp[0]\n",
118 | "\n",
119 | " pycuda.driver.memcpy_dtoh(tmp, int(flag_data_device) + ((self._n - 1) * self._size_of_element_bytes))\n",
120 | " flag_data_end = tmp[0]\n",
121 | "\n",
122 | " total_flags_set = flag_set_scatter_offset_end + flag_data_end\n",
123 | "\n",
124 | " split_scatter_funcion_block = (512,1,1)\n",
125 | " num_blocks = int(math.ceil(float(self._n) / float(split_scatter_funcion_block[0])))\n",
126 | " split_scatter_function_grid = (num_blocks, 1)\n",
127 | "\n",
128 | " self._split_scatter_funcion(\n",
129 | " input_data_device,\n",
130 | " flag_data_device,\n",
131 | " self._flag_set_scatter_offset_device,\n",
132 | " output_data_device,\n",
133 | " numpy.int32(total_flags_set),\n",
134 | " numpy.int32(self._n),\n",
135 | " block=split_scatter_funcion_block,\n",
136 | " grid=split_scatter_function_grid)\n",
137 | " \n",
138 | " def __copy_output_dtoh(self, output_data_host):\n",
139 | "\n",
140 | " pycuda.driver.memcpy_dtoh(output_data_host, self._output_data_device)\n",
141 | "\n",
142 | " def split_device(self, input_data_device, flag_data_device, output_data_device, num_elements):\n",
143 | "\n",
144 | " self.__split(input_data_device, flag_data_device, output_data_device, num_elements)\n",
145 | "\n",
146 | " def split_host(self, input_data_host, flag_data_host, output_data_host):\n",
147 | "\n",
148 | " num_elements = input_data_host.shape[0]\n",
149 | " \n",
150 | " self.__copy_input_htod(input_data_host, flag_data_host)\n",
151 | " self.split_device(self._input_data_device, self._flag_data_device, self._output_data_device, num_elements)\n",
152 | " self.__copy_output_dtoh(output_data_host)"
153 | ],
154 | "language": "python",
155 | "metadata": {},
156 | "outputs": [],
157 | "prompt_number": 2
158 | },
159 | {
160 | "cell_type": "code",
161 | "collapsed": false,
162 | "input": [
163 | "n = 10000\n",
164 | "input_data = (numpy.random.rand(n) * n).astype(numpy.uint32)\n",
165 | "flag_data = zeros_like(input_data)\n",
166 | "\n",
167 | "b = 0\n",
168 | "mask = 2**b\n",
169 | " \n",
170 | "for i in range(n):\n",
171 | " input_value = input_data[i]\n",
172 | " flag_data[i] = not (input_value & mask)\n",
173 | "\n",
174 | "\n",
175 | " \n",
176 | "flag_set_scatter_offset_cpu = zeros_like(input_data)\n",
177 | "output_data_cpu = zeros_like(input_data)\n",
178 | "\n",
179 | "for i in range(1,n):\n",
180 | " flag_set_scatter_offset_cpu[i] = flag_set_scatter_offset_cpu[i-1] + flag_data[i-1]\n",
181 | "\n",
182 | "total_flags_set = flag_set_scatter_offset_cpu[n-1] + flag_data[n-1]\n",
183 | "\n",
184 | "for i in range(n):\n",
185 | " input_value = input_data[i]\n",
186 | " flag_value = flag_data[i]\n",
187 | " flag_set_scatter_offset_value = flag_set_scatter_offset_cpu[i]\n",
188 | " scatter_offset_value = -1\n",
189 | "\n",
190 | " if flag_value:\n",
191 | " scatter_offset_value = flag_set_scatter_offset_value\n",
192 | " else:\n",
193 | " scatter_offset_value = i - flag_set_scatter_offset_value + total_flags_set\n",
194 | "\n",
195 | " output_data_cpu[scatter_offset_value] = input_value\n",
196 | " \n",
197 | " \n",
198 | "\n",
199 | "output_data_gpu = numpy.zeros_like(input_data)\n",
200 | "split_manager = SplitManager(n)\n",
201 | "\n",
202 | "split_manager.split_host(input_data, flag_data, output_data_gpu)\n",
203 | "\n",
204 | "print output_data_gpu\n",
205 | "print output_data_cpu\n",
206 | "print\n",
207 | "print \"Difference between GPU and CPU (should be 0.0%%): %f\" % numpy.linalg.norm(output_data_gpu - output_data_cpu)\n",
208 | "\n",
209 | "figsize(19,4)\n",
210 | "\n",
211 | "matplotlib.pyplot.subplot(141)\n",
212 | "matplotlib.pyplot.plot(input_data);\n",
213 | "matplotlib.pyplot.title(\"input_data\");\n",
214 | "\n",
215 | "matplotlib.pyplot.subplot(142)\n",
216 | "matplotlib.pyplot.plot(flag_data);\n",
217 | "matplotlib.pyplot.title(\"flag_data\");\n",
218 | "\n",
219 | "matplotlib.pyplot.subplot(143)\n",
220 | "matplotlib.pyplot.plot(output_data_gpu);\n",
221 | "matplotlib.pyplot.title(\"output_data_gpu\");\n",
222 | "\n",
223 | "matplotlib.pyplot.subplot(144)\n",
224 | "matplotlib.pyplot.plot(output_data_cpu);\n",
225 | "matplotlib.pyplot.title(\"output_data_cpu\");"
226 | ],
227 | "language": "python",
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "output_type": "stream",
232 | "stream": "stdout",
233 | "text": [
234 | "[6676 3142 9384 ..., 5291 5203 3143]\n",
235 | "[6676 3142 9384 ..., 5291 5203 3143]\n",
236 | "\n",
237 | "Difference between GPU and CPU (should be 0.0%): 0.000000\n"
238 | ]
239 | },
240 | {
241 | "output_type": "display_data",
242 | "png": "iVBORw0KGgoAAAANSUhEUgAABGMAAAEICAYAAAD7pjk6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xd8VFXex/FvEAuuMSAtj9IMxCSAQoAkSI2KiCJFULGu\nCjbKYgEb6IKu64pgAVTAXZHHR5TFgrqCIqihiCRBRRBDlSZIJCAhaHAFzvPHdfpMMpnM3JmEz/v1\nOq+5c+uZ9ptzf/fce+OMMUYAAAAAAACwRY1oVwAAAAAAAOB4QjIGAAAAAADARiRjAAAAAAAAbEQy\nBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGOOQ61bt9bSpUujXY2Abr75Zj3yyCPRrgaAEG3b\ntk1XXXWV6tSpo7p169r+e27WrJk++eQTW7cJoOqrUaOGvv/++2hXA0A1QUxBeUjGHIe+/fZbdevW\nLaLbqExCJS4uTnFxcUHNm52drZdffjmk7QCIjNdee00JCQnat2+f+vbtG/TvOVwqEkNoKAHRFe7f\noB2/6W3btqlGjRo6duxYRLcDoOKIKahKSMYgJhljgprP7p08AOVbvny5srKyVKNGDRljgv49R0us\n1w+o7sL9G7TrN03sAGITMQVVBcmY45CjC//48eN17bXXasSIEUpMTNTVV1+tgoICj/mef/55ZWZm\nqnnz5po+fbp+//13SdKsWbPUtWtXj/XWqFFDW7Zs0UsvvaTXX39dTz31lOLj49WvX78y67N9+3bd\nfvvtSkxM1G233aYjR444p/3888+6/PLL1aBBAyUnJ+uvf/2rfvrpJ0nS2LFjtWzZMo0YMULx8fEa\nOXKkJOmuu+5SkyZN1LBhQ91xxx365ptvwvK+ASjfhRdeqMWLF2vkyJGKj493xgyp7N+zJO3du1f3\n33+/EhMTdcUVV2j06NG68cYby93mF198oZ49e6pZs2aaOnWqx7S8vDydf/75qlOnjs4//3w9//zz\nzhjj6CHYpk0bxcfH680339SBAwfKrCMA/3bt2qWHH35YzZo10y233KKvv/5akm8PVvf2g7/fYE5O\njho1aqTnn39ezZo10yWXXKLc3Fzn8hVdX1kWLFigzMxMpaam+sw7f/58paenKyEhQRdffLFeffVV\n5zTHdmrXrq34+Hjl5uZqy5YtuvDCC1WvXj2dd955mjBhgg4dOlTu++beBrr11lt1ww03OHsWV+a9\nAKo6YkpoMaWoqEjPPPOMzj33XNWrV09/+ctfnK+rS5cuevjhh3XmmWdq0KBBPvt97qd4jx8/Pqg2\nGCqHZMxxyL03yTvvvKM2bdqooKBACQkJeuKJJzzmff755/XMM8/onXfe0UsvvaSZM2eWu+7bb79d\n119/vR544AGVlJTovffeK3OZgQMH6vTTT9e3336rlJQUzZ0711lHY4yGDBmiHTt26KOPPlJeXp6m\nTJkiSfr73/+url276oUXXlBJSYlzfGZmpr755htt2LBBCQkJGj58eIXfIwCh+fTTTz1+lyeddJJz\nWlm/Z0kaOnSoCgsLtXr1ag0cOFAzZswot/fbgQMH1KNHD91www1asWKFcnNztWvXLuf0mjVravLk\nySoqKtLTTz+tSZMmKScnR5Kc185as2aNSkpKdNVVV+nYsWNl1hGAf9ddd52Ki4uVn5+vrl27Kjs7\nW6WlpWWeNujvNyhJP/30k/Ly8rRy5Upde+21uuiii/TLL79IKvs0xEDr8+fbb7/Vn//8Zz3yyCNa\nsGCBZs2a5TH9tNNO02uvvab9+/dr9OjRGjFihDZv3ixJWrZsmSSpuLhYJSUlysrKkmQdJPrxxx/1\n2muvae7cuZozZ06579vAgQNVu3Ztffvtt2rVqpXefPNNj9cX6nsBVHXElNBiypAhQ7R69WrNmzdP\nu3fv1jXXXOOclpeXp8OHD+ubb75Ru3bt1KNHD+c07/eB2GIPkjHHuZSUFN12222qU6eOhgwZosWL\nFzunxcXF6eqrr1aXLl3Upk0b3Xnnnfrggw+CXncwXe0KCwu1bt06Pf7446pXr55Gjx6thg0bOqef\nccYZuuKKK3TKKaeoefPmGj16tE9yx3s7119/verUqaPatWvrkUce0erVq1VUVBR0vQFERlm/5yNH\njmjx4sUaN26cEhMTdcMNNyg9Pb3cdX788cdq3769/vznP+vMM8/UY4895tG7rl27dsrMzNQJJ5yg\nTp066YYbbigzQRxMzAHgqaioSPn5+XryySdVv359DR48WOeee64++uijkNZ35MgRjR8/XomJibr5\n5pt13nnnhbyuQBYsWKDLLrtMffr0UVJSkkaNGuUxvXv37mrVqpVOOOEEXXLJJerXr58zFvhr3zRv\n3lwXXXSRTjzxRJ133nkaOnRoubHD0QZ67LHHVK9ePd1zzz1KTEz0mMeO9wKINcSU0GJKcXGxFi9e\nrGeffVYtWrTQSSedpM6dOzun16hRQ48//rjq16+vBx54QJL01VdfVfaloxJqRrsCiJ64uDi1adPG\n+TwxMVGFhYU6duyYatSw8nRt27Z1Tk9PT9eYMWMqtP7y5OXlqUWLFjrllFOc49q1a+ccPnbsmB5+\n+GEtW7ZMa9eulTFGhw4dkjHGuX7v7cyaNUtvvvmm8vLydOTIEZWWlmrt2rW64IILgq47gPAr6/dc\nUFCgY8eOKSkpyTl/+/bty02k5uXlecSxpKQkJSQkOJ/v2rVLf/vb37RixQpt27ZNR48eVYcOHQKu\nzxjjPAUyUMwB4GnlypVKSkrSn/70J+e4Dh06OI/2VvQ6CKeddppHLGjXrp1WrlypgQMHhqfCsmJH\np06dnM+9k7/r1q3TpEmTtGLFCu3Zs0f//e9/nW0jfw4dOqSxY8dqxYoV2rhxo44dO6Yzzjij3Dok\nJycHbANJ9rwXQKwhpoQWUz7//HM1bdpUdevW9TvdO96kp6friy++8Ik7EtevsQs9Y1Amx/mZkpU5\ndQSZs846S4WFhX7nk6QTTjghqCuCZ2RkaPPmzSotLfXYjsObb76p+fPn65VXXlFRUZHefvttjwuC\nem9n586duvfeezVmzBht375dO3bsUK1atQgoQBQ5khhl/Z5TU1N97ljw5ZdflpsAyczM1OrVq53P\nt2zZouLiYufzxx9/XL///rsWLFig4uJi3XPPPR4xw3GRYYe5c+eWGXMA+OrYsaO+//57Z7d/Sc5T\nC8prL3j/BiVrJ2TLli3O519++aXOP/98SVb7Y8+ePRVanz+ZmZk+bRx3o0ePVqNGjbRkyRIVFxdr\n4MCBHm0PyXNn5YUXXtCGDRs0d+5cHThwQM8++2y57aCMjAxt2rRJhw8fDliPyrwXQFVFTAktpnTq\n1Enbt2/Xvn37/E7ftGmTxz7X119/Xeb7wEGoyCMZcxwrL7AYY/T222/r888/15o1a/TSSy/p8ssv\nlyR16dJFO3fu1Mcff6ydO3fqqaee8li2ffv2WrNmjcfpAv4kJiaqVatWGjdunPbu3atnnnnGI8Du\n3r1btWvXVr169bRx40ZNmDDBZztff/2187Xs3btXxhglJiaqpKREY8aM0W+//Rb0ewIgvNwTGWX9\nnk888UT16NFDf/vb31RYWKjXX3/dI8kSSM+ePfXVV19p9uzZ2r17tx599FHVrOnq9Ll7926dccYZ\nqlu3rnJycjwumCdZMWTVqlUe85cVcwD4qlevnjIyMjRmzBj99NNPmjVrltatW6devXrpoosu0rvv\nvqtdu3bp008/9elm7/0blKwdk8cee0x79uzRq6++qm+//VY9e/aUpJDW589ll12mjz76SPPnz9f3\n33+v5557zmP67t27Va9ePSUkJOj999/X+++/75zWqFEjNWjQwCd21KlTRw0aNFB+fr6ef/75cuvg\naAONHz9eRUVFmjx5ssfOUCjvBTtPqA6IKaHFlNq1a+viiy/Wvffeq82bN+vw4cNasWKFc/qxY8ec\n+1wTJ06U5OqNd9FFF+mNN95QUVGR3n33XS1ZsqTc7aHySMYcpxwXafL+0/a+cNPw4cN17733qn//\n/hoyZIhuvvlmSVKtWrX00ksvafTo0brkkkt0zTXXeCzbt29f1ahRQ2eddZYGDBhQZl3efPNN7d+/\nX61bt9b69es1aNAg57TBgwfrrLPO0jnnnKMbb7xRgwcP9tjODTfcoM2bN6t+/fq6++671a5dOw0b\nNkwXXnihunXrptatW6tx48aVeasAVIJ7nCnv9/ziiy+qbt26atOmjd566y1df/31Hqcc+VO7dm0t\nXLhQr7zyis4//3xlZmaqUaNGzunjx4/X6tWr1ahRI02cOFEjRozw2Obo0aM1adIk1alTR2+99Va5\ndQTg3+zZs3XqqacqIyNDOTk5+uSTT1SrVi0NGDBAnTp1UocOHfTUU09p+PDhZf4GJStJkZmZqays\nLL322mv6+OOPddppp0lSSOvzp3Xr1nrllVf06KOP6rLLLtNNN93ksZ6nn35ac+fOVZMmTfTGG2/o\nzjvvdE6Li4vTI488oiFDhqhOnTrKy8vTPffco9LSUjVt2lSjRo3SsGHDgoodb775poqKitSqVSut\nXbtWvXv39oh7FX0vgOqCmBJaTHn55ZfVunVrXX755WrcuLHmzp3rnJaVlaUTTzxRbdq0UX5+vj7+\n+GPntNtuu03169dXq1atNGfOHN1+++3lbguVF2foe40Azj77bL388su68MILo10VAMehjIwMPfTQ\nQ+UmdAFUHzk5Obrxxhu1c+fOaFfFdsYY/c///I8WLFigdu3aHdfvBRAu/I4ss2bN0ssvv+y87g5i\nQ5k9YwYPHqyGDRvq3HPPdY4rKSlRv3791KRJE/Xv39/jfudTpkxRcnKyWrZsqeXLlzvHFxQUqF27\ndkpKStLYsWOd43///XcNGTJETZs2VXZ2tk/XTABw8BePvD300ENKSkpS+/bttX79ehtrh3BYtWqV\ntmzZol9++UXTpk3T2rVrddFFF0W7WohRtFFQHSxdulR79uzRvn379Nhjj+nYsWN+L6aJyCKeAIiG\nMpMxt9xyi89tv6ZNm6YmTZpo06ZNatSokaZPny7Jun/7iy++qE8++UTTpk3TyJEjncuMGjVKDzzw\ngPLz87VkyRLn+XDz5s1TcXGxCgoK1KtXLz3++OPhfn2IATt27FB8fLxPOf300/XDDz9Eu3qoIvzF\nI3d5eXlatmyZVq1apdGjR2v06NE21g7hsGfPHl1wwQU688wztXz5ci1cuFAJCQmaPXu23xhSVmIO\n1R9tlOorXKcGPvHEE35jR+/evcOy/mCU1QbauXOnNmzYoLZt2+qcc87R7t27tXDhQo/lOU3SHsST\n6u14iSll7Vf5uzwFYoApx9atW03r1q2dzwcOHGi+/vprY4wxX375pbnyyiuNMca8//775q677nLO\n17ZtW1NSUmKMMSYpKck5/umnnzbPP/+8McaYe++918ybN88YY8y+fftMhw4dyqsOgOOYdzxyN2XK\nFPPss886n7vHHQDVE20UAOFCPAFgtwpfwDc/P1+pqamSpNTUVOXl5UmScnNzlZaW5pwvJSVFubm5\n2rx5sxo0aOAc37JlS61cuVKSdSS7ZcuWkqQzzjhDhYWF3PkGQEjc44kk1a9f3+M2hgCqP9ooAMKF\neAIg0mqWP4snU4Hr/frrCmWMcY43brc8LWvddKkCYldFYkIkeccTKXDsIKYAsamy8YQ2CgB3lYkp\n0YgngdYFIPoisc9T4Z4xGRkZKigokGRdpCojI0OSdaus7777zjnf+vXrlZGRoRYtWqiwsNA5/rvv\nvlNWVpbPMvv371fDhg118skn+92uI4gZYzR4sNE771jDv/5qJFnl9NNd86SnG61dazRkiGv6pElG\nI0a45tm71+jIEaPhw41atDB6+22j3bsdwdIqq1a55t+0yejee42uucY1zxNPGLcAaxXH83Hjxqm4\n2Oibb4wOHnRNP3jQeLyeQMV7fW+95RqXlGQ93nyza/ovv1iPs2ZZ07Zu9b++Tp08nz/5pGueV181\nksbpwQet56+/bjR+vDV8001Go0dby6xe7bnub75x1bV+fde6i4td4x3jSkqs93nTJmv800+75mnf\n3uj6641uvdVzGUfZvdvovPM83xf397tpU2vahAmuZfbvN/rsM//v8Z49vtvwLocPW48dOhitWWO9\nJmOM7rjDqKjImvbPf7q+O+7lwQeN/vEPo4kTre+Xe70l63VKRomJ4zRggOf0wkL/r9PxGXTr5ru+\nDh38L+NerrrK9Xl5f47GGBUUGB075v+9uP9+67F2bd/ER7R5x6C9e/cqKSmpjCXK/+xjr4yLgTpQ\n79gvVbXelRfNNsoNNxh98IH1WrZt84yr555rlJ9vdPSoUc+e1n+SZNS/vzW9bVvreUqK53LLl/vG\neu+SmOg53dEWcP9veOcdo9tvt4affNJ6vPrqcbr6aqNmzaznW7b4/991FMe0+HhrePdu3+nu20xM\ntNpY777rvy3y0Ueu4fnzrfbWqada75Vj3jPPNDrhBM/tNGxo/d+//75ru9Onu4Y7drSGHf9Txhh9\n951xtg/cx518sjWudWvr8fvv/dd1zRqrXen+Wvfvt9qgSUmu96JjR6NDh3zbA5JRmzZWvf/7X+v5\nfff5b/c99JDntidPtp7feaf1/MgRz/bg/Pm+6zl0yKqf+7i77/b8HLdutdrJxhg9+qg17uKLrceh\nQz3fq86dx2nQIGvcpEmu8Q0a+H5PjhwxGjPGmn7smFX/n392LXPkiNG6db7fNccyjnLHHf7r4ig7\nd3q+xrg4oyuvdK3vzDMrH1OiFU8k6eefjc4+22o7pqd7vvY1a4zOOcca/vRTo2HDrN+4431ytB8l\nox9+cC23e7fR5s3W+JdeChxTJKOcHGv46FGj0lLfNqhjPkcbXjLq0GGcHn/c83eyapXr+QsvuIYn\nTLC+C5LVFjfG6LTTXNP9xbGDB63v7MaNvtPOOsvz+/Tzz0Z//as1/Pnn1nyrV7umx8e7hu+9d5xz\nPZde6lpndrbn99SxHmOMRo60xp16qvW8uNj6TvqLm47iiFOS0Y03uoZfe82avnevNc/f/ua5/ObN\n1ufgWE/37ta0ceOselv7br6f4yefuMaPG+dbJ8noggt8f4v+vhPu3yP35e+80/ouStb7u2uXa1qj\nRq51OvZbjTF65JFxatw48H+He+nSxfq/XLDAmr5xo7Vv6r7M1q3Wd6NXr8Cvw/03IRnddZfv61mz\nxjW8ZYu//8EIMeXwPn9ywoQJZsSIEebXX381w4YNMxMnTjTGGLNnzx6TkpJitm/fbj777DOTnp7u\nXObSSy81b7zxhtm7d6/p3Lmzyc/PN8YY8+9//9sMGDDAHDp0yPzjH/8ww4cP91sHRzX79jVGssql\nlxqzaZMxBw64xtWqZcyhQ45ljJk61Zhbb3VNdxTXen1LnTqe01atMuaNN4z54gv/899+uzFXXWXM\nvn2+6x83bpy54w7fZRYsMGbnzvLeed/1vfWWa1yDBtZjp06e88+bZ8ysWb7Luq+vUydj5sxxPX/4\nYdc8r75qjDTOSMY8+6wx55zjWk+g9/HYMde4Z57xnGfPnsDLS8Zs3+4aHjvWd/3e8/fta8x55/m+\ntn/9y5hBg8b53Ubdur6f+4ED1nC3btbzuDj/9ZOMOXzYd9yHH/qf152/5bzr4fh+nnHGODNggDU8\nbZoxffoY88MP1vN33/Vc7913e67v55+NueIKazglxTW+Tx/f75Rju4895r/O7p+Hv3L//dbj6ac7\n1lVuCAmrsq4Zk5ubazp37myKiorM7NmzTe/evQOux4qqZb/W2Cz+v+OxX6g39Q6mqNIxIVptlGbN\nrNfQtq31eMYZrulr1ljjJk0yZsIEz9fcv78xCxd6jluxwlrut9+s547/qV27/L8HiYnWdFd9jFm7\n1vP5O++41v/kk/6/J507G7N8eeDP59prXcMlJcb8+KPndIc//9l6nphozNCh1vD77xuTkOCqT3nF\nGGOOHvU/7ZRTrPbV669bz/fsMWb6dGu4WTNjOnZ0zXv99da6Jk3yfX/d19m6teu9ycuzPkf3edyX\nd9TP/fnu3dbjn/7kOf73313zJiRY9b7ySuv5qFH+vkvW/7n7+kePtp7feaf1fN48z/bgM8/4ric7\n22oTu3O0HdyXddQ9Kclz3MUXe3+nxnm8F+719S6O96J3b2NOOMEadvu5mZdf9v8eSsZMmWLMunX+\np91zj+971bmz6/V6z3/mmRWPKbEQT6zX5mqjONq8ju+S+3vj730yxpiaNV3Pe/Y0prDQGt+ypWt8\njx6B3wfJmJwca/ippzy/C6NHW88d493bk94xxbt+Tz/tGk5Lc8Wuxx6z5o2Pd01fsMAa9803xowZ\nY4376SfrsXZt67f+v//r/z3wLi+95Pv6Pb9b4wK+r6WlnvP++qvvNo2x2tyBYtkVVxgzd64xI0f6\nn2fWLCtm+5vm/p/RvLm1vs6drefjxvnW2938+a7xDz7oO1+gzynQd+KLL3zHJSVZsaasz90xn2Pa\nmDG+3xNjjCku9l2uSxdjTjvNGnbETsmYgwc969GxozG9enku26iRMf/8pzXPggWe05o2tWKpw5Ej\nrrqsXBnou1SxeBKsMnvGXHvtterUqZM2btyoxo0b65VXXtHQoUO1Y8cOpaSkaNeuXbrzzjslSQ0b\nNtTQoUN14YUXatiwYZo8ebJzPZMmTdJTTz2ljIwMde3aVR06dJAkXXHFFUpISFBaWpo++ugjPfzw\nwwHrcuiQ9P77rudHj0rJydLFF7vGlZZKp50mOXr3/eUvrmF3v/0mPfWU/+38/LO/90EaOtT//Fu3\nSm++KdWt6zn+j4unq7TUd5nLLpOGDPEd/8wz0nXXSb/8Ii1Y4H97DsZ4Pt+xw3rcudP//O4XbS8s\nlK65xvX8l1/8L7NkiWt47drAdTl82DV8772e0154wXoM1OPyjTdcw3//u+e0K6/0nd/9O+Du1lul\nDz7wP23fPt9xBw9aj466V7RH6NSpZU9fu9b/doMxa5b0n/+4nvfv7zndu65r10rz5lnD7t+L//xH\n6tFDeuutsrcXFyctWyZ9+KH0+eeh1dkOjni0YcMGNW7cWDNnztSMGTM0Y8YMSVJmZqa6dOmiDh06\n6Omnn9bEiROjXGMAkRRLbZRt26zH1autx59/lmrUkHbvlnJyXPP5u4GK+3+tJHXqZD16/887/gte\ne82K7WVx/P+sWFH2fN6uuCLwNPf/688/lxYt8j/fq69aj3v2SNOmuepRXFyxurh1PCiT+3+/43Nw\n8HcDvk6dfNtm7u/1p5+6PkcH75vz/XEZEB/e7Sn3tlxxsTV9zRrr+dGjnvNu3249Ov7P3es2d67r\n+e+/e073bvcZY33n/LU/Jd+21ZEj/uerrPnzXa9x1y7X+DFjrEd/9Rs5UmrVyv/6nn1WOukkqw3r\nsG+f9ftx/42FKpbiiTvH92XePOn884N7Le6f6ccfSw0bWsNud+bW4sWu4bPPlr74wv+6Nm70fD5p\nkvX46afB1cWdIx5IUkGBFSMk6a9/tR5LSlzTP/nEepwxQ3riCWvY7XI82rZN+uyz4LdtTPAxxZ17\nnSTpv//1nScrS9q7N/A65s2TXn898PT9+13vgbcHHnANb9kiff+96/nu3YHXeeSINHOm/2mbNklf\nfx14We/X7Igx3uMlqz6BYk1Fef/neXPfnzl2zHOav5j8ww/Sbbf5/9y3b7f+77z/S//7X+mWW4Kr\nb9hEJMUTZpI8jupUtjiyuoGKtU2rrFplPTqOdgVTtmyxHj/77DPnESLv4n4R9UWLrJ4WgTK2paXG\nZGZ6Hs1w9PZw9IzJyrKeP/20/54x7utzHMHzl5W0esZ8ZiTriF1575MxvlnjihTXEbrgi7+eMZIx\ntWt/FlR9JWN27LCGMzPL356/Hi6XXVb+NoKph6NnzFlnfebsGeMojp4xknU0oHt3Y5YssY4Ouc/3\n9tuu4eRk3+21aGHMxo3W0SrHdj2PZBhz3XXWo+NIY6AS7Z4x4SJV1Z4xZX/HY7dQb+odTFG0Q0NI\n/MUTR2/L9u2to/2Sb+8KyfqfdRz1dS/GuP57unb1HO/oJbxsmXU0z1/PmE8/tYYdPQb894zx/J50\n7mxMw4bBfVYffWTMI4/41tmxfe/iOCobaLp3ce9x611OOcWYiRM/c7ZR/vUvY2bMcE137xkjWe24\nRx8te3stWrjem7/+1Rp29OoIpnj3EvJ+XxzDjRp95jxCPHKkNe0f/zBm717PZcaPdw07elg7esbM\nnevbu8Xh0CFjLr/cd/yddwY+cj9vnu84354xru9KeT1j/JX69X2X+fXXwPPXqxd42gcfBLftUHrG\nxAp/MeXGG12v3993q6xxjvFNmniOO3rUNf+4cdZvxfH8jTes4Z49vb8LVnH0QnjgAfd1fhZUXcr7\nnUhWW98YY4YN8503IcF6vPnm4LbRu7cxn38eePrs2Z95xGf3dTp64zjKvn1W7A31dYWjOHrGSJ95\n9CYxxmrrT5pk9Qp0f/033OB67vjPCFTHP/3J9Xm/955r/McfW+O2bzdm8ODA9Zs82Xece8+YDz/0\n/Z4Y43m2S3nl558Dfye9S+vWrh6mgT4jR88Y9x6gvsWtsmFU4WvGVAeOrG5FGBP8vI7Mc3Z2tt57\nz/88jp4zknTppVYJZN8+KS/P82iGd3Zy3TrrcdQo6cUXXePXrw++3pIj05gd1Lxxcdb8NWLkW3TS\nSdllTu/Z0zOTv2GD9O235a/3lFOCr0NZWeqy1KqV7TNu/37XcIMG1tGfadN8e8YMHFj++s85x3r9\ngZSVsXfnOEKDaMmOdgVClB3tCoQoO9oVCFF2tCuAP3z5peuonHsv0nDo2tU64uo4uvz991bPX8nV\na9j76KHnuGyfaRXpJVqReZ98Mvh5Jen++8ueft992Xr33eDW1aGDNG5c2fNs3uwafuwx69G750o4\n7NqV7Tyy7Vj/Qw/Jp604frzvstOnW4/+elY7rFrl2VMoLs7qCTx9umePW3f+ekP59nrK9pnH0cM4\nGMZYPVq2QFkuAAAgAElEQVTce9/+9FPg+YuKyl7X8axPH9ewey+hULz0kmv4H/+wfiuOnknXXms9\nfvyx9Rjo83L0zLdkV65Cbspqb1a0l938+VabP5DCwmyf3m+BLFxoxd7YkK1ZszzHvPqq1ZPvm29c\n4+bPL/uMC+/ePu49/Pr1cw1Pnix17Ci1axe4140k3XVX2bXu2jXb7/jly8teztuiRf5763j79ltp\n6dLg1hmo12MkxchudOxyNDbcv9TleeUV67GkpGIBoyING+8fjvvzP+68J8mza2gwbr65YvMbU/FT\nfMLt6aeDm2/RIuntt13P27eXfv01vHVx7/YZiHeXT8lq1PxxjTgnf+/rV1/5dlEuj6OR+csvvtuo\nKH9dvgEAntx3GB070RXZifz8c9fOUFn/se47rb/95tqGY0ff0QB1X8cjjwSuc7D/519+GdqpuP/8\nZ3DzlXXQLNxJLTu5fwdCTSoEs/PhrqI7ru727/c9RUuydvg6dgx+PSUl1qUDunRxjevWLbQ6uScj\njifLllmP7t8bf6fMBHLHHd6JE899BMe6Ap3KE+iSBuES6LTH8gQbs8o6rcpfuzyQKVOCn9cO5SU+\nHNwP8HoLNvE8f76Umxv6ZRgk6X//1/fUXMlKCl9+efDr+eUX6yCz2xmCFUoQuyvrlC07VJlkTCSO\nUARy1VWVW/6556zHH34oe76//CW49Xlfh8Xbyy8HDsj/+U/oyZKK7vSH4sEHK76M44+ofn3rMdhs\ntuRqlDg7nIXZTTeVP4+/7/JPPwWXKNm6tezr1ZT1mrZudR1FqWxSJtoJOACozrp08b12iOR7dHr4\ncM/nf1zSQpJnnHb/byirPRVsT9exY13XhHO45x7Pnrn+eB/FDZdo/ye57xB48/6MKspf7ya7Xu9n\nn/ke8Hr/fautU5F2xG+/+e7weicGUDbv6yFVlHsvGAd/17By530dTX/fu3C1pf313q5Vq+z1V2Tb\n/n5HDo6EeTCi0XPCW2Wv72jnPrWDo0fgzTf77v9OmKCgezo6NGpkPbpfqyfU7+Kzz7qGt24NbR2V\nUdP+TYYmmNNJwqW8C56Gy/PPS3ff7brQVqA/V/cLt3kzxrp4bSChXKzKYf780Je1Q1ndWAOJdGZf\nCq1e/vh7/8tLkJUXiL76ynq08/cEAAide1z/8kvPae47GCUl1lHHyqjMTv5zz0lJSWXP4ziNKtzc\n36NoJGbKOg2rrARVMKc2++tNFM1TdYLt7u8t0A0mIiXaCTo7RPo1/utfkV1/eQ4fDt93PdQLVVe1\n71EwB7nLuthwNIRyYN7BvadSVT2Fscr0jLGjl4Y/3lfUDzf3q2KHItDVzx38fTHD+WVNTXV1nawK\nInXXAHcjRoRnPZXpNRSqYE9DqqoBDwCqq6yswNP83SnS24oV5ffoLU95/w0VOa0iGuz+b6voqeTl\nKevOl9FW1mkSkRDu9zYWRTpRUNa1m4LpQdC7d+XrEK7fZFk9YyIpWtutiEDvcbT2vaMhmkm3KpOM\ncdzWzG5lXSgt3Oz6Ijhun+iueXPP21YGa/Pm4K6TYoeKJFrS08N/vRiHcHX/i8aFkR23JC3PwYOV\nOw8dAI430Wzs2dWWKW/HKVLJgnC9t9Hovh8O//qX1QYK9vR3VA/ut6q2m+OCuHPmBJ6nrIvGhkNF\negJG6yBieRcjj1ULFvieihoule2IUN1UmWQMIuv776Xrrot2LSonmCN/DpU9QmNHUK/IXZwc7Axw\nlT1/GQCOd+4X3K8OovG/UNap2hVV3l2XwunFF8P3ft12W+WvBYeqp6w72lQXM2aEZz0V6aHSv394\ntikFf5ORcMjMDN+6eve259IOIBkTUyJx1Mz7/PJIsGMbgbjf7jsY7rd8rIwPPww8LZhu2ME0wKva\neaoAgMD8xfRgrhnijdNEfYV6HRNvdveMcSRQIvl//8knkVs3Yk+kPu+qvmNeketVud9qviq1xfPz\nXcMHDkSvHqgYkjERdN555c8zdGhk61CR3iKh+vjjyG8jEPdbVcdKA/X998ueXlJS9rn9kTRqlPV4\nPJ0HCgDVxWWXVf66LtXRa6+5hqNxNwy7VWQH8ZZbQt9ORQ94wV7+vgc9elRsHSeeGNx8jzxSsfXG\nmkifMhVrKvI/ESv7T9GyaVN0t08yJoKCuYbJli2u4TVrIleX6iRS13qxSyxczGv9+sqvo23byq8D\nAI535TWEveP1119Hri7VwZ490a4BUHUEe73Fit56uLo4nhMVx0viP9q3KycZgyrHPYFVFbkH9mjf\nNhAAENti7TakCB/HqQTH8w4fQleVTqGpqqZPj3YNoifaPUYqqqrV14FkDAAAAGCzZ58NbTl/yRt2\nzIHwq+qnZwWjusSOffuiXYPQkIwBYlRVv1gaAMClujR4AcQGYgrC4eGHo12D4xvJGAAAgAhjxwnh\ncjxcpLgs110X7RoA1UeoPfSqky++iN62ScYAAABE2MSJ0a4BYlVFE3XRvItlLHjjjWjXIDaQ4AXC\no0uX6G2bZAxgs1Wrol0DAIDd9u+Pdg1QXbATDgDVA8kYwGaLF0e7BgCAWMGddBCOW8iSoDn+8Jkj\nkn7+Odo1OD7EGRP7zYC4uDhJMV9N4DgUpyoQQnwQU4BYdHzGk+bNpS1bwlcfVH89e/qeqrRkidS9\ne3TqE7uqd0xp0kTasSPy9QEgRSqekIwBUAnVu6EDwE7HZzypU4cjkKiY1FRp/fpo16IqOD5jCoBI\niEw84TQlAACAKCERg4o6dizaNQAAhAPJGAAAAAAAABuRjAEAAAAAALARyRgAAACgiuAuOgBQPZCM\nAQAAAAAAsBHJGAAAAAAAABuRjAEAAAAAALARyRgAAACgiuCaMQBQPZCMAQAAAKqIGrTeAaBaiDPG\nmGhXojxxcXGSYr6awHEoTlUghPggpgCxiHgCIJyIKQDCJTLxhNw6AAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYKORkzD//+U916tRJ7du319133y1J\nKikpUb9+/dSkSRP1799fhw4dcs4/ZcoUJScnq2XLllq+fLlzfEFBgdq1a6ekpCSNHTu2Ei8FQHW3\ndOlSpaWlKTk5WVOnTvWZXlpaqptuuknp6enq3r273nvvvSjUEkC00UYBEC7EEwARY0Kwb98+06xZ\nM3Po0CFz9OhRc+mll5qPPvrITJgwwYwYMcIcPnzYDB8+3EycONEYY0xhYaFJSUkx27dvNzk5OSY9\nPd25rksvvdTMmTPHFBUVmc6dO5v8/Hyf7UkykqFQKDFXFEoICVnbtm3NkiVLzLZt20xKSorZu3ev\nx/Rp06aZoUOHGmOM2bZtm0lKSjLHjh3zWQ8xhUKJxaKwxAnaKBQKxSqqcvGEmEKhxGpRpeOJPyH1\njKlVq5aMMSouLlZpaal+/fVX1a5dW3l5eRoyZIhOPvlkDR48WLm5uZKk3Nxc9erVS02aNFH37t1l\njHFmkDds2KBBgwapbt26GjBggHMZAHBXXFwsSerWrZuaNm2qnj17+sSLhIQElZSU6Pfff9f+/ft1\n6qmnKi4uLhrVBRAltFEAhAvxBEAkhZyMmTZtmpo1a6bExER17txZWVlZys/PV2pqqiQpNTVVeXl5\nkqzAlJaW5lw+JSVFubm52rx5sxo0aOAc37JlS61cubIyrwdANeUeXyT/8eLaa6/V0aNHVa9ePXXp\n0kWzZ8+2u5oAoow2CoBwIZ4AiKSaoSy0d+9eDR06VN99953q1Kmjq666Sh988IGsnnXB8Xe0uuzl\nx7sNZ/9RANgr548Sm55//nnVrFlTP/74o9auXavevXtr+/btqlHDX955vNtwtogpgN1yFIl4QhsF\nOF7lKNwxJTrxRCKmANGWIzv2eUJKxuTl5aljx45q0aKFJOmqq67SsmXLlJGRoYKCAqWnp6ugoEAZ\nGRmSpKysLC1evNi5/Pr165WRkaH4+HgVFhY6x3/33Xfq2LFjgK2OD6WqAMIqW54Ngkdt23JGRobu\nu+8+5/N169apV69eHvMsXbpUQ4YM0amnnqqsrCydeeaZ2rhxo0ePGpfxka0wgHJkKxLxhDYKcLzK\nVrhjSnTiiURMAaItW3bs84R0mlLXrl21atUq7d+/X7/99ps+/PBD9ezZU1lZWZo5c6ZKS0s1c+ZM\nZ5DJzMzUwoULtWPHDuXk5KhGjRqKj4+XZHXtmzNnjoqKijRv3jxlZWWF79UBqDYSEhIkWQmXbdu2\nadGiRT7x4qKLLtJ//vMfHTt2TN9//732798fIBEDoLqijQIgXIgnACIq1Cv/vvLKK6Zbt26mQ4cO\n5uGHHzZHjx41Bw8eNH379jWNGzc2/fr1MyUlJc75n3vuOdO8eXOTlpZmli5d6hy/bt06k56ebpo1\na2YefPBBv9uSuKo4hRKbRaGGkJDk5OSY1NRU07x5czN58mRjjDHTp08306dPN8YYc+DAATNy5EiT\nnp5uevbsaebPn+93PcQUCiUWi8IWK2ijUCiUcMUUO+MJMYVCidWisMQTb3F//OhjmnWuZcxXEzgO\nxakKhBAfxBQgFhFPAIQTMQVAuEQmnoR0mhIAAAAAAABCQzIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAAbEQyBgAA\nAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGISdjfvnlF910000655xz1LJlS+Xm5qqkpET9+vVTkyZN\n1L9/fx06dMg5/5QpU5ScnKyWLVtq+fLlzvEFBQVq166dkpKSNHbs2Mq9GgDV2tKlS5WWlqbk5GRN\nnTrV7zz5+fnKyMhQWlqasrOz7a0ggJhAGwVAuBBPAESMCdGoUaPMww8/bEpLS83vv/9uDhw4YCZM\nmGBGjBhhDh8+bIYPH24mTpxojDGmsLDQpKSkmO3bt5ucnByTnp7uXM+ll15q5syZY4qKikznzp1N\nfn6+z7YkGclQKJSYKwo1hISkbdu2ZsmSJWbbtm0mJSXF7N2712P6sWPHTOvWrc2iRYuMMcZnugMx\nhUKJxaKwxQraKBQKJVwxxc54QkyhUGK1KCzxxFvIPWMWL16sMWPG6JRTTlHNmjWVkJCgvLw8DRky\nRCeffLIGDx6s3NxcSVJubq569eqlJk2aqHv37jLGODPIGzZs0KBBg1S3bl0NGDDAuQwAuCsuLpYk\ndevWTU2bNlXPnj194sWqVat03nnnqUePHpKkevXq2V5PANFHGwVAuBBPAERKSMmYH374QYcPH9bQ\noUOVlZWlCRMmqLS0VPn5+UpNTZUkpaamKi8vT5IVmNLS0pzLp6SkKDc3V5s3b1aDBg2c41u2bKmV\nK1dW5vUAqKbc44vkP14sXLhQcXFx6tq1q/r06aOFCxfaXU0AUUYbBUC4EE8ARFLNUBY6fPiwNm7c\nqIkTJ6pHjx664447NHfuXFk964ITFxfnM67s5ce7DWf/UQDYK+ePEpsOHz6s1atXa/Hixfr11191\n8cUX69tvv1WtWrX8zD3ebThbxBTAbjmKRDyhjQIcr3IU7pgSnXgiEVOAaMuRHfs8ISVjWrRooZSU\nFPXp00eSdO211+rVV19VRkaGCgoKlJ6eroKCAmVkZEiSsrKytHjxYufy69evV0ZGhuLj41VYWOgc\n/91336ljx44Btjo+lKoCCKtseTYIHrVtyxkZGbrvvvucz9etW6devXp5zHP++efrt99+U2JioiSp\nQ4cOWrp0qS655BI/axwfwdoCKF+2IhFPaKMAx6tshTumRCeeSMQUINqyZcc+T8jXjElOTlZubq6O\nHTum+fPnq0ePHsrKytLMmTNVWlqqmTNnOoNMZmamFi5cqB07dignJ0c1atRQfHy8JKtr35w5c1RU\nVKR58+YpKysrPK8MQLWSkJAgybqj0rZt27Ro0SKfeNGxY0ctWbJEv/76q/bv36+vv/5anTt3jkZ1\nAUQRbRQA4UI8ARAxoV75d8OGDSYrK8u0adPGjBo1yhw6dMgcPHjQ9O3b1zRu3Nj069fPlJSUOOd/\n7rnnTPPmzU1aWppZunSpc/y6detMenq6adasmXnwwQf9bkviquIUSmwWhRpCQpKTk2NSU1NN8+bN\nzeTJk40xxkyfPt1Mnz7dOc+LL75o0tLSTLdu3cwbb7zhdz3EFAolFovCFitoo1AolHDFFDvjCTGF\nQonVorDEE29xf/zoY5p1rmXMVxM4DsWpCoQQH8QUIBYRTwCEEzEFQLhEJp6EfJoSAAAAAAAAKo5k\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUjGAAAAAAAA2Ihk\nDAAAAAAAgI1IxgAAAAAAANiIZAwAAAAAAICNSMYAAAAAAADYiGQMAAAAAACAjUJOxhw9elTp6enq\n06ePJKmkpET9+vVTkyZN1L9/fx06dMg575QpU5ScnKyWLVtq+fLlzvEFBQVq166dkpKSNHbs2Eq8\nDADHg6VLlyotLU3JycmaOnVqwPny8/NVs2ZNvfPOOzbWDkAsoZ0CIFyIJwAiIeRkzOTJk9WyZUvF\nxcVJkqZNm6YmTZpo06ZNatSokaZPny5J+umnn/Tiiy/qk08+0bRp0zRy5EjnOkaNGqUHHnhA+fn5\nWrJkiVatWlXJlwOgOrvrrrs0Y8YMLV68WC+88IKKiop85jl69KgeeOAB9erVS8aYKNQSQCygnQIg\nXIgnACIhpGTMDz/8oAULFujWW2917uzk5eVpyJAhOvnkkzV48GDl5uZKknJzc9WrVy81adJE3bt3\nlzHGmT3esGGDBg0apLp162rAgAHOZQDAW3FxsSSpW7duatq0qXr27Ok3ZkydOlVXXnml6tevb3cV\nAcQI2ikAwoV4AiBSQkrG3HPPPZo4caJq1HAtnp+fr9TUVElSamqq8vLyJFlBKS0tzTlfSkqKcnNz\ntXnzZjVo0MA5vmXLllq5cmVILwJA9eceYyT/MWPXrl167733NHToUElyHsECcHyhnQIgXIgnACKl\nZkUX+OCDD9SgQQOlp6crJyfHOb4ipwP420Eqf/nxbsPZfxQA9sr5o8Smu+++W08++aTi4uJkjCkn\nrox3G84WMQWwW44iEU+i004Z7zacLeIJEA05CndMYb8HOF7lyI59ngonY1asWKH3339fCxYs0OHD\nh3Xw4EHdeOONysjIUEFBgdLT01VQUKCMjAxJUlZWlhYvXuxcfv369crIyFB8fLwKCwud47/77jt1\n7NixjC2Pr2hVAYRdtjwbBI/atuWMjAzdd999zufr1q1Tr169POb58ssvdc0110iSioqK9OGHH+rE\nE09U3759/axxfARrC6B82YpEPIlOO2V8WOoOoDKyFe6Ywn4PcLzKlh37PBU+TemJJ57Qzp07tXXr\nVs2ZM0cXXnih/u///k9ZWVmaOXOmSktLNXPmTGeAyczM1MKFC7Vjxw7l5OSoRo0aio+Pl2R165sz\nZ46Kioo0b948ZWVlhffVAag2EhISJFl3VNq2bZsWLVrkEzO+//57bd26VVu3btWVV16padOmBUjE\nAKiuaKcACBfiCYBICvluSg6OrndDhw7Vjh07lJKSol27dunOO++UJDVs2FBDhw7VhRdeqGHDhmny\n5MnOZSdNmqSnnnpKGRkZ6tq1qzp06FDZ6gCoxp577jndcccd6tGjh4YNG6Z69eppxowZmjFjRrSr\nBiBG0U4BEC7EEwDhFGeqwL1frcAX89UEjkNxVfL20cQUIBYRTwCEEzEFQLhEJp5UumcMAAAAAAAA\ngkcyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAA\nbEQyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAA\nbEQyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAA\nbEQyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAA\nbEQyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsRDIGAAAAAADARiRjAAAAAAAA\nbEQyBgAAAAAAwEYkYwAAAAAAAGxEMgYAAAAAAMBGJGMAAAAAAABsFFIyZufOnbrgggvUqlUrZWdn\n6/XXX5cklZSUqF+/fmrSpIn69++vQ4cOOZeZMmWKkpOT1bJlSy1fvtw5vqCgQO3atVNSUpLGjh1b\nyZcDoDpbunSp0tLSlJycrKlTp/pMnz17ttq0aaM2bdrouuuu08aNG6NQSwDRRBsFQLgQTwBElAnB\njz/+aL7++mtjjDF79+41Z599tjl48KCZMGGCGTFihDl8+LAZPny4mThxojHGmMLCQpOSkmK2b99u\ncnJyTHp6unNdl156qZkzZ44pKioynTt3Nvn5+T7bk2QkQ6FQYq4olBASsrZt25olS5aYbdu2mZSU\nFLN3716P6StWrDAHDhwwxhgza9Ysc8MNN/hdDzGFQonForDECdooFArFKqpy8YSYQqHEalGl44k/\nIfWMSUxMVNu2bSVJ9erVU6tWrZSfn6+8vDwNGTJEJ598sgYPHqzc3FxJUm5urnr16qUmTZqoe/fu\nMsY4M8gbNmzQoEGDVLduXQ0YMMC5DAC4Ky4uliR169ZNTZs2Vc+ePX3ixfnnn6+EhARJUu/evbVk\nyRLb6wkgumijAAgX4gmASKr0NWM2b96sdevWKTMzU/n5+UpNTZUkpaamKi8vT5IVmNLS0pzLpKSk\nKDc3V5s3b1aDBg2c41u2bKmVK1dWtkoAqiH3+CKVHy9eeukl9enTx46qAYhRtFEAhAvxBEC41azM\nwiUlJRo0aJCeffZZnXbaabJ61gUnLi7OZ1zZy493G87+owCwV84fJbYtXrxYr732mlasWFHGXOPd\nhrNFTAHslqNIxhPaKMDxJkeRiin2xhOJmAJEW47s2OcJORnz+++/a+DAgbrxxhvVr18/SVJGRoYK\nCgqUnp6ugoICZWRkSJKysrK0ePFi57Lr169XRkaG4uPjVVhY6Bz/3XffqWPHjgG2OD7UqgIIm2x5\nNggetW3LGRkZuu+++5zP161bp169evnMt2bNGt1555366KOPVLt27TLWOD78lQRQAdmKVDyhjQIc\nj7IViZhifzyRiClAtGXLjn2ekE5TMsZoyJAhat26te6++27n+KysLM2cOVOlpaWaOXOmM8hkZmZq\n4cKF2rFjh3JyclSjRg3Fx8dLsrr2zZkzR0VFRZo3b56ysrLC8LIAVDeOa8EsXbpU27Zt06JFi3zi\nxY4dOzRw4EDNnj1bLVq0iEY1AUQZbRQA4UI8ARBRoVz1d9myZSYuLs60adPGtG3b1rRt29Z8+OGH\n5uDBg6Zv376mcePGpl+/fqakpMS5zHPPPWeaN29u0tLSzNKlS53j161bZ9LT002zZs3Mgw8+6Hd7\nElcVp1BisyiUEBKynJwck5qaapo3b24mT55sjDFm+vTpZvr06cYYY4YMGWLOOOMMZ1zKyMjwux5i\nCoUSi0VhiRO0USgUilVU5eIJMYVCidWiSscTf+L++NHHNOtcy5ivJnAcilMVCCE+iClALCKeAAgn\nYgqAcIlMPKn03ZQAAAAAAAAQPJIxAAAAAAAANiIZAwAAAAAAYCOSMQAAAAAAADYiGQMAAAAAAGAj\nkjEAAAAAAAA2IhkDAAAAAABgI5IxAAAAAAAANiIZAwAAAAAAYCOSMQAAAAAAADYiGQMAAAAAAGAj\nkjEAAAAAAAA2IhkDAAAAAABgI5IxAAAAAABUEyefHO0aIBgkYwAAAAAAqCbi4qJdAwSDZAwAAAAA\nAICNSMYAAAAAAFBNnHBCtGuAYJCMAQAAAACgmuA0paqBZAwAAAAAAICNqm0yJjEx2jUAAAAAAADw\nVW2TMcnJ0a4BAAAAAAD2qlFt9/Krl2r7MRkT7RoAAAAAQOVlZES7BqhKzjsv2jVAMEjGIOo2bYp2\nDQAACK9//zvaNQBQnYweHe0aoCo57bRo1wDBqLbJmGPHol0DBOuss6JdAwAAwuvUU0Nf9r77wlcP\nANUDd8cBqp9qm4yhZwwAAKiK2OkC4I19G6D6qbbJGHrGVB1cYAoAUN1UJqHCThcAoDL4H6kaquVu\n8KJF9uzgz54d+W1ESmpqtGtg2bJFOvnkaNcCAIDYQSM6sH79gp/39NMjV4/qoH176eOPo10LBCsc\nPThJRPAAABwhSURBVOb+/OfKrwPRdf750a5BYKWl0a5B1VMtkzE9etizneuuC225U04Jbz1CMX9+\ntGtgiYX3Ity839sLL4xOPQAAVcv+/dYjyZjAhg8Pft6TTnINX3FF+OtS1Z1wgud7hOpl4EDfccSW\nqu/xx4ObLxqnu8bafl1V2AerlskYyd5gU9Esc82akalHRSQlRbsGVV+dOv7Hp6T4jqtXL7J1AQDA\nLu69j3v2jNx2srIit2671K0b/LzV4fUiciq6c821p3ylp9u7vQYNwr/O1q3Dv86q7G9/Czzt7LPt\nq0eoSMaEaPXq4Odds0bq0ye4eceNC60+lbF1q/3bjHW33lr+PCecEPz6nnkmtHqEcqepq64KbVsA\ngOornAcFzjjDNTx+fPjW683fzmR8fPDLu7cFo9UjIJZ3iKNRt4okp1A511wT2fV36RKZ9V55ZWTW\nK/nvLRSst96q+DLhTMb83/9Zj96/2+P9APv110e7BpVTbZMx5V3A11+Auucez+feXa1C/QM591zp\n/fcrtsyYMaFtK5aceGL41vX22+FbVzAqkzn317i54Qbr+jh2OPNMqUULe7YFAAi/SCQO9u4N/zol\nKTk5tOWCvZ7LP//p+bxjx+CWmzq1YvWpqNq1XcPt2lVs2Vjrym8nbtoQumCTZ//6l/UYyR4UXbtK\ny5aFf72dOkX2O/Lgg77jgo0p3omcWrXKXyacZ0ME+vz/85/wbaO6CXQWQyypNiExM9PzeXkNmcRE\n33HevRe8f2QPP+x/XcEGx4oEl+P5j9qf3r39j2/f3t56hKJNG+s7kpBg3zZDbcgHep8BAJHnr8fl\nP/4RvvWHu8v8iSdG/jTcIUMiu/5QubfTunWr3Lruvrtyy1eGv/ZwJA0aFHjajz+Gvl67X0c0BNu2\nK+v6SM2aSVdfHZbqRESkb45ywgnS5597jnPv6VcRkUoavf56xeaP5d530RbMmQ6BNGkSvnqUpdok\nY0aP9nwejqNKo0Z5Pi/rDySYxkL//pWrj/tRmOqisgEkUsmYynx/3F/Tf/8rTZpkDQcK9v4a2mPH\nVm77odY/lq/QDgDVnaO3iPv/iL8juaF65JHwrcsOzZuH3k7wXq6s/8WnnvI//rzzAi8TrnaCZO0g\n+1vn3/8e+jaC5e86d/44TpGojL59pcmTAyev/CVUzj03uHWnpYVer+qsbVvP57VqSf/+d+XXW5UT\nAO4HR198MfT1lBUDCgr8j3/ggcDLdO9e9vYCvefe4885p+z1RFqw177yd3DAu+7lnWXx229S06b+\npyUnV+57ate1eapUMsbxhnp3iX3sMWnAAM9x5Z2mFIzLLgt+3vr1fcc1b+75vLIJIsfr93ehosok\nJcr7ou7aVbnMojs7evwE+8ctSQ89FNx8xcUVr8cXX1hBxJE5j4vz36UxO9t6dK+340rpoQaRUL//\nDRuGthwAIPaV19iPJe++K738cnTr8M031qN3G9MOl10mtWoV2rKXXx7cfBVpYwQ77759gadNnmy1\nibp2DX67cAmlTRhqr49IOPXU4OaryHWhHIYNK3+en37yHdepU8W3FQx/+3wffOA66Hnxxb7THZ0A\nzjwzuG1UdJ/q118rNr/DSy+Ftlx5TjvNd9yUKZ7P33hD+p//8Z3P0ZP/pJM8eyi5J2VPPDG8SfNI\nqVLJGEdyY8MGz/EdOri69jr+uO+7zzX9o4/KX3dlLmLrfYqUQ6DrdqxaFfq2JP/dWUPNgvr7IXir\nzJfRu+HXqFHo65KkJ58sfx7vowBlcVwHqLxz14M9t93BmODOQR0xwhUo1qyp2DbceScOy/rDq0jC\npXPn0OoDAMeTevWkF15wPZ840fX/V9Hrifizdm1w8510ku/Ol/tRyvL+z7178Hpfr8XB0X3b8f9V\n0XZNMA3khATp5JM9xzl6moaynXvusXpmhCIpyWrT+esp85e/VOyC/sEKdsfVn/vvD189ynPddZ7P\nA12jwRhXDyD3ceFSlXtq+NO6tXTBBdbwvfe6rgPjz4QJwa/X0Z713ukNZN0613DLlsFvx8G9F9SH\nHwaez9EDwhjp2Wcrfj0a9/gbiL8D597fyWCXk6R33il/2UBGjJBuvtn/tEBJc+/vuOO29MF+94O5\nxk04VOa36F3Hk06SLrrId75hw/zHj/x8z3pUJsbYddH3KpWMCeY0HUcPkeuuc92ZKDW1/OW8f4yL\nF3s+Hz/eszuV+59N167S4MG+d0L605/8byuYLqEdOviO+/hj69FfAsVfTwjvI0q7dvnO8+ab5dfF\n3V//WrH5c3I8n/frF9xypaX+x3s3hPwld8r78fj70zrpJOmzz1zPKxNIKrJspC4wuGiRtHGj/2m3\n3RZ4Ofe6T5rEnZkAIBj33ut5ZHb0aNd/tffRy1mzfJfPzHS1X265xXUkcMcO67Gs7tLut5Z278Ex\ncqTvvBVtXPo7Iin5HgAIpneud2/h8vj7L/U+fbwiunf33QEtq4eG9+nnmzZ5JtYcO7Unnmid/uXY\n0XTsQDuEejODiRODm8/9SPALL1jfh0j0PAm0Tu/3NC4uMjcscN/JOh6sXevaMU1Otr6P3r8JR49q\n7+Sb46Kxp51m9bR2tDWbN5fuuMMaDra3l3sC5tprg6+/P4H2iyTp0CHXcHx8+ftK4biDTrt2oV/P\n8bnnpEsucSVEHG68MfAy7j3d+vaVXnkl8LwrVriGHQf9Gze2Hr1jSjD7HdOmlT+PPwkJVnwP501Z\nHPztu8bFWb0i3f3v/4a2/Ro1Av/npaRId91V8XVGQkwkY5YuXaq0tDQlJydraoC9U/feJ4EyXX36\n+P+zr8gFeL76ynd7N95oJVocRz5WrfJdZ3Ky7+0df/opp9ztBeoed+mlvuM6dHC9bu/X7+9L6p1Z\n9e72NnBgoF49Of4rFQbBXuzKu/F60knWtVUc94t3JMM8bxmeoyuu8J98c2+seAet00+3zjd0nC4k\neWbyAyUB3ZNiPXr4nyd4OZVdgYczzwzuDheB/sxq1bIaveG8Cnw4BBMrHnroISUlJal9+/Zav369\nzTWMtJxoVyBEOdGuQIhyol2BEOVEuwLVSjBxpyJHjBMTfY/iLlvmanzPnOlq4Dsa397cj6g++KBn\nm8DR47NNG+sxcEM9J/hK/6Eyt7H23knxl1jp1086cCCYteWEXpFy6iBZ/32Ongh33eWqu+MUYsnz\nOgUJCVKvXtKSJdKnn1rjXn3VevRsn+VIKn/nqWNH34OE/k5dGDTIM0kybJi0cGHZ63Z44AHptdf8\nT/NO/p1zTk7AntTudxr9+mvrMZw7bo6EoL+DlA73328lRKuCYOKJt0Dfl0CfSc2aUkmJdNZZ1mkx\nI0ZY45s3D9yuc7/bmjH+e7uV3wMsp7wZAvJOapTn/9s796Co7iuOH5RqnJpJBynqxAcohketLJZl\n8YlV66PjIy0kkfiqxsSiJsaYzhiHxqYzjUMMEa2Orwwy9VFMNDYERylqF8gQwFcz7S6aouCrlvCI\n4S2vb//4ebn37otdsrB3w/nM3Nm9v72Pc+/e+73nnt/vd37KVt6OWvk3NDgOkBARVVcbXdu5A+wF\nLW/dEvf0pEmOc5BKTJok/hOTSQ5+Sy3upfurrc1IRM4FY377266XIbIOUDx86Lh1pzLAlpZmexnr\nil0jffmlOgCnZOZM+fu8eeLd0dXWh0OHCs2QApqWvQL693fc+r83B4jRRDBm48aNdODAATp//jzt\n3buXqqqqrJYpKrK/vjQMcWam+oaUnBNXWipERooWKMr+isoLLT7eeaerqsrY5TJ6vfO2OUJ6GErB\nJCIhuoAswpacPGmvL6mRiNRJkadNsx4xQa8XQSl7XXikgNX//qcuv3lT1DAp6aoZt4+PcDhDQ8Ux\nffqpXC7x9ttGOnVK5IGxbFnj6BooK5OdJwlb/QzNZvX855/L353t32ovyZR0zpXXVna29ZDe0rkM\nCxO1JMpbRXoAODuM2/PP23fEJBFS1pxMnmz/WuotutKK4uJiys/Pp8uXL9Obb75Jb1pm9vZ6jJ42\noJsYPW1ANzF62oBuYvS0Ad8rnPFRlK0+Lbv62Ko8WrtW3QJ3wICuX0bS0+UKCUdIzybpme1KMMZe\nbbn0bJICPF3R1ma/S5DULVxqabt6tfjMyBDOvHM11UY6dkz9vPP3Vzvxtp7Ljmrm7ZGaKrfKtdXN\nV9mdSKrIWbNGrqRRv/wau93yVvo/AfklJiPD9RGypOtz7lz7rZX+9S+i/fvleb3e6FS3KWe7iVtW\ncul0IhEnkcjrUVsrKttWrRJ22OuKL90zycneMxqkM3piia1uGpaYzXKwsH//rtMQDB0quu5JOGo1\n53z3PiMFBDiXm8qVvDD5+fZbzP/yl0Tl5eK71G1I6lb0+efi/rTs7miZKLqmxkhE1qPqlpaKT1ua\nJFV4utqdPyBA3LcSyi5Qlnb6+wvtTUhQp6iQNKW9Xdj9XVrzp6eL/14K9qamurZ+To78fdUqsmlP\nfLzl9WWkCRNEzldH+PvbD+x2dcz37omAemCgaGF6754of/CAaP36rlvFXL5M9M47jpdxFx4Pxnz7\nODPq9OnTafTo0TRnzhwqchR5saCgwLnh7JRBitWrhQD95CfiwW0Z/ZWSKtlyej7+2Pk+d+PG2U++\ntmaNCC4sW+bctmxRUEC0YoXYzvbtwsGJjLTuyvP2287lzVHyxBPqJrIrVqij5kRExcVCAGtq1OUr\nVwoH6coVMa90RHx85L7X9lD+FhHRdV/0rCwxapGPj5j69bNuWWMvsVdsrAhISQGlXbvEpy2HLSxM\njGwgHc/TT8viogxqORIIyVGLiRFJqSz59FOR+JdINDWOjhbdhaSgibTvoiIhMkOGEO3ZI2or8/LE\n/+Fo5I333rNuel1fT/TPf4rvlo7Z8OGyA3jmTM91q3IGZ7SiqKiI4uPjyc/PjxISEqjEXjp7hmEY\nJ+iOj3LggO1yqXbT31+8LDl6wbLVImblStujMkrPnPPniVJShLNfXCySuH7yibpW2PIFTdmMe/Vq\nedmYGNHNSfJ3LFuc2st1MGyYqMnt319+Dr/zjqjl3biR6LPPrFt8+PuLbj7PPms/2WhJifql5de/\nJlqyRK6YIRIvjO+/L14mSkuFfzV1qvjt5El5X83N1tt3drRKX18RNBg1SuzHZCLautV6uUOH5BYd\nypeJWbPUue8sa3st8+JNmCBeksxm+zlkHLU4XrNGBMaIiBIT1V19lK2BJU6etB72t7VV+GVPPEFk\n2dhU6gpn6xwQ2R8N6umnrcukwMqPfyxe1DMzRXAuKMg6V6REZqa6ouz2bblFkhZxVU9iYoR/KvnF\nlv6lMhAQFibuCSL7rV+UFX6+vtbBByU6nTzEckaGSGa9fr0450SyNliOYlVQIA80YhngUeqarQS2\ntjhzRlQGKv36n/1MBBPPnxf3mrJ1FpFIkXD4sP1AyZAh4lxJwYMFC0Sqgk2b1F2xgoJEz4jsbOuu\nd5Ke5+SoK69d7fb09dfinE6eLAK40ruTkmeeEXl0pPOp1JQRI9Tvwf36ydcBkXUrnJdfFkE7KTjh\nLNJ7UVub3AImKUlOSPzKK/Ky0mhd6elidFjlUOszZsij+iUmivdLJcHB8r5cyQFKpK7Y9/WVtXHk\nSDEPiHO1Z4/aXiUmk5zoOCpK3AO3brlmh8vAw+Tk5GDJkiWd8/v27UNSUpJqGcnMpUsBVyy+cAEI\nDBTfa2uBefPUvz96BLS0ON7G6dPA/fv2fycCysrk+R07RFl9PbBt2zbVskajbfs7OoD//lf8Vl0N\n/O1volyvB2bPFuWTJjm2U8nJk0B8vO3fPv4YmDvXev+ffQaUl4t9rVu3DaWl4reAAKCx0foYnn1W\nvY39+4GsLKChwfZ+//hHYMUKoKJCLiMC6urE9/PngT17gNJSoLVVlP30p0Bhoe3t/ec/Yv1vvpHL\nLM/3H/4g/g+J994DPvgAyMsDfvQj29sFgG+/lb//5S/AJ5+of1+0CJgwQZ6/exe4dw/w9RU2ff21\n/W0fOwYMHaouq6sDoqK22VxeyYYNQFtbl4upuHoVOHtW2JWdLZcTAZs22V6HCPjVr+T5mhoxSSQn\ni2UePxZcM+g74IxWLFu2DNmKAzUYDCiVLmYFRKQ4Bm+atmnABrZb+5O32k09JyDdxBUfxRbPPAP8\n9a/AggVivr5e/XteHvDvf1uv19FhvSwg/BYfH+DDD4GgIHHevvzS8TG0tAAFBfJ8dbXwj2Jjt2H7\ndlE2dqx4PgHA88+L/Uh2pKTI/sGdO4B0OrZsAX7/e3m7QUHAW2/J81lZ8rL2eOstwGy2LieyfV4k\nlM/7Y8fEebRFe7vYVm6u7X38/e/i2NragB/+UJSFhwORkY7tdhY/PyAhAdi7V2xbslunA0aMkJer\nrZV9jV/8QpxzS9LTgd/8RnxvawMmTpTXXbZMbN/y+CSfTOnXXL4MpKaql62pUftnAHDjBhAbK74r\nz3dJibgG/f3Vyyv3UV0t9t/SInwNyW6Ja9dkez/8UPiWznDkCHDokLCBSPg4tkhP16amOKMngH27\nb9wAfvAD4M9/FrrS3m7td+/fL+5bS5qabPuQhYXiXNXVyVrcFWYzUFkpvjc2AlVV4h4bPHgbamuB\n27fl7eTmAjt3yuveuSN880OHhJ9/4gRw/Lj4zddXXM+AbI+Sn/8cKC62b1dlJfDyy9bl77/v+LiU\n13dHB7Bunf1ld++2va0//UmUd3SI98JvvhHzcXHi88ED+9t0ltZWed9EQHCwbDcRcP26vOw//iH+\nh8ZG+8ceFibuJQA4elS8JwHieUEEvPKKvOzu3cCsWeJ7R4f8PwFAUpLQUSU3blhfhxs2iHdUy/e1\ndevE/g4flsuamuTnEADMmQNER4v35YEDgVu31NuePFm8a7a3AytX2j5eS+rqxLvnBx+IY3B0jfSk\nnnhcpZx1dHjiiSdtTlrSiqVLl+LcuXOd8waDATdv3rTalqfPGU888WR70hrso/DEk3dPWsKVYAxP\nPPGkvakn8Hh6Tr1eT79TjENtMplo3rx5qmWELjEM05dxRisMBgOZzWaaO3cuERFVVlbSmDFjrLbF\nmsIwjDOwj8IwjLtwRk+IWFMYpi/h8ZwxTz3uXJeXl0fl5eWUk5NDBoPBw1YxDKM1nNEKg8FAp06d\nourqajp+/DiFWXZmZhiGcQH2URiGcResJwzDWOLxljFERKmpqbR27VpqbW2l1157jfwth+1hGIYh\n21px4HG2zLVr11J0dDRNnTqVoqKiyM/Pj47aGy6KYRjGSdhHYRjGXbCeMAyjokc6P7mJ3NxchIaG\nIjg4GLt37/a0Obhz5w5mzJiB8PBwxMbG4tjjbHe1tbVYtGgRRo4cicWLF6NOykoLYNeuXQgODkZY\nWBjy8/M7y81mMyIjIxEUFIStW7f2iv1tbW3Q6XRY8DiToDfYXV9fjxUrVmDcuHEICwtDYWGhV9h9\n8OBBTJo0CRMnTsTGjRsBaPN8r1q1CgEBARg/fnxnmTvtbGlpwerVqzFq1CjExsbigTsymH0HtKQp\nrCeesZs1hTXFXWhJTwDWFE/YzXrCeuJOWFPchzfqCcCa0hc1RdPBGJ1Oh9zcXJSXlyMkJASVUupu\nD/HgwQNcu3YNAFBZWYmgoCDU1tYiOTkZGzZsQHNzM9avX48dj4fwqaioQEhICG7fvg2j0YhIRXr+\n+fPnIyMjA1VVVZgyZQouXbrU4/anpKTgxRdfxMKFCwHAK+zevHkzkpKS0NTUhNbWVjx8+FDzdldX\nVyMwMBD19fVob2/H/Pnzce7cOU3anZeXh6tXr6pEyZ12njhxAnFxcWhoaMD27duxfv16t9rvKlrS\nFNYTz9jNmsKa4i60pCcAa4on7GY9YT1xJ6wp7sMb9QRgTemLmqLZYMzDhw+h0+k651999VVkZWV5\n0CJrFixYgAsXLiAuLq5TrK5cuYL4x+NKZ2ZmdkYHASGyUrRtzJgxneUpKSnYs2dPj9p69+5dzJo1\nCxcvXuyMEnuD3REREWhUjq3tBXY3NjZi9OjRuH//Purr6xEbG4vCwkLN2l1WVqYSJXfa+cYbb+D0\n6dMAhFhHRUW53X5n0bqmsJ70vN0AawprinvQup4ArCnso9iG9UR7egKwprgTb9UTgDWlL2qKxxP4\n2uPSpUsUGhraOR8eHk6FhYUetEhNaWkpmUwmio6OVtkaGhpKxcXFRERUVFSkSiAaEhJCRUVFVFpa\nSgEBAZ3lvXFsmzZtoh07dlC/fvJfrnW77927R83NzZSYmEgGg4GSk5OpqalJ83YPGjSI9u3bR4GB\ngTRs2DCaMmUKGQwGzdst4U47i4uLKTw8nIiI/Pz8qKKigh49etTjx2ALLWsK60nv2M2awpriLrSs\nJ0SsKb1hN+sJ64k7YU1xH96oJ0SsKX1VUzQbjNEydXV19MILL9DOnTtp8ODBLg1B5+PjY1Xmyvrd\nISsriwICAigyMlK1L63b3dzcTF999RXFxcWR0Wgkk8lEH330kebtrqyspMTERDKbzVReXk5ffPEF\nZWVlad7u7uzHnp1SOUTru25tu6/AeuL6+t2FNUUNa8r3E9YU19fvDqwnalhPvr94k6Z4q54QsaZY\n0lc0RbPBGL1eT9evX++cN5lMFBMT40GLBK2trRQXF0fLly+nxYsXE5GwtaSkhIiISkpKSK/XE5EY\nZtdsNneue/36ddLr9RQcHEwVFRWd5WazuUePraCggDIzMykoKIgSEhLo4sWLtHz5cs3bHRwcTCEh\nIbRw4UIaNGgQJSQk0Llz5zRvd3FxMcXExFBwcDANGTKEnnvuOcrPz9e83RLusFMaqlG5Tk1NDQ0d\nOpQGDhzY48dgCy1qCutJ717frCmsKe5Ci3pCxJrCPkrXsJ5oT0+IWFPchbfqCRFrSl/VFM0GY556\n6ikiIsrLy6Py8nLKycnpPFBPAYBeeuklGj9+PL3++uud5QaDgdLS0qipqYnS0tI6L5zo6GjKzs6m\nO3fukNFopH79+tGTTz5JRKIZVEZGBlVVVdHp06d79Njeffddunv3LpWVlVFGRgbNnDmTjhw5onm7\niYjGjRtHRUVF1NHRQWfOnKHZs2dr3u5p06bR5cuXqaamhh49ekRnz56lOXPmaN5uCXfaaTAY6OjR\no9TQ0EAHDx70qGOhNU1hPfHM9c2awpriDrSmJ0SsKb1tNxHrCeuJ+2BNcQ/erCdErCl9UlO6zCrj\nQYxGI0JDQzF27Fjs2rXL0+YgPz8fPj4+iIiIgE6ng06nw9mzZx0OiZWamoqxY8ciLCwMeXl5neUm\nkwmRkZEIDAzEli1beu0YjEZjZ2Zxb7D7xo0bMBgMiIiIwObNm1FfX+8Vdh8+fBjTp09HVFQUkpKS\n0N7erkm7lyxZguHDh2PAgAEYMWIE0tLS3GpnS0sLVq1ahZEjR2pi2EgtaQrriWfsZk1hTXEXWtIT\ngDXFE3aznrCeuBPWFPfibXoCsKb0RU3xAbiDJMMwDMMwDMMwDMMwTG+h2W5KDMMwDMMwDMMwDMMw\n30c4GMMwDMMwDMMwDMMwDNOLcDCGYRiGYRiGYRiGYRimF+FgDMMwDMMwDMMwDMMwTC/CwRiGYRiG\nYRiGYRiGYZhehIMxDMMwDMMwDMMwDMMwvcj/ASfMQP9jZcDPAAAAAElFTkSuQmCC\n"
243 | }
244 | ],
245 | "prompt_number": 8
246 | }
247 | ],
248 | "metadata": {}
249 | }
250 | ]
251 | }
--------------------------------------------------------------------------------
/IPython/memorial.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/memorial.exr
--------------------------------------------------------------------------------
/IPython/prefixsum.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy
3 | import pycuda.autoinit
4 | import pycuda.driver
5 | import pycuda.compiler
6 |
7 | class PrefixSumManager:
8 |
9 | source_module = pycuda.compiler.SourceModule \
10 | (
11 | """
12 | __global__ void prefix_sum_up_sweep( unsigned int* d_prefix_sum, int n, int d )
13 | {
14 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
15 | int k = global_index_1d * ( 2 << d );
16 |
17 | int left_index;
18 | int right_index;
19 |
20 | if ( d == 0 )
21 | {
22 | left_index = k;
23 | right_index = k + 1;
24 | }
25 | else
26 | {
27 | left_index = k + ( 2 << ( d - 1 ) ) - 1;
28 | right_index = k + ( 2 << d ) - 1;
29 | }
30 |
31 | if ( right_index < n )
32 | {
33 | d_prefix_sum[ right_index ] = d_prefix_sum[ left_index ] + d_prefix_sum[ right_index ];
34 | }
35 | }
36 |
37 | __global__ void prefix_sum_down_sweep( unsigned int* d_prefix_sum, int n, int d )
38 | {
39 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
40 | int k = global_index_1d * ( 2 << d );
41 |
42 | int left_index;
43 | int right_index;
44 |
45 | if ( d == 0 )
46 | {
47 | left_index = k;
48 | right_index = k + 1;
49 | }
50 | else
51 | {
52 | left_index = k + ( 2 << ( d - 1 ) ) - 1;
53 | right_index = k + ( 2 << d ) - 1;
54 | }
55 |
56 | if ( right_index < n )
57 | {
58 | unsigned int temp = d_prefix_sum[ right_index ];
59 | d_prefix_sum[ right_index ] = d_prefix_sum[ left_index ] + d_prefix_sum[ right_index ];
60 | d_prefix_sum[ left_index ] = temp;
61 | }
62 | }
63 |
64 | __global__ void blocked_prefix_sum_set_last_block_elements_to_zero( unsigned int* d_prefix_sums, int n, int block_size_num_elements )
65 | {
66 | int global_index_1d_left = ( ( ( threadIdx.x * 2 ) + 1 ) * block_size_num_elements ) - 1;
67 | int global_index_1d_right = ( ( ( threadIdx.x * 2 ) + 2 ) * block_size_num_elements ) - 1;
68 |
69 | if ( global_index_1d_left < n )
70 | {
71 | d_prefix_sums[ global_index_1d_left ] = 0;
72 | }
73 |
74 | if ( global_index_1d_right < n )
75 | {
76 | d_prefix_sums[ global_index_1d_right ] = 0;
77 | }
78 | }
79 |
80 | __global__ void blocked_prefix_sum_down_sweep( unsigned int* d_prefix_sum, unsigned int* d_block_sums, unsigned int* d_input_data_resized, int n, int d )
81 | {
82 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
83 | int k = global_index_1d * ( 2 << d );
84 |
85 | int left_index;
86 | int right_index;
87 |
88 | if ( d == 0 )
89 | {
90 | left_index = k;
91 | right_index = k + 1;
92 | }
93 | else
94 | {
95 | left_index = k + ( 2 << ( d - 1 ) ) - 1;
96 | right_index = k + ( 2 << d ) - 1;
97 | }
98 |
99 | if ( right_index < n )
100 | {
101 | unsigned int temp = d_prefix_sum[ right_index ];
102 | d_prefix_sum[ right_index ] = d_prefix_sum[ left_index ] + d_prefix_sum[ right_index ];
103 | d_prefix_sum[ left_index ] = temp;
104 | }
105 |
106 | if ( d == 0 && threadIdx.x == blockDim.x - 1 )
107 | {
108 | d_block_sums[ blockIdx.x ] = d_prefix_sum[ right_index ] + d_input_data_resized[ right_index ];
109 | }
110 | }
111 |
112 | __global__ void blocked_prefix_sum_add_block_sums( unsigned int* d_prefix_sums, unsigned int* d_block_sums, int n )
113 | {
114 | int global_index_1d = 2 * ( ( blockIdx.x * blockDim.x ) + threadIdx.x );
115 |
116 | if ( blockIdx.x > 0 && global_index_1d < n - 1 )
117 | {
118 | unsigned int block_sum = d_block_sums[ blockIdx.x ];
119 | d_prefix_sums[ global_index_1d ] = d_prefix_sums[ global_index_1d ] + block_sum;
120 | d_prefix_sums[ global_index_1d + 1 ] = d_prefix_sums[ global_index_1d + 1 ] + block_sum;
121 | }
122 | }
123 | """
124 | )
125 |
126 | _prefix_sum_down_sweep_function = source_module.get_function("prefix_sum_down_sweep")
127 | _prefix_sum_up_sweep_function = source_module.get_function("prefix_sum_up_sweep")
128 | _blocked_prefix_sum_down_sweep_function = source_module.get_function("blocked_prefix_sum_down_sweep")
129 | _blocked_prefix_sum_set_last_block_elements_to_zero_function = source_module.get_function("blocked_prefix_sum_set_last_block_elements_to_zero")
130 | _blocked_prefix_sum_add_block_sums_function = source_module.get_function("blocked_prefix_sum_add_block_sums")
131 |
132 | _size_of_element_bytes = 4
133 | _block_size_num_elements = 1024
134 | _block_size_num_threads = _block_size_num_elements / 2
135 | _num_sweep_passes = int(math.ceil(math.log(_block_size_num_elements,2)))
136 |
137 | _max_num_elements = -1
138 | _n = -1
139 | _input_data_resized_num_elements = -1
140 | _input_data_resized_num_threads = -1
141 | _input_data_device = -1
142 | _input_data_resized_device = -1
143 | _prefix_sum_device = -1
144 | _block_sums_device = -1
145 |
146 | def __init__(self, max_num_elements):
147 |
148 | num_elements_to_pad = 0
149 | if max_num_elements % self._block_size_num_elements != 0:
150 | num_elements_to_pad = self._block_size_num_elements - (max_num_elements % self._block_size_num_elements)
151 |
152 | max_num_elements = max_num_elements + num_elements_to_pad
153 |
154 | assert max_num_elements <= self._block_size_num_elements**2
155 |
156 | self._max_num_elements = max_num_elements
157 | self._num_bytes = self._max_num_elements * self._size_of_element_bytes
158 | self._input_data_device = pycuda.driver.mem_alloc(self._num_bytes)
159 | self._input_data_resized_device = pycuda.driver.mem_alloc(self._num_bytes)
160 | self._output_data_device = pycuda.driver.mem_alloc(self._num_bytes)
161 | self._prefix_sum_device = pycuda.driver.mem_alloc(self._num_bytes)
162 | self._block_sums_device = pycuda.driver.mem_alloc(self._num_bytes)
163 |
164 | def __copy_input_htod(self, input_data_host):
165 |
166 | assert input_data_host.shape[0] < self._block_size_num_elements**2
167 | assert input_data_host.shape[0] <= self._max_num_elements
168 | assert input_data_host.dtype == numpy.uint32
169 |
170 | pycuda.driver.memcpy_htod(self._input_data_device, input_data_host)
171 |
172 | def __initialize_prefix_sum(self, input_data_device, num_elements):
173 |
174 | self._n = num_elements
175 |
176 | num_elements_to_pad = 0
177 | if self._n % self._block_size_num_elements != 0:
178 | num_elements_to_pad = self._block_size_num_elements - (self._n % self._block_size_num_elements)
179 |
180 | self._input_data_resized_num_elements = self._n + num_elements_to_pad
181 | self._input_data_resized_num_threads = self._input_data_resized_num_elements / 2
182 |
183 | assert self._input_data_resized_num_elements <= self._max_num_elements
184 |
185 | pycuda.driver.memset_d32(self._input_data_resized_device, 0, self._input_data_resized_num_elements)
186 | pycuda.driver.memset_d32(self._prefix_sum_device, 0, self._input_data_resized_num_elements)
187 | pycuda.driver.memset_d32(self._block_sums_device, 0, self._block_size_num_elements)
188 |
189 | pycuda.driver.memcpy_dtod(self._input_data_resized_device, input_data_device, self._n * self._size_of_element_bytes)
190 | pycuda.driver.memcpy_dtod(self._prefix_sum_device, input_data_device, self._n * self._size_of_element_bytes)
191 |
192 | def __block_prefix_sum_input(self):
193 |
194 | prefix_sum_up_sweep_function_block = (self._block_size_num_threads,1,1)
195 | num_blocks = int(math.ceil(float(self._input_data_resized_num_threads) / float(prefix_sum_up_sweep_function_block[0])))
196 | prefix_sum_up_sweep_function_grid = (num_blocks, 1)
197 |
198 | blocked_prefix_sum_set_last_block_elements_to_zero_function_block = (self._block_size_num_threads,1,1)
199 | num_blocks = int(math.ceil(float(self._block_size_num_threads) / float(blocked_prefix_sum_set_last_block_elements_to_zero_function_block[0])))
200 | blocked_prefix_sum_set_last_block_elements_to_zero_function_grid = (num_blocks, 1)
201 |
202 | blocked_prefix_sum_down_sweep_function_block = (self._block_size_num_threads,1,1)
203 | num_blocks = int(math.ceil(float(self._input_data_resized_num_threads) / float(blocked_prefix_sum_down_sweep_function_block[0])))
204 | blocked_prefix_sum_down_sweep_function_grid = (num_blocks, 1)
205 |
206 | for d in range(self._num_sweep_passes):
207 | self._prefix_sum_up_sweep_function(
208 | self._prefix_sum_device,
209 | numpy.int32(self._input_data_resized_num_elements),
210 | numpy.int32(d),
211 | block=prefix_sum_up_sweep_function_block,
212 | grid=prefix_sum_up_sweep_function_grid)
213 |
214 | self._blocked_prefix_sum_set_last_block_elements_to_zero_function(
215 | self._prefix_sum_device,
216 | numpy.int32(self._input_data_resized_num_elements),
217 | numpy.int32(self._block_size_num_elements),
218 | block=blocked_prefix_sum_set_last_block_elements_to_zero_function_block,
219 | grid=blocked_prefix_sum_set_last_block_elements_to_zero_function_grid)
220 |
221 | for d in range(self._num_sweep_passes - 1,-1,-1):
222 | self._blocked_prefix_sum_down_sweep_function(
223 | self._prefix_sum_device,
224 | self._block_sums_device,
225 | self._input_data_resized_device,
226 | numpy.int32(self._input_data_resized_num_elements),
227 | numpy.int32(d),
228 | block=blocked_prefix_sum_down_sweep_function_block,
229 | grid=blocked_prefix_sum_down_sweep_function_grid)
230 |
231 | def __block_prefix_sum_block_sums(self):
232 |
233 | prefix_sum_up_sweep_function_block = (self._block_size_num_threads,1,1)
234 | num_blocks = int(math.ceil(float(self._block_size_num_threads) / float(prefix_sum_up_sweep_function_block[0])))
235 | prefix_sum_up_sweep_function_grid = (num_blocks, 1)
236 |
237 | blocked_prefix_sum_set_last_block_elements_to_zero_function_block = (self._block_size_num_threads,1,1)
238 | num_blocks = int(math.ceil(float(self._block_size_num_threads) / float(blocked_prefix_sum_set_last_block_elements_to_zero_function_block[0])))
239 | blocked_prefix_sum_set_last_block_elements_to_zero_function_grid = (num_blocks, 1)
240 |
241 | prefix_sum_down_sweep_function_block = (self._block_size_num_threads,1,1)
242 | num_blocks = int(math.ceil(float(self._block_size_num_threads) / float(prefix_sum_down_sweep_function_block[0])))
243 | prefix_sum_down_sweep_function_grid = (num_blocks, 1)
244 |
245 | for d in range(self._num_sweep_passes):
246 | self._prefix_sum_up_sweep_function(
247 | self._block_sums_device,
248 | numpy.int32(self._block_size_num_elements),
249 | numpy.int32(d),
250 | block=prefix_sum_up_sweep_function_block,
251 | grid=prefix_sum_up_sweep_function_grid)
252 |
253 | self._blocked_prefix_sum_set_last_block_elements_to_zero_function(
254 | self._block_sums_device,
255 | numpy.int32(self._block_size_num_elements),
256 | numpy.int32(self._block_size_num_elements),
257 | block=blocked_prefix_sum_set_last_block_elements_to_zero_function_block,
258 | grid=blocked_prefix_sum_set_last_block_elements_to_zero_function_grid)
259 |
260 | for d in range(self._num_sweep_passes - 1,-1,-1):
261 | self._prefix_sum_down_sweep_function(
262 | self._block_sums_device,
263 | numpy.int32(self._block_size_num_elements),
264 | numpy.int32(d),
265 | block=prefix_sum_down_sweep_function_block,
266 | grid=prefix_sum_down_sweep_function_grid)
267 |
268 | def __distribute_block_sums(self):
269 |
270 | blocked_prefix_sum_add_block_sums_function_block = (self._block_size_num_threads,1,1)
271 | num_blocks = int(math.ceil(float(self._input_data_resized_num_threads) / float(blocked_prefix_sum_add_block_sums_function_block[0])))
272 | blocked_prefix_sum_add_block_sums_function_grid = (num_blocks, 1)
273 |
274 | self._blocked_prefix_sum_add_block_sums_function(
275 | self._prefix_sum_device,
276 | self._block_sums_device,
277 | numpy.int32(self._input_data_resized_num_elements),
278 | block=blocked_prefix_sum_add_block_sums_function_block,
279 | grid=blocked_prefix_sum_add_block_sums_function_grid)
280 |
281 | def __copy_output_dtod(self, output_data_device):
282 |
283 | pycuda.driver.memcpy_dtod(output_data_device, self._prefix_sum_device, self._n * self._size_of_element_bytes)
284 |
285 | def __copy_output_dtoh(self, output_data_host):
286 |
287 | pycuda.driver.memcpy_dtoh(output_data_host, self._prefix_sum_device)
288 |
289 | def prefix_sum_device(self, input_data_device, output_data_device, num_elements):
290 |
291 | self.__initialize_prefix_sum(input_data_device, num_elements)
292 | self.__block_prefix_sum_input()
293 | self.__block_prefix_sum_block_sums()
294 | self.__distribute_block_sums()
295 | self.__copy_output_dtod(output_data_device)
296 |
297 | def prefix_sum_host(self, input_data_host, output_data_host):
298 |
299 | num_elements = input_data_host.shape[0]
300 |
301 | self.__copy_input_htod(input_data_host)
302 | self.prefix_sum_device(self._input_data_device, self._output_data_device, num_elements)
303 | self.__copy_output_dtoh(output_data_host)
304 |
--------------------------------------------------------------------------------
/IPython/prefixsum.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/prefixsum.pyc
--------------------------------------------------------------------------------
/IPython/radixsort.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy
3 | import pycuda.autoinit
4 | import pycuda.driver
5 | import pycuda.compiler
6 | import split
7 |
8 | class RadixSortManager:
9 |
10 | source_module = pycuda.compiler.SourceModule \
11 | (
12 | """
13 | __global__ void radix_sort_compute_flags_ascending(
14 | unsigned int* d_input_data,
15 | unsigned int* d_output_data,
16 | int mask,
17 | int n )
18 | {
19 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
20 |
21 | if ( global_index_1d < n )
22 | {
23 | unsigned int input_value = d_input_data[ global_index_1d ];
24 |
25 | if ( input_value & mask )
26 | {
27 | d_output_data[ global_index_1d ] = 0;
28 | }
29 | else
30 | {
31 | d_output_data[ global_index_1d ] = 1;
32 | }
33 | }
34 | }
35 |
36 | __global__ void radix_sort_compute_flags_descending(
37 | unsigned int* d_input_data,
38 | unsigned int* d_output_data,
39 | int mask,
40 | int n )
41 | {
42 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
43 |
44 | if ( global_index_1d < n )
45 | {
46 | unsigned int input_value = d_input_data[ global_index_1d ];
47 |
48 | if ( input_value & mask )
49 | {
50 | d_output_data[ global_index_1d ] = 1;
51 | }
52 | else
53 | {
54 | d_output_data[ global_index_1d ] = 0;
55 | }
56 | }
57 | }
58 | """
59 | )
60 |
61 | _radix_sort_compute_flags_ascending_function = source_module.get_function("radix_sort_compute_flags_ascending")
62 | _radix_sort_compute_flags_descending_function = source_module.get_function("radix_sort_compute_flags_descending")
63 |
64 | _size_of_element_bytes = 4
65 | _size_of_element_bits = 32
66 |
67 | _max_num_elements = -1
68 | _num_bytes = -1
69 |
70 | _input_keys_device = -1
71 | _input_values_device = -1
72 | _flag_data_device = -1
73 | _split_keys_old_device = -1
74 | _split_values_old_device = -1
75 | _split_keys_new_device = -1
76 | _split_values_new_device = -1
77 |
78 | _split_manager = -1
79 |
80 | def __init__(self, max_num_elements):
81 |
82 | self._max_num_elements = max_num_elements
83 | self._num_bytes = self._max_num_elements * self._size_of_element_bytes
84 |
85 | self._input_keys_device = pycuda.driver.mem_alloc(self._num_bytes)
86 | self._input_values_device = pycuda.driver.mem_alloc(self._num_bytes)
87 | self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)
88 | self._split_keys_old_device = pycuda.driver.mem_alloc(self._num_bytes)
89 | self._split_values_old_device = pycuda.driver.mem_alloc(self._num_bytes)
90 | self._output_keys_device = pycuda.driver.mem_alloc(self._num_bytes)
91 | self._output_values_device = pycuda.driver.mem_alloc(self._num_bytes)
92 |
93 | self._split_manager = split.SplitManager(max_num_elements)
94 |
95 | def __copy_input_htod_key(self, input_keys_host):
96 |
97 | assert input_keys_host.shape[0] <= self._max_num_elements
98 |
99 | assert \
100 | input_keys_host.dtype == numpy.uint32 or \
101 | input_keys_host.dtype == numpy.int32 or \
102 | input_keys_host.dtype == numpy.float32
103 |
104 | pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)
105 |
106 | def __copy_input_htod_key_value(self, input_keys_host, input_values_host):
107 |
108 | assert input_keys_host.shape[0] == input_values_host.shape[0]
109 | assert input_keys_host.shape[0] <= self._max_num_elements
110 |
111 | assert \
112 | input_keys_host.dtype == numpy.uint32 or \
113 | input_keys_host.dtype == numpy.int32 or \
114 | input_keys_host.dtype == numpy.float32
115 |
116 | assert \
117 | input_values_host.dtype == numpy.uint32 or \
118 | input_values_host.dtype == numpy.int32 or \
119 | input_values_host.dtype == numpy.float32
120 |
121 | pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)
122 | pycuda.driver.memcpy_htod(self._input_values_device, input_values_host)
123 |
124 | def __radix_sort_key(self, input_keys_device, output_keys_device, num_elements, compute_flags_function):
125 |
126 | assert num_elements <= self._max_num_elements
127 |
128 | self._n = num_elements
129 |
130 | pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)
131 |
132 | pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
133 | pycuda.driver.memset_d32(output_keys_device, 0, self._n)
134 |
135 | for b in range(self._size_of_element_bits):
136 |
137 | mask = numpy.int32(2**numpy.int8(b))
138 |
139 | radix_sort_compute_flags_funcion_block = (512,1,1)
140 | num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
141 | radix_sort_compute_flags_funcion_grid = (num_blocks, 1)
142 |
143 | compute_flags_function(
144 | self._split_keys_old_device,
145 | self._flag_data_device,
146 | numpy.int32(mask),
147 | numpy.int32(self._n),
148 | block=radix_sort_compute_flags_funcion_block,
149 | grid=radix_sort_compute_flags_funcion_grid)
150 |
151 | self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)
152 |
153 | self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device
154 |
155 | pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)
156 |
157 | def __radix_sort_key_value(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements, compute_flags_function):
158 |
159 | assert num_elements <= self._max_num_elements
160 |
161 | self._n = num_elements
162 |
163 | pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)
164 | pycuda.driver.memcpy_dtod(self._split_values_old_device, input_values_device, self._n * self._size_of_element_bytes)
165 |
166 | pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)
167 | pycuda.driver.memset_d32(output_keys_device, 0, self._n)
168 | pycuda.driver.memset_d32(output_values_device, 0, self._n)
169 |
170 | for b in range(self._size_of_element_bits):
171 |
172 | mask = numpy.int32(2**numpy.int8(b))
173 |
174 | radix_sort_compute_flags_funcion_block = (512,1,1)
175 | num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))
176 | radix_sort_compute_flags_funcion_grid = (num_blocks, 1)
177 |
178 | compute_flags_function(
179 | self._split_keys_old_device,
180 | self._flag_data_device,
181 | numpy.int32(mask),
182 | numpy.int32(self._n),
183 | block=radix_sort_compute_flags_funcion_block,
184 | grid=radix_sort_compute_flags_funcion_grid)
185 |
186 | self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)
187 | self._split_manager.split_device(self._split_values_old_device, self._flag_data_device, output_values_device, self._n)
188 |
189 | self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device
190 | self._split_values_old_device, output_values_device = output_values_device, self._split_values_old_device
191 |
192 | pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)
193 | pycuda.driver.memcpy_dtod(output_values_device, self._split_values_old_device, self._n * self._size_of_element_bytes)
194 |
195 | def __copy_output_dtoh_key(self, output_keys_host):
196 |
197 | pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)
198 |
199 | def __copy_output_dtoh_key_value(self, output_keys_host, output_values_host):
200 |
201 | pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)
202 | pycuda.driver.memcpy_dtoh(output_values_host, self._output_values_device)
203 |
204 | def radix_sort_key_ascending_device(self, input_keys_device, output_keys_device, num_elements):
205 |
206 | self.__radix_sort_key(
207 | input_keys_device,
208 | output_keys_device,
209 | num_elements,
210 | self._radix_sort_compute_flags_ascending_function)
211 |
212 | def radix_sort_key_descending_device(self, input_keys_device, output_keys_device, num_elements):
213 |
214 | self.__radix_sort_key(
215 | input_keys_device,
216 | output_keys_device,
217 | num_elements,
218 | self._radix_sort_compute_flags_descending_function)
219 |
220 | def radix_sort_key_ascending_host(self, input_keys_host, output_keys_host):
221 |
222 | num_elements = input_keys_host.shape[0]
223 |
224 | self.__copy_input_htod_key(input_keys_host)
225 | self.radix_sort_key_ascending_device(self._input_keys_device, self._output_keys_device, num_elements)
226 | self.__copy_output_dtoh_key(output_keys_host)
227 |
228 | def radix_sort_key_descending_host(self, input_keys_host, output_keys_host):
229 |
230 | num_elements = input_keys_host.shape[0]
231 |
232 | self.__copy_input_htod_key(input_keys_host)
233 | self.radix_sort_key_descending_device(self._input_keys_device, self._output_keys_device, num_elements)
234 | self.__copy_output_dtoh_key(output_keys_host)
235 |
236 | def radix_sort_key_value_ascending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):
237 |
238 | self.__radix_sort_key_value(
239 | input_keys_device,
240 | input_values_device,
241 | output_keys_device,
242 | output_values_device,
243 | num_elements,
244 | self._radix_sort_compute_flags_ascending_function)
245 |
246 | def radix_sort_key_value_descending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):
247 |
248 | self.__radix_sort_key_value(
249 | input_keys_device,
250 | input_values_device,
251 | output_keys_device,
252 | output_values_device,
253 | num_elements,
254 | self._radix_sort_compute_flags_descending_function)
255 |
256 | def radix_sort_key_value_ascending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):
257 |
258 | num_elements = input_keys_host.shape[0]
259 |
260 | self.__copy_input_htod_key_value(input_keys_host, input_values_host)
261 | self.radix_sort_key_value_ascending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
262 | self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)
263 |
264 | def radix_sort_key_value_descending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):
265 |
266 | num_elements = input_keys_host.shape[0]
267 |
268 | self.__copy_input_htod_key_value(input_keys_host, input_values_host)
269 | self.radix_sort_key_value_descending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)
270 | self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)
271 |
--------------------------------------------------------------------------------
/IPython/reduce.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy
3 | import pycuda.autoinit
4 | import pycuda.driver
5 | import pycuda.compiler
6 |
7 | class ReduceManager:
8 |
9 | source_module = pycuda.compiler.SourceModule \
10 | (
11 | """
12 | __global__ void reduce_sum( float* d_scratchpad, int n, int num_threads )
13 | {
14 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
15 |
16 | int left_index = global_index_1d;
17 | int right_index = global_index_1d + num_threads;
18 |
19 | if ( right_index < n )
20 | {
21 | d_scratchpad[ left_index ] = d_scratchpad[ left_index ] + d_scratchpad[ right_index ];
22 | }
23 | }
24 |
25 | __global__ void reduce_product( float* d_scratchpad, int n, int num_threads )
26 | {
27 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
28 |
29 | int left_index = global_index_1d;
30 | int right_index = global_index_1d + num_threads;
31 |
32 | if ( right_index < n )
33 | {
34 | d_scratchpad[ left_index ] = d_scratchpad[ left_index ] * d_scratchpad[ right_index ];
35 | }
36 | }
37 |
38 | __global__ void reduce_min( float* d_scratchpad, int n, int num_threads )
39 | {
40 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
41 |
42 | int left_index = global_index_1d;
43 | int right_index = global_index_1d + num_threads;
44 |
45 | if ( right_index < n )
46 | {
47 | d_scratchpad[ left_index ] = min( d_scratchpad[ left_index ], d_scratchpad[ right_index ] );
48 | }
49 | }
50 |
51 | __global__ void reduce_max( float* d_scratchpad, int n, int num_threads )
52 | {
53 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
54 |
55 | int left_index = global_index_1d;
56 | int right_index = global_index_1d + num_threads;
57 |
58 | if ( right_index < n )
59 | {
60 | d_scratchpad[ left_index ] = max( d_scratchpad[ left_index ], d_scratchpad[ right_index ] );
61 | }
62 | }
63 | """
64 | )
65 |
66 | _reduce_sum_function = source_module.get_function("reduce_sum")
67 | _reduce_product_function = source_module.get_function("reduce_product")
68 | _reduce_min_function = source_module.get_function("reduce_min")
69 | _reduce_max_function = source_module.get_function("reduce_max")
70 |
71 | _size_of_element_bytes = 4
72 | _block_size_num_elements = 1024
73 | _block_size_num_threads = _block_size_num_elements / 2
74 |
75 | _max_num_elements = -1
76 | _n = -1
77 | _scratchpad_device = -1
78 |
79 | def __init__(self, max_num_elements):
80 |
81 | self._max_num_elements = max_num_elements
82 | self._num_bytes = self._max_num_elements * self._size_of_element_bytes
83 | self._scratchpad_device = pycuda.driver.mem_alloc(self._num_bytes)
84 |
85 | def __copy_input_htod(self, input_data_host):
86 |
87 | assert input_data_host.shape[0] <= self._max_num_elements
88 | assert input_data_host.dtype == numpy.float32
89 |
90 | pycuda.driver.memcpy_htod(self._scratchpad_device, input_data_host)
91 |
92 | def __copy_input_dtod(self, input_data_device, num_elements):
93 |
94 | pycuda.driver.memcpy_dtod(self._scratchpad_device, input_data_device, int(num_elements * self._size_of_element_bytes))
95 |
96 | def __reduce(self, num_elements, reduce_function):
97 |
98 | self._n = num_elements
99 |
100 | num_sweep_passes = int(math.ceil(math.log(num_elements,2)))
101 | reduce_num_elements = self._n
102 |
103 | for d in range(num_sweep_passes):
104 |
105 | reduce_num_threads = int(math.ceil(float(reduce_num_elements) / float(2)))
106 |
107 | reduce_function_block = (self._block_size_num_threads,1,1)
108 | num_blocks = int(math.ceil(float(reduce_num_threads) / float(reduce_function_block[0])))
109 | reduce_function_grid = (num_blocks, 1)
110 |
111 | reduce_function(
112 | self._scratchpad_device,
113 | numpy.int32(reduce_num_elements),
114 | numpy.int32(reduce_num_threads),
115 | block=reduce_function_block,
116 | grid=reduce_function_grid)
117 |
118 | reduce_num_elements = reduce_num_threads
119 |
120 | tmp = numpy.zeros(1, dtype=numpy.float32)
121 |
122 | pycuda.driver.memcpy_dtoh(tmp, self._scratchpad_device)
123 |
124 | return tmp[0]
125 |
126 | def reduce_sum_device(self, input_data_device, num_elements):
127 |
128 | self.__copy_input_dtod(input_data_device, num_elements)
129 | return self.__reduce(num_elements, self._reduce_sum_function)
130 |
131 | def reduce_product_device(self, input_data_device, num_elements):
132 |
133 | self.__copy_input_dtod(input_data_device, num_elements)
134 | return self.__reduce(num_elements, self._reduce_product_function)
135 |
136 | def reduce_min_device(self, input_data_device, num_elements):
137 |
138 | self.__copy_input_dtod(input_data_device, num_elements)
139 | return self.__reduce(num_elements, self._reduce_min_function)
140 |
141 | def reduce_max_device(self, input_data_device, num_elements):
142 |
143 | self.__copy_input_dtod(input_data_device, num_elements)
144 | return self.__reduce(num_elements, self._reduce_max_function)
145 |
146 | def reduce_sum_host(self, input_data_host):
147 |
148 | num_elements = input_data_host.shape[0]
149 |
150 | self.__copy_input_htod(input_data_host)
151 | return self.__reduce(num_elements, self._reduce_sum_function)
152 |
153 | def reduce_product_host(self, input_data_host):
154 |
155 | num_elements = input_data_host.shape[0]
156 |
157 | self.__copy_input_htod(input_data_host)
158 | return self.__reduce(num_elements, self._reduce_product_function)
159 |
160 | def reduce_min_host(self, input_data_host):
161 |
162 | num_elements = input_data_host.shape[0]
163 |
164 | self.__copy_input_htod(input_data_host)
165 | return self.__reduce(num_elements, self._reduce_min_function)
166 |
167 | def reduce_max_host(self, input_data_host):
168 |
169 | num_elements = input_data_host.shape[0]
170 |
171 | self.__copy_input_htod(input_data_host)
172 | return self.__reduce(num_elements, self._reduce_max_function)
173 |
--------------------------------------------------------------------------------
/IPython/split.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy
3 | import pycuda.autoinit
4 | import pycuda.driver
5 | import pycuda.compiler
6 | import prefixsum
7 |
8 | class SplitManager:
9 |
10 | source_module = pycuda.compiler.SourceModule \
11 | (
12 | """
13 | __global__ void split_scatter(
14 | unsigned int* d_input_data,
15 | unsigned int* d_flag_data,
16 | unsigned int* d_flag_set_scatter_offset,
17 | unsigned int* d_output_data,
18 | int total_flags_set,
19 | int n )
20 | {
21 | int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;
22 |
23 | if ( global_index_1d < n )
24 | {
25 | unsigned int input_value = d_input_data[ global_index_1d ];
26 | unsigned int flag_value = d_flag_data[ global_index_1d ];
27 | unsigned int flag_set_scatter_offset_value = d_flag_set_scatter_offset[ global_index_1d ];
28 |
29 | unsigned int scatter_offset_value;
30 |
31 | if ( flag_value > 0 )
32 | {
33 | scatter_offset_value = flag_set_scatter_offset_value;
34 | }
35 | else
36 | {
37 | scatter_offset_value = global_index_1d - flag_set_scatter_offset_value + total_flags_set;
38 | }
39 |
40 | d_output_data[ scatter_offset_value ] = input_value;
41 | }
42 | }
43 | """
44 | )
45 |
46 | _split_scatter_funcion = source_module.get_function("split_scatter")
47 |
48 | _prefix_sum_manager = -1
49 |
50 | _size_of_element_bytes = 4
51 | _max_num_elements = -1
52 | _num_bytes = -1
53 | _n = -1
54 | _input_data_device = -1
55 | _flag_data_device = -1
56 | _flag_set_scatter_offset_device = -1
57 | _output_data_device = -1
58 | _block_sums_device = -1
59 |
60 | def __init__(self, max_num_elements):
61 |
62 | self._max_num_elements = max_num_elements
63 | self._num_bytes = self._max_num_elements * self._size_of_element_bytes
64 | self._input_data_device = pycuda.driver.mem_alloc(self._num_bytes)
65 | self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)
66 | self._flag_set_scatter_offset_device = pycuda.driver.mem_alloc(self._num_bytes)
67 | self._output_data_device = pycuda.driver.mem_alloc(self._num_bytes)
68 | self._prefix_sum_manager = prefixsum.PrefixSumManager(self._max_num_elements)
69 |
70 | def __copy_input_htod(self, input_data_host, flag_data_host):
71 |
72 | assert input_data_host.shape[0] <= self._max_num_elements
73 | assert \
74 | input_data_host.dtype == numpy.uint32 or \
75 | input_data_host.dtype == numpy.int32 or \
76 | input_data_host.dtype == numpy.float32
77 |
78 | pycuda.driver.memcpy_htod(self._input_data_device, input_data_host)
79 | pycuda.driver.memcpy_htod(self._flag_data_device, flag_data_host)
80 |
81 | def __split(self, input_data_device, flag_data_device, output_data_device, num_elements):
82 |
83 | assert num_elements <= self._max_num_elements
84 |
85 | self._n = num_elements
86 |
87 | pycuda.driver.memset_d32(self._flag_set_scatter_offset_device, 0, self._n)
88 | pycuda.driver.memset_d32(output_data_device, 0, self._n)
89 |
90 | self._prefix_sum_manager.prefix_sum_device(flag_data_device, self._flag_set_scatter_offset_device, self._n)
91 |
92 | tmp = numpy.zeros(1, dtype=numpy.uint32)
93 |
94 | pycuda.driver.memcpy_dtoh(tmp, int(self._flag_set_scatter_offset_device) + ((self._n - 1) * self._size_of_element_bytes))
95 | flag_set_scatter_offset_end = tmp[0]
96 |
97 | pycuda.driver.memcpy_dtoh(tmp, int(flag_data_device) + ((self._n - 1) * self._size_of_element_bytes))
98 | flag_data_end = tmp[0]
99 |
100 | total_flags_set = flag_set_scatter_offset_end + flag_data_end
101 |
102 | split_scatter_funcion_block = (512,1,1)
103 | num_blocks = int(math.ceil(float(self._n) / float(split_scatter_funcion_block[0])))
104 | split_scatter_function_grid = (num_blocks, 1)
105 |
106 | self._split_scatter_funcion(
107 | input_data_device,
108 | flag_data_device,
109 | self._flag_set_scatter_offset_device,
110 | output_data_device,
111 | numpy.int32(total_flags_set),
112 | numpy.int32(self._n),
113 | block=split_scatter_funcion_block,
114 | grid=split_scatter_function_grid)
115 |
116 | def __copy_output_dtoh(self, output_data_host):
117 |
118 | pycuda.driver.memcpy_dtoh(output_data_host, self._output_data_device)
119 |
120 | def split_device(self, input_data_device, flag_data_device, output_data_device, num_elements):
121 |
122 | self.__split(input_data_device, flag_data_device, output_data_device, num_elements)
123 |
124 | def split_host(self, input_data_host, flag_data_host, output_data_host):
125 |
126 | num_elements = input_data_host.shape[0]
127 |
128 | self.__copy_input_htod(input_data_host, flag_data_host)
129 | self.split_device(self._input_data_device, self._flag_data_device, self._output_data_device, num_elements)
130 | self.__copy_output_dtoh(output_data_host)
131 |
--------------------------------------------------------------------------------
/IPython/split.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikeroberts3000/GpuComputing/f2b29376f3c82e116709a102a8b3360ac87f933c/IPython/split.pyc
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository contains easy-to-read Python/CUDA implementations of fundamental GPU computing primitives: map, reduce, prefix sum (scan), split, radix sort, and histogram. I use these primitives to construct easy-to-read Python/CUDA implementations of the following image processing operations: Gaussian blurring, bilateral filtering, histogram equalization, red-eye removal, and seamless image cloning.
2 |
3 | This code can be browsed online with the IPython Notebook Viewer using the links below.
4 |
5 | ### GPU Computing Primitives
6 |
7 | - map
8 | - reduce
9 | - prefix sum (scan)
10 | - split
11 | - radix sort
12 | - histogram
13 |
14 | ### Image Processing Operations
15 |
16 | - Gaussian blurring
17 | - bilateral filtering
18 | - histogram equalization
19 | - red-eye removal
20 | - seamless image cloning
21 |
--------------------------------------------------------------------------------