├── .gitignore
├── CMakeLists.txt
├── Final
    ├── batcher
    │   ├── batcher.cu
    │   ├── compare.h
    │   └── gputimer.h
    ├── smooth
    │   ├── compare.h
    │   ├── gputimer.h
    │   └── smooth.cu
    └── warpreduce
    │   ├── part_a
    │       ├── compare.h
    │       ├── gputimer.h
    │       └── warpreduce.cu
    │   └── part_b
    │       ├── compare.h
    │       ├── gputimer.h
    │       └── warpreduce.cu
├── Lesson Code Snippets
    ├── Lesson 2 Code Snippets
    │   ├── CMakeLists.txt
    │   ├── associative.cu
    │   ├── atomics.cu
    │   ├── gputimer.h
    │   ├── hello.cu
    │   ├── hello_blockIdx.cu
    │   ├── hello_threadIdx.cu
    │   └── memory.cu
    ├── Lesson 3 Code Snippets
    │   ├── CMakeLists.txt
    │   ├── histo.cu
    │   ├── reduce.cu
    │   ├── reduce_minmax.cu
    │   └── reduce_minmax_2.cu
    ├── Lesson 5 Code Snippets
    │   ├── CMakeLists.txt
    │   ├── deviceQuery_simplified.cpp
    │   ├── gputimer.h
    │   └── transpose.cu
    └── Lesson 7 Code Snippets
    │   ├── CMakeLists.txt
    │   ├── cub
    │       └── example_block_scan_cum.cu
    │   ├── opencv
    │       ├── gettime.cc
    │       ├── gettime.h
    │       └── opencv.cu
    │   ├── thrust
    │       ├── gettime.cc
    │       ├── gettime.h
    │       ├── gputimer.h
    │       └── thrust_example.cu
    │   └── tiling
    │       ├── a.exp
    │       ├── gputimer.h
    │       ├── tiling.cu
    │       └── utils.h
├── Problem Sets
    ├── Problem Set 1
    │   ├── CMakeLists.txt
    │   ├── HW1.cpp
    │   ├── Makefile
    │   ├── cinque_terre.gold
    │   ├── cinque_terre_small.jpg
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 2
    │   ├── CMakeLists.txt
    │   ├── HW2.cpp
    │   ├── Makefile
    │   ├── cinque_terre.gold
    │   ├── cinque_terre_small.jpg
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 3
    │   ├── CMakeLists.txt
    │   ├── HW3.cu
    │   ├── Makefile
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── memorial.exr
    │   ├── memorial_large.exr
    │   ├── memorial_png.gold
    │   ├── memorial_png_large.gold
    │   ├── memorial_raw.png
    │   ├── memorial_raw_large.png
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 4
    │   ├── CMakeLists.txt
    │   ├── HW4.cu
    │   ├── Makefile
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── red_eye_effect.gold
    │   ├── red_eye_effect_5.jpg
    │   ├── red_eye_effect_template_5.jpg
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 5
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── main.cu
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student.cu
    │   ├── timer.h
    │   └── utils.h
    └── Problem Set 6
    │   ├── CMakeLists.txt
    │   ├── HW6.cu
    │   ├── Makefile
    │   ├── blended.gold
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── destination.png
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── source.png
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | 
 4 | # Libraries
 5 | *.lib
 6 | *.a
 7 | 
 8 | # Shared objects (inc. Windows DLLs)
 9 | *.dll
10 | *.so
11 | *.so.*
12 | *.dylib
13 | 
14 | # Executables
15 | *.exe
16 | *.out
17 | *.app
18 | 
19 | # OS X stuff
20 | .DS_Store
21 | 
22 | build*
23 | bin
24 | 
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
 9 | project(cs344)
10 | 
11 | find_package(OpenCV REQUIRED)
12 | find_package(CUDA REQUIRED)
13 | 
14 | link_libraries(${OpenCV_LIBS})
15 | include_directories(${OpenCV_INCLUDE_DIRS})
16 | 
17 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/")
18 | 
19 | if(CUDA_FOUND)
20 |   # compared to class settings, we let NVidia's FindCUDA CMake detect 
21 |   # whether to build x64.  We tell it to support most devices, though, 
22 |   # to make sure more people can easily run class code without knowing 
23 |   # about this compiler argument
24 |   
25 |   # Commented out these lines, otherwise there will be some tricky errors
26 |   # set(CUDA_NVCC_FLAGS "
27 |   # -ccbin /usr/bin/clang; 
28 |   # -gencode;arch=compute_30,code=sm_30;  
29 |   # -gencode;arch=compute_35,code=sm_35;
30 |   # -gencode;arch=compute_35,code=compute_35;
31 |   # -gencode;arch=compute_20,code=sm_20; 
32 |   # -gencode;arch=compute_11,code=sm_11; 
33 |   # -gencode;arch=compute_12,code=sm_12;
34 |   # -gencode;arch=compute_13,code=sm_13;")
35 | 
36 |   # add -Wextra compiler flag for gcc compilations
37 |   if (UNIX)
38 |     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra")
39 |     set(CMAKE_CXX_FLAGS "-stdlib=libstdc++")
40 |   endif (UNIX)
41 | 
42 |   # add debugging to CUDA NVCC flags.  For NVidia's NSight tools.
43 |   set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
44 | 
45 |   add_subdirectory ("Problem Sets/Problem Set 1")
46 |   add_subdirectory ("Problem Sets/Problem Set 2")
47 |   add_subdirectory ("Problem Sets/Problem Set 3")
48 |   add_subdirectory ("Problem Sets/Problem Set 4")
49 |   add_subdirectory ("Problem Sets/Problem Set 5")
50 |   add_subdirectory ("Problem Sets/Problem Set 6")
51 | 
52 |   add_subdirectory ("Lesson Code Snippets/Lesson 7 Code Snippets")
53 |   add_subdirectory ("Lesson Code Snippets/Lesson 5 Code Snippets")
54 |   add_subdirectory ("Lesson Code Snippets/Lesson 3 Code Snippets")
55 |   add_subdirectory ("Lesson Code Snippets/Lesson 2 Code Snippets")
56 | else(CUDA_FOUND)
57 |   message("CUDA is not installed on this system.")
58 | endif()
59 | 


--------------------------------------------------------------------------------
/Final/batcher/batcher.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // http://en.wikipedia.org/wiki/Bitonic_sort
 8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in)
 9 | {
10 |     // you are guaranteed this is called with <<<1, 64, 64*4>>>
11 |     extern __shared__ float sdata[];
12 |     int tid  = threadIdx.x;
13 |     sdata[tid] = d_in[tid];
14 |     __syncthreads();
15 |     
16 |     for (int stage = 0; stage <= 5; stage++)
17 |     {
18 |         for (int substage = stage; substage >= 0; substage--)
19 |         {
20 |             // TODO
21 |         }
22 |     }
23 | 
24 |     d_out[tid] = sdata[tid];
25 | }
26 | 
27 | int compareFloat (const void * a, const void * b)
28 | {
29 |   if ( *(float*)a <  *(float*)b ) return -1;
30 |   if ( *(float*)a == *(float*)b ) return 0;
31 |   if ( *(float*)a >  *(float*)b ) return 1;
32 |   return 0;                     // should never reach this
33 | }
34 | 
35 | int main(int argc, char **argv)
36 | {
37 |     const int ARRAY_SIZE = 64;
38 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
39 | 
40 |     // generate the input array on the host
41 |     float h_in[ARRAY_SIZE];
42 |     float h_sorted[ARRAY_SIZE];
43 |     float h_out[ARRAY_SIZE];
44 |     for(int i = 0; i < ARRAY_SIZE; i++) {
45 |         // generate random float in [0, 1]
46 |         h_in[i] = (float)random()/(float)RAND_MAX;
47 |         h_sorted[i] = h_in[i];
48 |     }
49 |     qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat);
50 | 
51 |     // declare GPU memory pointers
52 |     float * d_in, * d_out;
53 | 
54 |     // allocate GPU memory
55 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
56 |     cudaMalloc((void **) &d_out, ARRAY_BYTES);
57 | 
58 |     // transfer the input array to the GPU
59 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
60 | 
61 |     // launch the kernel
62 |     GpuTimer timer;
63 |     timer.Start();
64 |     batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in);
65 |     timer.Stop();
66 |     
67 |     printf("Your code executed in %g ms\n", timer.Elapsed());
68 |     
69 |     // copy back the sum from GPU
70 |     cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
71 | 
72 |     compare(h_out, h_sorted, ARRAY_SIZE);
73 |   
74 |     // free GPU memory allocation
75 |     cudaFree(d_in);
76 |     cudaFree(d_out);
77 |         
78 |     return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/Final/batcher/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE)
 2 | {	
 3 | 	int failure = 0;
 4 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 5 |         if (h_out[i] != h_sorted[i]) {
 6 |             printf("Oops! Index %i is %f, should be %f\n",
 7 |                    i, h_out[i], h_sorted[i]);
 8 |             failure = 1;
 9 |         }
10 |     }
11 | 
12 |     if (failure == 0){
13 |     	printf("Success! Your bitonic sort worked.");
14 |     }
15 | 
16 |     return failure;
17 | }


--------------------------------------------------------------------------------
/Final/batcher/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/smooth/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){
 2 |     int failure = 0;
 3 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 4 |         if (h_out[i] != h_cmp[i]) {
 5 |             fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n",
 6 |                     i, h_in[i], i, h_out[i], i, h_cmp[i]);
 7 |             failure = 1;
 8 |         }
 9 |         if (h_out_shared[i] != h_cmp[i]) {
10 |             fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n",
11 |                     i, h_in[i], i, h_out_shared[i], i, h_cmp[i]);
12 |             failure = 1;
13 |         }
14 |     }
15 | 
16 |     if (failure == 0)
17 |     {
18 |         printf("Success! Your smooth code worked!\n");
19 |     }
20 | 
21 |     return failure;
22 | }


--------------------------------------------------------------------------------
/Final/smooth/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/smooth/smooth.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Reference
 8 | __global__ void smooth(float * v_new, const float * v) {
 9 |     int myIdx = threadIdx.x * gridDim.x + blockIdx.x;
10 |     int numThreads = blockDim.x * gridDim.x;
11 |     int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1;
12 |     int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1;
13 |     float myElt = v[myIdx];
14 |     float myLeftElt = v[myLeftIdx];
15 |     float myRightElt = v[myRightIdx];
16 |     v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt;
17 | }
18 | 
19 | // Your code
20 | __global__ void smooth_shared(float * v_new, const float * v) {
21 |     extern __shared__ float s[];
22 |     // TODO: Fill in the rest of this function
23 |     return v[0];
24 | }
25 | 
26 | int main(int argc, char **argv)
27 | {
28 | 
29 |     const int ARRAY_SIZE = 4096;
30 |     const int BLOCK_SIZE = 256;
31 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
32 | 
33 |     // generate the input array on the host
34 |     float h_in[ARRAY_SIZE];
35 |     float h_cmp[ARRAY_SIZE];
36 |     float h_out[ARRAY_SIZE];
37 |     float h_out_shared[ARRAY_SIZE];
38 |     for(int i = 0; i < ARRAY_SIZE; i++) {
39 |         // generate random float in [0, 1]
40 |         h_in[i] = (float)random()/(float)RAND_MAX;
41 |     }
42 |     for(int i = 0; i < ARRAY_SIZE; i++) {
43 |         h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] +
44 |                     0.50f * h_in[i] +
45 |                     0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]);
46 |     }
47 | 
48 |     // declare GPU memory pointers
49 |     float * d_in, * d_out, * d_out_shared;
50 | 
51 |     // allocate GPU memory
52 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
53 |     cudaMalloc((void **) &d_out, ARRAY_BYTES);
54 |     cudaMalloc((void **) &d_out_shared, ARRAY_BYTES);
55 | 
56 |     // transfer the input array to the GPU
57 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
58 | 
59 |     // cudaEvent_t start, stop;
60 |     // cudaEventCreate(&start);
61 |     // cudaEventCreate(&stop);
62 |     // launch the kernel
63 |     smooth<<<ARRAY_SIZE / BLOCK_SIZE, BLOCK_SIZE>>>(d_out, d_in);
64 |     GpuTimer timer;
65 |     timer.Start();
66 |     smooth_shared<<<ARRAY_SIZE / BLOCK_SIZE, BLOCK_SIZE, (BLOCK_SIZE + 2) * sizeof(float)>>>(d_out_shared, d_in);
67 |     timer.Stop();
68 | 
69 |     printf("Your code executed in %g ms\n", timer.Elapsed());
70 |     // cudaEventSynchronize(stop);
71 |     // float elapsedTime;
72 |     // cudaEventElapsedTime(&elapsedTime, start, stop);    
73 | 
74 |     // copy back the result from GPU
75 |     cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
76 |     cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost);
77 | 
78 |     // testing for correctness
79 |     compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE);
80 | 
81 |     // free GPU memory allocation
82 |     cudaFree(d_in);
83 |     cudaFree(d_out);
84 |     cudaFree(d_out_shared);
85 |         
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(unsigned int h_out_shared, int sum){
 2 |  	int failure = 0;
 3 |     if (h_out_shared != sum) {
 4 |         fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n", 
 5 |                 h_out_shared, sum);
 6 |         failure = 1;
 7 |     }
 8 | 
 9 |     if (failure == 0)
10 |     {
11 |         printf("Success! Your shared warp reduce worked.\n");
12 |     }
13 |     else{
14 |     	printf("Error! Your shared reduce code's output did not match sum.\n");	
15 |     }
16 | 
17 |     return failure;
18 | }


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/warpreduce.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Subpart A:
 8 | // Write step 1 as a kernel that operates on threads 0--31.
 9 | // Assume that the input flags are 0 for false and 1 for true and are stored
10 | // in a local per-thread register called p (for predicate).
11 | //
12 | // You have access to 31 words of shared memory s[0:31], with s[0]
13 | // corresponding to thread 0 and s[31] corresponding to thread 31.
14 | // You may change the values of s[0:31]. Put the return sum in s[0].
15 | // Your code should execute no more than 5 warp-wide addition operations.
16 | 
17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) {
18 |     // Assumes values in 'p' are either 1 or 0
19 |     // Assumes s[0:31] are allocated
20 |     // Sums p across warp, returning the result. Suggest you put
21 |     // result in s[0] and return it
22 |     // You may change any value in s
23 |     // You should execute no more than 5 + operations (if you're doing
24 |     // 31, you're doing it wrong)
25 |     //
26 |     // TODO: Fill in the rest of this function
27 | 
28 |     return s[0];
29 | }
30 | 
31 | __global__ void reduce(unsigned int * d_out_shared,
32 |                        const unsigned int * d_in)
33 | {
34 |     extern __shared__ unsigned int s[];
35 |     int t = threadIdx.x;
36 |     int p = d_in[t];
37 |     unsigned int sr = shared_reduce(p, s);
38 |     if (t == 0)
39 |     {
40 |         *d_out_shared = sr;
41 |     }
42 | }
43 | 
44 | int main(int argc, char **argv)
45 | {
46 |     const int ARRAY_SIZE = 32;
47 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 | 
49 |     // generate the input array on the host
50 |     unsigned int h_in[ARRAY_SIZE];
51 |     unsigned int sum = 0;
52 |     for(int i = 0; i < ARRAY_SIZE; i++) {
53 |         // generate random float in [0, 1]
54 |         h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 |         sum += h_in[i];
56 |     }
57 | 
58 |     // declare GPU memory pointers
59 |     unsigned int * d_in, * d_out_shared;
60 | 
61 |     // allocate GPU memory
62 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 |     cudaMalloc((void **) &d_out_shared, sizeof(unsigned int));
64 | 
65 |     // transfer the input array to the GPU
66 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
67 | 
68 |     GpuTimer timer;
69 |     timer.Start();
70 |     // launch the kernel
71 |     reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 |         (d_out_shared, d_in);
73 |     timer.Stop();
74 | 
75 |     printf("Your code executed in %g ms\n", timer.Elapsed());
76 | 
77 |     unsigned int h_out_shared;
78 |     // copy back the sum from GPU
79 |     cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int), 
80 |                cudaMemcpyDeviceToHost);
81 |     
82 |     compare(h_out_shared, sum);
83 | 
84 |     // free GPU memory allocation
85 |     cudaFree(d_in);
86 |     cudaFree(d_out_shared);
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(unsigned int h_out_warp, int sum){
 2 |  	int failure = 0;
 3 |     if (h_out_warp != sum) {
 4 |         fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n", 
 5 |                 h_out_warp, sum);
 6 |         failure = 1;
 7 |     }
 8 | 
 9 |     if (failure == 0)
10 |     {
11 |         printf("Success! Your warp reduce worked.\n");
12 |     }
13 |     else{
14 |     	printf("Error! Your warp reduce code's output did not match sum.\n");	
15 |     }
16 | 
17 |     return failure;
18 | }


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/warpreduce.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Subpart b:
 8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions.
 9 | // Namely, these instructions are:
10 | //
11 | // int __popc(int x) Population Count: Returns the number of bits that are set
12 | // to 1 in the 32-bit integer x.
13 | //
14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero
15 | // bits beginning at the most significant bit of the 32-bit integer x.
16 | //
17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only
18 | // if the predicate p provided by the thread in lane k of the warp is non-zero.
19 | 
20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) {
21 |     // Assumes values in 'p' are either 1 or 0
22 |     // Should not use 's'
23 |     // Sums p across warp, returning the result.
24 |     // You can do this without using the character '+' in your code at all
25 |     //
26 |     // TODO: Fill in the rest of this function
27 |     //
28 | }
29 | 
30 | __global__ void reduce(unsigned int * d_out_warp, 
31 |                        const unsigned int * d_in)
32 | {
33 |     extern __shared__ unsigned int s[];
34 |     int t = threadIdx.x;
35 |     int p = d_in[t];
36 | 
37 |     unsigned int wr = warp_reduce(p, s);
38 |     if (t == 0)
39 |     {
40 |         *d_out_warp = wr;
41 |     }
42 | }
43 | 
44 | int main(int argc, char **argv)
45 | {
46 |     const int ARRAY_SIZE = 32;
47 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 | 
49 |     // generate the input array on the host
50 |     unsigned int h_in[ARRAY_SIZE];
51 |     unsigned int sum = 0;
52 |     for(int i = 0; i < ARRAY_SIZE; i++) {
53 |         // generate random float in [0, 1]
54 |         h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 |         sum += h_in[i];
56 |     }
57 | 
58 |     // declare GPU memory pointers
59 |     unsigned int * d_in, * d_out_warp;
60 | 
61 |     // allocate GPU memory
62 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 |     cudaMalloc((void **) &d_out_warp, sizeof(unsigned int));
64 | 
65 |     // transfer the input array to the GPU
66 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
67 | 
68 |     GpuTimer timer;
69 |     timer.Start();
70 |     // launch the kernel
71 |     reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 |         (d_out_warp, d_in);
73 |     timer.Stop();
74 | 
75 |     printf("Your code executed in %g ms\n", timer.Elapsed());  
76 | 
77 |     unsigned int h_out_warp;
78 |     // copy back the sum from GPU
79 |     cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int), 
80 |                cudaMemcpyDeviceToHost);
81 | 
82 |     // compare your result against the expected reduce sum
83 |     compare(h_out_warp, sum);
84 | 
85 |     // free GPU memory allocation
86 |     cudaFree(d_in);
87 |     cudaFree(d_out_warp);
88 |         
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | CUDA_ADD_EXECUTABLE(Lesson2_atomics atomics.cu)
11 | 
12 | CUDA_ADD_EXECUTABLE(Lesson2_memory memory.cu)
13 | 
14 | CUDA_ADD_EXECUTABLE(Lesson2_hello_world hello.cu)
15 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main(int argc,char **argv)
4 | {   
5 |     printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n", 
6 |         1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99, 
7 |         1.f, 1e99, -1e99, 1.f + (1e99 + -1e99));
8 |     return 0;
9 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <device_launch_parameters.h>
 6 | #include <device_functions.h>
 7 | #include "gputimer.h"
 8 | 
 9 | #define NUM_THREADS 1000000
10 | #define ARRAY_SIZE  100
11 | 
12 | #define BLOCK_WIDTH 1000
13 | 
14 | void print_array(int *array, int size)
15 | {
16 |     printf("{ ");
17 |     for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
18 |     printf("}\n");
19 | }
20 | 
21 | __global__ void increment_naive(int *g)
22 | {
23 | 	// which thread is this?
24 | 	int i = blockIdx.x * blockDim.x + threadIdx.x; 
25 | 
26 | 	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
27 | 	i = i % ARRAY_SIZE;  
28 | 	g[i] = g[i] + 1;
29 | }
30 | 
31 | __global__ void increment_atomic(int *g)
32 | {
33 | 	// which thread is this?
34 | 	int i = blockIdx.x * blockDim.x + threadIdx.x; 
35 | 
36 | 	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
37 | 	i = i % ARRAY_SIZE;  
38 | 	atomicAdd(& g[i], 1);
39 | }
40 | 
41 | int main(int argc,char **argv)
42 | {   
43 |     GpuTimer timer;
44 |     printf("%d total threads in %d blocks writing into %d array elements\n",
45 |            NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
46 | 
47 |     // declare and allocate host memory
48 |     int h_array[ARRAY_SIZE];
49 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
50 |  
51 |     // declare, allocate, and zero out GPU memory
52 |     int * d_array;
53 |     cudaMalloc((void **) &d_array, ARRAY_BYTES);
54 |     cudaMemset((void *) d_array, 0, ARRAY_BYTES); 
55 | 
56 |     // launch the kernel - comment out one of these
57 |     timer.Start();
58 |     //increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
59 |     increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
60 |     timer.Stop();
61 |     
62 |     // copy back the array of sums from GPU and print
63 |     cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
64 |     print_array(h_array, ARRAY_SIZE);
65 |     printf("Time elapsed = %g ms\n", timer.Elapsed());
66 |  
67 |     // free GPU memory allocation and exit
68 |     cudaFree(d_array);
69 |     return 0;
70 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <device_launch_parameters.h>
 6 | #include <device_functions.h>
 7 | 
 8 | #define NUM_BLOCKS 4
 9 | #define BLOCK_WIDTH 4
10 | 
11 | __global__ void hello()
12 | {
13 |     printf("Hello world! I'm thread %d in block %d\n", threadIdx.x, blockIdx.x);
14 | }
15 | 
16 | 
17 | int main(int argc,char **argv)
18 | {
19 |     // launch the kernel
20 |     hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
21 | 
22 |     // force the printf()s to flush
23 |     cudaDeviceSynchronize();
24 | 
25 |     printf("That's all!\n");
26 | 
27 |     return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define NUM_BLOCKS 16
 4 | #define BLOCK_WIDTH 1
 5 | 
 6 | __global__ void hello()
 7 | {
 8 |     printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
 9 | }
10 | 
11 | 
12 | int main(int argc,char **argv)
13 | {
14 |     // launch the kernel
15 |     hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
16 | 
17 |     // force the printf()s to flush
18 |     cudaDeviceSynchronize();
19 | 
20 |     printf("That's all!\n");
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define NUM_BLOCKS 1
 4 | #define BLOCK_WIDTH 256
 5 | 
 6 | __global__ void hello()
 7 | {
 8 |     printf("Hello world! I'm thread %d\n", threadIdx.x);
 9 | }
10 | 
11 | 
12 | int main(int argc,char **argv)
13 | {
14 |     // launch the kernel
15 |     hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
16 | 
17 |     // force the printf()s to flush
18 |     cudaDeviceSynchronize();
19 | 
20 |     printf("That's all!\n");
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu:
--------------------------------------------------------------------------------
  1 | // Using different memory spaces in CUDA
  2 | #include <stdio.h>
  3 | #include <cuda.h>
  4 | #include <cuda_runtime.h>
  5 | #include <cuda_runtime_api.h>
  6 | #include <device_launch_parameters.h>
  7 | #include <device_functions.h>
  8 | 
  9 | /**********************
 10 |  * using local memory *
 11 |  **********************/
 12 | 
 13 | // a __device__ or __global__ function runs on the GPU
 14 | __global__ void use_local_memory_GPU(float in)
 15 | {
 16 |     float f;    // variable "f" is in local memory and private to each thread
 17 |     f = in;     // parameter "in" is in local memory and private to each thread
 18 |     // ... real code would presumably do other stuff here ... 
 19 | }
 20 | 
 21 | /**********************
 22 |  * using global memory *
 23 |  **********************/
 24 | 
 25 | // a __global__ function runs on the GPU & can be called from host
 26 | __global__ void use_global_memory_GPU(float *array)
 27 | {
 28 |     // "array" is a pointer into global memory on the device
 29 |     array[threadIdx.x] = 2.0f * (float) threadIdx.x;
 30 | }
 31 | 
 32 | /**********************
 33 |  * using shared memory *
 34 |  **********************/
 35 | 
 36 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)
 37 | __global__ void use_shared_memory_GPU(float *array)
 38 | {
 39 |     // local variables, private to each thread
 40 |     int i, index = threadIdx.x;
 41 |     float average, sum = 0.0f;
 42 | 
 43 |     // __shared__ variables are visible to all threads in the thread block
 44 |     // and have the same lifetime as the thread block
 45 |     __shared__ float sh_arr[128];
 46 | 
 47 |     // copy data from "array" in global memory to sh_arr in shared memory.
 48 |     // here, each thread is responsible for copying a single element.
 49 |     sh_arr[index] = array[index];
 50 | 
 51 |     __syncthreads();    // ensure all the writes to shared memory have completed
 52 | 
 53 |     // now, sh_arr is fully populated. Let's find the average of all previous elements
 54 |     for (i=0; i<index; i++) { sum += sh_arr[i]; }
 55 |     average = sum / (index + 1.0f);
 56 | 
 57 |     // if array[index] is greater than the average of array[0..index-1], replace with average.
 58 |     // since array[] is in global memory, this change will be seen by the host (and potentially 
 59 |     // other thread blocks, if any)
 60 |     if (array[index] > average) { array[index] = average; }
 61 | 
 62 |     // the following code has NO EFFECT: it modifies shared memory, but 
 63 |     // the resulting modified data is never copied back to global memory
 64 |     // and vanishes when the thread block completes
 65 |     sh_arr[index] = 3.14;
 66 | }
 67 | 
 68 | int main(int argc, char **argv)
 69 | {
 70 |     /*
 71 |      * First, call a kernel that shows using local memory 
 72 |      */
 73 |     use_local_memory_GPU<<<1, 128>>>(2.0f);
 74 | 
 75 |     /*
 76 |      * Next, call a kernel that shows using global memory
 77 |      */
 78 |     float h_arr[128];   // convention: h_ variables live on host
 79 |     float *d_arr;       // convention: d_ variables live on device (GPU global mem)
 80 | 
 81 |     // allocate global memory on the device, place result in "d_arr"
 82 |     cudaMalloc((void **) &d_arr, sizeof(float) * 128);
 83 |     // now copy data from host memory "h_arr" to device memory "d_arr"
 84 |     cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
 85 |     // launch the kernel (1 block of 128 threads)
 86 |     use_global_memory_GPU<<<1, 128>>>(d_arr);  // modifies the contents of array at d_arr
 87 |     // copy the modified array back to the host, overwriting contents of h_arr
 88 |     cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost);
 89 |     // ... do other stuff ...
 90 | 
 91 |     /*
 92 |      * Next, call a kernel that shows using shared memory
 93 |      */
 94 | 
 95 |     // as before, pass in a pointer to data in global memory
 96 |     use_shared_memory_GPU<<<1, 128>>>(d_arr); 
 97 |     // copy the modified array back to the host
 98 |     cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
 99 |     // ... do other stuff ...
100 |     return 0;
101 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | CUDA_ADD_EXECUTABLE(Lesson3_histo histo.cu)
11 | 
12 | CUDA_ADD_EXECUTABLE(Lesson3_reduce reduce.cu)
13 | 
14 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax reduce_minmax.cu)
15 | 
16 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax_2 reduce_minmax_2.cu)
17 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <cuda_runtime.h>
  3 | 
  4 | int log2(int i)
  5 | {
  6 |     int r = 0;
  7 |     while (i >>= 1) r++;
  8 |     return r;
  9 | }
 10 | 
 11 | int bit_reverse(int w, int bits)
 12 | {
 13 |     int r = 0;
 14 |     for (int i = 0; i < bits; i++)
 15 |     {
 16 |         int bit = (w & (1 << i)) >> i;
 17 |         r |= bit << (bits - i - 1);
 18 |     }
 19 |     return r;
 20 | }
 21 | 
 22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
 23 | {
 24 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 25 |     int myItem = d_in[myId];
 26 |     int myBin = myItem % BIN_COUNT;
 27 |     d_bins[myBin]++;
 28 | }
 29 | 
 30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
 31 | {
 32 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 33 |     int myItem = d_in[myId];
 34 |     int myBin = myItem % BIN_COUNT;
 35 |     atomicAdd(&(d_bins[myBin]), 1);
 36 | }
 37 | 
 38 | 
 39 | int main(int argc, char **argv)
 40 | {
 41 |     int deviceCount;
 42 |     cudaGetDeviceCount(&deviceCount);
 43 |     if (deviceCount == 0) {
 44 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 45 |         exit(EXIT_FAILURE);
 46 |     }
 47 |     int dev = 0;
 48 |     cudaSetDevice(dev);
 49 | 
 50 |     cudaDeviceProp devProps;
 51 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
 52 |     {
 53 |         printf("Using device %d:\n", dev);
 54 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
 55 |                devProps.name, (int)devProps.totalGlobalMem, 
 56 |                (int)devProps.major, (int)devProps.minor, 
 57 |                (int)devProps.clockRate);
 58 |     }
 59 | 
 60 |     const int ARRAY_SIZE = 65536;
 61 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
 62 |     const int BIN_COUNT = 16;
 63 |     const int BIN_BYTES = BIN_COUNT * sizeof(int);
 64 | 
 65 |     // generate the input array on the host
 66 |     int h_in[ARRAY_SIZE];
 67 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 68 |         h_in[i] = bit_reverse(i, log2(ARRAY_SIZE));
 69 |     }
 70 |     int h_bins[BIN_COUNT];
 71 |     for(int i = 0; i < BIN_COUNT; i++) {
 72 |         h_bins[i] = 0;
 73 |     }
 74 | 
 75 |     // declare GPU memory pointers
 76 |     int * d_in;
 77 |     int * d_bins;
 78 | 
 79 |     // allocate GPU memory
 80 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
 81 |     cudaMalloc((void **) &d_bins, BIN_BYTES);
 82 | 
 83 |     // transfer the arrays to the GPU
 84 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
 85 |     cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice); 
 86 | 
 87 |     int whichKernel = 0;
 88 |     if (argc == 2) {
 89 |         whichKernel = atoi(argv[1]);
 90 |     }
 91 |         
 92 |     // launch the kernel
 93 |     switch(whichKernel) {
 94 |     case 0:
 95 |         printf("Running naive histo\n");
 96 |         naive_histo<<<ARRAY_SIZE / 64, 64>>>(d_bins, d_in, BIN_COUNT);
 97 |         break;
 98 |     case 1:
 99 |         printf("Running simple histo\n");
100 |         simple_histo<<<ARRAY_SIZE / 64, 64>>>(d_bins, d_in, BIN_COUNT);
101 |         break;
102 |     default:
103 |         fprintf(stderr, "error: ran no kernel\n");
104 |         exit(EXIT_FAILURE);
105 |     }
106 | 
107 |     // copy back the sum from GPU
108 |     cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);
109 | 
110 |     for(int i = 0; i < BIN_COUNT; i++) {
111 |         printf("bin %d: count %d\n", i, h_bins[i]);
112 |     }
113 | 
114 |     // free GPU memory allocation
115 |     cudaFree(d_in);
116 |     cudaFree(d_bins);
117 |         
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <cuda_runtime.h>
  4 | 
  5 | __global__ void global_reduce_kernel(float * d_out, float * d_in)
  6 | {
  7 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
  8 |     int tid  = threadIdx.x;
  9 | 
 10 |     // do reduction in global mem
 11 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 12 |     {
 13 |         if (tid < s)
 14 |         {
 15 |             d_in[myId] += d_in[myId + s];
 16 |         }
 17 |         __syncthreads();        // make sure all adds at one stage are done!
 18 |     }
 19 | 
 20 |     // only thread 0 writes result for this block back to global mem
 21 |     if (tid == 0)
 22 |     {
 23 |         d_out[blockIdx.x] = d_in[myId];
 24 |     }
 25 | }
 26 | 
 27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
 28 | {
 29 |     // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
 30 |     extern __shared__ float sdata[];
 31 | 
 32 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 33 |     int tid  = threadIdx.x;
 34 | 
 35 |     // load shared mem from global mem
 36 |     sdata[tid] = d_in[myId];
 37 |     __syncthreads();            // make sure entire block is loaded!
 38 | 
 39 |     // do reduction in shared mem
 40 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 41 |     {
 42 |         if (tid < s)
 43 |         {
 44 |             sdata[tid] += sdata[tid + s];
 45 |         }
 46 |         __syncthreads();        // make sure all adds at one stage are done!
 47 |     }
 48 | 
 49 |     // only thread 0 writes result for this block back to global mem
 50 |     if (tid == 0)
 51 |     {
 52 |         d_out[blockIdx.x] = sdata[0];
 53 |     }
 54 | }
 55 | 
 56 | void reduce(float * d_out, float * d_intermediate, float * d_in, 
 57 |             int size, bool usesSharedMemory)
 58 | {
 59 |     // assumes that size is not greater than maxThreadsPerBlock^2
 60 |     // and that size is a multiple of maxThreadsPerBlock
 61 |     const int maxThreadsPerBlock = 1024;
 62 |     int threads = maxThreadsPerBlock;
 63 |     int blocks = size / maxThreadsPerBlock;
 64 |     if (usesSharedMemory)
 65 |     {
 66 |         shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
 67 |             (d_intermediate, d_in);
 68 |     }
 69 |     else
 70 |     {
 71 |         global_reduce_kernel<<<blocks, threads>>>
 72 |             (d_intermediate, d_in);
 73 |     }
 74 |     // now we're down to one block left, so reduce it
 75 |     threads = blocks; // launch one thread for each block in prev step
 76 |     blocks = 1;
 77 |     if (usesSharedMemory)
 78 |     {
 79 |         shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
 80 |             (d_out, d_intermediate);
 81 |     }
 82 |     else
 83 |     {
 84 |         global_reduce_kernel<<<blocks, threads>>>
 85 |             (d_out, d_intermediate);
 86 |     }
 87 | }
 88 | 
 89 | int main(int argc, char **argv)
 90 | {
 91 |     int deviceCount;
 92 |     cudaGetDeviceCount(&deviceCount);
 93 |     if (deviceCount == 0) {
 94 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 95 |         exit(EXIT_FAILURE);
 96 |     }
 97 |     int dev = 0;
 98 |     cudaSetDevice(dev);
 99 | 
100 |     cudaDeviceProp devProps;
101 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
102 |     {
103 |         printf("Using device %d:\n", dev);
104 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
105 |                devProps.name, (int)devProps.totalGlobalMem, 
106 |                (int)devProps.major, (int)devProps.minor, 
107 |                (int)devProps.clockRate);
108 |     }
109 | 
110 |     const int ARRAY_SIZE = 1 << 16;
111 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
112 | 
113 |     // generate the input array on the host
114 |     float h_in[ARRAY_SIZE];
115 |     float sum = 0.0f;
116 |     for(int i = 0; i < ARRAY_SIZE; i++) {
117 |         // generate random float in [-1.0f, 1.0f]
118 |         h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
119 |         sum += h_in[i];
120 |     }
121 | 
122 |     // declare GPU memory pointers
123 |     float * d_in, * d_intermediate, * d_out;
124 | 
125 |     // allocate GPU memory
126 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
127 |     cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
128 |     cudaMalloc((void **) &d_out, sizeof(float));
129 | 
130 |     // transfer the input array to the GPU
131 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
132 | 
133 |     int whichKernel = 0;
134 |     if (argc == 2) {
135 |         whichKernel = atoi(argv[1]);
136 |     }
137 |         
138 |     cudaEvent_t start, stop;
139 |     cudaEventCreate(&start);
140 |     cudaEventCreate(&stop);
141 |     // launch the kernel
142 |     switch(whichKernel) {
143 |     case 0:
144 |         printf("Running global reduce\n");
145 |         cudaEventRecord(start, 0);
146 |         for (int i = 0; i < 100; i++)
147 |         {
148 |             reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
149 |         }
150 |         cudaEventRecord(stop, 0);
151 |         break;
152 |     case 1:
153 |         printf("Running reduce with shared mem\n");
154 |         cudaEventRecord(start, 0);
155 |         for (int i = 0; i < 100; i++)
156 |         {
157 |             reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
158 |         }
159 |         cudaEventRecord(stop, 0);
160 |         break;
161 |     default:
162 |         fprintf(stderr, "error: ran no kernel\n");
163 |         exit(EXIT_FAILURE);
164 |     }
165 |     cudaEventSynchronize(stop);
166 |     float elapsedTime;
167 |     cudaEventElapsedTime(&elapsedTime, start, stop);    
168 |     elapsedTime /= 100.0f;      // 100 trials
169 | 
170 |     // copy back the sum from GPU
171 |     float h_out;
172 |     cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
173 | 
174 |     printf("average time elapsed: %f\n", elapsedTime);
175 | 
176 |     // free GPU memory allocation
177 |     cudaFree(d_in);
178 |     cudaFree(d_intermediate);
179 |     cudaFree(d_out);
180 | 
181 |     return 0;
182 | }
183 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <algorithm>
  4 | #include <ctime>
  5 | #include <cuda_runtime.h>
  6 | 
  7 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in, bool is_max)
  8 | {
  9 |   // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
 10 |   extern __shared__ float sdata[];
 11 | 
 12 |   int myId = threadIdx.x + blockDim.x * blockIdx.x;
 13 |   int tid  = threadIdx.x;
 14 | 
 15 |   // load shared mem from global mem
 16 |   sdata[tid] = d_in[myId];
 17 |   __syncthreads();            // make sure entire block is loaded!
 18 | 
 19 |   // do reduction in shared mem
 20 |   for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 21 |   {
 22 |       if (tid < s)
 23 |       {
 24 |           if(is_max)
 25 |             sdata[tid] = max(sdata[tid], sdata[tid + s]);
 26 |           else
 27 |             sdata[tid] = min(sdata[tid], sdata[tid + s]);
 28 |       }
 29 |       __syncthreads();        // make sure all adds at one stage are done!
 30 |   }
 31 | 
 32 |   // only thread 0 writes result for this block back to global mem
 33 |   if (tid == 0)
 34 |   {
 35 |       d_out[blockIdx.x] = sdata[0];
 36 |   }
 37 | }
 38 | 
 39 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length)
 40 | {
 41 |   // use reduce
 42 |   const int m = 1 << 10;
 43 |   int blocks = ceil((float)length / m);
 44 |   float *d_intermediate; // should not modify d_in
 45 |   cudaMalloc(&d_intermediate, sizeof(float)* blocks); // store max and min
 46 | 
 47 |   shmem_reduce_kernel<<<blocks, m, m * sizeof(float)>>>(d_intermediate, d_logLuminance, true);
 48 |   shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(max_logLum, d_intermediate, true);
 49 | 
 50 |   shmem_reduce_kernel<<<blocks, m, m * sizeof(float)>>>(d_intermediate, d_logLuminance, false);
 51 |   shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(min_logLum, d_intermediate, false);
 52 | 
 53 |   cudaFree(d_intermediate);
 54 | }
 55 | 
 56 | int main(int argc, char **argv)
 57 | {
 58 |     int deviceCount;
 59 |     cudaGetDeviceCount(&deviceCount);
 60 |     if (deviceCount == 0) {
 61 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 62 |         exit(EXIT_FAILURE);
 63 |     }
 64 |     int dev = 0;
 65 |     cudaSetDevice(dev);
 66 | 
 67 |     cudaDeviceProp devProps;
 68 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
 69 |     {
 70 |         printf("Using device %d:\n", dev);
 71 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
 72 |                devProps.name, (int)devProps.totalGlobalMem, 
 73 |                (int)devProps.major, (int)devProps.minor, 
 74 |                (int)devProps.clockRate);
 75 |     }
 76 | 
 77 |     const int ARRAY_SIZE = 1 << 16;
 78 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 79 | 
 80 |     // generate the input array on the host
 81 |     float h_in[ARRAY_SIZE];
 82 |     float sum = 0.0f;
 83 |     srand((unsigned)time(0));
 84 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 85 |         // generate random float in [-1.0f, 1.0f]
 86 |         h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
 87 |         sum += h_in[i];
 88 |     }
 89 | 
 90 |     // declare GPU memory pointers
 91 |     float *d_in;
 92 | 
 93 |     // allocate GPU memory
 94 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
 95 | 
 96 |     // transfer the input array to the GPU
 97 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
 98 | 
 99 |     // launch the kernel
100 |     printf("Running reduce\n");
101 |     float *d_min, *d_max;
102 |     cudaMalloc((void **) &d_min, sizeof(float));
103 |     cudaMalloc((void **) &d_max, sizeof(float));
104 |     reduce(d_min, d_max, d_in, ARRAY_SIZE);
105 | 
106 |     // copy back the sum from GPU
107 |     float h_min, h_max;
108 |     cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost);
109 |     cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
110 | 
111 |     printf("Max_GPU: %f  Min_GPU: %f\n", h_max, h_min);
112 |     h_max = h_in[0]; h_min = h_in[0];
113 |     for (size_t i = 1; i < ARRAY_SIZE; ++i) {
114 |         h_max = std::max(h_in[i], h_max);
115 |         h_min = std::min(h_in[i], h_min);
116 |     }
117 |     printf("Max_CPU: %f  Min_CPU: %f\n", h_max, h_min);
118 | 
119 |     // free GPU memory allocation
120 |     cudaFree(d_in);
121 |     cudaFree(d_min);
122 |     cudaFree(d_max);
123 |     return 0;
124 | }
125 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax_2.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <algorithm>
  4 | #include <cuda_runtime.h>
  5 | 
  6 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in)
  7 | {
  8 |   // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
  9 |   extern __shared__ float sdata[];
 10 | 
 11 |   int myId = threadIdx.x + blockDim.x * blockIdx.x;
 12 |   int tid = threadIdx.x;
 13 | 
 14 |   // load shared mem from global mem
 15 |   sdata[tid] = d_in[myId];
 16 |   __syncthreads();            // make sure entire block is loaded!
 17 | 
 18 |   // do reduction in shared mem
 19 |   unsigned int s = blockDim.x / 2;
 20 |   // step 1: 分两半
 21 |   if (tid < s) {
 22 |     float temp = sdata[tid];
 23 |     sdata[tid] = max(temp, sdata[tid + s]);
 24 |     sdata[tid + s] = min(temp, sdata[tid + s]);
 25 |   }
 26 |   __syncthreads();        // make sure all adds at one stage are done!
 27 | 
 28 |   // step 2: 两边走
 29 |   for (s = s / 2; s > 0; s >>= 1)
 30 |   {
 31 |     if (tid < s) {
 32 |       sdata[tid] = max(sdata[tid], sdata[tid + s]);
 33 |     }
 34 |     else if (tid >= blockDim.x / 2 && tid < blockDim.x / 2 + s) {
 35 |       sdata[tid] = min(sdata[tid], sdata[tid + s]);
 36 |     }
 37 |     __syncthreads();        // make sure all adds at one stage are done!
 38 |   }
 39 | 
 40 |   // only thread 0 writes result for this block back to global mem
 41 |   if (tid == 0)
 42 |   {
 43 |     d_out[blockIdx.x] = sdata[0];
 44 |     d_out[blockDim.x + blockIdx.x] = sdata[blockDim.x / 2];
 45 |     //printf("%f %f\n", sdata[0], sdata[blockDim.x / 2]); 
 46 |   }
 47 | }
 48 | 
 49 | __global__ void shmem_reduce_finish_kernel(float *min_logLum, 
 50 |   float *max_logLum, const float * const d_in)
 51 | {
 52 |   // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
 53 |   extern __shared__ float sdata[];
 54 |   int tid = threadIdx.x;
 55 | 
 56 |   // load shared mem from global mem
 57 |   sdata[tid] = d_in[tid];
 58 |   sdata[tid + blockDim.x] = d_in[tid + blockDim.x];
 59 |   __syncthreads();            // make sure entire block is loaded!
 60 | 
 61 |   // do reduction in shared mem
 62 |   for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 63 |   {
 64 |     if (tid < s) {
 65 |       sdata[tid] = max(sdata[tid], sdata[tid + s]);
 66 |       sdata[tid + blockDim.x] = min(sdata[tid + blockDim.x], sdata[tid + blockDim.x + s]);
 67 |     }
 68 |     __syncthreads();        // make sure all adds at one stage are done!
 69 |   }
 70 | 
 71 |   // only thread 0 writes result for this block back to global mem
 72 |   if (tid == 0)
 73 |   {
 74 |     *max_logLum = sdata[0];
 75 |     *min_logLum = sdata[blockDim.x];
 76 |   }
 77 | }
 78 | 
 79 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length)
 80 | {
 81 |   // use reduce
 82 |   const int m = 1 << 6;
 83 |   int blocks = ceil((float)length / m);
 84 |   float *d_intermediate; // should not modify d_in
 85 |   cudaMalloc(&d_intermediate, sizeof(float)* blocks * 2); // store max and min
 86 |   shmem_reduce_kernel<<<blocks, m, m * sizeof(float)>>>(d_intermediate, d_logLuminance);
 87 |   shmem_reduce_finish_kernel<<<1, blocks, 2 * blocks*sizeof(float)>>>(min_logLum, max_logLum, d_intermediate);
 88 |   cudaFree(d_intermediate);
 89 | }
 90 | 
 91 | int main(int argc, char **argv)
 92 | {
 93 |     int deviceCount;
 94 |     cudaGetDeviceCount(&deviceCount);
 95 |     if (deviceCount == 0) {
 96 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 97 |         exit(EXIT_FAILURE);
 98 |     }
 99 |     int dev = 0;
100 |     cudaSetDevice(dev);
101 | 
102 |     cudaDeviceProp devProps;
103 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
104 |     {
105 |         printf("Using device %d:\n", dev);
106 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
107 |                devProps.name, (int)devProps.totalGlobalMem, 
108 |                (int)devProps.major, (int)devProps.minor, 
109 |                (int)devProps.clockRate);
110 |     }
111 | 
112 |     const int ARRAY_SIZE = 1 << 12;
113 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
114 | 
115 |     // generate the input array on the host
116 |     float h_in[ARRAY_SIZE];
117 |     float sum = 0.0f;
118 |     for(int i = 0; i < ARRAY_SIZE; i++) {
119 |         // generate random float in [-1.0f, 1.0f]
120 |         h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
121 |         sum += h_in[i];
122 |     }
123 | 
124 |     // declare GPU memory pointers
125 |     float *d_in;
126 | 
127 |     // allocate GPU memory
128 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
129 | 
130 |     // transfer the input array to the GPU
131 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
132 | 
133 |     // launch the kernel
134 |     printf("Running reduce\n");
135 |     float *d_min, *d_max;
136 |     cudaMalloc((void **) &d_min, sizeof(float));
137 |     cudaMalloc((void **) &d_max, sizeof(float));
138 |     reduce(d_min, d_max, d_in, ARRAY_SIZE);
139 | 
140 |     // copy back the sum from GPU
141 |     float h_min, h_max;
142 |     cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost);
143 |     cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
144 | 
145 |     printf("Max_GPU: %f  Min_GPU: %f\n", h_max, h_min);
146 |     h_max = h_in[0]; h_min = h_in[0];
147 |     for (size_t i = 1; i < ARRAY_SIZE; ++i) {
148 |         h_max = std::max(h_in[i], h_max);
149 |         h_min = std::min(h_in[i], h_min);
150 |     }
151 |     printf("Max_CPU: %f  Min_CPU: %f\n", h_max, h_min);
152 | 
153 |     // free GPU memory allocation
154 |     cudaFree(d_in);
155 |     cudaFree(d_min);
156 |     cudaFree(d_max);
157 |     return 0;
158 | }
159 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | CUDA_ADD_EXECUTABLE(Lesson5_deviceQuery deviceQuery_simplified.cpp)
11 | 
12 | CUDA_ADD_EXECUTABLE(Lesson5_transpose transpose.cu gputimer.h)
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  Lesson7_tiling_hdr tiling/*.hpp tiling/*.h )
11 | SET (Lesson7_tiling_files tiling/tiling.cu)
12 | CUDA_ADD_EXECUTABLE(Lesson7_tiling ${Lesson7_tiling_files} ${Lesson7_tiling_hdr})
13 | 
14 | file( GLOB  Lesson7_thrust_hdr thrust/*.h )
15 | SET (Lesson7_thrust_files thrust/thrust_example.cu thrust/gettime.cc)
16 | CUDA_ADD_EXECUTABLE(Lesson7_thrust ${Lesson7_thrust_files} ${Lesson7_thrust_hdr})
17 | 
18 | file( GLOB  Lesson7_opencv_hdr opencv/*.h )
19 | SET (Lesson7_opencv_files opencv/opencv.cu opencv/gettime.cc)
20 | CUDA_ADD_EXECUTABLE(Lesson7_opencv ${Lesson7_opencv_files} ${Lesson7_opencv_hdr})
21 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Simple demonstration of cub::BlockScan
 31 |  *
 32 |  * Example compilation string:
 33 |  *
 34 |  * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum
 35 |  *
 36 |  ******************************************************************************/
 37 | 
 38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h)
 39 | #define CUB_STDERR
 40 | 
 41 | #include <stdio.h>
 42 | #include <iostream>
 43 | 
 44 | #include <cub/cub.cuh>
 45 | 
 46 | using namespace cub;
 47 | 
 48 | //---------------------------------------------------------------------
 49 | // Globals, constants and typedefs
 50 | //---------------------------------------------------------------------
 51 | 
 52 | bool g_verbose      = false;
 53 | int g_iterations    = 100;
 54 | 
 55 | 
 56 | //---------------------------------------------------------------------
 57 | // Kernels
 58 | //---------------------------------------------------------------------
 59 | 
 60 | /**
 61 |  * Simple kernel for performing a block-wide exclusive prefix sum over integers
 62 |  */
 63 | template <
 64 |     int         BLOCK_THREADS,
 65 |     int         ITEMS_PER_THREAD>
 66 | __global__ void BlockPrefixSumKernel(
 67 |     int         *d_in,          // Tile of input
 68 |     int         *d_out,         // Tile of output
 69 |     clock_t     *d_elapsed)     // Elapsed cycle count of block scan
 70 | {
 71 |     // Parameterize BlockScan type for our thread block
 72 |     typedef BlockScan<int, BLOCK_THREADS> BlockScanT;
 73 | 
 74 |     // Shared memory
 75 |     __shared__ typename BlockScanT::SmemStorage smem_storage;
 76 | 
 77 |     // Per-thread tile data
 78 |     int data[ITEMS_PER_THREAD];
 79 |     BlockLoadVectorized(d_in, data);
 80 | 
 81 |     // Start cycle timer
 82 |     clock_t start = clock();
 83 | 
 84 |     // Compute exclusive prefix sum
 85 |     int aggregate;
 86 |     BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
 87 | 
 88 |     // Stop cycle timer
 89 |     clock_t stop = clock();
 90 | 
 91 |     // Store output
 92 |     BlockStoreVectorized(d_out, data);
 93 | 
 94 |     // Store aggregate and elapsed clocks
 95 |     if (threadIdx.x == 0)
 96 |     {
 97 |         *d_elapsed = (start > stop) ? start - stop : stop - start;
 98 |         d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
 99 |     }
100 | }
101 | 
102 | 
103 | 
104 | //---------------------------------------------------------------------
105 | // Host utilities
106 | //---------------------------------------------------------------------
107 | 
108 | /**
109 |  * Initialize exclusive prefix sum problem (and solution).
110 |  * Returns the aggregate
111 |  */
112 | int Initialize(
113 |     int *h_in,
114 |     int *h_reference,
115 |     int num_elements)
116 | {
117 |     int inclusive = 0;
118 | 
119 |     for (int i = 0; i < num_elements; ++i)
120 |     {
121 |         h_in[i] = i % 17;
122 | 
123 |         h_reference[i] = inclusive;
124 |         inclusive += h_in[i];
125 |     }
126 | 
127 |     return inclusive;
128 | }
129 | 
130 | 
131 | /**
132 |  * Test thread block scan
133 |  */
134 | template <
135 |     int BLOCK_THREADS,
136 |     int ITEMS_PER_THREAD>
137 | void Test()
138 | {
139 |     const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
140 | 
141 |     // Allocate host arrays
142 |     int *h_in           = new int[TILE_SIZE];
143 |     int *h_reference    = new int[TILE_SIZE];
144 |     int *h_gpu          = new int[TILE_SIZE + 1];
145 | 
146 |     // Initialize problem and reference output on host
147 |     int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
148 | 
149 |     // Initialize device arrays
150 |     int *d_in           = NULL;
151 |     int *d_out          = NULL;
152 |     clock_t *d_elapsed  = NULL;
153 |     cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
154 |     cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
155 |     cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
156 | 
157 |     // Display input problem data
158 |     if (g_verbose)
159 |     {
160 |         printf("Input data: ");
161 |         for (int i = 0; i < TILE_SIZE; i++)
162 |             printf("%d, ", h_in[i]);
163 |         printf("\n\n");
164 |     }
165 | 
166 |     // Copy problem to device
167 |     cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
168 | 
169 |     printf("BlockScan %d items (%d threads, %d items per thread): ",
170 |         TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
171 | 
172 |     // Run this several times and average the performance results
173 |     clock_t elapsed_scan_clocks     = 0;
174 |     for (int i = 0; i < g_iterations; ++i)
175 |     {
176 |         // Run aggregate/prefix kernel
177 |         BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
178 |             d_in,
179 |             d_out,
180 |             d_elapsed);
181 | 
182 |         // Copy results from device
183 |         clock_t scan_clocks;
184 |         cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
185 |         cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
186 |         elapsed_scan_clocks += scan_clocks;
187 |     }
188 | 
189 |     // Check scanned items
190 |     bool correct = true;
191 |     for (int i = 0; i < TILE_SIZE; i++)
192 |     {
193 |         if (h_gpu[i] != h_reference[i])
194 |         {
195 |             printf("Incorrect result @ offset %d (%d != %d)\n",
196 |                 i, h_gpu[i], h_reference[i]);
197 |             correct = false;
198 |             break;
199 |         }
200 |     }
201 | 
202 |     // Check total aggregate
203 |     if (h_gpu[TILE_SIZE] != h_aggregate)
204 |     {
205 |         printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
206 |         correct = false;
207 |     }
208 |     if (correct) printf("Correct!\n");
209 | 
210 |     // Display results problem data
211 |     if (g_verbose)
212 |     {
213 |         printf("GPU output (reference output): ");
214 |         for (int i = 0; i < TILE_SIZE; i++)
215 |             printf("%d (%d), ", h_gpu[i], h_reference[i]);
216 |         printf("\n");
217 |         printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
218 |         printf("\n\n");
219 |     }
220 | 
221 |     // Display timing results
222 |     printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
223 | 
224 |     // Cleanup
225 |     if (h_in) delete[] h_in;
226 |     if (h_reference) delete[] h_reference;
227 |     if (h_gpu) delete[] h_gpu;
228 |     if (d_in) cudaFree(d_in);
229 |     if (d_out) cudaFree(d_out);
230 |     if (d_elapsed) cudaFree(d_elapsed);
231 | }
232 | 
233 | 
234 | /**
235 |  * Main
236 |  */
237 | int main(int argc, char** argv)
238 | {
239 |     // Display GPU name
240 |     cudaDeviceProp props;
241 |     cudaGetDeviceProperties(&props, 0);
242 |     printf("Using device %s\n", props.name);
243 | 
244 | /** Add tests here **/
245 | 
246 |     // Run tests
247 |     Test<1024, 1>();
248 |     Test<512, 2>();
249 |     Test<256, 4>();
250 |     Test<128, 8>();
251 |     Test<64, 16>();
252 |     Test<32, 32>();
253 |     Test<16, 64>();
254 | 
255 | /****/
256 | 
257 |     return 0;
258 | }
259 | 
260 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.cc:
--------------------------------------------------------------------------------
 1 | #define WIN32_LEAN_AND_MEAN
 2 | #include <Windows.h>
 3 | #include <stdint.h> // portable: uint64_t   MSVC: __int64 
 4 | #include "gettime.h"
 5 | 
 6 | int gettimeofday(struct timeval * tp, struct timezone * tzp)
 7 | {
 8 |     // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
 9 |     static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL);
10 | 
11 |     SYSTEMTIME  system_time;
12 |     FILETIME    file_time;
13 |     uint64_t    time;
14 | 
15 |     GetSystemTime( &system_time );
16 |     SystemTimeToFileTime( &system_time, &file_time );
17 |     time =  ((uint64_t)file_time.dwLowDateTime )      ;
18 |     time += ((uint64_t)file_time.dwHighDateTime) << 32;
19 | 
20 |     tp->tv_sec  = (long) ((time - EPOCH) / 10000000L);
21 |     tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.h:
--------------------------------------------------------------------------------
 1 | #ifndef GETTIME_H
 2 | #define GETTIME_H
 3 | 
 4 | #include <winsock.h>
 5 | 
 6 | // MSVC defines this in winsock2.h!?
 7 | /*struct timeval {
 8 |     long tv_sec;
 9 |     long tv_usec;
10 | };
11 | */
12 | int gettimeofday(struct timeval * tp, struct timezone * tzp);
13 | 
14 | double tic();
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/opencv.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "gettime.h"
 3 | #include <string>
 4 | #include <stdio.h>
 5 | #include <algorithm>
 6 | #include <opencv2/core/core.hpp>
 7 | #include <opencv2/highgui/highgui.hpp>
 8 | #include <opencv2/opencv.hpp>
 9 | #include <opencv2/cudaimgproc.hpp>
10 | 
11 | using namespace std;
12 | using namespace cv;
13 | using namespace cv::cuda;
14 | 
15 | int main(int argc, char **argv) {
16 | 
17 |   cv::Mat src = cv::imread("IMAG0179_small.jpg", cv::IMREAD_GRAYSCALE);
18 | 
19 |   if (!src.data) {
20 |     printf("failed opening jpg\n");
21 |     exit(1);
22 |   }
23 | 
24 |   Mat mask;
25 |   cv::Canny(src, mask, 100, 200, 3);
26 | 
27 |   Mat dst_cpu;
28 |   cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR);
29 |   Mat dst_gpu = dst_cpu.clone();
30 | 
31 |   vector<Vec4i> lines_cpu;
32 |   {
33 |     const int64 start = getTickCount();
34 | 
35 |     cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
36 | 
37 |     const double timeSec = (getTickCount() - start) / getTickFrequency();
38 |     cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
39 |     cout << "CPU Found : " << lines_cpu.size() << endl;
40 |   }
41 | 
42 |   for (size_t i = 0; i < lines_cpu.size(); ++i)
43 |   {
44 |     Vec4i l = lines_cpu[i];
45 |     line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
46 |   }
47 | 
48 |   GpuMat d_src(mask);
49 |   GpuMat d_lines;
50 |   {
51 |     const int64 start = getTickCount();
52 | 
53 |     Ptr<cuda::HoughSegmentDetector> hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 60, 5);
54 |     hough->detect(d_src, d_lines);
55 | 
56 |     const double timeSec = (getTickCount() - start) / getTickFrequency();
57 |     cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
58 |     cout << "GPU Found : " << d_lines.cols << endl;
59 |   }
60 |   vector<Vec4i> lines_gpu;
61 |   if (!d_lines.empty())
62 |   {
63 |     lines_gpu.resize(d_lines.cols);
64 |     Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
65 |     d_lines.download(h_lines);
66 |   }
67 | 
68 |   for (size_t i = 0; i < lines_gpu.size(); ++i)
69 |   {
70 |     Vec4i l = lines_gpu[i];
71 |     line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
72 |   }
73 | 
74 |   imshow("source", src);
75 |   imshow("detected lines [CPU]", dst_cpu);
76 |   imshow("detected lines [GPU]", dst_gpu);
77 |   waitKey();
78 | 
79 |   return 0;
80 | }
81 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.cc:
--------------------------------------------------------------------------------
 1 | #define WIN32_LEAN_AND_MEAN
 2 | #include <Windows.h>
 3 | #include <stdint.h> // portable: uint64_t   MSVC: __int64 
 4 | #include "gettime.h"
 5 | 
 6 | int gettimeofday(struct timeval * tp, struct timezone * tzp)
 7 | {
 8 |     // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
 9 |     static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL);
10 | 
11 |     SYSTEMTIME  system_time;
12 |     FILETIME    file_time;
13 |     uint64_t    time;
14 | 
15 |     GetSystemTime( &system_time );
16 |     SystemTimeToFileTime( &system_time, &file_time );
17 |     time =  ((uint64_t)file_time.dwLowDateTime )      ;
18 |     time += ((uint64_t)file_time.dwHighDateTime) << 32;
19 | 
20 |     tp->tv_sec  = (long) ((time - EPOCH) / 10000000L);
21 |     tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
22 |     return 0;
23 | }
24 | 
25 | /*double tic() {
26 |   struct timeval t;
27 |   gettimeofday(&t, NULL);
28 |   return ((double)t.tv_sec * 1000 + ((double)t.tv_usec) / 1000.);
29 | }*/
30 | 
31 | double tic() {
32 |   LARGE_INTEGER m_nFreq;
33 |   LARGE_INTEGER m_Time;
34 |   QueryPerformanceFrequency(&m_nFreq);
35 |   QueryPerformanceCounter(&m_Time);
36 |   return (double)m_Time.QuadPart * 1000. / m_nFreq.QuadPart;
37 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.h:
--------------------------------------------------------------------------------
 1 | #ifndef GETTIME_H
 2 | #define GETTIME_H
 3 | 
 4 | #include <winsock.h>
 5 | 
 6 | // MSVC defines this in winsock2.h!?
 7 | /*struct timeval {
 8 |     long tv_sec;
 9 |     long tv_usec;
10 | };
11 | */
12 | int gettimeofday(struct timeval * tp, struct timezone * tzp);
13 | double tic();
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/host_vector.h>
 2 | #include <thrust/device_vector.h>
 3 | #include <thrust/generate.h>
 4 | #include <thrust/sort.h>
 5 | #include <thrust/copy.h>
 6 | #include <algorithm>
 7 | #include <cstdlib>
 8 | #include <vector>
 9 | 
10 | #include "gputimer.h"
11 | #include "gettime.h"
12 | 
13 | int main(void)
14 | {
15 |   // generate N random numbers serially
16 |   int N = 1000000;
17 |   std::vector<char> h_vec(N);
18 |   std::generate(h_vec.begin(), h_vec.end(), rand);
19 |   std::vector<char> h_vec_std(h_vec);
20 | 
21 |   double t0 = tic();
22 |   thrust::sort(h_vec.begin(), h_vec.end());
23 |   std::cout << "thrust::sort took " << tic() - t0 << " ms." << std::endl;
24 | 
25 |   t0 = tic();
26 |   std::sort(h_vec_std.begin(), h_vec_std.end());
27 |   std::cout << "std::sort took " << tic() - t0 << " ms." << std::endl;
28 | 
29 |   for (int i = 0; i < N; i++) {
30 |     if (h_vec[i] != h_vec_std[i]) {
31 |       std::cout << i << " Not same!" << std::endl;
32 |       exit(1);
33 |     }
34 |   }
35 | 
36 |   return 0;
37 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <device_launch_parameters.h>
  3 | #include <device_functions.h>
  4 | #include "gputimer.h"
  5 | #include "utils.h"
  6 | 
  7 | const int BLOCKSIZE	= 128;
  8 | const int NUMBLOCKS = 100;					// set this to 1 or 2 for debugging
  9 | const int N 		= BLOCKSIZE*NUMBLOCKS;
 10 | 
 11 | /* 
 12 |  * TODO: modify the foo and bar kernels to use tiling: 
 13 |  * 		 - copy the input data to shared memory
 14 |  *		 - perform the computation there
 15 |  *	     - copy the result back to global memory
 16 |  *		 - assume thread blocks of 128 threads
 17 |  *		 - handle intra-block boundaries correctly
 18 |  * You can ignore boundary conditions (we ignore the first 2 and last 2 elements)
 19 |  */
 20 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){
 21 | 
 22 | 	int i = threadIdx.x + blockIdx.x*blockDim.x; 
 23 | 	
 24 | 	out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
 25 | }
 26 | 
 27 | __global__ void bar(float out[], float in[]) 
 28 | {
 29 | 	int i = threadIdx.x + blockIdx.x*blockDim.x; 
 30 | 
 31 | 	out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
 32 | }
 33 | 
 34 | __global__ void bar_tile(float out[], float in[])
 35 | {
 36 |   int i = threadIdx.x + blockIdx.x*blockDim.x;
 37 |   int idx = threadIdx.x;
 38 |   extern __shared__ float sh_din[];  
 39 |   sh_din[idx + 2] = in[i];
 40 |   if (idx == 0) {
 41 |     sh_din[idx] = in[i-2];
 42 |     sh_din[idx+1] = in[i-1];
 43 |   }
 44 |   else if (idx == blockDim.x - 1) {
 45 |     sh_din[idx + 3] = in[i+1];
 46 |     sh_din[idx + 4] = in[i+2];
 47 |   }
 48 |   __syncthreads();
 49 | 
 50 |   out[i] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f;
 51 | }
 52 | 
 53 | __global__ void bar_tile_2(float out[], float in[])
 54 | {
 55 |   int i = threadIdx.x + blockIdx.x*blockDim.x;
 56 |   int idx = threadIdx.x;
 57 |   extern __shared__ float sh_din[];
 58 |   sh_din[idx] = in[i];
 59 |   __syncthreads();
 60 |   if (idx == 0) {
 61 |     out[i] = (in[i - 2] + in[i - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
 62 |   }
 63 |   else if (idx == 1) {
 64 |     out[i] = (in[i - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
 65 |   }
 66 |   else if (idx == blockDim.x - 2) {
 67 |     out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + in[i + 2]) / 5.0f;
 68 |   }
 69 |   else if (idx == blockDim.x - 1) {
 70 |     out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + in[i + 1] + in[i + 2]) / 5.0f;
 71 |   }
 72 |   else {
 73 |     out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
 74 |   }  
 75 | }
 76 | 
 77 | __global__ void bar_tile_3(float out[], float in[])
 78 | {
 79 |   int idx = threadIdx.x;
 80 |   extern __shared__ float sh_din[];
 81 |   int i_in = blockIdx.x * BLOCKSIZE + idx;
 82 |   sh_din[idx] = in[i_in-2];
 83 |   __syncthreads();
 84 |   if (idx < blockDim.x-4)
 85 |     out[i_in] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f;
 86 | }
 87 | 
 88 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[])
 89 | {
 90 | 	for (int i=0; i<N; i++)
 91 | 	{
 92 | 		out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
 93 | 	}
 94 | }
 95 | 
 96 | void cpuBar(float out[], float in[])
 97 | {
 98 | 	// ignore the boundaries
 99 | 	for (int i=2; i<N-2; i++)
100 | 	{
101 | 		out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
102 | 	}
103 | }
104 | 
105 | int main(int argc, char **argv)
106 | {
107 | 	// declare and fill input arrays for foo() and bar()
108 | 	float fooA[N], fooB[N], fooC[N], fooD[N], fooE[N], barIn[N];
109 | 	for (int i=0; i<N; i++) 
110 | 	{
111 | 		fooA[i] = i; 
112 | 		fooB[i] = i+1;
113 | 		fooC[i] = i+2;
114 | 		fooD[i] = i+3;
115 | 		fooE[i] = i+4;
116 | 		barIn[i] = 2*i; 
117 | 	}
118 | 	// device arrays
119 | 	int numBytes = N * sizeof(float);
120 | 	float *d_fooA;	 	checkCudaErrors(cudaMalloc(&d_fooA, numBytes));
121 | 	float *d_fooB; 		checkCudaErrors(cudaMalloc(&d_fooB, numBytes));
122 | 	float *d_fooC;	 	checkCudaErrors(cudaMalloc(&d_fooC, numBytes));
123 | 	float *d_fooD; 		checkCudaErrors(cudaMalloc(&d_fooD, numBytes));
124 | 	float *d_fooE; 		checkCudaErrors(cudaMalloc(&d_fooE, numBytes));
125 | 	float *d_barIn; 	checkCudaErrors(cudaMalloc(&d_barIn, numBytes));
126 | 	checkCudaErrors(cudaMemcpy(d_fooA, fooA, numBytes, cudaMemcpyHostToDevice));
127 | 	checkCudaErrors(cudaMemcpy(d_fooB, fooB, numBytes, cudaMemcpyHostToDevice));
128 | 	checkCudaErrors(cudaMemcpy(d_fooC, fooC, numBytes, cudaMemcpyHostToDevice));
129 | 	checkCudaErrors(cudaMemcpy(d_fooD, fooD, numBytes, cudaMemcpyHostToDevice));
130 | 	checkCudaErrors(cudaMemcpy(d_fooE, fooE, numBytes, cudaMemcpyHostToDevice));
131 | 	checkCudaErrors(cudaMemcpy(d_barIn, barIn, numBytes, cudaMemcpyHostToDevice));	
132 | 
133 | 	// output arrays for host and device
134 | 	float fooOut[N], barOut[N], *d_fooOut, *d_barOut;
135 | 	checkCudaErrors(cudaMalloc(&d_fooOut, numBytes));
136 | 	checkCudaErrors(cudaMalloc(&d_barOut, numBytes));
137 | 
138 | 	// declare and compute reference solutions
139 | 	float ref_fooOut[N], ref_barOut[N]; 
140 | 	cpuFoo(ref_fooOut, fooA, fooB, fooC, fooD, fooE);
141 | 	cpuBar(ref_barOut, barIn);
142 | 
143 | 	// launch and time foo and bar
144 | 	GpuTimer fooTimer, barTimer;
145 | 	fooTimer.Start();
146 | 	foo<<<N/BLOCKSIZE, BLOCKSIZE>>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE);
147 | 	fooTimer.Stop();
148 | 	
149 |   checkCudaErrors(cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost));
150 |   printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed());
151 |   compareArrays(ref_fooOut, fooOut, N);
152 | 
153 | 	barTimer.Start();
154 | 	bar<<<N/BLOCKSIZE, BLOCKSIZE>>>(d_barOut, d_barIn);
155 |   //bar_tile << <N / BLOCKSIZE, BLOCKSIZE, (BLOCKSIZE + 4) * sizeof(float) >> >(d_barOut, d_barIn);
156 |   //bar_tile_2 << <N / BLOCKSIZE, BLOCKSIZE, (BLOCKSIZE) * sizeof(float) >> >(d_barOut, d_barIn);
157 |   //bar_tile_3 << <N / BLOCKSIZE, BLOCKSIZE+4, (BLOCKSIZE + 4) * sizeof(float) >> >(d_barOut, d_barIn);
158 | 	barTimer.Stop();
159 | 
160 | 	checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
161 | 	printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
162 | 	compareArrays(ref_barOut, barOut, N);
163 | 
164 |   barTimer.Start();
165 |   bar_tile << <N / BLOCKSIZE, BLOCKSIZE, (BLOCKSIZE + 4) * sizeof(float) >> >(d_barOut, d_barIn);
166 |   barTimer.Stop();
167 | 
168 |   checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
169 |   printf("bar_tile<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
170 |   compareArrays(ref_barOut, barOut, N);
171 | 
172 |   barTimer.Start();
173 |   bar_tile_2 << <N / BLOCKSIZE, BLOCKSIZE, (BLOCKSIZE)* sizeof(float) >> >(d_barOut, d_barIn);
174 |   barTimer.Stop();
175 | 
176 |   checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
177 |   printf("bar_tile_2<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
178 |   compareArrays(ref_barOut, barOut, N);
179 | 
180 |   barTimer.Start();
181 |   bar_tile_3 << <N / BLOCKSIZE, BLOCKSIZE + 4, (BLOCKSIZE + 4) * sizeof(float) >> >(d_barOut, d_barIn);
182 |   barTimer.Stop();
183 | 
184 |   checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
185 |   printf("bar_tile_3<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
186 |   compareArrays(ref_barOut, barOut, N);
187 | 
188 | 	checkCudaErrors(cudaFree(d_fooA));
189 | 	checkCudaErrors(cudaFree(d_fooB));
190 | 	checkCudaErrors(cudaFree(d_fooC));
191 | 	checkCudaErrors(cudaFree(d_fooD));
192 | 	checkCudaErrors(cudaFree(d_fooE));
193 |   checkCudaErrors(cudaFree(d_barIn));
194 | 	checkCudaErrors(cudaFree(d_fooOut));
195 | 	checkCudaErrors(cudaFree(d_barOut));
196 | }
197 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | #include <algorithm>
12 | 
13 | // error checking utility functions
14 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
15 | 
16 | template<typename T>
17 | void check(T err, const char* const func, const char* const file, const int line)
18 | {
19 |   if (err != cudaSuccess) {
20 |     fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
21 |     fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
22 |     exit(1);
23 |   }
24 | }
25 | 
26 | void printArray(float in[], int N)
27 | {
28 | 	for (int i=0; i<N; i++) { printf("%g ", in[i]); }
29 | 	printf("\n");
30 | }
31 | 
32 | int compareArrays(float *ref, float *test, int N)
33 | {
34 | 	// ignore the boundaries
35 | 	for (int i=2; i<N-2; i++)
36 | 	{
37 | 		if (ref[i] != test[i]) 
38 | 		{
39 | 			printf("Error: solution does not match reference!\n");
40 | 			printf("first deviation at location %d\n", i);
41 | 			printf("reference array:\n"); printArray(ref, N);
42 | 			printf("solution array:\n"); printArray(test, N);
43 | 			return 1;
44 | 		}
45 | 	}
46 | 	printf("Verified!\n");
47 | 	return 0;
48 | }
49 | 
50 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu})


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/HW1.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | #include "utils.h"
 5 | #include <cuda.h>
 6 | #include <cuda_runtime.h>
 7 | #include <string>
 8 | 
 9 | cv::Mat imageRGBA;
10 | cv::Mat imageGrey;
11 | 
12 | uchar4        *d_rgbaImage__;
13 | unsigned char *d_greyImage__;
14 | 
15 | size_t numRows() { return imageRGBA.rows; }
16 | size_t numCols() { return imageRGBA.cols; }
17 | 
18 | //return types are void since any internal error will be handled by quitting
19 | //no point in returning error codes...
20 | //returns a pointer to an RGBA version of the input image
21 | //and a pointer to the single channel grey-scale output
22 | //on both the host and device
23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage,
24 |                 uchar4 **d_rgbaImage, unsigned char **d_greyImage,
25 |                 const std::string &filename) {
26 |   //make sure the context initializes ok
27 |   checkCudaErrors(cudaFree(0));
28 | 
29 |   cv::Mat image;
30 |   image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
31 |   if (image.empty()) {
32 |     std::cerr << "Couldn't open file: " << filename << std::endl;
33 |     exit(1);
34 |   }
35 | 
36 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
37 | 
38 |   //allocate memory for the output
39 |   imageGrey.create(image.rows, image.cols, CV_8UC1);
40 | 
41 |   //This shouldn't ever happen given the way the images are created
42 |   //at least based upon my limited understanding of OpenCV, but better to check
43 |   if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
44 |     std::cerr << "Images aren't continuous!! Exiting." << std::endl;
45 |     exit(1);
46 |   }
47 | 
48 |   *inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
49 |   *greyImage  = imageGrey.ptr<unsigned char>(0);
50 | 
51 |   const size_t numPixels = numRows() * numCols();
52 |   //allocate memory on the device for both input and output
53 |   checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels));
54 |   checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels));
55 |   checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around
56 | 
57 |   //copy input array to the GPU
58 |   checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
59 | 
60 |   d_rgbaImage__ = *d_rgbaImage;
61 |   d_greyImage__ = *d_greyImage;
62 | }
63 | 
64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) {
65 |   cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
66 | 
67 |   //output the image
68 |   cv::imwrite(output_file.c_str(), output);
69 | }
70 | 
71 | void cleanup()
72 | {
73 |   //cleanup
74 |   cudaFree(d_rgbaImage__);
75 |   cudaFree(d_greyImage__);
76 | }
77 | 
78 | void generateReferenceImage(std::string input_filename, std::string output_filename)
79 | {
80 |   cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE);
81 | 
82 |   cv::imwrite(output_filename, reference);
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | # or if using MacPorts
19 | 
20 | #OPENCV_LIBPATH=/opt/local/lib
21 | #OPENCV_INCLUDEPATH=/opt/local/include
22 | 
23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
24 | 
25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
26 | 
27 | ######################################################
28 | # On Macs the default install locations are below    #
29 | # ####################################################
30 | 
31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
32 | #CUDA_LIBPATH=/usr/local/cuda/lib
33 | 
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 | 
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 | 
38 | student: main.o student_func.o compare.o reference_calc.o Makefile
39 | 	$(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 | 
41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp
42 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH)
43 | 
44 | student_func.o: student_func.cu utils.h
45 | 	nvcc -c student_func.cu $(NVCC_OPTS)
46 | 
47 | compare.o: compare.cpp compare.h
48 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 | 
50 | reference_calc.o: reference_calc.cpp reference_calc.h
51 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 | 
53 | clean:
54 | 	rm -f *.o *.png hw
55 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre_small.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | 
 5 | #include "utils.h"
 6 | 
 7 | void compareImages(std::string reference_filename, std::string test_filename, 
 8 |                    bool useEpsCheck, double perPixelError, double globalError)
 9 | {
10 |   cv::Mat reference = cv::imread(reference_filename, -1);
11 |   cv::Mat test = cv::imread(test_filename, -1);
12 | 
13 |   cv::Mat diff = abs(reference - test);
14 | 
15 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 | 
17 |   double minVal, maxVal;
18 | 
19 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 | 
21 |   //now perform transform so that we bump values to the full range
22 | 
23 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 | 
25 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
26 | 
27 |   cv::imwrite("HW1_differenceImage.png", diff);
28 |   //OK, now we can start comparing values...
29 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
30 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
31 | 
32 |   if (useEpsCheck) {
33 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 |   }
35 |   else
36 |   {
37 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 |   }
39 | 
40 |   std::cout << "PASS" << std::endl;
41 |   return;
42 | }
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, 
5 |                    bool useEpsCheck, double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/main.cpp:
--------------------------------------------------------------------------------
 1 | //Udacity HW1 Solution
 2 | 
 3 | #include <iostream>
 4 | #include "timer.h"
 5 | #include "utils.h"
 6 | #include <string>
 7 | #include <stdio.h>
 8 | #include "reference_calc.h"
 9 | #include "compare.h"
10 | 
11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, 
12 |                             uchar4 * const d_rgbaImage,
13 |                             unsigned char* const d_greyImage, 
14 |                             size_t numRows, size_t numCols);
15 | 
16 | //include the definitions of the above functions for this homework
17 | #include "HW1.cpp"
18 | 
19 | int main(int argc, char **argv) {
20 |   uchar4        *h_rgbaImage, *d_rgbaImage;
21 |   unsigned char *h_greyImage, *d_greyImage;
22 | 
23 |   std::string input_file;
24 |   std::string output_file;
25 |   std::string reference_file;
26 |   double perPixelError = 0.0;
27 |   double globalError   = 0.0;
28 |   bool useEpsCheck = false;
29 |   switch (argc)
30 |   {
31 | 	case 2:
32 | 	  input_file = std::string(argv[1]);
33 | 	  output_file = "HW1_output.png";
34 | 	  reference_file = "HW1_reference.png";
35 | 	  break;
36 | 	case 3:
37 | 	  input_file  = std::string(argv[1]);
38 |       output_file = std::string(argv[2]);
39 | 	  reference_file = "HW1_reference.png";
40 | 	  break;
41 | 	case 4:
42 | 	  input_file  = std::string(argv[1]);
43 |       output_file = std::string(argv[2]);
44 | 	  reference_file = std::string(argv[3]);
45 | 	  break;
46 | 	case 6:
47 | 	  useEpsCheck=true;
48 | 	  input_file  = std::string(argv[1]);
49 | 	  output_file = std::string(argv[2]);
50 | 	  reference_file = std::string(argv[3]);
51 | 	  perPixelError = atof(argv[4]);
52 |       globalError   = atof(argv[5]);
53 | 	  break;
54 | 	default:
55 |       std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
56 |       exit(1);
57 |   }
58 |   //load the image and give us our input and output pointers
59 |   preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
60 | 
61 |   GpuTimer timer;
62 |   timer.Start();
63 |   //call the students' code
64 |   your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
65 |   timer.Stop();
66 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
67 | 
68 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
69 | 
70 |   if (err < 0) {
71 |     //Couldn't print! Probably the student closed stdout - bad news
72 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
73 |     exit(1);
74 |   }
75 | 
76 |   size_t numPixels = numRows()*numCols();
77 |   checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));
78 | 
79 |   //check results and output the grey image
80 |   postProcess(output_file, h_greyImage);
81 | 
82 |   referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());
83 | 
84 |   postProcess(reference_file, h_greyImage);
85 | 
86 |   //generateReferenceImage(input_file, reference_file);
87 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
88 |                 globalError);
89 | 
90 |   cleanup();
91 | 
92 |   return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | // for uchar4 struct
 2 | #include <cuda_runtime.h>
 3 | 
 4 | void referenceCalculation(const uchar4* const rgbaImage,
 5 |                           unsigned char *const greyImage,
 6 |                           size_t numRows,
 7 |                           size_t numCols)
 8 | {
 9 |   for (size_t r = 0; r < numRows; ++r) {
10 |     for (size_t c = 0; c < numCols; ++c) {
11 |       uchar4 rgba = rgbaImage[r * numCols + c];
12 |       float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
13 |       greyImage[r * numCols + c] = channelSum;
14 |     }
15 |   }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 |                           unsigned char *const greyImage,
6 |                           size_t numRows,
7 |                           size_t numCols);
8 | 
9 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/student_func.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/student_func.cu


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | #include <algorithm>
12 | 
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | template<typename T>
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 |   //check that the GPU result matches the CPU result
27 |   for (size_t i = 0; i < numElem; ++i) {
28 |     if (ref[i] != gpu[i]) {
29 |       std::cerr << "Difference at pos " << i << std::endl;
30 |       //the + is magic to convert char to int without messing
31 |       //with other types
32 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 |                  "\nGPU      : " << +gpu[i] << std::endl;
34 |       exit(1);
35 |     }
36 |   }
37 | }
38 | 
39 | template<typename T>
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 |   assert(eps1 >= 0 && eps2 >= 0);
42 |   unsigned long long totalDiff = 0;
43 |   unsigned numSmallDifferences = 0;
44 |   for (size_t i = 0; i < numElem; ++i) {
45 |     //subtract smaller from larger in case of unsigned types
46 |     T smaller = std::min(ref[i], gpu[i]);
47 |     T larger = std::max(ref[i], gpu[i]);
48 |     T diff = larger - smaller;
49 |     if (diff > 0 && diff <= eps1) {
50 |       numSmallDifferences++;
51 |     }
52 |     else if (diff > eps1) {
53 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 |         "\nGPU      : " << +gpu[i] << std::endl;
56 |       exit(1);
57 |     }
58 |     totalDiff += diff * diff;
59 |   }
60 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 |   if (percentSmallDifferences > eps2) {
62 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 |     exit(1);
65 |   }
66 | }
67 | 
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template<typename T>
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 | 
74 |   size_t numBadPixels = 0;
75 |   for (size_t i = 0; i < numElem; ++i) {
76 |     T smaller = std::min(ref[i], gpu[i]);
77 |     T larger = std::max(ref[i], gpu[i]);
78 |     T diff = larger - smaller;
79 |     if (diff > variance)
80 |       ++numBadPixels;
81 |   }
82 | 
83 |   if (numBadPixels > tolerance) {
84 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 |     exit(1);
86 |   }
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp)
13 |     
14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu})
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/HW2.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include "utils.h"
  5 | #include <cuda.h>
  6 | #include <cuda_runtime.h>
  7 | #include <string>
  8 | 
  9 | cv::Mat imageInputRGBA;
 10 | cv::Mat imageOutputRGBA;
 11 | 
 12 | uchar4 *d_inputImageRGBA__;
 13 | uchar4 *d_outputImageRGBA__;
 14 | 
 15 | float *h_filter__;
 16 | 
 17 | size_t numRows() { return imageInputRGBA.rows; }
 18 | size_t numCols() { return imageInputRGBA.cols; }
 19 | 
 20 | //return types are void since any internal error will be handled by quitting
 21 | //no point in returning error codes...
 22 | //returns a pointer to an RGBA version of the input image
 23 | //and a pointer to the single channel grey-scale output
 24 | //on both the host and device
 25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
 26 |                 uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
 27 |                 unsigned char **d_redBlurred,
 28 |                 unsigned char **d_greenBlurred,
 29 |                 unsigned char **d_blueBlurred,
 30 |                 float **h_filter, int *filterWidth,
 31 |                 const std::string &filename) {
 32 | 
 33 |   //make sure the context initializes ok
 34 |   checkCudaErrors(cudaFree(0));
 35 | 
 36 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 37 |   if (image.empty()) {
 38 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 39 |     exit(1);
 40 |   }
 41 | 
 42 |   cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);
 43 | 
 44 |   //allocate memory for the output
 45 |   imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);
 46 | 
 47 |   //This shouldn't ever happen given the way the images are created
 48 |   //at least based upon my limited understanding of OpenCV, but better to check
 49 |   if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
 50 |     std::cerr << "Images aren't continuous!! Exiting." << std::endl;
 51 |     exit(1);
 52 |   }
 53 | 
 54 |   *h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
 55 |   *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);
 56 | 
 57 |   const size_t numPixels = numRows() * numCols();
 58 |   //allocate memory on the device for both input and output
 59 |   checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
 60 |   checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
 61 |   checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around
 62 | 
 63 |   //copy input array to the GPU
 64 |   checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
 65 | 
 66 |   d_inputImageRGBA__  = *d_inputImageRGBA;
 67 |   d_outputImageRGBA__ = *d_outputImageRGBA;
 68 | 
 69 |   //now create the filter that they will use
 70 |   const int blurKernelWidth = 9;
 71 |   const float blurKernelSigma = 2.;
 72 | 
 73 |   *filterWidth = blurKernelWidth;
 74 | 
 75 |   //create and fill the filter we will convolve with
 76 |   *h_filter = new float[blurKernelWidth * blurKernelWidth];
 77 |   h_filter__ = *h_filter;
 78 | 
 79 |   float filterSum = 0.f; //for normalization
 80 | 
 81 |   for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
 82 |     for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
 83 |       float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
 84 |       (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
 85 |       filterSum += filterValue;
 86 |     }
 87 |   }
 88 | 
 89 |   float normalizationFactor = 1.f / filterSum;
 90 | 
 91 |   for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
 92 |     for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
 93 |       (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
 94 |     }
 95 |   }
 96 | 
 97 |   //blurred
 98 |   checkCudaErrors(cudaMalloc(d_redBlurred,    sizeof(unsigned char) * numPixels));
 99 |   checkCudaErrors(cudaMalloc(d_greenBlurred,  sizeof(unsigned char) * numPixels));
100 |   checkCudaErrors(cudaMalloc(d_blueBlurred,   sizeof(unsigned char) * numPixels));
101 |   checkCudaErrors(cudaMemset(*d_redBlurred,   0, sizeof(unsigned char) * numPixels));
102 |   checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
103 |   checkCudaErrors(cudaMemset(*d_blueBlurred,  0, sizeof(unsigned char) * numPixels));
104 | }
105 | 
106 | void postProcess(const std::string& output_file, uchar4* data_ptr) {
107 |   cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
108 | 
109 |   cv::Mat imageOutputBGR;
110 |   cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
111 |   //output the image
112 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
113 | }
114 | 
115 | void cleanUp(void)
116 | {
117 |   cudaFree(d_inputImageRGBA__);
118 |   cudaFree(d_outputImageRGBA__);
119 |   delete[] h_filter__;
120 | }
121 | 
122 | 
123 | // An unused bit of code showing how to accomplish this assignment using OpenCV.  It is much faster 
124 | //    than the naive implementation in reference_calc.cpp.
125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size)
126 | {
127 | 	cv::Mat input = cv::imread(input_file);
128 | 	// Create an identical image for the output as a placeholder
129 | 	cv::Mat reference = cv::imread(input_file);
130 | 	cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0);
131 | 	cv::imwrite(reference_file, reference);
132 | }
133 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
20 | 
21 | ######################################################
22 | # On Macs the default install locations are below    #
23 | # ####################################################
24 | 
25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
26 | #CUDA_LIBPATH=/usr/local/cuda/lib
27 | 
28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
29 | 
30 | GCC_OPTS=-O3 -Wall -Wextra -m64
31 | 
32 | student: main.o student_func.o compare.o reference_calc.o Makefile
33 | 	$(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
34 | 
35 | main.o: main.cpp timer.h utils.h HW2.cpp
36 | 	g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH)
37 | 
38 | student_func.o: student_func.cu reference_calc.cpp utils.h
39 | 	nvcc -c student_func.cu $(NVCC_OPTS)
40 | 
41 | compare.o: compare.cpp compare.h
42 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 | 
44 | reference_calc.o: reference_calc.cpp reference_calc.h
45 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
46 | 
47 | clean:
48 | 	rm -f *.o *.png hw
49 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre_small.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | 
 5 | #include "utils.h"
 6 | 
 7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 8 | 				   double perPixelError, double globalError)
 9 | {
10 |   cv::Mat reference = cv::imread(reference_filename, -1);
11 |   cv::Mat test = cv::imread(test_filename, -1);
12 | 
13 |   cv::Mat diff = abs(reference - test);
14 | 
15 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 | 
17 |   double minVal, maxVal;
18 | 
19 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 | 
21 |   //now perform transform so that we bump values to the full range
22 | 
23 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 | 
25 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
26 | 
27 |   cv::imwrite("HW2_differenceImage.png", diff);
28 |   //OK, now we can start comparing values...
29 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
30 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
31 | 
32 |   if (useEpsCheck) {
33 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 |   }
35 |   else
36 |   {
37 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 |   }
39 | 
40 |   std::cout << "PASS" << std::endl;
41 |   return;
42 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW2 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | 
  9 | #include "reference_calc.h"
 10 | #include "compare.h"
 11 | 
 12 | //include the definitions of the above functions for this homework
 13 | #include "HW2.cpp"
 14 | 
 15 | 
 16 | /*******  DEFINED IN student_func.cu *********/
 17 | 
 18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
 19 |                         uchar4* const d_outputImageRGBA,
 20 |                         const size_t numRows, const size_t numCols,
 21 |                         unsigned char *d_redBlurred,
 22 |                         unsigned char *d_greenBlurred,
 23 |                         unsigned char *d_blueBlurred,
 24 |                         const int filterWidth);
 25 | 
 26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
 27 |                                 const float* const h_filter, const size_t filterWidth);
 28 | 
 29 | 
 30 | /*******  Begin main *********/
 31 | 
 32 | int main(int argc, char **argv) {
 33 |   uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
 34 |   uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
 35 |   unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;
 36 | 
 37 |   float *h_filter;
 38 |   int    filterWidth;
 39 | 
 40 |   std::string input_file;
 41 |   std::string output_file;
 42 |   std::string reference_file;
 43 |   double perPixelError = 0.0;
 44 |   double globalError   = 0.0;
 45 |   bool useEpsCheck = false;
 46 |   switch (argc)
 47 |   {
 48 | 	case 2:
 49 | 	  input_file = std::string(argv[1]);
 50 | 	  output_file = "HW2_output.png";
 51 | 	  reference_file = "HW2_reference.png";
 52 | 	  break;
 53 | 	case 3:
 54 | 	  input_file  = std::string(argv[1]);
 55 |       output_file = std::string(argv[2]);
 56 | 	  reference_file = "HW2_reference.png";
 57 | 	  break;
 58 | 	case 4:
 59 | 	  input_file  = std::string(argv[1]);
 60 |       output_file = std::string(argv[2]);
 61 | 	  reference_file = std::string(argv[3]);
 62 | 	  break;
 63 | 	case 6:
 64 | 	  useEpsCheck=true;
 65 | 	  input_file  = std::string(argv[1]);
 66 | 	  output_file = std::string(argv[2]);
 67 | 	  reference_file = std::string(argv[3]);
 68 | 	  perPixelError = atof(argv[4]);
 69 |       globalError   = atof(argv[5]);
 70 | 	  break;
 71 | 	default:
 72 |       std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 73 |       exit(1);
 74 |   }
 75 |   //load the image and give us our input and output pointers
 76 |   preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
 77 |              &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
 78 |              &h_filter, &filterWidth, input_file);
 79 | 
 80 |   allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
 81 |   GpuTimer timer;
 82 |   timer.Start();
 83 |   //call the students' code
 84 |   your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
 85 |                      d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
 86 |   timer.Stop();
 87 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 88 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 89 | 
 90 |   if (err < 0) {
 91 |     //Couldn't print! Probably the student closed stdout - bad news
 92 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 93 |     exit(1);
 94 |   }
 95 | 
 96 |   //check results and output the blurred image
 97 | 
 98 |   size_t numPixels = numRows()*numCols();
 99 |   //copy the output back to the host
100 |   checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
101 | 
102 |   postProcess(output_file, h_outputImageRGBA);
103 | 
104 |   referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
105 |                        numRows(), numCols(),
106 |                        h_filter, filterWidth);
107 | 
108 |   postProcess(reference_file, h_outputImageRGBA);
109 | 
110 |     //  Cheater easy way with OpenCV
111 |     //generateReferenceImage(input_file, reference_file, filterWidth);
112 | 
113 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
114 | 
115 |   checkCudaErrors(cudaFree(d_redBlurred));
116 |   checkCudaErrors(cudaFree(d_greenBlurred));
117 |   checkCudaErrors(cudaFree(d_blueBlurred));
118 | 
119 |   cleanUp();
120 | 
121 |   return 0;
122 | }
123 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | // for uchar4 struct
 4 | #include <cuda_runtime.h>
 5 | 
 6 | void channelConvolution(const unsigned char* const channel,
 7 |                         unsigned char* const channelBlurred,
 8 |                         const size_t numRows, const size_t numCols,
 9 |                         const float *filter, const int filterWidth)
10 | {
11 |   //Dealing with an even width filter is trickier
12 |   assert(filterWidth % 2 == 1);
13 | 
14 |   //For every pixel in the image
15 |   for (int r = 0; r < (int)numRows; ++r) {
16 |     for (int c = 0; c < (int)numCols; ++c) {
17 |       float result = 0.f;
18 |       //For every value in the filter around the pixel (c, r)
19 |       for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) {
20 |         for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) {
21 |           //Find the global image position for this filter position
22 |           //clamp to boundary of the image
23 | 		  int image_r = std::min(std::max(r + filter_r, 0), static_cast<int>(numRows - 1));
24 |           int image_c = std::min(std::max(c + filter_c, 0), static_cast<int>(numCols - 1));
25 | 
26 |           float image_value = static_cast<float>(channel[image_r * numCols + image_c]);
27 |           float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2];
28 | 
29 |           result += image_value * filter_value;
30 |         }
31 |       }
32 | 
33 |       channelBlurred[r * numCols + c] = result;
34 |     }
35 |   }
36 | }
37 | 
38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
39 |                           size_t numRows, size_t numCols,
40 |                           const float* const filter, const int filterWidth)
41 | {
42 |   unsigned char *red   = new unsigned char[numRows * numCols];
43 |   unsigned char *blue  = new unsigned char[numRows * numCols];
44 |   unsigned char *green = new unsigned char[numRows * numCols];
45 | 
46 |   unsigned char *redBlurred   = new unsigned char[numRows * numCols];
47 |   unsigned char *blueBlurred  = new unsigned char[numRows * numCols];
48 |   unsigned char *greenBlurred = new unsigned char[numRows * numCols];
49 | 
50 |   //First we separate the incoming RGBA image into three separate channels
51 |   //for Red, Green and Blue
52 |   for (size_t i = 0; i < numRows * numCols; ++i) {
53 |     uchar4 rgba = rgbaImage[i];
54 |     red[i]   = rgba.x;
55 |     green[i] = rgba.y;
56 |     blue[i]  = rgba.z;
57 |   }
58 | 
59 |   //Now we can do the convolution for each of the color channels
60 |   channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth);
61 |   channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth);
62 |   channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth);
63 | 
64 |   //now recombine into the output image - Alpha is 255 for no transparency
65 |   for (size_t i = 0; i < numRows * numCols; ++i) {
66 |     uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255);
67 |     outputImage[i] = rgba;
68 |   }
69 | 
70 |   delete[] red;
71 |   delete[] green;
72 |   delete[] blue;
73 | 
74 |   delete[] redBlurred;
75 |   delete[] greenBlurred;
76 |   delete[] blueBlurred;
77 | }
78 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
5 |                           size_t numRows, size_t numCols,
6 |                           const float* const filter, const int filterWidth);
7 | 
8 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/student_func.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/student_func.cu


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <algorithm>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | # minimum required cmake version
 8 | cmake_minimum_required(VERSION 2.8)
 9 | find_package(CUDA QUIET REQUIRED)
10 | 
11 | SET (compare_files compare.cpp)
12 | 
13 | file( GLOB  hdr *.hpp *.h )
14 | file( GLOB  cu  *.cu)
15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
16 |     
17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu})
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | 
20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
21 | 
22 | ######################################################
23 | # On Macs the default install locations are below    #
24 | # ####################################################
25 | 
26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
27 | #CUDA_LIBPATH=/usr/local/cuda/lib
28 | 
29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
30 | 
31 | GCC_OPTS=-O3 -Wall -Wextra -m64
32 | 
33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile
34 | 	$(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
35 | 
36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h
37 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
38 | 
39 | HW3.o: HW3.cu loadSaveImage.h utils.h
40 | 	$(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
41 | 
42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
43 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
44 | 
45 | compare.o: compare.cpp compare.h
46 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
47 | 
48 | reference_calc.o: reference_calc.cpp reference_calc.h
49 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
50 | 
51 | student_func.o: student_func.cu utils.h
52 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
53 | 
54 | clean:
55 | 	rm -f *.o hw
56 | 	find . -type f -name '*.exr' | grep -v memorial | xargs rm -f
57 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 5 | 				   double perPixelError, double globalError)
 6 | {
 7 |   cv::Mat reference = cv::imread(reference_filename, -1);
 8 |   cv::Mat test = cv::imread(test_filename, -1);
 9 | 
10 |   cv::Mat diff = abs(reference - test);
11 | 
12 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 | 
14 |   double minVal, maxVal;
15 | 
16 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 | 
18 |   //now perform transform so that we bump values to the full range
19 | 
20 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 | 
22 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
23 | 
24 |   cv::imwrite("HW3_differenceImage.png", diff);
25 |   //OK, now we can start comparing values...
26 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
27 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
28 | 
29 |   if (useEpsCheck) {
30 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 |   }
32 |   else
33 |   {
34 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 |   }
36 | 
37 |   std::cout << "PASS" << std::endl;
38 |   return;
39 | }
40 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include <stdio.h>
  6 | #include "cuda_runtime.h"
  7 | 
  8 | //The caller becomes responsible for the returned pointer. This
  9 | //is done in the interest of keeping this code as simple as possible.
 10 | //In production code this is a bad idea - we should use RAII
 11 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 12 | //CODE!!!
 13 | void loadImageHDR(const std::string &filename,
 14 |                   float **imagePtr,
 15 |                   size_t *numRows, size_t *numCols)
 16 | {
 17 |     cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 18 | 
 19 |     cv::Mat image;
 20 | 
 21 |     if(originImg.type() != CV_32FC3){
 22 |       originImg.convertTo(image,CV_32FC3);
 23 |     } else{
 24 |       image = originImg;
 25 |     }
 26 | 
 27 |   if (image.empty()) {
 28 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   if (image.channels() != 3) {
 33 |     std::cerr << "Image must be color!" << std::endl;
 34 |     exit(1);
 35 |   }
 36 | 
 37 |   if (!image.isContinuous()) {
 38 |     std::cerr << "Image isn't continuous!" << std::endl;
 39 |     exit(1);
 40 |   }
 41 | 
 42 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 43 | 
 44 |   float *cvPtr = image.ptr<float>(0);
 45 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 46 |     (*imagePtr)[i] = cvPtr[i];
 47 | 
 48 |   *numRows = image.rows;
 49 |   *numCols = image.cols;
 50 | }
 51 | 
 52 | void loadImageRGBA(const std::string &filename,
 53 |                    uchar4 **imagePtr,
 54 |                    size_t *numRows, size_t *numCols)
 55 | {
 56 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 57 |   if (image.empty()) {
 58 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   if (image.channels() != 3) {
 63 |     std::cerr << "Image must be color!" << std::endl;
 64 |     exit(1);
 65 |   }
 66 | 
 67 |   if (!image.isContinuous()) {
 68 |     std::cerr << "Image isn't continuous!" << std::endl;
 69 |     exit(1);
 70 |   }
 71 | 
 72 |   cv::Mat imageRGBA;
 73 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 74 | 
 75 |   *imagePtr = new uchar4[image.rows * image.cols];
 76 | 
 77 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 78 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 79 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
 80 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
 81 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
 82 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
 83 |   }
 84 | 
 85 |   *numRows = image.rows;
 86 |   *numCols = image.cols;
 87 | }
 88 | 
 89 | void saveImageRGBA(const uchar4* const image,
 90 |                    const size_t numRows, const size_t numCols,
 91 |                    const std::string &output_file)
 92 | {
 93 |   int sizes[2];
 94 |   sizes[0] = numRows;
 95 |   sizes[1] = numCols;
 96 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
 97 |   cv::Mat imageOutputBGR;
 98 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
 99 |   //output the image
100 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
101 | }
102 | 
103 | //output an exr file
104 | //assumed to already be BGR
105 | void saveImageHDR(const float* const image,
106 |                   const size_t numRows, const size_t numCols,
107 |                   const std::string &output_file)
108 | {
109 |   int sizes[2];
110 |   sizes[0] = numRows;
111 |   sizes[1] = numCols;
112 | 
113 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
114 | 
115 |   imageHDR = imageHDR * 255;
116 | 
117 |   cv::imwrite(output_file.c_str(), imageHDR);
118 | }
119 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void saveImageRGBA(const uchar4* const image,
16 |                    const size_t numRows, const size_t numCols,
17 |                    const std::string &output_file);
18 | 
19 | void saveImageHDR(const float* const image,
20 |                   const size_t numRows, const size_t numCols,
21 |                   const std::string &output_file);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW3 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | #include <algorithm>
  9 | 
 10 | #include "compare.h"
 11 | #include "reference_calc.h"
 12 | 
 13 | // Functions from HW3.cu
 14 | void preProcess(float **d_luminance, unsigned int **d_cdf,
 15 |                 size_t *numRows, size_t *numCols, unsigned int *numBins,
 16 |                 const std::string& filename);
 17 | 
 18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
 19 |                  float min_logLum, float max_logLum);
 20 | 
 21 | void cleanupGlobalMemory(void);
 22 | 
 23 | // Function from student_func.cu
 24 | void your_histogram_and_prefixsum(const float* const d_luminance,
 25 |                                   unsigned int* const d_cdf,
 26 |                                   float &min_logLum,
 27 |                                   float &max_logLum,
 28 |                                   const size_t numRows,
 29 |                                   const size_t numCols,
 30 |                                   const size_t numBins);
 31 | 
 32 | 
 33 | int main(int argc, char **argv) {
 34 |   float *d_luminance;
 35 |   unsigned int *d_cdf;
 36 | 
 37 |   size_t numRows, numCols;
 38 |   unsigned int numBins;
 39 | 
 40 |   std::string input_file;
 41 |   std::string output_file;
 42 |   std::string reference_file;
 43 |   double perPixelError = 0.0;
 44 |   double globalError   = 0.0;
 45 |   bool useEpsCheck = false;
 46 | 
 47 |   switch (argc)
 48 |   {
 49 | 	case 2:
 50 | 	  input_file = std::string(argv[1]);
 51 | 	  output_file = "HW3_output.png";
 52 | 	  reference_file = "HW3_reference.png";
 53 | 	  break;
 54 | 	case 3:
 55 | 	  input_file  = std::string(argv[1]);
 56 |       output_file = std::string(argv[2]);
 57 | 	  reference_file = "HW3_reference.png";
 58 | 	  break;
 59 | 	case 4:
 60 | 	  input_file  = std::string(argv[1]);
 61 |       output_file = std::string(argv[2]);
 62 | 	  reference_file = std::string(argv[3]);
 63 | 	  break;
 64 | 	case 6:
 65 | 	  useEpsCheck=true;
 66 | 	  input_file  = std::string(argv[1]);
 67 | 	  output_file = std::string(argv[2]);
 68 | 	  reference_file = std::string(argv[3]);
 69 | 	  perPixelError = atof(argv[4]);
 70 |       globalError   = atof(argv[5]);
 71 | 	  break;
 72 | 	default:
 73 |       std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 74 |       exit(1);
 75 |   }
 76 |   //load the image and give us our input and output pointers
 77 |   preProcess(&d_luminance, &d_cdf,
 78 |              &numRows, &numCols, &numBins, input_file);
 79 | 
 80 |   GpuTimer timer;
 81 |   float min_logLum, max_logLum;
 82 |   min_logLum = 0.f;
 83 |   max_logLum = 1.f;
 84 |   timer.Start();
 85 |   //call the students' code
 86 |   your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
 87 |                                numRows, numCols, numBins);
 88 |   timer.Stop();
 89 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 90 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 91 | 
 92 |   if (err < 0) {
 93 |     //Couldn't print! Probably the student closed stdout - bad news
 94 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 95 |     exit(1);
 96 |   }
 97 | 
 98 |   float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
 99 |   unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);
100 | 
101 |   checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));
102 | 
103 |   //check results and output the tone-mapped image
104 |   postProcess(output_file, numRows, numCols, min_logLum, max_logLum);
105 | 
106 |   for (size_t i = 1; i < numCols * numRows; ++i) {
107 | 	min_logLum = std::min(h_luminance[i], min_logLum);
108 |     max_logLum = std::max(h_luminance[i], max_logLum);
109 |   }
110 | 
111 |   referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);
112 | 
113 |   checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));
114 | 
115 |   //check results and output the tone-mapped image
116 |   postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);
117 | 
118 |   cleanupGlobalMemory();
119 | 
120 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 | 
122 |   return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial.exr


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_large.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_large.exr


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png_large.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png_large.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw_large.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | 
 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
 5 |                           const size_t numRows, const size_t numCols, const size_t numBins, 
 6 | 						  float &logLumMin, float &logLumMax)
 7 | {
 8 |   logLumMin = h_logLuminance[0];
 9 |   logLumMax = h_logLuminance[0];
10 | 
11 |   //Step 1
12 |   //first we find the minimum and maximum across the entire image
13 |   for (size_t i = 1; i < numCols * numRows; ++i) {
14 |     logLumMin = std::min(h_logLuminance[i], logLumMin);
15 |     logLumMax = std::max(h_logLuminance[i], logLumMax);
16 |   }
17 | 
18 |   //Step 2
19 |   float logLumRange = logLumMax - logLumMin;
20 | 
21 |   //Step 3
22 |   //next we use the now known range to compute
23 |   //a histogram of numBins bins
24 |   unsigned int *histo = new unsigned int[numBins];
25 | 
26 |   for (size_t i = 0; i < numBins; ++i) histo[i] = 0;
27 | 
28 |   for (size_t i = 0; i < numCols * numRows; ++i) {
29 |     unsigned int bin = std::min(static_cast<unsigned int>(numBins - 1),
30 |                            static_cast<unsigned int>((h_logLuminance[i] - logLumMin) / logLumRange * numBins));
31 |     histo[bin]++;
32 |   }
33 | 
34 |   //Step 4
35 |   //finally we perform and exclusive scan (prefix sum)
36 |   //on the histogram to get the cumulative distribution
37 |   h_cdf[0] = 0;
38 |   for (size_t i = 1; i < numBins; ++i) {
39 |     h_cdf[i] = h_cdf[i - 1] + histo[i - 1];
40 |   }
41 | 
42 |   delete[] histo;
43 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 |                           const size_t numRows, const size_t numCols, const size_t numBins, 
6 | 						  float &logLumMin, float &logLumMax);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <algorithm>
 7 | #include <cuda.h>
 8 | #include <cuda_runtime.h>
 9 | #include <cuda_runtime_api.h>
10 | #include <cassert>
11 | #include <cmath>
12 | 
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | template<typename T>
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 |   //check that the GPU result matches the CPU result
27 |   for (size_t i = 0; i < numElem; ++i) {
28 |     if (ref[i] != gpu[i]) {
29 |       std::cerr << "Difference at pos " << i << std::endl;
30 |       //the + is magic to convert char to int without messing
31 |       //with other types
32 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 |                  "\nGPU      : " << +gpu[i] << std::endl;
34 |       exit(1);
35 |     }
36 |   }
37 | }
38 | 
39 | template<typename T>
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 |   assert(eps1 >= 0 && eps2 >= 0);
42 |   unsigned long long totalDiff = 0;
43 |   unsigned numSmallDifferences = 0;
44 |   for (size_t i = 0; i < numElem; ++i) {
45 |     //subtract smaller from larger in case of unsigned types
46 |     T smaller = std::min(ref[i], gpu[i]);
47 |     T larger = std::max(ref[i], gpu[i]);
48 |     T diff = larger - smaller;
49 |     if (diff > 0 && diff <= eps1) {
50 |       numSmallDifferences++;
51 |     }
52 |     else if (diff > eps1) {
53 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 |         "\nGPU      : " << +gpu[i] << std::endl;
56 |       exit(1);
57 |     }
58 |     totalDiff += diff * diff;
59 |   }
60 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 |   if (percentSmallDifferences > eps2) {
62 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 |     exit(1);
65 |   }
66 | }
67 | 
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template<typename T>
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 | 
74 |   size_t numBadPixels = 0;
75 |   for (size_t i = 0; i < numElem; ++i) {
76 |     T smaller = std::min(ref[i], gpu[i]);
77 |     T larger = std::max(ref[i], gpu[i]);
78 |     T diff = larger - smaller;
79 |     if (diff > variance)
80 |       ++numBadPixels;
81 |   }
82 | 
83 |   if (numBadPixels > tolerance) {
84 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 |     exit(1);
86 |   }
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu})
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
 2 | #NVCC=nvcc
 3 | 
 4 | ###################################
 5 | # These are the default install   #
 6 | # locations on most linux distros #
 7 | ###################################
 8 | 
 9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 | 
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 | 
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 | 
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 | 
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 | 
26 | ######################################################
27 | # On Macs the default install locations are below    #
28 | # ####################################################
29 | 
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 | 
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 | 
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 | 
38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile
39 | 	$(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 | 
41 | main.o: main.cpp timer.h utils.h reference_calc.h
42 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 | 
44 | HW4.o: HW4.cu loadSaveImage.h utils.h
45 | 	$(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
46 | 
47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
48 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 | 
50 | compare.o: compare.cpp compare.h
51 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 | 
53 | reference_calc.o: reference_calc.cpp reference_calc.h
54 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
55 | 
56 | student_func.o: student_func.cu reference_calc.cpp utils.h
57 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
58 | 
59 | clean:
60 | 	rm -f *.o *.png hw
61 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | 
 5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 6 | 				   double perPixelError, double globalError)
 7 | {
 8 |   cv::Mat reference = cv::imread(reference_filename, -1);
 9 |   cv::Mat test = cv::imread(test_filename, -1);
10 | 
11 |   cv::Mat diff = abs(reference - test);
12 | 
13 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
14 | 
15 |   double minVal, maxVal;
16 | 
17 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
18 | 
19 |   //now perform transform so that we bump values to the full range
20 | 
21 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
22 | 
23 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
24 | 
25 |   cv::imwrite("HW4_differenceImage.png", diff);
26 |   //OK, now we can start comparing values...
27 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
28 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
29 | 
30 |   if (useEpsCheck) {
31 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
32 |   }
33 |   else
34 |   {
35 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
36 |   }
37 | 
38 |   std::cout << "PASS" << std::endl;
39 |   return;
40 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW4_H__
2 | #define HW4_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include "cuda_runtime.h"
  6 | 
  7 | //The caller becomes responsible for the returned pointer. This
  8 | //is done in the interest of keeping this code as simple as possible.
  9 | //In production code this is a bad idea - we should use RAII
 10 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 11 | //CODE!!!
 12 | void loadImageHDR(const std::string &filename,
 13 |                   float **imagePtr,
 14 |                   size_t *numRows, size_t *numCols)
 15 | {
 16 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 17 |   if (image.empty()) {
 18 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 19 |     exit(1);
 20 |   }
 21 | 
 22 |   if (image.channels() != 3) {
 23 |     std::cerr << "Image must be color!" << std::endl;
 24 |     exit(1);
 25 |   }
 26 | 
 27 |   if (!image.isContinuous()) {
 28 |     std::cerr << "Image isn't continuous!" << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 33 | 
 34 |   float *cvPtr = image.ptr<float>(0);
 35 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 36 |     (*imagePtr)[i] = cvPtr[i];
 37 | 
 38 |   *numRows = image.rows;
 39 |   *numCols = image.cols;
 40 | }
 41 | 
 42 | void loadImageRGBA(const std::string &filename,
 43 |                    uchar4 **imagePtr,
 44 |                    size_t *numRows, size_t *numCols)
 45 | {
 46 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 47 |   if (image.empty()) {
 48 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 49 |     exit(1);
 50 |   }
 51 | 
 52 |   if (image.channels() != 3) {
 53 |     std::cerr << "Image must be color!" << std::endl;
 54 |     exit(1);
 55 |   }
 56 | 
 57 |   if (!image.isContinuous()) {
 58 |     std::cerr << "Image isn't continuous!" << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   cv::Mat imageRGBA;
 63 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 64 | 
 65 |   *imagePtr = new uchar4[image.rows * image.cols];
 66 | 
 67 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 68 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 69 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
 70 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
 71 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
 72 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
 73 |   }
 74 | 
 75 |   *numRows = image.rows;
 76 |   *numCols = image.cols;
 77 | }
 78 | 
 79 | void saveImageRGBA(const uchar4* const image,
 80 |                    const size_t numRows, const size_t numCols,
 81 |                    const std::string &output_file)
 82 | {
 83 |   int sizes[2];
 84 |   sizes[0] = numRows;
 85 |   sizes[1] = numCols;
 86 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
 87 |   cv::Mat imageOutputBGR;
 88 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
 89 |   //output the image
 90 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
 91 | }
 92 | 
 93 | //output an exr file
 94 | //assumed to already be BGR
 95 | void saveImageHDR(const float* const image,
 96 |                   const size_t numRows, const size_t numCols,
 97 |                   const std::string &output_file)
 98 | {
 99 |   int sizes[2];
100 |   sizes[0] = numRows;
101 |   sizes[1] = numCols;
102 | 
103 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
104 | 
105 |   imageHDR = imageHDR * 255;
106 | 
107 |   cv::imwrite(output_file.c_str(), imageHDR);
108 | }
109 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void saveImageRGBA(const uchar4* const image,
16 |                    const size_t numRows, const size_t numCols,
17 |                    const std::string &output_file);
18 | 
19 | void saveImageHDR(const float* const image,
20 |                   const size_t numRows, const size_t numCols,
21 |                   const std::string &output_file);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW4 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | #include <thrust/host_vector.h>
  9 | #include <thrust/device_vector.h>
 10 | 
 11 | #include "compare.h"
 12 | #include "reference_calc.h"
 13 | 
 14 | void preProcess(unsigned int **inputVals,
 15 |                 unsigned int **inputPos,
 16 |                 unsigned int **outputVals,
 17 |                 unsigned int **outputPos,
 18 |                 size_t &numElems,
 19 |                 const std::string& filename,
 20 | 				const std::string& template_file);
 21 | 
 22 | void postProcess(const unsigned int* const outputVals,
 23 |                  const unsigned int* const outputPos,
 24 |                  const size_t numElems,
 25 |                  const std::string& output_file);
 26 | 
 27 | void your_sort(unsigned int* const inputVals,
 28 |                unsigned int* const inputPos,
 29 |                unsigned int* const outputVals,
 30 |                unsigned int* const outputPos,
 31 |                const size_t numElems);
 32 | 
 33 | void PrintFullPath(char * partialPath)
 34 | {
 35 |   char full[_MAX_PATH];
 36 |   if (_fullpath(full, partialPath, _MAX_PATH) != NULL)
 37 |     printf("Full path is: %s\n", full);
 38 |   else
 39 |     printf("Invalid path\n");
 40 | }
 41 | 
 42 | int main(int argc, char **argv) {
 43 |   unsigned int *inputVals;
 44 |   unsigned int *inputPos;
 45 |   unsigned int *outputVals;
 46 |   unsigned int *outputPos;
 47 | 
 48 |   size_t numElems = 4;
 49 |   PrintFullPath(".\\");
 50 |   std::string input_file;
 51 |   std::string template_file;
 52 |   std::string output_file;
 53 |   std::string reference_file;
 54 |   double perPixelError = 0.0;
 55 |   double globalError   = 0.0;
 56 |   bool useEpsCheck = false;
 57 | 
 58 |   switch (argc)
 59 |   {
 60 | 	case 3:
 61 | 	  input_file  = std::string(argv[1]);
 62 |       template_file = std::string(argv[2]);
 63 | 	  output_file = "HW4_output.png";
 64 | 	  break;
 65 | 	case 4:
 66 | 	  input_file  = std::string(argv[1]);
 67 |       template_file = std::string(argv[2]);
 68 | 	  output_file = std::string(argv[3]);
 69 | 	  break;
 70 | 	default:
 71 |           std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
 72 |           exit(1);
 73 |   }
 74 |   //load the image and give us our input and output pointers
 75 |   preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
 76 | 
 77 |   /*
 78 |   // Use small array to Debug
 79 |   checkCudaErrors(cudaMalloc(&inputVals, sizeof(unsigned int)* numElems));
 80 |   checkCudaErrors(cudaMalloc(&inputPos, sizeof(unsigned int)* numElems));
 81 |   checkCudaErrors(cudaMalloc(&outputVals, sizeof(unsigned int)* numElems));
 82 |   checkCudaErrors(cudaMalloc(&outputPos, sizeof(unsigned int)* numElems));
 83 |   unsigned int ll[4] = { 0, 5, 2, 7 };
 84 |   thrust::host_vector<unsigned int> h_v(ll, ll+4);
 85 |   printf("%d %d %d %d\n", h_v[0], h_v[1], h_v[2], h_v[3]);
 86 |   thrust::device_vector<unsigned int> d_v = h_v;
 87 |   cudaMemcpy(inputVals, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice);
 88 |   cudaMemcpy(inputPos, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice);
 89 |   */
 90 | 
 91 |   GpuTimer timer;
 92 |   timer.Start();
 93 | 
 94 |   thrust::device_ptr<unsigned int> d_inputVals(inputVals);
 95 |   thrust::device_ptr<unsigned int> d_inputPos(inputPos);
 96 | 
 97 |   thrust::host_vector<unsigned int> h_inputVals(d_inputVals,
 98 |     d_inputVals + numElems);
 99 |   thrust::host_vector<unsigned int> h_inputPos(d_inputPos,
100 |     d_inputPos + numElems);
101 | 
102 |   //call the students' code
103 |   your_sort(inputVals, inputPos, outputVals, outputPos, numElems);
104 | 
105 |   timer.Stop();
106 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
107 |   printf("\n");
108 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
109 | 
110 |   if (err < 0) {
111 |     //Couldn't print! Probably the student closed stdout - bad news
112 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
113 |     exit(1);
114 |   }
115 | 
116 |   // TODO: something wrong with the function postProcess??
117 |   //check results and output the red-eye corrected image
118 |   //postProcess(outputVals, outputPos, numElems, output_file);
119 | 
120 |   // check code moved from HW4.cu
121 |   /****************************************************************************
122 |   * You can use the code below to help with debugging, but make sure to       *
123 |   * comment it out again before submitting your assignment for grading,       *
124 |   * otherwise this code will take too much time and make it seem like your    *
125 |   * GPU implementation isn't fast enough.                                     *
126 |   *                                                                           *
127 |   * This code MUST RUN BEFORE YOUR CODE in case you accidentally change       *
128 |   * the input values when implementing your radix sort.                       *
129 |   *                                                                           *
130 |   * This code performs the reference radix sort on the host and compares your *
131 |   * sorted values to the reference.                                           *
132 |   *                                                                           *
133 |   * Thrust containers are used for copying memory from the GPU                *
134 |   * ************************************************************************* */
135 |   ;
136 | 
137 |   thrust::host_vector<unsigned int> h_outputVals(numElems);
138 |   thrust::host_vector<unsigned int> h_outputPos(numElems);
139 | 
140 |   reference_calculation(&h_inputVals[0], &h_inputPos[0],
141 | 						&h_outputVals[0], &h_outputPos[0],
142 | 						numElems);
143 | 
144 |   //postProcess(valsPtr, posPtr, numElems, reference_file);
145 | 
146 |   //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
147 | 
148 |   thrust::device_ptr<unsigned int> d_outputVals(outputVals);
149 |   thrust::device_ptr<unsigned int> d_outputPos(outputPos);
150 | 
151 |   thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals,
152 |                                                      d_outputVals + numElems);
153 |   thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos,
154 |                                                     d_outputPos + numElems);
155 | 
156 |   checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
157 |   checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);
158 | 
159 |   checkCudaErrors(cudaFree(inputVals));
160 |   checkCudaErrors(cudaFree(inputPos));
161 |   checkCudaErrors(cudaFree(outputVals));
162 |   checkCudaErrors(cudaFree(outputPos));
163 | 
164 |   return 0;
165 | }
166 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_5.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | // For memset
 3 | #include <cstring>
 4 | 
 5 | void reference_calculation(unsigned int* inputVals,
 6 |                            unsigned int* inputPos,
 7 |                            unsigned int* outputVals,
 8 |                            unsigned int* outputPos,
 9 |                            const size_t numElems)
10 | {
11 |   const int numBits = 1;
12 |   const int numBins = 1 << numBits;
13 | 
14 |   unsigned int *binHistogram = new unsigned int[numBins];
15 |   unsigned int *binScan      = new unsigned int[numBins];
16 | 
17 |   unsigned int *vals_src = inputVals;
18 |   unsigned int *pos_src  = inputPos;
19 | 
20 |   unsigned int *vals_dst = outputVals;
21 |   unsigned int *pos_dst  = outputPos;
22 | 
23 |   //a simple radix sort - only guaranteed to work for numBits that are multiples of 2
24 |   for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) {
25 |     unsigned int mask = (numBins - 1) << i;
26 | 
27 |     memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins
28 |     memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins
29 | 
30 |     //perform histogram of data & mask into bins
31 |     for (unsigned int j = 0; j < numElems; ++j) {
32 |       unsigned int bin = (vals_src[j] & mask) >> i;
33 |       binHistogram[bin]++;
34 |     }
35 | 
36 |     //perform exclusive prefix sum (scan) on binHistogram to get starting
37 |     //location for each bin
38 |     for (unsigned int j = 1; j < numBins; ++j) {
39 |       binScan[j] = binScan[j - 1] + binHistogram[j - 1];
40 |     }
41 | 
42 |     //Gather everything into the correct location
43 |     //need to move vals and positions
44 |     for (unsigned int j = 0; j < numElems; ++j) {
45 |       unsigned int bin = (vals_src[j] & mask) >> i;
46 |       vals_dst[binScan[bin]] = vals_src[j];
47 |       pos_dst[binScan[bin]]  = pos_src[j];
48 |       binScan[bin]++;
49 |     }
50 | 
51 |     //swap the buffers (pointers only)
52 |     std::swap(vals_dst, vals_src);
53 |     std::swap(pos_dst, pos_src);
54 |   }
55 | 
56 |   //we did an even number of iterations, need to copy from input buffer into output
57 |   std::copy(inputVals, inputVals + numElems, outputVals);
58 |   std::copy(inputPos, inputPos + numElems, outputPos);
59 | 
60 |   delete[] binHistogram;
61 |   delete[] binScan;
62 | }
63 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | 
 5 | //A simple un-optimized reference radix sort calculation
 6 | //Only deals with power-of-2 radices
 7 | 
 8 | 
 9 | void reference_calculation(unsigned int* inputVals,
10 |                            unsigned int* inputPos,
11 |                            unsigned int* outputVals,
12 |                            unsigned int* outputPos,
13 |                            const size_t numElems);
14 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/student_func.cu:
--------------------------------------------------------------------------------
  1 | //Udacity HW 4
  2 | //Radix Sorting
  3 | 
  4 | #include "utils.h"
  5 | #include <thrust/host_vector.h>
  6 | #include <thrust/device_vector.h>
  7 | #include <device_launch_parameters.h>
  8 | #include <device_functions.h>
  9 | #include <thrust/sort.h>
 10 | 
 11 | /* Red Eye Removal
 12 | ===============
 13 | 
 14 | For this assignment we are implementing red eye removal.  This is
 15 | accomplished by first creating a score for every pixel that tells us how
 16 | likely it is to be a red eye pixel.  We have already done this for you - you
 17 | are receiving the scores and need to sort them in ascending order so that we
 18 | know which pixels to alter to remove the red eye.
 19 | 
 20 | Note: ascending order == smallest to largest
 21 | 
 22 | Each score is associated with a position, when you sort the scores, you must
 23 | also move the positions accordingly.
 24 | 
 25 | Implementing Parallel Radix Sort with CUDA
 26 | ==========================================
 27 | 
 28 | The basic idea is to construct a histogram on each pass of how many of each
 29 | "digit" there are.   Then we scan this histogram so that we know where to put
 30 | the output of each digit.  For example, the first 1 must come after all the
 31 | 0s so we have to know how many 0s there are to be able to start moving 1s
 32 | into the correct position.
 33 | 
 34 | 1) Histogram of the number of occurrences of each digit
 35 | 2) Exclusive Prefix Sum of Histogram
 36 | 3) Determine relative offset of each digit
 37 | For example [0 0 1 1 0 0 1]
 38 | ->  [0 1 4 5 2 3 6]
 39 | 4) Combine the results of steps 2 & 3 to determine the final
 40 | output location for each element and move it there
 41 | 
 42 | LSB Radix sort is an out-of-place sort and you will need to ping-pong values
 43 | between the input and output buffers we have provided.  Make sure the final
 44 | sorted results end up in the output buffer!  Hint: You may need to do a copy
 45 | at the end.
 46 | 
 47 | */
 48 | 
 49 | //#define USE_THRUST
 50 | 
 51 | __global__ void print_kernel(unsigned int *d_out)
 52 | {
 53 |   printf("%d ", d_out[threadIdx.x]);
 54 | }
 55 | 
 56 | 
 57 | __global__ void histo_kernel(unsigned int * d_out, unsigned int* const d_in,
 58 |   unsigned int shift, const unsigned int numElems)
 59 | {
 60 |   unsigned int mask = 1 << shift;
 61 |   int myId = threadIdx.x + blockDim.x * blockIdx.x;
 62 |   if (myId >= numElems)  return;
 63 |   int bin = (d_in[myId] & mask) >> shift;
 64 |   atomicAdd(&d_out[bin], 1);
 65 | }
 66 | 
 67 | // Blelloch Scan - described in lecture
 68 | __global__ void sumscan_kernel(unsigned int * d_in, const size_t numBins, const unsigned int numElems)
 69 | {
 70 |   int myId = threadIdx.x;
 71 |   if (myId >= numElems)  return;
 72 |   extern __shared__ float sdata[];
 73 |   sdata[myId] = d_in[myId];
 74 |   __syncthreads();            // make sure entire block is loaded!
 75 | 
 76 |   for (int d = 1; d < numBins; d *= 2) {
 77 |     if (myId >= d) {
 78 |       sdata[myId] += sdata[myId - d];
 79 |     }
 80 |     __syncthreads();
 81 |   }
 82 |   if (myId == 0)  d_in[0] = 0;
 83 |   else  d_in[myId] = sdata[myId - 1]; //inclusive->exclusive
 84 | }
 85 | 
 86 | __global__ void makescan_kernel(unsigned int * d_in, unsigned int *d_scan,
 87 |   unsigned int shift, const unsigned int numElems)
 88 | {
 89 |   unsigned int mask = 1 << shift;
 90 |   int myId = threadIdx.x + blockDim.x * blockIdx.x;
 91 |   if (myId >= numElems)  return;
 92 |   d_scan[myId] = ((d_in[myId] & mask) >> shift) ? 0 : 1;
 93 | }
 94 | 
 95 | __global__ void move_kernel(unsigned int* const d_inputVals,
 96 |   unsigned int* const d_inputPos,
 97 |   unsigned int* const d_outputVals,
 98 |   unsigned int* const d_outputPos,
 99 |   const unsigned int numElems,
100 |   unsigned int* const d_histogram,
101 |   unsigned int* const d_scaned,
102 |   unsigned int shift)
103 | {
104 |   unsigned int mask = 1 << shift;
105 |   int myId = threadIdx.x + blockDim.x * blockIdx.x;
106 |   if (myId >= numElems)  return;
107 |   // Important! 
108 |   // Algorithm described in 7.4 of http://wykvictor.github.io/2016/04/03/Cuda-2.html 
109 |   int des_id = 0;
110 |   if ((d_inputVals[myId] & mask) >> shift) {
111 |     des_id = myId + d_histogram[1] - d_scaned[myId];
112 |   } else {
113 |     des_id = d_scaned[myId];
114 |   }
115 |   d_outputVals[des_id] = d_inputVals[myId];
116 |   d_outputPos[des_id] = d_inputPos[myId];
117 | }
118 | 
119 | #ifdef USE_THRUST
120 | void your_sort(unsigned int* const d_inputVals,
121 |   unsigned int* const d_inputPos,
122 |   unsigned int* const d_outputVals,
123 |   unsigned int* const d_outputPos,
124 |   const size_t numElems)
125 | {
126 |   // Thrust vectors wrapping raw GPU data
127 |   thrust::device_ptr<unsigned int> d_inputVals_p(d_inputVals);
128 |   thrust::device_ptr<unsigned int> d_inputPos_p(d_inputPos);
129 |   thrust::host_vector<unsigned int> h_inputVals_vec(d_inputVals_p,
130 |     d_inputVals_p + numElems);
131 |   thrust::host_vector<unsigned int> h_inputPos_vec(d_inputPos_p,
132 |     d_inputPos_p + numElems);
133 |   // ?? device_vector is wrong
134 |   thrust::sort_by_key(h_inputVals_vec.begin(), h_inputVals_vec.end(), h_inputPos_vec.begin());
135 |   checkCudaErrors(cudaMemcpy(d_outputVals, thrust::raw_pointer_cast(&h_inputVals_vec[0]),
136 |     numElems * sizeof(unsigned int), cudaMemcpyHostToDevice));
137 |   checkCudaErrors(cudaMemcpy(d_outputPos, thrust::raw_pointer_cast(&h_inputPos_vec[0]),
138 |     numElems * sizeof(unsigned int), cudaMemcpyHostToDevice));
139 | }
140 | #else
141 | void your_sort(unsigned int* const d_inputVals,
142 |   unsigned int* const d_inputPos,
143 |   unsigned int* const d_outputVals,
144 |   unsigned int* const d_outputPos,
145 |   const size_t numElems)
146 | {
147 |   // use how many bits/time to compare(maybe 4 is most efficent)
148 |   const int numBits = 1;  //??
149 |   const int numBins = 1 << numBits;
150 |   const int m = 1 << 10;
151 |   int blocks = ceil((float)numElems / m);
152 |   printf("m %d blocks %d\n", m ,blocks);
153 |   // allocate GPU memory
154 |   unsigned int *d_binHistogram;
155 |   checkCudaErrors(cudaMalloc(&d_binHistogram, sizeof(unsigned int)* numBins));
156 |   // not numBins --> different from CPU version
157 |   thrust::device_vector<unsigned int> d_scan(numElems);
158 | 
159 |   // Loop bits: only guaranteed to work for numBits that are multiples of 2
160 |   for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i++) {
161 |     //unsigned int mask = 1 << i;
162 |     checkCudaErrors(cudaMemset(d_binHistogram, 0, sizeof(unsigned int)* numBins));
163 |     // 1) perform histogram of data & mask into bins
164 |     histo_kernel << <blocks, m >> >(d_binHistogram, d_inputVals, i, numElems);
165 |     cudaDeviceSynchronize();
166 |     checkCudaErrors(cudaGetLastError());
167 |     //print_kernel << <1, 2 >> >(d_binHistogram);
168 |     //printf("\n");
169 |     // 2) perform exclusive prefix sum (scan) on binHistogram to get starting
170 |     // location for each bin
171 |     sumscan_kernel << <1, numBins, sizeof(unsigned int)* numBins>> >(d_binHistogram, numBins, numElems);
172 |     //print_kernel << <1, 2 >> >(d_binHistogram);
173 |     //printf("\n");
174 |     cudaDeviceSynchronize();
175 |     checkCudaErrors(cudaGetLastError());
176 | 
177 |     // 3) Gather everything into the correct location
178 |     // need to move vals and positions
179 |     makescan_kernel << <blocks, m >> >(d_inputVals, thrust::raw_pointer_cast(&d_scan[0]), i, numElems);
180 |     //print_kernel << <1, 4 >> >(thrust::raw_pointer_cast(&d_scan[0]));
181 |     //printf("\n");
182 |     cudaDeviceSynchronize();
183 |     checkCudaErrors(cudaGetLastError());
184 | 
185 |     // segmented scan described in http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
186 |     //thrust::host_vector<unsigned int> h_scan = d_scan;
187 |     //printf("%d %d %d\n", h_scan[0], h_scan[1], h_scan[2]);
188 |     thrust::exclusive_scan(d_scan.begin(), d_scan.end(), d_scan.begin());
189 |     //print_kernel << <1, 4 >> >(thrust::raw_pointer_cast(&d_scan[0]));
190 |    // printf("\n");
191 |     cudaDeviceSynchronize();
192 |     checkCudaErrors(cudaGetLastError());
193 | 
194 |     //thrust::host_vector<unsigned int> h_scan_2 = d_scan;
195 |     //printf("%d %d %d\n", h_scan_2[0], h_scan_2[1], h_scan_2[2]);
196 |     move_kernel << <blocks, m >> >(d_inputVals, d_inputPos, d_outputVals, d_outputPos,
197 |       numElems, d_binHistogram, thrust::raw_pointer_cast(&d_scan[0]), i);
198 |     cudaDeviceSynchronize();
199 |     checkCudaErrors(cudaGetLastError());
200 | 
201 |     checkCudaErrors(cudaMemcpy(d_inputVals, d_outputVals, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
202 |     checkCudaErrors(cudaMemcpy(d_inputPos, d_outputPos, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
203 |     cudaDeviceSynchronize();
204 |     checkCudaErrors(cudaGetLastError());
205 |   }
206 |   // Free memory
207 |   checkCudaErrors(cudaFree(d_binHistogram));
208 | }
209 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | #include <algorithm>
12 | 
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | template<typename T>
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 |   //check that the GPU result matches the CPU result
27 |   for (size_t i = 0; i < numElem; ++i) {
28 |     if (ref[i] != gpu[i]) {
29 |       std::cerr << "Difference at pos " << i << std::endl;
30 |       //the + is magic to convert char to int without messing
31 |       //with other types
32 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 |                  "\nGPU      : " << +gpu[i] << std::endl;
34 |       exit(1);
35 |     }
36 |   }
37 |   std::cout << "Pass: Same" << std::endl;
38 | }
39 | 
40 | template<typename T>
41 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
42 |   assert(eps1 >= 0 && eps2 >= 0);
43 |   unsigned long long totalDiff = 0;
44 |   unsigned numSmallDifferences = 0;
45 |   for (size_t i = 0; i < numElem; ++i) {
46 |     //subtract smaller from larger in case of unsigned types
47 |     T smaller = std::min(ref[i], gpu[i]);
48 |     T larger = std::max(ref[i], gpu[i]);
49 |     T diff = larger - smaller;
50 |     if (diff > 0 && diff <= eps1) {
51 |       numSmallDifferences++;
52 |     }
53 |     else if (diff > eps1) {
54 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
55 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
56 |         "\nGPU      : " << +gpu[i] << std::endl;
57 |       exit(1);
58 |     }
59 |     totalDiff += diff * diff;
60 |   }
61 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
62 |   if (percentSmallDifferences > eps2) {
63 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
64 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
65 |     exit(1);
66 |   }
67 | }
68 | 
69 | //Uses the autodesk method of image comparison
70 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
71 | template<typename T>
72 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
73 | {
74 | 
75 |   size_t numBadPixels = 0;
76 |   for (size_t i = 0; i < numElem; ++i) {
77 |     T smaller = std::min(ref[i], gpu[i]);
78 |     T larger = std::max(ref[i], gpu[i]);
79 |     T diff = larger - smaller;
80 |     if (diff > variance)
81 |       ++numBadPixels;
82 |   }
83 | 
84 |   if (numBadPixels > tolerance) {
85 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
86 |     exit(1);
87 |   }
88 | }
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | 
12 | SET (HW5_files main.cu student.cu reference_calc.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW5 ${HW5_files} ${hdr})
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
 3 | 
 4 | histo: main.cu reference_calc.o student.o Makefile
 5 | 	nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS)
 6 | 
 7 | student.o: student.cu
 8 | 	nvcc -c student.cu $(NVCC_OPTS)
 9 | 
10 | reference_calc.o: reference_calc.cpp reference_calc.h
11 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
12 | 
13 | clean:
14 | 	rm -f *.o hw *.bin
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <iostream>
  3 | #include <cstdio>
  4 | #include <fstream>
  5 | #include "utils.h"
  6 | #include "timer.h"
  7 | #include <cstdio>
  8 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
  9 | #include <Windows.h>
 10 | #else
 11 | #include <sys/time.h>
 12 | #endif
 13 | 
 14 | #include <thrust/random/linear_congruential_engine.h>
 15 | #include <thrust/random/normal_distribution.h>
 16 | #include <thrust/random/uniform_int_distribution.h>
 17 | 
 18 | #include "reference_calc.h"
 19 | 
 20 | void computeHistogram(const unsigned int *const d_vals,
 21 |                       unsigned int* const d_histo,
 22 |                       const unsigned int numBins,
 23 |                       const unsigned int numElems,
 24 |                       int types);
 25 | 
 26 | int main(void)
 27 | {
 28 |   const unsigned int numBins = 1024;
 29 |   const unsigned int numElems = 10000 * numBins;
 30 |   const float stddev = 100.f;
 31 | 
 32 |   unsigned int *vals = new unsigned int[numElems];
 33 |   unsigned int *h_vals = new unsigned int[numElems];
 34 |   unsigned int *h_studentHisto = new unsigned int[numBins];
 35 |   unsigned int *h_refHisto = new unsigned int[numBins];
 36 | 
 37 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
 38 |   srand(GetTickCount());
 39 | #else
 40 |   timeval tv;
 41 |   gettimeofday(&tv, NULL);
 42 | 
 43 |   srand(tv.tv_usec);
 44 | #endif
 45 | 
 46 |   //make the mean unpredictable, but close enough to the middle
 47 |   //so that timings are unaffected
 48 |   unsigned int mean = rand() % 100 + 462;
 49 | 
 50 |   //Output mean so that grading can happen with the same inputs
 51 |   std::cout << mean << std::endl;
 52 | 
 53 |   thrust::minstd_rand rng;
 54 | 
 55 |   thrust::random::normal_distribution<float> normalDist((float)mean, stddev);
 56 | 
 57 |   // Generate the random values
 58 |   for (size_t i = 0; i < numElems; ++i) {
 59 |     vals[i] = std::min((unsigned int) std::max((int)normalDist(rng), 0), numBins - 1);
 60 |   }
 61 | 
 62 |   unsigned int *d_vals, *d_histo;
 63 | 
 64 |   //generate reference for the given mean
 65 |   reference_calculation(vals, h_refHisto, numBins, numElems);
 66 | 
 67 |   for (int i = 0; i < 3; i++) { // test different implenmentations
 68 |     GpuTimer timer;
 69 | 
 70 |     checkCudaErrors(cudaMalloc(&d_vals, sizeof(unsigned int)* numElems));
 71 |     checkCudaErrors(cudaMalloc(&d_histo, sizeof(unsigned int)* numBins));
 72 |     checkCudaErrors(cudaMemset(d_histo, 0, sizeof(unsigned int)* numBins));
 73 | 
 74 |     checkCudaErrors(cudaMemcpy(d_vals, vals, sizeof(unsigned int)* numElems, cudaMemcpyHostToDevice));
 75 | 
 76 |     timer.Start();
 77 |     computeHistogram(d_vals, d_histo, numBins, numElems, i);
 78 |     timer.Stop();
 79 |     int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 80 | 
 81 |     if (err < 0) {
 82 |       //Couldn't print! Probably the student closed stdout - bad news
 83 |       std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 84 |       exit(1);
 85 |     }
 86 | 
 87 |     // copy the student-computed histogram back to the host
 88 |     checkCudaErrors(cudaMemcpy(h_studentHisto, d_histo, sizeof(unsigned int)* numBins, cudaMemcpyDeviceToHost));
 89 | 
 90 |     //Now do the comparison
 91 |     checkResultsExact(h_refHisto, h_studentHisto, numBins);
 92 |   }
 93 | 
 94 |   delete[] h_vals;
 95 |   delete[] h_refHisto;
 96 |   delete[] h_studentHisto;
 97 | 
 98 |   cudaFree(d_vals);
 99 |   cudaFree(d_histo);
100 | 
101 |   return 0;
102 | }
103 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | //Reference Histogram calculation
 3 | 
 4 | void reference_calculation(const unsigned int* const vals,
 5 |                            unsigned int* const histo,
 6 |                            const size_t numBins,
 7 |                            const size_t numElems)
 8 | 
 9 | {
10 |   //zero out bins
11 |   for (size_t i = 0; i < numBins; ++i)
12 |     histo[i] = 0;
13 | 
14 |   //go through vals and increment appropriate bin
15 |   for (size_t i = 0; i < numElems; ++i)
16 |     histo[vals[i]]++;
17 | }
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | //Reference Histogram calculation
 5 | 
 6 | void reference_calculation(const unsigned int* const vals,
 7 |                            unsigned int* const histo,
 8 |                            const size_t numBins,
 9 |                            const size_t numElems);
10 | 
11 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/student.cu:
--------------------------------------------------------------------------------
 1 | /* Udacity HW5
 2 |    Histogramming for Speed
 3 | 
 4 |    The goal of this assignment is compute a histogram
 5 |    as fast as possible.  We have simplified the problem as much as
 6 |    possible to allow you to focus solely on the histogramming algorithm.
 7 | 
 8 |    The input values that you need to histogram are already the exact
 9 |    bins that need to be updated.  This is unlike in HW3 where you needed
10 |    to compute the range of the data and then do:
11 |    bin = (val - valMin) / valRange to determine the bin.
12 | 
13 |    Here the bin is just:
14 |    bin = val
15 | 
16 |    so the serial histogram calculation looks like:
17 |    for (i = 0; i < numElems; ++i)
18 |      histo[val[i]]++;
19 | 
20 |    That's it!  Your job is to make it run as fast as possible!
21 | 
22 |    The values are normally distributed - you may take
23 |    advantage of this fact in your implementation.
24 | 
25 | */
26 | 
27 | 
28 | #include "utils.h"
29 | #include <thrust/host_vector.h>
30 | #include <thrust/device_vector.h>
31 | #include <thrust/sort.h>
32 | #include <device_launch_parameters.h>
33 | #include <device_functions.h>
34 | 
35 | __global__
36 | void atomic_kernel(const unsigned int* const d_vals, //INPUT
37 |                unsigned int* const d_histo,      //OUPUT
38 |                const unsigned int numElems)
39 | {
40 |   int myId = threadIdx.x + blockIdx.x * blockDim.x;
41 |   if (myId >= numElems) return;
42 |   atomicAdd(&d_histo[d_vals[myId]], 1);
43 | }
44 | 
45 | void computeHistogram(const unsigned int* const d_vals, //INPUT
46 |                       unsigned int* const d_histo,      //OUTPUT
47 |                       const unsigned int numBins,
48 |                       const unsigned int numElems, int types)
49 | {
50 |   const int m = 1 << 10;
51 |   int blocks = ceil((float)numElems / m);
52 | 
53 |   /*thrust::device_ptr<unsigned int> in_vals(d_vals);
54 |   thrust::device_ptr<unsigned int> in_keys(d_vals);
55 |   thrust::device_ptr<unsigned int> out_vals(d_histo);
56 |   unsigned int* out_keys;
57 |   checkCudaErrors(cudaMalloc(&out_keys, sizeof(unsigned int)*numElems));*/
58 | 
59 |   switch (types){
60 |   case 0:
61 |     atomic_kernel << <blocks, m >> >(d_vals, d_histo, numElems);
62 | 
63 |     cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
64 |     break;
65 | 
66 |   // https://www.ecse.rpi.edu/~wrf/wiki/ParallelComputingSpring2014/thrust/histogram.cu
67 |   case 1:
68 |     //thrust::sort(in_vals, in_vals + numElems);
69 |     //thrust::reduce_by_key(in_keys, in_keys + numElems, in_vals, out_keys, out_vals);
70 |     
71 |     break;
72 |   case 3:
73 |     break;
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | #include <algorithm>
12 | 
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | template<typename T>
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 |   //check that the GPU result matches the CPU result
27 |   for (size_t i = 0; i < numElem; ++i) {
28 |     if (ref[i] != gpu[i]) {
29 |       std::cerr << "Difference at pos " << i << std::endl;
30 |       //the + is magic to convert char to int without messing
31 |       //with other types
32 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 |                  "\nGPU      : " << +gpu[i] << std::endl;
34 |       exit(1);
35 |     }
36 |   }
37 | }
38 | 
39 | template<typename T>
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 |   assert(eps1 >= 0 && eps2 >= 0);
42 |   unsigned long long totalDiff = 0;
43 |   unsigned numSmallDifferences = 0;
44 |   for (size_t i = 0; i < numElem; ++i) {
45 |     //subtract smaller from larger in case of unsigned types
46 |     T smaller = std::min(ref[i], gpu[i]);
47 |     T larger = std::max(ref[i], gpu[i]);
48 |     T diff = larger - smaller;
49 |     if (diff > 0 && diff <= eps1) {
50 |       numSmallDifferences++;
51 |     }
52 |     else if (diff > eps1) {
53 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 |         "\nGPU      : " << +gpu[i] << std::endl;
56 |       exit(1);
57 |     }
58 |     totalDiff += diff * diff;
59 |   }
60 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 |   if (percentSmallDifferences > eps2) {
62 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 |     exit(1);
65 |   }
66 | }
67 | 
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template<typename T>
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 | 
74 |   size_t numBadPixels = 0;
75 |   for (size_t i = 0; i < numElem; ++i) {
76 |     T smaller = std::min(ref[i], gpu[i]);
77 |     T larger = std::max(ref[i], gpu[i]);
78 |     T diff = larger - smaller;
79 |     if (diff > variance)
80 |       ++numBadPixels;
81 |   }
82 | 
83 |   if (numBadPixels > tolerance) {
84 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 |     exit(1);
86 |   }
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | 
12 | SET (HW6_files student_func.cu HW6.cu main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW6 ${HW6_files} ${hdr})
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/HW6.cu:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <string>
 5 | #include <iostream>
 6 | 
 7 | #include "loadSaveImage.h"
 8 | #include <stdio.h>
 9 | 
10 | 
11 | //return types are void since any internal error will be handled by quitting
12 | //no point in returning error codes...
13 | void preProcess( uchar4 **sourceImg,
14 |                  size_t &numRows,  size_t &numCols,
15 |                  uchar4 **destImg, 
16 |                  uchar4 **blendedImg, const std::string& source_filename,
17 |                  const std::string& dest_filename){
18 | 
19 |   //make sure the context initializes ok
20 |   checkCudaErrors(cudaFree(0));
21 | 
22 |   size_t numRowsSource, numColsSource, numRowsDest, numColsDest;
23 | 
24 |   loadImageRGBA(source_filename, sourceImg, &numRowsSource, &numColsSource);
25 |   loadImageRGBA(dest_filename, destImg, &numRowsDest, &numColsDest);
26 | 
27 |   assert(numRowsSource == numRowsDest);
28 |   assert(numColsSource == numColsDest);
29 | 
30 |   numRows = numRowsSource;
31 |   numCols = numColsSource;
32 | 
33 |   *blendedImg = new uchar4[numRows * numCols];
34 | 
35 | }
36 | 
37 | void postProcess(const uchar4* const blendedImg,
38 |                  const size_t numRowsDest, const size_t numColsDest,
39 |                  const std::string& output_file)
40 | {
41 |   //just need to save the image...
42 |   saveImageRGBA(blendedImg, numRowsDest, numColsDest, output_file);
43 | }
44 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
 2 | #NVCC=nvcc
 3 | 
 4 | ###################################
 5 | # These are the default install   #
 6 | # locations on most linux distros #
 7 | ###################################
 8 | 
 9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 | 
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 | 
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 | 
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 | 
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 | 
26 | ######################################################
27 | # On Macs the default install locations are below    #
28 | # ####################################################
29 | 
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 | 
34 | #no warnings otherwise thrust explodes output
35 | 
36 | NVCC_OPTS=-O3 -arch=sm_20 -m64
37 | 
38 | GCC_OPTS=-O3 -m64
39 | 
40 | student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile
41 | 	$(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
42 | 
43 | main.o: main.cpp timer.h utils.h
44 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
45 | 
46 | HW6.o: HW6.cu loadSaveImage.h utils.h
47 | 	$(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
48 | 
49 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
50 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
51 | 
52 | student_func.o: student_func.cu reference_calc.cpp utils.h
53 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
54 | 
55 | compare.o: compare.cpp compare.h
56 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
57 | 
58 | reference_calc.o: reference_calc.cpp reference_calc.h
59 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
60 | 
61 | clean:
62 | 	rm -f *.o hw
63 | 	find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f
64 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/blended.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/blended.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 5 | 				   double perPixelError, double globalError)
 6 | {
 7 |   cv::Mat reference = cv::imread(reference_filename, -1);
 8 |   cv::Mat test = cv::imread(test_filename, -1);
 9 | 
10 |   cv::Mat diff = abs(reference - test);
11 | 
12 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 | 
14 |   double minVal, maxVal;
15 | 
16 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 | 
18 |   //now perform transform so that we bump values to the full range
19 | 
20 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 | 
22 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
23 | 
24 |   cv::imwrite("HW6_differenceImage.png", diff);
25 |   //OK, now we can start comparing values...
26 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
27 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
28 | 
29 |   if (useEpsCheck) {
30 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 |   }
32 |   else
33 |   {
34 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 |   }
36 | 
37 |   std::cout << "PASS" << std::endl;
38 |   return;
39 | }
40 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/destination.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include "cuda_runtime.h"
  6 | 
  7 | //The caller becomes responsible for the returned pointer. This
  8 | //is done in the interest of keeping this code as simple as possible.
  9 | //In production code this is a bad idea - we should use RAII
 10 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 11 | //CODE!!!
 12 | void loadImageHDR(const std::string &filename,
 13 |                   float **imagePtr,
 14 |                   size_t *numRows, size_t *numCols)
 15 | {
 16 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 17 |   if (image.empty()) {
 18 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 19 |     exit(1);
 20 |   }
 21 | 
 22 |   if (image.channels() != 3) {
 23 |     std::cerr << "Image must be color!" << std::endl;
 24 |     exit(1);
 25 |   }
 26 | 
 27 |   if (!image.isContinuous()) {
 28 |     std::cerr << "Image isn't continuous!" << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 33 | 
 34 |   float *cvPtr = image.ptr<float>(0);
 35 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 36 |     (*imagePtr)[i] = cvPtr[i];
 37 | 
 38 |   *numRows = image.rows;
 39 |   *numCols = image.cols;
 40 | }
 41 | 
 42 | void loadImageGrey(const std::string &filename,
 43 |                    unsigned char **imagePtr,
 44 |                    size_t *numRows, size_t *numCols)
 45 | {
 46 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
 47 |   if (image.empty()) {
 48 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 49 |     exit(1);
 50 |   }
 51 | 
 52 |   if (image.channels() != 1) {
 53 |     std::cerr << "Image must be greyscale!" << std::endl;
 54 |     exit(1);
 55 |   }
 56 | 
 57 |   if (!image.isContinuous()) {
 58 |     std::cerr << "Image isn't continuous!" << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   *imagePtr = new unsigned char[image.rows * image.cols];
 63 | 
 64 |   unsigned char *cvPtr = image.ptr<unsigned char>(0);
 65 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 66 |     (*imagePtr)[i] = cvPtr[i];
 67 |   }
 68 | 
 69 |   *numRows = image.rows;
 70 |   *numCols = image.cols;
 71 | }
 72 | void loadImageRGBA(const std::string &filename,
 73 |                    uchar4 **imagePtr,
 74 |                    size_t *numRows, size_t *numCols)
 75 | {
 76 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 77 |   if (image.empty()) {
 78 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 79 |     exit(1);
 80 |   }
 81 | 
 82 |   if (image.channels() != 3) {
 83 |     std::cerr << "Image must be color!" << std::endl;
 84 |     exit(1);
 85 |   }
 86 | 
 87 |   if (!image.isContinuous()) {
 88 |     std::cerr << "Image isn't continuous!" << std::endl;
 89 |     exit(1);
 90 |   }
 91 | 
 92 |   cv::Mat imageRGBA;
 93 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 94 | 
 95 |   *imagePtr = new uchar4[image.rows * image.cols];
 96 | 
 97 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 98 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 99 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
100 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
101 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
102 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
103 |   }
104 | 
105 |   *numRows = image.rows;
106 |   *numCols = image.cols;
107 | }
108 | 
109 | void saveImageRGBA(const uchar4* const image,
110 |                    const size_t numRows, const size_t numCols,
111 |                    const std::string &output_file)
112 | {
113 |   int sizes[2];
114 |   sizes[0] = numRows;
115 |   sizes[1] = numCols;
116 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
117 |   cv::Mat imageOutputBGR;
118 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
119 |   //output the image
120 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
121 | }
122 | 
123 | //output an exr file
124 | //assumed to already be BGR
125 | void saveImageHDR(const float* const image,
126 |                   const size_t numRows, const size_t numCols,
127 |                   const std::string &output_file)
128 | {
129 |   int sizes[2];
130 |   sizes[0] = numRows;
131 |   sizes[1] = numCols;
132 | 
133 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
134 | 
135 |   imageHDR = imageHDR * 255;
136 | 
137 |   cv::imwrite(output_file.c_str(), imageHDR);
138 | }
139 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void loadImageGrey(const std::string &filename,
16 |                    unsigned char **imagePtr,
17 |                    size_t *numRows, size_t *numCols);
18 | 
19 | void saveImageRGBA(const uchar4* const image,
20 |                    const size_t numRows, const size_t numCols,
21 |                    const std::string &output_file);
22 | 
23 | void saveImageHDR(const float* const image,
24 |                   const size_t numRows, const size_t numCols,
25 |                   const std::string &output_file);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW6 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | 
  9 | #include <opencv2/core/core.hpp>
 10 | #include <opencv2/highgui/highgui.hpp>
 11 | #include <opencv2/opencv.hpp>
 12 | 
 13 | #include "reference_calc.h"
 14 | #include "compare.h"
 15 | 
 16 | void preProcess( uchar4 **sourceImg, size_t &numRowsSource,  size_t &numColsSource,
 17 |                  uchar4 **destImg,
 18 |                  uchar4 **blendedImg, const std::string& source_filename,
 19 |                  const std::string& dest_filename);
 20 | 
 21 | void postProcess(const uchar4* const blendedImg,
 22 |                  const size_t numRowsDest, const size_t numColsDest,
 23 |                  const std::string& output_file);
 24 | 
 25 | void your_blend(const uchar4* const sourceImg,
 26 |                 const size_t numRowsSource, const size_t numColsSource,
 27 |                 const uchar4* const destImg,
 28 |                 uchar4* const blendedImg);
 29 | 
 30 | int main(int argc, char **argv) {
 31 |   uchar4 *h_sourceImg, *h_destImg, *h_blendedImg;
 32 |   size_t numRowsSource, numColsSource;
 33 | 
 34 |   std::string input_source_file;
 35 |   std::string input_dest_file;
 36 |   std::string output_file;
 37 | 
 38 |   std::string reference_file;
 39 |   double perPixelError = 0.0;
 40 |   double globalError   = 0.0;
 41 |   bool useEpsCheck = false;
 42 | 
 43 |   switch (argc)
 44 |   {
 45 |   	case 3:
 46 |   	  input_source_file  = std::string(argv[1]);
 47 |   	  input_dest_file = std::string(argv[2]);
 48 |       output_file = "HW6_output.png";
 49 |   	  reference_file = "HW6_reference.png";
 50 |   	  break;
 51 |   	case 4:
 52 |   	  input_source_file  = std::string(argv[1]);
 53 |   	  input_dest_file = std::string(argv[2]);
 54 |       output_file = std::string(argv[3]);
 55 |   	  reference_file = "HW6_reference.png";
 56 |   	  break;
 57 |   	case 5:
 58 |   	  input_source_file  = std::string(argv[1]);
 59 |   	  input_dest_file = std::string(argv[2]);
 60 |   	  output_file = std::string(argv[3]);
 61 |   	  reference_file = std::string(argv[4]);
 62 |   	  break;
 63 |   	case 7:
 64 |   	  useEpsCheck=true;
 65 |   	  input_source_file  = std::string(argv[1]);
 66 |   	  input_dest_file = std::string(argv[2]);
 67 |   	  output_file = std::string(argv[3]);
 68 |   	  reference_file = std::string(argv[4]);
 69 |   	  perPixelError = atof(argv[5]);
 70 |       globalError   = atof(argv[6]);
 71 |   	  break;
 72 |   	default:
 73 |         std::cerr << "Usage: ./HW6 input_source_file input_dest_filename [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 74 |         exit(1);
 75 |     }
 76 | 
 77 |   //load the image and give us our input and output pointers
 78 |   preProcess(&h_sourceImg, numRowsSource, numColsSource,
 79 |              &h_destImg,
 80 |              &h_blendedImg, input_source_file, input_dest_file);
 81 | 
 82 |   GpuTimer timer;
 83 |   timer.Start();
 84 | 
 85 |   //call the students' code
 86 |   your_blend(h_sourceImg, numRowsSource, numColsSource,
 87 |              h_destImg,
 88 |              h_blendedImg);
 89 | 
 90 |   timer.Stop();
 91 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 92 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 93 |   printf("\n");
 94 |   if (err < 0) {
 95 |     //Couldn't print! Probably the student closed stdout - bad news
 96 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 97 |     exit(1);
 98 |   }
 99 | 
100 |   //check results and output the tone-mapped image
101 |   postProcess(h_blendedImg, numRowsSource, numColsSource, output_file);
102 | 
103 |   // calculate the reference image
104 |   uchar4* h_reference = new uchar4[numRowsSource*numColsSource];
105 |   reference_calc(h_sourceImg, numRowsSource, numColsSource,
106 |                    h_destImg, h_reference);
107 | 
108 |   // save the reference image
109 |   postProcess(h_reference, numRowsSource, numColsSource, reference_file);
110 | 
111 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
112 | 
113 |   delete[] h_reference;
114 |   delete[] h_destImg;
115 |   delete[] h_sourceImg;
116 |   delete[] h_blendedImg;
117 |   return 0;
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | void reference_calc(const uchar4* const h_sourceImg,
 5 |                     const size_t numRowsSource, const size_t numColsSource,
 6 |                     const uchar4* const h_destImg,
 7 |                       uchar4* const h_blendedImg);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/source.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/student_func.cu:
--------------------------------------------------------------------------------
  1 | //Udacity HW 6
  2 | //Poisson Blending
  3 | 
  4 | /* Background
  5 |    ==========
  6 | 
  7 |    The goal for this assignment is to take one image (the source) and
  8 |    paste it into another image (the destination) attempting to match the
  9 |    two images so that the pasting is non-obvious. This is
 10 |    known as a "seamless clone".
 11 | 
 12 |    The basic ideas are as follows:
 13 | 
 14 |    1) Figure out the interior and border of the source image
 15 |    2) Use the values of the border pixels in the destination image 
 16 |       as boundary conditions for solving a Poisson equation that tells
 17 |       us how to blend the images.
 18 |    
 19 |       No pixels from the destination except pixels on the border
 20 |       are used to compute the match.
 21 | 
 22 |    Solving the Poisson Equation
 23 |    ============================
 24 | 
 25 |    There are multiple ways to solve this equation - we choose an iterative
 26 |    method - specifically the Jacobi method. Iterative methods start with
 27 |    a guess of the solution and then iterate to try and improve the guess
 28 |    until it stops changing.  If the problem was well-suited for the method
 29 |    then it will stop and where it stops will be the solution.
 30 | 
 31 |    The Jacobi method is the simplest iterative method and converges slowly - 
 32 |    that is we need a lot of iterations to get to the answer, but it is the
 33 |    easiest method to write.
 34 | 
 35 |    Jacobi Iterations
 36 |    =================
 37 | 
 38 |    Our initial guess is going to be the source image itself.  This is a pretty
 39 |    good guess for what the blended image will look like and it means that
 40 |    we won't have to do as many iterations compared to if we had started far
 41 |    from the final solution.
 42 | 
 43 |    ImageGuess_prev (Floating point)
 44 |    ImageGuess_next (Floating point)
 45 | 
 46 |    DestinationImg
 47 |    SourceImg
 48 | 
 49 |    Follow these steps to implement one iteration:
 50 | 
 51 |    1) For every pixel p in the interior, compute two sums over the four neighboring pixels:
 52 |       Sum1: If the neighbor is in the interior then += ImageGuess_prev[neighbor]
 53 |              else if the neighbor in on the border then += DestinationImg[neighbor]
 54 | 
 55 |       Sum2: += SourceImg[p] - SourceImg[neighbor]   (for all four neighbors)
 56 | 
 57 |    2) Calculate the new pixel value:
 58 |       float newVal= (Sum1 + Sum2) / 4.f  <------ Notice that the result is FLOATING POINT
 59 |       ImageGuess_next[p] = min(255, max(0, newVal)); //clamp to [0, 255]
 60 | 
 61 | 
 62 |     In this assignment we will do 800 iterations.
 63 |    */
 64 | 
 65 | 
 66 | 
 67 | #include "utils.h"
 68 | #include <thrust/host_vector.h>
 69 | 
 70 | void your_blend(const uchar4* const h_sourceImg,  //IN
 71 |                 const size_t numRowsSource, const size_t numColsSource,
 72 |                 const uchar4* const h_destImg, //IN
 73 |                 uchar4* const h_blendedImg) //OUT
 74 | {
 75 | 
 76 |   /* To Recap here are the steps you need to implement
 77 |   
 78 |      1) Compute a mask of the pixels from the source image to be copied
 79 |         The pixels that shouldn't be copied are completely white, they
 80 |         have R=255, G=255, B=255.  Any other pixels SHOULD be copied.
 81 | 
 82 |      2) Compute the interior and border regions of the mask.  An interior
 83 |         pixel has all 4 neighbors also inside the mask.  A border pixel is
 84 |         in the mask itself, but has at least one neighbor that isn't.
 85 | 
 86 |      3) Separate out the incoming image into three separate channels
 87 | 
 88 |      4) Create two float(!) buffers for each color channel that will
 89 |         act as our guesses.  Initialize them to the respective color
 90 |         channel of the source image since that will act as our intial guess.
 91 | 
 92 |      5) For each color channel perform the Jacobi iteration described 
 93 |         above 800 times.
 94 | 
 95 |      6) Create the output image by replacing all the interior pixels
 96 |         in the destination image with the result of the Jacobi iterations.
 97 |         Just cast the floating point values to unsigned chars since we have
 98 |         already made sure to clamp them to the correct range.
 99 | 
100 |       Since this is final assignment we provide little boilerplate code to
101 |       help you.  Notice that all the input/output pointers are HOST pointers.
102 | 
103 |       You will have to allocate all of your own GPU memory and perform your own
104 |       memcopies to get data in and out of the GPU memory.
105 | 
106 |       Remember to wrap all of your calls with checkCudaErrors() to catch any
107 |       thing that might go wrong.  After each kernel call do:
108 | 
109 |       cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
110 | 
111 |       to catch any errors that happened while executing the kernel.
112 |   */
113 | }
114 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | #include <algorithm>
12 | 
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 | 
15 | template<typename T>
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 |   if (err != cudaSuccess) {
18 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 |     exit(1);
21 |   }
22 | }
23 | 
24 | template<typename T>
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 |   //check that the GPU result matches the CPU result
27 |   for (size_t i = 0; i < numElem; ++i) {
28 |     if (ref[i] != gpu[i]) {
29 |       std::cerr << "Difference at pos " << i << std::endl;
30 |       //the + is magic to convert char to int without messing
31 |       //with other types
32 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 |                  "\nGPU      : " << +gpu[i] << std::endl;
34 |       exit(1);
35 |     }
36 |   }
37 | }
38 | 
39 | template<typename T>
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 |   assert(eps1 >= 0 && eps2 >= 0);
42 |   unsigned long long totalDiff = 0;
43 |   unsigned numSmallDifferences = 0;
44 |   for (size_t i = 0; i < numElem; ++i) {
45 |     //subtract smaller from larger in case of unsigned types
46 |     T smaller = std::min(ref[i], gpu[i]);
47 |     T larger = std::max(ref[i], gpu[i]);
48 |     T diff = larger - smaller;
49 |     if (diff > 0 && diff <= eps1) {
50 |       numSmallDifferences++;
51 |     }
52 |     else if (diff > eps1) {
53 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 |         "\nGPU      : " << +gpu[i] << std::endl;
56 |       exit(1);
57 |     }
58 |     totalDiff += diff * diff;
59 |   }
60 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 |   if (percentSmallDifferences > eps2) {
62 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 |     exit(1);
65 |   }
66 | }
67 | 
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template<typename T>
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 | 
74 |   size_t numBadPixels = 0;
75 |   for (size_t i = 0; i < numElem; ++i) {
76 |     T smaller = std::min(ref[i], gpu[i]);
77 |     T larger = std::max(ref[i], gpu[i]);
78 |     T diff = larger - smaller;
79 |     if (diff > variance)
80 |       ++numBadPixels;
81 |   }
82 | 
83 |   if (numBadPixels > tolerance) {
84 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 |     exit(1);
86 |   }
87 | }
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Solutions for class: [Introduction to Parallel Programming](https://www.udacity.com/course/intro-to-parallel-programming--cs344)
 2 | 
 3 | #### Building on Windows Visual Studio
 4 | ##### Prerequisites
 5 | * Install Visual Studio 2013:
 6 | 	
 7 | 	**Note**: `Visual Studio Express` and `Visual Studio 2015` are not supported!(I tried but not work ^_^)
 8 | 
 9 | 	[Nvidia reference](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/index.html#axzz44vwAc5Qx)
10 | 
11 | * Install Cuda 7.5:
12 | 	Also refer to above link. [download](https://developer.nvidia.com/cuda-downloads)
13 | 
14 | * Install CMake:
15 | 	The latest version is OK. [download](https://cmake.org/) 
16 | 
17 | * Install OpenCV:
18 | 	I installed 2.4.12, other versions should also work. [download](http://opencv.org/)
19 | 	* Run the EXE to extract the files. This EXE does not have an installer. Instead, you put your files where you want, and then add an environment variable
20 | 	* Adding the environment variable named "OpenCV_DIR" (no quotes) to the "build" subfolder in the folder where you extracted.(The exact folder you need will have one very important file in it: OpenCVConfig.cmake - this tells CMake which variables to set for you.)
21 | 	* Add a dir of "OpenCV binary DLLs" to Windows $PATH.(like f:/software/opencv/build/x86/vc12/bin)
22 | 
23 | ##### Compile the solution
24 | ```
25 | git clone https://github.com/wykvictor/cs344.git
26 | cd cs344
27 | mkdir build
28 | cd build
29 | cmake ..
30 | ```
31 | 
32 | **Done!** Just use Visual Studio to open the project-solution in dir build/ and compile everything.
33 | 
34 | =======
35 | ### Original README.md forked from [udacity/cs344](https://github.com/udacity/cs344)
36 | 
37 | ##### Introduction to Parallel Programming class code
38 | 
39 | #### Building on OS X
40 | 
41 | These instructions are for OS X 10.9 "Mavericks".
42 | 
43 | * Step 1. Build and install OpenCV. The best way to do this is with
44 | Homebrew. However, you must slightly alter the Homebrew OpenCV
45 | installation; you must build it with libstdc++ (instead of the default
46 | libc++) so that it will properly link against the nVidia CUDA dev kit. 
47 | [This entry in the Udacity discussion forums](http://forums.udacity.com/questions/100132476/cuda-55-opencv-247-os-x-maverick-it-doesnt-work) describes exactly how to build a compatible OpenCV.
48 | 
49 | * Step 2. You can now create 10.9-compatible makefiles, which will allow you to
50 | build and run your homework on your own machine:
51 | ```
52 | mkdir build
53 | cd build
54 | cmake ..
55 | make
56 | ```
57 | 
58 | 


--------------------------------------------------------------------------------