├── .gitignore ├── CMakeLists.txt ├── Final ├── batcher │ ├── batcher.cu │ ├── compare.h │ └── gputimer.h ├── smooth │ ├── compare.h │ ├── gputimer.h │ └── smooth.cu └── warpreduce │ ├── part_a │ ├── compare.h │ ├── gputimer.h │ └── warpreduce.cu │ └── part_b │ ├── compare.h │ ├── gputimer.h │ └── warpreduce.cu ├── Lesson Code Snippets ├── Lesson 2 Code Snippets │ ├── CMakeLists.txt │ ├── associative.cu │ ├── atomics.cu │ ├── gputimer.h │ ├── hello.cu │ ├── hello_blockIdx.cu │ ├── hello_threadIdx.cu │ └── memory.cu ├── Lesson 3 Code Snippets │ ├── CMakeLists.txt │ ├── histo.cu │ ├── reduce.cu │ ├── reduce_minmax.cu │ └── reduce_minmax_2.cu ├── Lesson 5 Code Snippets │ ├── CMakeLists.txt │ ├── deviceQuery_simplified.cpp │ ├── gputimer.h │ └── transpose.cu └── Lesson 7 Code Snippets │ ├── CMakeLists.txt │ ├── cub │ └── example_block_scan_cum.cu │ ├── opencv │ ├── gettime.cc │ ├── gettime.h │ └── opencv.cu │ ├── thrust │ ├── gettime.cc │ ├── gettime.h │ ├── gputimer.h │ └── thrust_example.cu │ └── tiling │ ├── a.exp │ ├── gputimer.h │ ├── tiling.cu │ └── utils.h ├── Problem Sets ├── Problem Set 1 │ ├── CMakeLists.txt │ ├── HW1.cpp │ ├── Makefile │ ├── cinque_terre.gold │ ├── cinque_terre_small.jpg │ ├── compare.cpp │ ├── compare.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 2 │ ├── CMakeLists.txt │ ├── HW2.cpp │ ├── Makefile │ ├── cinque_terre.gold │ ├── cinque_terre_small.jpg │ ├── compare.cpp │ ├── compare.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 3 │ ├── CMakeLists.txt │ ├── HW3.cu │ ├── Makefile │ ├── compare.cpp │ ├── compare.h │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── memorial.exr │ ├── memorial_large.exr │ ├── memorial_png.gold │ ├── memorial_png_large.gold │ ├── memorial_raw.png │ ├── memorial_raw_large.png │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 4 │ ├── CMakeLists.txt │ ├── HW4.cu │ ├── Makefile │ ├── compare.cpp │ ├── compare.h │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── red_eye_effect.gold │ ├── red_eye_effect_5.jpg │ ├── red_eye_effect_template_5.jpg │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 5 │ ├── CMakeLists.txt │ ├── Makefile │ ├── main.cu │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student.cu │ ├── timer.h │ └── utils.h └── Problem Set 6 │ ├── CMakeLists.txt │ ├── HW6.cu │ ├── Makefile │ ├── blended.gold │ ├── compare.cpp │ ├── compare.h │ ├── destination.png │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── source.png │ ├── student_func.cu │ ├── timer.h │ └── utils.h └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | 4 | # Libraries 5 | *.lib 6 | *.a 7 | 8 | # Shared objects (inc. Windows DLLs) 9 | *.dll 10 | *.so 11 | *.so.* 12 | *.dylib 13 | 14 | # Executables 15 | *.exe 16 | *.out 17 | *.app 18 | 19 | # OS X stuff 20 | .DS_Store 21 | 22 | build* 23 | bin 24 | 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR) 9 | project(cs344) 10 | 11 | find_package(OpenCV REQUIRED) 12 | find_package(CUDA REQUIRED) 13 | 14 | link_libraries(${OpenCV_LIBS}) 15 | include_directories(${OpenCV_INCLUDE_DIRS}) 16 | 17 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/") 18 | 19 | if(CUDA_FOUND) 20 | # compared to class settings, we let NVidia's FindCUDA CMake detect 21 | # whether to build x64. We tell it to support most devices, though, 22 | # to make sure more people can easily run class code without knowing 23 | # about this compiler argument 24 | 25 | # Commented out these lines, otherwise there will be some tricky errors 26 | # set(CUDA_NVCC_FLAGS " 27 | # -ccbin /usr/bin/clang; 28 | # -gencode;arch=compute_30,code=sm_30; 29 | # -gencode;arch=compute_35,code=sm_35; 30 | # -gencode;arch=compute_35,code=compute_35; 31 | # -gencode;arch=compute_20,code=sm_20; 32 | # -gencode;arch=compute_11,code=sm_11; 33 | # -gencode;arch=compute_12,code=sm_12; 34 | # -gencode;arch=compute_13,code=sm_13;") 35 | 36 | # add -Wextra compiler flag for gcc compilations 37 | if (UNIX) 38 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra") 39 | set(CMAKE_CXX_FLAGS "-stdlib=libstdc++") 40 | endif (UNIX) 41 | 42 | # add debugging to CUDA NVCC flags. For NVidia's NSight tools. 43 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G") 44 | 45 | add_subdirectory ("Problem Sets/Problem Set 1") 46 | add_subdirectory ("Problem Sets/Problem Set 2") 47 | add_subdirectory ("Problem Sets/Problem Set 3") 48 | add_subdirectory ("Problem Sets/Problem Set 4") 49 | add_subdirectory ("Problem Sets/Problem Set 5") 50 | add_subdirectory ("Problem Sets/Problem Set 6") 51 | 52 | add_subdirectory ("Lesson Code Snippets/Lesson 7 Code Snippets") 53 | add_subdirectory ("Lesson Code Snippets/Lesson 5 Code Snippets") 54 | add_subdirectory ("Lesson Code Snippets/Lesson 3 Code Snippets") 55 | add_subdirectory ("Lesson Code Snippets/Lesson 2 Code Snippets") 56 | else(CUDA_FOUND) 57 | message("CUDA is not installed on this system.") 58 | endif() 59 | -------------------------------------------------------------------------------- /Final/batcher/batcher.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // http://en.wikipedia.org/wiki/Bitonic_sort 8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in) 9 | { 10 | // you are guaranteed this is called with <<<1, 64, 64*4>>> 11 | extern __shared__ float sdata[]; 12 | int tid = threadIdx.x; 13 | sdata[tid] = d_in[tid]; 14 | __syncthreads(); 15 | 16 | for (int stage = 0; stage <= 5; stage++) 17 | { 18 | for (int substage = stage; substage >= 0; substage--) 19 | { 20 | // TODO 21 | } 22 | } 23 | 24 | d_out[tid] = sdata[tid]; 25 | } 26 | 27 | int compareFloat (const void * a, const void * b) 28 | { 29 | if ( *(float*)a < *(float*)b ) return -1; 30 | if ( *(float*)a == *(float*)b ) return 0; 31 | if ( *(float*)a > *(float*)b ) return 1; 32 | return 0; // should never reach this 33 | } 34 | 35 | int main(int argc, char **argv) 36 | { 37 | const int ARRAY_SIZE = 64; 38 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 39 | 40 | // generate the input array on the host 41 | float h_in[ARRAY_SIZE]; 42 | float h_sorted[ARRAY_SIZE]; 43 | float h_out[ARRAY_SIZE]; 44 | for(int i = 0; i < ARRAY_SIZE; i++) { 45 | // generate random float in [0, 1] 46 | h_in[i] = (float)random()/(float)RAND_MAX; 47 | h_sorted[i] = h_in[i]; 48 | } 49 | qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat); 50 | 51 | // declare GPU memory pointers 52 | float * d_in, * d_out; 53 | 54 | // allocate GPU memory 55 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 56 | cudaMalloc((void **) &d_out, ARRAY_BYTES); 57 | 58 | // transfer the input array to the GPU 59 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 60 | 61 | // launch the kernel 62 | GpuTimer timer; 63 | timer.Start(); 64 | batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in); 65 | timer.Stop(); 66 | 67 | printf("Your code executed in %g ms\n", timer.Elapsed()); 68 | 69 | // copy back the sum from GPU 70 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost); 71 | 72 | compare(h_out, h_sorted, ARRAY_SIZE); 73 | 74 | // free GPU memory allocation 75 | cudaFree(d_in); 76 | cudaFree(d_out); 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /Final/batcher/compare.h: -------------------------------------------------------------------------------- 1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE) 2 | { 3 | int failure = 0; 4 | for(int i = 0; i < ARRAY_SIZE; i++) { 5 | if (h_out[i] != h_sorted[i]) { 6 | printf("Oops! Index %i is %f, should be %f\n", 7 | i, h_out[i], h_sorted[i]); 8 | failure = 1; 9 | } 10 | } 11 | 12 | if (failure == 0){ 13 | printf("Success! Your bitonic sort worked."); 14 | } 15 | 16 | return failure; 17 | } -------------------------------------------------------------------------------- /Final/batcher/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/smooth/compare.h: -------------------------------------------------------------------------------- 1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){ 2 | int failure = 0; 3 | for(int i = 0; i < ARRAY_SIZE; i++) { 4 | if (h_out[i] != h_cmp[i]) { 5 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n", 6 | i, h_in[i], i, h_out[i], i, h_cmp[i]); 7 | failure = 1; 8 | } 9 | if (h_out_shared[i] != h_cmp[i]) { 10 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n", 11 | i, h_in[i], i, h_out_shared[i], i, h_cmp[i]); 12 | failure = 1; 13 | } 14 | } 15 | 16 | if (failure == 0) 17 | { 18 | printf("Success! Your smooth code worked!\n"); 19 | } 20 | 21 | return failure; 22 | } -------------------------------------------------------------------------------- /Final/smooth/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/smooth/smooth.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Reference 8 | __global__ void smooth(float * v_new, const float * v) { 9 | int myIdx = threadIdx.x * gridDim.x + blockIdx.x; 10 | int numThreads = blockDim.x * gridDim.x; 11 | int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1; 12 | int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1; 13 | float myElt = v[myIdx]; 14 | float myLeftElt = v[myLeftIdx]; 15 | float myRightElt = v[myRightIdx]; 16 | v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt; 17 | } 18 | 19 | // Your code 20 | __global__ void smooth_shared(float * v_new, const float * v) { 21 | extern __shared__ float s[]; 22 | // TODO: Fill in the rest of this function 23 | return v[0]; 24 | } 25 | 26 | int main(int argc, char **argv) 27 | { 28 | 29 | const int ARRAY_SIZE = 4096; 30 | const int BLOCK_SIZE = 256; 31 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 32 | 33 | // generate the input array on the host 34 | float h_in[ARRAY_SIZE]; 35 | float h_cmp[ARRAY_SIZE]; 36 | float h_out[ARRAY_SIZE]; 37 | float h_out_shared[ARRAY_SIZE]; 38 | for(int i = 0; i < ARRAY_SIZE; i++) { 39 | // generate random float in [0, 1] 40 | h_in[i] = (float)random()/(float)RAND_MAX; 41 | } 42 | for(int i = 0; i < ARRAY_SIZE; i++) { 43 | h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] + 44 | 0.50f * h_in[i] + 45 | 0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]); 46 | } 47 | 48 | // declare GPU memory pointers 49 | float * d_in, * d_out, * d_out_shared; 50 | 51 | // allocate GPU memory 52 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 53 | cudaMalloc((void **) &d_out, ARRAY_BYTES); 54 | cudaMalloc((void **) &d_out_shared, ARRAY_BYTES); 55 | 56 | // transfer the input array to the GPU 57 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 58 | 59 | // cudaEvent_t start, stop; 60 | // cudaEventCreate(&start); 61 | // cudaEventCreate(&stop); 62 | // launch the kernel 63 | smooth<<>>(d_out, d_in); 64 | GpuTimer timer; 65 | timer.Start(); 66 | smooth_shared<<>>(d_out_shared, d_in); 67 | timer.Stop(); 68 | 69 | printf("Your code executed in %g ms\n", timer.Elapsed()); 70 | // cudaEventSynchronize(stop); 71 | // float elapsedTime; 72 | // cudaEventElapsedTime(&elapsedTime, start, stop); 73 | 74 | // copy back the result from GPU 75 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost); 76 | cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost); 77 | 78 | // testing for correctness 79 | compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE); 80 | 81 | // free GPU memory allocation 82 | cudaFree(d_in); 83 | cudaFree(d_out); 84 | cudaFree(d_out_shared); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /Final/warpreduce/part_a/compare.h: -------------------------------------------------------------------------------- 1 | int compare(unsigned int h_out_shared, int sum){ 2 | int failure = 0; 3 | if (h_out_shared != sum) { 4 | fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n", 5 | h_out_shared, sum); 6 | failure = 1; 7 | } 8 | 9 | if (failure == 0) 10 | { 11 | printf("Success! Your shared warp reduce worked.\n"); 12 | } 13 | else{ 14 | printf("Error! Your shared reduce code's output did not match sum.\n"); 15 | } 16 | 17 | return failure; 18 | } -------------------------------------------------------------------------------- /Final/warpreduce/part_a/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/warpreduce/part_a/warpreduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Subpart A: 8 | // Write step 1 as a kernel that operates on threads 0--31. 9 | // Assume that the input flags are 0 for false and 1 for true and are stored 10 | // in a local per-thread register called p (for predicate). 11 | // 12 | // You have access to 31 words of shared memory s[0:31], with s[0] 13 | // corresponding to thread 0 and s[31] corresponding to thread 31. 14 | // You may change the values of s[0:31]. Put the return sum in s[0]. 15 | // Your code should execute no more than 5 warp-wide addition operations. 16 | 17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) { 18 | // Assumes values in 'p' are either 1 or 0 19 | // Assumes s[0:31] are allocated 20 | // Sums p across warp, returning the result. Suggest you put 21 | // result in s[0] and return it 22 | // You may change any value in s 23 | // You should execute no more than 5 + operations (if you're doing 24 | // 31, you're doing it wrong) 25 | // 26 | // TODO: Fill in the rest of this function 27 | 28 | return s[0]; 29 | } 30 | 31 | __global__ void reduce(unsigned int * d_out_shared, 32 | const unsigned int * d_in) 33 | { 34 | extern __shared__ unsigned int s[]; 35 | int t = threadIdx.x; 36 | int p = d_in[t]; 37 | unsigned int sr = shared_reduce(p, s); 38 | if (t == 0) 39 | { 40 | *d_out_shared = sr; 41 | } 42 | } 43 | 44 | int main(int argc, char **argv) 45 | { 46 | const int ARRAY_SIZE = 32; 47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int); 48 | 49 | // generate the input array on the host 50 | unsigned int h_in[ARRAY_SIZE]; 51 | unsigned int sum = 0; 52 | for(int i = 0; i < ARRAY_SIZE; i++) { 53 | // generate random float in [0, 1] 54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0; 55 | sum += h_in[i]; 56 | } 57 | 58 | // declare GPU memory pointers 59 | unsigned int * d_in, * d_out_shared; 60 | 61 | // allocate GPU memory 62 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 63 | cudaMalloc((void **) &d_out_shared, sizeof(unsigned int)); 64 | 65 | // transfer the input array to the GPU 66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 67 | 68 | GpuTimer timer; 69 | timer.Start(); 70 | // launch the kernel 71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>> 72 | (d_out_shared, d_in); 73 | timer.Stop(); 74 | 75 | printf("Your code executed in %g ms\n", timer.Elapsed()); 76 | 77 | unsigned int h_out_shared; 78 | // copy back the sum from GPU 79 | cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int), 80 | cudaMemcpyDeviceToHost); 81 | 82 | compare(h_out_shared, sum); 83 | 84 | // free GPU memory allocation 85 | cudaFree(d_in); 86 | cudaFree(d_out_shared); 87 | } 88 | 89 | -------------------------------------------------------------------------------- /Final/warpreduce/part_b/compare.h: -------------------------------------------------------------------------------- 1 | int compare(unsigned int h_out_warp, int sum){ 2 | int failure = 0; 3 | if (h_out_warp != sum) { 4 | fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n", 5 | h_out_warp, sum); 6 | failure = 1; 7 | } 8 | 9 | if (failure == 0) 10 | { 11 | printf("Success! Your warp reduce worked.\n"); 12 | } 13 | else{ 14 | printf("Error! Your warp reduce code's output did not match sum.\n"); 15 | } 16 | 17 | return failure; 18 | } -------------------------------------------------------------------------------- /Final/warpreduce/part_b/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/warpreduce/part_b/warpreduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Subpart b: 8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions. 9 | // Namely, these instructions are: 10 | // 11 | // int __popc(int x) Population Count: Returns the number of bits that are set 12 | // to 1 in the 32-bit integer x. 13 | // 14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero 15 | // bits beginning at the most significant bit of the 32-bit integer x. 16 | // 17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only 18 | // if the predicate p provided by the thread in lane k of the warp is non-zero. 19 | 20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) { 21 | // Assumes values in 'p' are either 1 or 0 22 | // Should not use 's' 23 | // Sums p across warp, returning the result. 24 | // You can do this without using the character '+' in your code at all 25 | // 26 | // TODO: Fill in the rest of this function 27 | // 28 | } 29 | 30 | __global__ void reduce(unsigned int * d_out_warp, 31 | const unsigned int * d_in) 32 | { 33 | extern __shared__ unsigned int s[]; 34 | int t = threadIdx.x; 35 | int p = d_in[t]; 36 | 37 | unsigned int wr = warp_reduce(p, s); 38 | if (t == 0) 39 | { 40 | *d_out_warp = wr; 41 | } 42 | } 43 | 44 | int main(int argc, char **argv) 45 | { 46 | const int ARRAY_SIZE = 32; 47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int); 48 | 49 | // generate the input array on the host 50 | unsigned int h_in[ARRAY_SIZE]; 51 | unsigned int sum = 0; 52 | for(int i = 0; i < ARRAY_SIZE; i++) { 53 | // generate random float in [0, 1] 54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0; 55 | sum += h_in[i]; 56 | } 57 | 58 | // declare GPU memory pointers 59 | unsigned int * d_in, * d_out_warp; 60 | 61 | // allocate GPU memory 62 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 63 | cudaMalloc((void **) &d_out_warp, sizeof(unsigned int)); 64 | 65 | // transfer the input array to the GPU 66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 67 | 68 | GpuTimer timer; 69 | timer.Start(); 70 | // launch the kernel 71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>> 72 | (d_out_warp, d_in); 73 | timer.Stop(); 74 | 75 | printf("Your code executed in %g ms\n", timer.Elapsed()); 76 | 77 | unsigned int h_out_warp; 78 | // copy back the sum from GPU 79 | cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int), 80 | cudaMemcpyDeviceToHost); 81 | 82 | // compare your result against the expected reduce sum 83 | compare(h_out_warp, sum); 84 | 85 | // free GPU memory allocation 86 | cudaFree(d_in); 87 | cudaFree(d_out_warp); 88 | 89 | } 90 | 91 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | CUDA_ADD_EXECUTABLE(Lesson2_atomics atomics.cu) 11 | 12 | CUDA_ADD_EXECUTABLE(Lesson2_memory memory.cu) 13 | 14 | CUDA_ADD_EXECUTABLE(Lesson2_hello_world hello.cu) 15 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc,char **argv) 4 | { 5 | printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n", 6 | 1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99, 7 | 1.f, 1e99, -1e99, 1.f + (1e99 + -1e99)); 8 | return 0; 9 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "gputimer.h" 8 | 9 | #define NUM_THREADS 1000000 10 | #define ARRAY_SIZE 100 11 | 12 | #define BLOCK_WIDTH 1000 13 | 14 | void print_array(int *array, int size) 15 | { 16 | printf("{ "); 17 | for (int i = 0; i < size; i++) { printf("%d ", array[i]); } 18 | printf("}\n"); 19 | } 20 | 21 | __global__ void increment_naive(int *g) 22 | { 23 | // which thread is this? 24 | int i = blockIdx.x * blockDim.x + threadIdx.x; 25 | 26 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 27 | i = i % ARRAY_SIZE; 28 | g[i] = g[i] + 1; 29 | } 30 | 31 | __global__ void increment_atomic(int *g) 32 | { 33 | // which thread is this? 34 | int i = blockIdx.x * blockDim.x + threadIdx.x; 35 | 36 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 37 | i = i % ARRAY_SIZE; 38 | atomicAdd(& g[i], 1); 39 | } 40 | 41 | int main(int argc,char **argv) 42 | { 43 | GpuTimer timer; 44 | printf("%d total threads in %d blocks writing into %d array elements\n", 45 | NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); 46 | 47 | // declare and allocate host memory 48 | int h_array[ARRAY_SIZE]; 49 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); 50 | 51 | // declare, allocate, and zero out GPU memory 52 | int * d_array; 53 | cudaMalloc((void **) &d_array, ARRAY_BYTES); 54 | cudaMemset((void *) d_array, 0, ARRAY_BYTES); 55 | 56 | // launch the kernel - comment out one of these 57 | timer.Start(); 58 | //increment_naive<<>>(d_array); 59 | increment_atomic<<>>(d_array); 60 | timer.Stop(); 61 | 62 | // copy back the array of sums from GPU and print 63 | cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost); 64 | print_array(h_array, ARRAY_SIZE); 65 | printf("Time elapsed = %g ms\n", timer.Elapsed()); 66 | 67 | // free GPU memory allocation and exit 68 | cudaFree(d_array); 69 | return 0; 70 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/hello.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define NUM_BLOCKS 4 9 | #define BLOCK_WIDTH 4 10 | 11 | __global__ void hello() 12 | { 13 | printf("Hello world! I'm thread %d in block %d\n", threadIdx.x, blockIdx.x); 14 | } 15 | 16 | 17 | int main(int argc,char **argv) 18 | { 19 | // launch the kernel 20 | hello<<>>(); 21 | 22 | // force the printf()s to flush 23 | cudaDeviceSynchronize(); 24 | 25 | printf("That's all!\n"); 26 | 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 16 4 | #define BLOCK_WIDTH 1 5 | 6 | __global__ void hello() 7 | { 8 | printf("Hello world! I'm a thread in block %d\n", blockIdx.x); 9 | } 10 | 11 | 12 | int main(int argc,char **argv) 13 | { 14 | // launch the kernel 15 | hello<<>>(); 16 | 17 | // force the printf()s to flush 18 | cudaDeviceSynchronize(); 19 | 20 | printf("That's all!\n"); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 1 4 | #define BLOCK_WIDTH 256 5 | 6 | __global__ void hello() 7 | { 8 | printf("Hello world! I'm thread %d\n", threadIdx.x); 9 | } 10 | 11 | 12 | int main(int argc,char **argv) 13 | { 14 | // launch the kernel 15 | hello<<>>(); 16 | 17 | // force the printf()s to flush 18 | cudaDeviceSynchronize(); 19 | 20 | printf("That's all!\n"); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu: -------------------------------------------------------------------------------- 1 | // Using different memory spaces in CUDA 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /********************** 10 | * using local memory * 11 | **********************/ 12 | 13 | // a __device__ or __global__ function runs on the GPU 14 | __global__ void use_local_memory_GPU(float in) 15 | { 16 | float f; // variable "f" is in local memory and private to each thread 17 | f = in; // parameter "in" is in local memory and private to each thread 18 | // ... real code would presumably do other stuff here ... 19 | } 20 | 21 | /********************** 22 | * using global memory * 23 | **********************/ 24 | 25 | // a __global__ function runs on the GPU & can be called from host 26 | __global__ void use_global_memory_GPU(float *array) 27 | { 28 | // "array" is a pointer into global memory on the device 29 | array[threadIdx.x] = 2.0f * (float) threadIdx.x; 30 | } 31 | 32 | /********************** 33 | * using shared memory * 34 | **********************/ 35 | 36 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks) 37 | __global__ void use_shared_memory_GPU(float *array) 38 | { 39 | // local variables, private to each thread 40 | int i, index = threadIdx.x; 41 | float average, sum = 0.0f; 42 | 43 | // __shared__ variables are visible to all threads in the thread block 44 | // and have the same lifetime as the thread block 45 | __shared__ float sh_arr[128]; 46 | 47 | // copy data from "array" in global memory to sh_arr in shared memory. 48 | // here, each thread is responsible for copying a single element. 49 | sh_arr[index] = array[index]; 50 | 51 | __syncthreads(); // ensure all the writes to shared memory have completed 52 | 53 | // now, sh_arr is fully populated. Let's find the average of all previous elements 54 | for (i=0; i average) { array[index] = average; } 61 | 62 | // the following code has NO EFFECT: it modifies shared memory, but 63 | // the resulting modified data is never copied back to global memory 64 | // and vanishes when the thread block completes 65 | sh_arr[index] = 3.14; 66 | } 67 | 68 | int main(int argc, char **argv) 69 | { 70 | /* 71 | * First, call a kernel that shows using local memory 72 | */ 73 | use_local_memory_GPU<<<1, 128>>>(2.0f); 74 | 75 | /* 76 | * Next, call a kernel that shows using global memory 77 | */ 78 | float h_arr[128]; // convention: h_ variables live on host 79 | float *d_arr; // convention: d_ variables live on device (GPU global mem) 80 | 81 | // allocate global memory on the device, place result in "d_arr" 82 | cudaMalloc((void **) &d_arr, sizeof(float) * 128); 83 | // now copy data from host memory "h_arr" to device memory "d_arr" 84 | cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice); 85 | // launch the kernel (1 block of 128 threads) 86 | use_global_memory_GPU<<<1, 128>>>(d_arr); // modifies the contents of array at d_arr 87 | // copy the modified array back to the host, overwriting contents of h_arr 88 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost); 89 | // ... do other stuff ... 90 | 91 | /* 92 | * Next, call a kernel that shows using shared memory 93 | */ 94 | 95 | // as before, pass in a pointer to data in global memory 96 | use_shared_memory_GPU<<<1, 128>>>(d_arr); 97 | // copy the modified array back to the host 98 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice); 99 | // ... do other stuff ... 100 | return 0; 101 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | CUDA_ADD_EXECUTABLE(Lesson3_histo histo.cu) 11 | 12 | CUDA_ADD_EXECUTABLE(Lesson3_reduce reduce.cu) 13 | 14 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax reduce_minmax.cu) 15 | 16 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax_2 reduce_minmax_2.cu) 17 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int log2(int i) 5 | { 6 | int r = 0; 7 | while (i >>= 1) r++; 8 | return r; 9 | } 10 | 11 | int bit_reverse(int w, int bits) 12 | { 13 | int r = 0; 14 | for (int i = 0; i < bits; i++) 15 | { 16 | int bit = (w & (1 << i)) >> i; 17 | r |= bit << (bits - i - 1); 18 | } 19 | return r; 20 | } 21 | 22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT) 23 | { 24 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 25 | int myItem = d_in[myId]; 26 | int myBin = myItem % BIN_COUNT; 27 | d_bins[myBin]++; 28 | } 29 | 30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT) 31 | { 32 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 33 | int myItem = d_in[myId]; 34 | int myBin = myItem % BIN_COUNT; 35 | atomicAdd(&(d_bins[myBin]), 1); 36 | } 37 | 38 | 39 | int main(int argc, char **argv) 40 | { 41 | int deviceCount; 42 | cudaGetDeviceCount(&deviceCount); 43 | if (deviceCount == 0) { 44 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 45 | exit(EXIT_FAILURE); 46 | } 47 | int dev = 0; 48 | cudaSetDevice(dev); 49 | 50 | cudaDeviceProp devProps; 51 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 52 | { 53 | printf("Using device %d:\n", dev); 54 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 55 | devProps.name, (int)devProps.totalGlobalMem, 56 | (int)devProps.major, (int)devProps.minor, 57 | (int)devProps.clockRate); 58 | } 59 | 60 | const int ARRAY_SIZE = 65536; 61 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); 62 | const int BIN_COUNT = 16; 63 | const int BIN_BYTES = BIN_COUNT * sizeof(int); 64 | 65 | // generate the input array on the host 66 | int h_in[ARRAY_SIZE]; 67 | for(int i = 0; i < ARRAY_SIZE; i++) { 68 | h_in[i] = bit_reverse(i, log2(ARRAY_SIZE)); 69 | } 70 | int h_bins[BIN_COUNT]; 71 | for(int i = 0; i < BIN_COUNT; i++) { 72 | h_bins[i] = 0; 73 | } 74 | 75 | // declare GPU memory pointers 76 | int * d_in; 77 | int * d_bins; 78 | 79 | // allocate GPU memory 80 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 81 | cudaMalloc((void **) &d_bins, BIN_BYTES); 82 | 83 | // transfer the arrays to the GPU 84 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 85 | cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice); 86 | 87 | int whichKernel = 0; 88 | if (argc == 2) { 89 | whichKernel = atoi(argv[1]); 90 | } 91 | 92 | // launch the kernel 93 | switch(whichKernel) { 94 | case 0: 95 | printf("Running naive histo\n"); 96 | naive_histo<<>>(d_bins, d_in, BIN_COUNT); 97 | break; 98 | case 1: 99 | printf("Running simple histo\n"); 100 | simple_histo<<>>(d_bins, d_in, BIN_COUNT); 101 | break; 102 | default: 103 | fprintf(stderr, "error: ran no kernel\n"); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | // copy back the sum from GPU 108 | cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost); 109 | 110 | for(int i = 0; i < BIN_COUNT; i++) { 111 | printf("bin %d: count %d\n", i, h_bins[i]); 112 | } 113 | 114 | // free GPU memory allocation 115 | cudaFree(d_in); 116 | cudaFree(d_bins); 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void global_reduce_kernel(float * d_out, float * d_in) 6 | { 7 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 8 | int tid = threadIdx.x; 9 | 10 | // do reduction in global mem 11 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 12 | { 13 | if (tid < s) 14 | { 15 | d_in[myId] += d_in[myId + s]; 16 | } 17 | __syncthreads(); // make sure all adds at one stage are done! 18 | } 19 | 20 | // only thread 0 writes result for this block back to global mem 21 | if (tid == 0) 22 | { 23 | d_out[blockIdx.x] = d_in[myId]; 24 | } 25 | } 26 | 27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in) 28 | { 29 | // sdata is allocated in the kernel call: 3rd arg to <<>> 30 | extern __shared__ float sdata[]; 31 | 32 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 33 | int tid = threadIdx.x; 34 | 35 | // load shared mem from global mem 36 | sdata[tid] = d_in[myId]; 37 | __syncthreads(); // make sure entire block is loaded! 38 | 39 | // do reduction in shared mem 40 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 41 | { 42 | if (tid < s) 43 | { 44 | sdata[tid] += sdata[tid + s]; 45 | } 46 | __syncthreads(); // make sure all adds at one stage are done! 47 | } 48 | 49 | // only thread 0 writes result for this block back to global mem 50 | if (tid == 0) 51 | { 52 | d_out[blockIdx.x] = sdata[0]; 53 | } 54 | } 55 | 56 | void reduce(float * d_out, float * d_intermediate, float * d_in, 57 | int size, bool usesSharedMemory) 58 | { 59 | // assumes that size is not greater than maxThreadsPerBlock^2 60 | // and that size is a multiple of maxThreadsPerBlock 61 | const int maxThreadsPerBlock = 1024; 62 | int threads = maxThreadsPerBlock; 63 | int blocks = size / maxThreadsPerBlock; 64 | if (usesSharedMemory) 65 | { 66 | shmem_reduce_kernel<<>> 67 | (d_intermediate, d_in); 68 | } 69 | else 70 | { 71 | global_reduce_kernel<<>> 72 | (d_intermediate, d_in); 73 | } 74 | // now we're down to one block left, so reduce it 75 | threads = blocks; // launch one thread for each block in prev step 76 | blocks = 1; 77 | if (usesSharedMemory) 78 | { 79 | shmem_reduce_kernel<<>> 80 | (d_out, d_intermediate); 81 | } 82 | else 83 | { 84 | global_reduce_kernel<<>> 85 | (d_out, d_intermediate); 86 | } 87 | } 88 | 89 | int main(int argc, char **argv) 90 | { 91 | int deviceCount; 92 | cudaGetDeviceCount(&deviceCount); 93 | if (deviceCount == 0) { 94 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 95 | exit(EXIT_FAILURE); 96 | } 97 | int dev = 0; 98 | cudaSetDevice(dev); 99 | 100 | cudaDeviceProp devProps; 101 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 102 | { 103 | printf("Using device %d:\n", dev); 104 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 105 | devProps.name, (int)devProps.totalGlobalMem, 106 | (int)devProps.major, (int)devProps.minor, 107 | (int)devProps.clockRate); 108 | } 109 | 110 | const int ARRAY_SIZE = 1 << 16; 111 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 112 | 113 | // generate the input array on the host 114 | float h_in[ARRAY_SIZE]; 115 | float sum = 0.0f; 116 | for(int i = 0; i < ARRAY_SIZE; i++) { 117 | // generate random float in [-1.0f, 1.0f] 118 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f); 119 | sum += h_in[i]; 120 | } 121 | 122 | // declare GPU memory pointers 123 | float * d_in, * d_intermediate, * d_out; 124 | 125 | // allocate GPU memory 126 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 127 | cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated 128 | cudaMalloc((void **) &d_out, sizeof(float)); 129 | 130 | // transfer the input array to the GPU 131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 132 | 133 | int whichKernel = 0; 134 | if (argc == 2) { 135 | whichKernel = atoi(argv[1]); 136 | } 137 | 138 | cudaEvent_t start, stop; 139 | cudaEventCreate(&start); 140 | cudaEventCreate(&stop); 141 | // launch the kernel 142 | switch(whichKernel) { 143 | case 0: 144 | printf("Running global reduce\n"); 145 | cudaEventRecord(start, 0); 146 | for (int i = 0; i < 100; i++) 147 | { 148 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false); 149 | } 150 | cudaEventRecord(stop, 0); 151 | break; 152 | case 1: 153 | printf("Running reduce with shared mem\n"); 154 | cudaEventRecord(start, 0); 155 | for (int i = 0; i < 100; i++) 156 | { 157 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true); 158 | } 159 | cudaEventRecord(stop, 0); 160 | break; 161 | default: 162 | fprintf(stderr, "error: ran no kernel\n"); 163 | exit(EXIT_FAILURE); 164 | } 165 | cudaEventSynchronize(stop); 166 | float elapsedTime; 167 | cudaEventElapsedTime(&elapsedTime, start, stop); 168 | elapsedTime /= 100.0f; // 100 trials 169 | 170 | // copy back the sum from GPU 171 | float h_out; 172 | cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost); 173 | 174 | printf("average time elapsed: %f\n", elapsedTime); 175 | 176 | // free GPU memory allocation 177 | cudaFree(d_in); 178 | cudaFree(d_intermediate); 179 | cudaFree(d_out); 180 | 181 | return 0; 182 | } 183 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in, bool is_max) 8 | { 9 | // sdata is allocated in the kernel call: 3rd arg to <<>> 10 | extern __shared__ float sdata[]; 11 | 12 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 13 | int tid = threadIdx.x; 14 | 15 | // load shared mem from global mem 16 | sdata[tid] = d_in[myId]; 17 | __syncthreads(); // make sure entire block is loaded! 18 | 19 | // do reduction in shared mem 20 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 21 | { 22 | if (tid < s) 23 | { 24 | if(is_max) 25 | sdata[tid] = max(sdata[tid], sdata[tid + s]); 26 | else 27 | sdata[tid] = min(sdata[tid], sdata[tid + s]); 28 | } 29 | __syncthreads(); // make sure all adds at one stage are done! 30 | } 31 | 32 | // only thread 0 writes result for this block back to global mem 33 | if (tid == 0) 34 | { 35 | d_out[blockIdx.x] = sdata[0]; 36 | } 37 | } 38 | 39 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length) 40 | { 41 | // use reduce 42 | const int m = 1 << 10; 43 | int blocks = ceil((float)length / m); 44 | float *d_intermediate; // should not modify d_in 45 | cudaMalloc(&d_intermediate, sizeof(float)* blocks); // store max and min 46 | 47 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance, true); 48 | shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(max_logLum, d_intermediate, true); 49 | 50 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance, false); 51 | shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(min_logLum, d_intermediate, false); 52 | 53 | cudaFree(d_intermediate); 54 | } 55 | 56 | int main(int argc, char **argv) 57 | { 58 | int deviceCount; 59 | cudaGetDeviceCount(&deviceCount); 60 | if (deviceCount == 0) { 61 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 62 | exit(EXIT_FAILURE); 63 | } 64 | int dev = 0; 65 | cudaSetDevice(dev); 66 | 67 | cudaDeviceProp devProps; 68 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 69 | { 70 | printf("Using device %d:\n", dev); 71 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 72 | devProps.name, (int)devProps.totalGlobalMem, 73 | (int)devProps.major, (int)devProps.minor, 74 | (int)devProps.clockRate); 75 | } 76 | 77 | const int ARRAY_SIZE = 1 << 16; 78 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 79 | 80 | // generate the input array on the host 81 | float h_in[ARRAY_SIZE]; 82 | float sum = 0.0f; 83 | srand((unsigned)time(0)); 84 | for(int i = 0; i < ARRAY_SIZE; i++) { 85 | // generate random float in [-1.0f, 1.0f] 86 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f); 87 | sum += h_in[i]; 88 | } 89 | 90 | // declare GPU memory pointers 91 | float *d_in; 92 | 93 | // allocate GPU memory 94 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 95 | 96 | // transfer the input array to the GPU 97 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 98 | 99 | // launch the kernel 100 | printf("Running reduce\n"); 101 | float *d_min, *d_max; 102 | cudaMalloc((void **) &d_min, sizeof(float)); 103 | cudaMalloc((void **) &d_max, sizeof(float)); 104 | reduce(d_min, d_max, d_in, ARRAY_SIZE); 105 | 106 | // copy back the sum from GPU 107 | float h_min, h_max; 108 | cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost); 109 | cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost); 110 | 111 | printf("Max_GPU: %f Min_GPU: %f\n", h_max, h_min); 112 | h_max = h_in[0]; h_min = h_in[0]; 113 | for (size_t i = 1; i < ARRAY_SIZE; ++i) { 114 | h_max = std::max(h_in[i], h_max); 115 | h_min = std::min(h_in[i], h_min); 116 | } 117 | printf("Max_CPU: %f Min_CPU: %f\n", h_max, h_min); 118 | 119 | // free GPU memory allocation 120 | cudaFree(d_in); 121 | cudaFree(d_min); 122 | cudaFree(d_max); 123 | return 0; 124 | } 125 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax_2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in) 7 | { 8 | // sdata is allocated in the kernel call: 3rd arg to <<>> 9 | extern __shared__ float sdata[]; 10 | 11 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 12 | int tid = threadIdx.x; 13 | 14 | // load shared mem from global mem 15 | sdata[tid] = d_in[myId]; 16 | __syncthreads(); // make sure entire block is loaded! 17 | 18 | // do reduction in shared mem 19 | unsigned int s = blockDim.x / 2; 20 | // step 1: 分两半 21 | if (tid < s) { 22 | float temp = sdata[tid]; 23 | sdata[tid] = max(temp, sdata[tid + s]); 24 | sdata[tid + s] = min(temp, sdata[tid + s]); 25 | } 26 | __syncthreads(); // make sure all adds at one stage are done! 27 | 28 | // step 2: 两边走 29 | for (s = s / 2; s > 0; s >>= 1) 30 | { 31 | if (tid < s) { 32 | sdata[tid] = max(sdata[tid], sdata[tid + s]); 33 | } 34 | else if (tid >= blockDim.x / 2 && tid < blockDim.x / 2 + s) { 35 | sdata[tid] = min(sdata[tid], sdata[tid + s]); 36 | } 37 | __syncthreads(); // make sure all adds at one stage are done! 38 | } 39 | 40 | // only thread 0 writes result for this block back to global mem 41 | if (tid == 0) 42 | { 43 | d_out[blockIdx.x] = sdata[0]; 44 | d_out[blockDim.x + blockIdx.x] = sdata[blockDim.x / 2]; 45 | //printf("%f %f\n", sdata[0], sdata[blockDim.x / 2]); 46 | } 47 | } 48 | 49 | __global__ void shmem_reduce_finish_kernel(float *min_logLum, 50 | float *max_logLum, const float * const d_in) 51 | { 52 | // sdata is allocated in the kernel call: 3rd arg to <<>> 53 | extern __shared__ float sdata[]; 54 | int tid = threadIdx.x; 55 | 56 | // load shared mem from global mem 57 | sdata[tid] = d_in[tid]; 58 | sdata[tid + blockDim.x] = d_in[tid + blockDim.x]; 59 | __syncthreads(); // make sure entire block is loaded! 60 | 61 | // do reduction in shared mem 62 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 63 | { 64 | if (tid < s) { 65 | sdata[tid] = max(sdata[tid], sdata[tid + s]); 66 | sdata[tid + blockDim.x] = min(sdata[tid + blockDim.x], sdata[tid + blockDim.x + s]); 67 | } 68 | __syncthreads(); // make sure all adds at one stage are done! 69 | } 70 | 71 | // only thread 0 writes result for this block back to global mem 72 | if (tid == 0) 73 | { 74 | *max_logLum = sdata[0]; 75 | *min_logLum = sdata[blockDim.x]; 76 | } 77 | } 78 | 79 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length) 80 | { 81 | // use reduce 82 | const int m = 1 << 6; 83 | int blocks = ceil((float)length / m); 84 | float *d_intermediate; // should not modify d_in 85 | cudaMalloc(&d_intermediate, sizeof(float)* blocks * 2); // store max and min 86 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance); 87 | shmem_reduce_finish_kernel<<<1, blocks, 2 * blocks*sizeof(float)>>>(min_logLum, max_logLum, d_intermediate); 88 | cudaFree(d_intermediate); 89 | } 90 | 91 | int main(int argc, char **argv) 92 | { 93 | int deviceCount; 94 | cudaGetDeviceCount(&deviceCount); 95 | if (deviceCount == 0) { 96 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 97 | exit(EXIT_FAILURE); 98 | } 99 | int dev = 0; 100 | cudaSetDevice(dev); 101 | 102 | cudaDeviceProp devProps; 103 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 104 | { 105 | printf("Using device %d:\n", dev); 106 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 107 | devProps.name, (int)devProps.totalGlobalMem, 108 | (int)devProps.major, (int)devProps.minor, 109 | (int)devProps.clockRate); 110 | } 111 | 112 | const int ARRAY_SIZE = 1 << 12; 113 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 114 | 115 | // generate the input array on the host 116 | float h_in[ARRAY_SIZE]; 117 | float sum = 0.0f; 118 | for(int i = 0; i < ARRAY_SIZE; i++) { 119 | // generate random float in [-1.0f, 1.0f] 120 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f); 121 | sum += h_in[i]; 122 | } 123 | 124 | // declare GPU memory pointers 125 | float *d_in; 126 | 127 | // allocate GPU memory 128 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 129 | 130 | // transfer the input array to the GPU 131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 132 | 133 | // launch the kernel 134 | printf("Running reduce\n"); 135 | float *d_min, *d_max; 136 | cudaMalloc((void **) &d_min, sizeof(float)); 137 | cudaMalloc((void **) &d_max, sizeof(float)); 138 | reduce(d_min, d_max, d_in, ARRAY_SIZE); 139 | 140 | // copy back the sum from GPU 141 | float h_min, h_max; 142 | cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost); 143 | cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost); 144 | 145 | printf("Max_GPU: %f Min_GPU: %f\n", h_max, h_min); 146 | h_max = h_in[0]; h_min = h_in[0]; 147 | for (size_t i = 1; i < ARRAY_SIZE; ++i) { 148 | h_max = std::max(h_in[i], h_max); 149 | h_min = std::min(h_in[i], h_min); 150 | } 151 | printf("Max_CPU: %f Min_CPU: %f\n", h_max, h_min); 152 | 153 | // free GPU memory allocation 154 | cudaFree(d_in); 155 | cudaFree(d_min); 156 | cudaFree(d_max); 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 5 Code Snippets/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | CUDA_ADD_EXECUTABLE(Lesson5_deviceQuery deviceQuery_simplified.cpp) 11 | 12 | CUDA_ADD_EXECUTABLE(Lesson5_transpose transpose.cu gputimer.h) 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 5 Code Snippets/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB Lesson7_tiling_hdr tiling/*.hpp tiling/*.h ) 11 | SET (Lesson7_tiling_files tiling/tiling.cu) 12 | CUDA_ADD_EXECUTABLE(Lesson7_tiling ${Lesson7_tiling_files} ${Lesson7_tiling_hdr}) 13 | 14 | file( GLOB Lesson7_thrust_hdr thrust/*.h ) 15 | SET (Lesson7_thrust_files thrust/thrust_example.cu thrust/gettime.cc) 16 | CUDA_ADD_EXECUTABLE(Lesson7_thrust ${Lesson7_thrust_files} ${Lesson7_thrust_hdr}) 17 | 18 | file( GLOB Lesson7_opencv_hdr opencv/*.h ) 19 | SET (Lesson7_opencv_files opencv/opencv.cu opencv/gettime.cc) 20 | CUDA_ADD_EXECUTABLE(Lesson7_opencv ${Lesson7_opencv_files} ${Lesson7_opencv_hdr}) 21 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Simple demonstration of cub::BlockScan 31 | * 32 | * Example compilation string: 33 | * 34 | * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum 35 | * 36 | ******************************************************************************/ 37 | 38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h) 39 | #define CUB_STDERR 40 | 41 | #include 42 | #include 43 | 44 | #include 45 | 46 | using namespace cub; 47 | 48 | //--------------------------------------------------------------------- 49 | // Globals, constants and typedefs 50 | //--------------------------------------------------------------------- 51 | 52 | bool g_verbose = false; 53 | int g_iterations = 100; 54 | 55 | 56 | //--------------------------------------------------------------------- 57 | // Kernels 58 | //--------------------------------------------------------------------- 59 | 60 | /** 61 | * Simple kernel for performing a block-wide exclusive prefix sum over integers 62 | */ 63 | template < 64 | int BLOCK_THREADS, 65 | int ITEMS_PER_THREAD> 66 | __global__ void BlockPrefixSumKernel( 67 | int *d_in, // Tile of input 68 | int *d_out, // Tile of output 69 | clock_t *d_elapsed) // Elapsed cycle count of block scan 70 | { 71 | // Parameterize BlockScan type for our thread block 72 | typedef BlockScan BlockScanT; 73 | 74 | // Shared memory 75 | __shared__ typename BlockScanT::SmemStorage smem_storage; 76 | 77 | // Per-thread tile data 78 | int data[ITEMS_PER_THREAD]; 79 | BlockLoadVectorized(d_in, data); 80 | 81 | // Start cycle timer 82 | clock_t start = clock(); 83 | 84 | // Compute exclusive prefix sum 85 | int aggregate; 86 | BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate); 87 | 88 | // Stop cycle timer 89 | clock_t stop = clock(); 90 | 91 | // Store output 92 | BlockStoreVectorized(d_out, data); 93 | 94 | // Store aggregate and elapsed clocks 95 | if (threadIdx.x == 0) 96 | { 97 | *d_elapsed = (start > stop) ? start - stop : stop - start; 98 | d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; 99 | } 100 | } 101 | 102 | 103 | 104 | //--------------------------------------------------------------------- 105 | // Host utilities 106 | //--------------------------------------------------------------------- 107 | 108 | /** 109 | * Initialize exclusive prefix sum problem (and solution). 110 | * Returns the aggregate 111 | */ 112 | int Initialize( 113 | int *h_in, 114 | int *h_reference, 115 | int num_elements) 116 | { 117 | int inclusive = 0; 118 | 119 | for (int i = 0; i < num_elements; ++i) 120 | { 121 | h_in[i] = i % 17; 122 | 123 | h_reference[i] = inclusive; 124 | inclusive += h_in[i]; 125 | } 126 | 127 | return inclusive; 128 | } 129 | 130 | 131 | /** 132 | * Test thread block scan 133 | */ 134 | template < 135 | int BLOCK_THREADS, 136 | int ITEMS_PER_THREAD> 137 | void Test() 138 | { 139 | const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; 140 | 141 | // Allocate host arrays 142 | int *h_in = new int[TILE_SIZE]; 143 | int *h_reference = new int[TILE_SIZE]; 144 | int *h_gpu = new int[TILE_SIZE + 1]; 145 | 146 | // Initialize problem and reference output on host 147 | int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); 148 | 149 | // Initialize device arrays 150 | int *d_in = NULL; 151 | int *d_out = NULL; 152 | clock_t *d_elapsed = NULL; 153 | cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); 154 | cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); 155 | cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); 156 | 157 | // Display input problem data 158 | if (g_verbose) 159 | { 160 | printf("Input data: "); 161 | for (int i = 0; i < TILE_SIZE; i++) 162 | printf("%d, ", h_in[i]); 163 | printf("\n\n"); 164 | } 165 | 166 | // Copy problem to device 167 | cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); 168 | 169 | printf("BlockScan %d items (%d threads, %d items per thread): ", 170 | TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD); 171 | 172 | // Run this several times and average the performance results 173 | clock_t elapsed_scan_clocks = 0; 174 | for (int i = 0; i < g_iterations; ++i) 175 | { 176 | // Run aggregate/prefix kernel 177 | BlockPrefixSumKernel<<<1, BLOCK_THREADS>>>( 178 | d_in, 179 | d_out, 180 | d_elapsed); 181 | 182 | // Copy results from device 183 | clock_t scan_clocks; 184 | cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost); 185 | cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost); 186 | elapsed_scan_clocks += scan_clocks; 187 | } 188 | 189 | // Check scanned items 190 | bool correct = true; 191 | for (int i = 0; i < TILE_SIZE; i++) 192 | { 193 | if (h_gpu[i] != h_reference[i]) 194 | { 195 | printf("Incorrect result @ offset %d (%d != %d)\n", 196 | i, h_gpu[i], h_reference[i]); 197 | correct = false; 198 | break; 199 | } 200 | } 201 | 202 | // Check total aggregate 203 | if (h_gpu[TILE_SIZE] != h_aggregate) 204 | { 205 | printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate); 206 | correct = false; 207 | } 208 | if (correct) printf("Correct!\n"); 209 | 210 | // Display results problem data 211 | if (g_verbose) 212 | { 213 | printf("GPU output (reference output): "); 214 | for (int i = 0; i < TILE_SIZE; i++) 215 | printf("%d (%d), ", h_gpu[i], h_reference[i]); 216 | printf("\n"); 217 | printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate); 218 | printf("\n\n"); 219 | } 220 | 221 | // Display timing results 222 | printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations); 223 | 224 | // Cleanup 225 | if (h_in) delete[] h_in; 226 | if (h_reference) delete[] h_reference; 227 | if (h_gpu) delete[] h_gpu; 228 | if (d_in) cudaFree(d_in); 229 | if (d_out) cudaFree(d_out); 230 | if (d_elapsed) cudaFree(d_elapsed); 231 | } 232 | 233 | 234 | /** 235 | * Main 236 | */ 237 | int main(int argc, char** argv) 238 | { 239 | // Display GPU name 240 | cudaDeviceProp props; 241 | cudaGetDeviceProperties(&props, 0); 242 | printf("Using device %s\n", props.name); 243 | 244 | /** Add tests here **/ 245 | 246 | // Run tests 247 | Test<1024, 1>(); 248 | Test<512, 2>(); 249 | Test<256, 4>(); 250 | Test<128, 8>(); 251 | Test<64, 16>(); 252 | Test<32, 32>(); 253 | Test<16, 64>(); 254 | 255 | /****/ 256 | 257 | return 0; 258 | } 259 | 260 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.cc: -------------------------------------------------------------------------------- 1 | #define WIN32_LEAN_AND_MEAN 2 | #include 3 | #include // portable: uint64_t MSVC: __int64 4 | #include "gettime.h" 5 | 6 | int gettimeofday(struct timeval * tp, struct timezone * tzp) 7 | { 8 | // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's 9 | static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL); 10 | 11 | SYSTEMTIME system_time; 12 | FILETIME file_time; 13 | uint64_t time; 14 | 15 | GetSystemTime( &system_time ); 16 | SystemTimeToFileTime( &system_time, &file_time ); 17 | time = ((uint64_t)file_time.dwLowDateTime ) ; 18 | time += ((uint64_t)file_time.dwHighDateTime) << 32; 19 | 20 | tp->tv_sec = (long) ((time - EPOCH) / 10000000L); 21 | tp->tv_usec = (long) (system_time.wMilliseconds * 1000); 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.h: -------------------------------------------------------------------------------- 1 | #ifndef GETTIME_H 2 | #define GETTIME_H 3 | 4 | #include 5 | 6 | // MSVC defines this in winsock2.h!? 7 | /*struct timeval { 8 | long tv_sec; 9 | long tv_usec; 10 | }; 11 | */ 12 | int gettimeofday(struct timeval * tp, struct timezone * tzp); 13 | 14 | double tic(); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/opencv/opencv.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gettime.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | using namespace cv; 13 | using namespace cv::cuda; 14 | 15 | int main(int argc, char **argv) { 16 | 17 | cv::Mat src = cv::imread("IMAG0179_small.jpg", cv::IMREAD_GRAYSCALE); 18 | 19 | if (!src.data) { 20 | printf("failed opening jpg\n"); 21 | exit(1); 22 | } 23 | 24 | Mat mask; 25 | cv::Canny(src, mask, 100, 200, 3); 26 | 27 | Mat dst_cpu; 28 | cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR); 29 | Mat dst_gpu = dst_cpu.clone(); 30 | 31 | vector lines_cpu; 32 | { 33 | const int64 start = getTickCount(); 34 | 35 | cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5); 36 | 37 | const double timeSec = (getTickCount() - start) / getTickFrequency(); 38 | cout << "CPU Time : " << timeSec * 1000 << " ms" << endl; 39 | cout << "CPU Found : " << lines_cpu.size() << endl; 40 | } 41 | 42 | for (size_t i = 0; i < lines_cpu.size(); ++i) 43 | { 44 | Vec4i l = lines_cpu[i]; 45 | line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA); 46 | } 47 | 48 | GpuMat d_src(mask); 49 | GpuMat d_lines; 50 | { 51 | const int64 start = getTickCount(); 52 | 53 | Ptr hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 60, 5); 54 | hough->detect(d_src, d_lines); 55 | 56 | const double timeSec = (getTickCount() - start) / getTickFrequency(); 57 | cout << "GPU Time : " << timeSec * 1000 << " ms" << endl; 58 | cout << "GPU Found : " << d_lines.cols << endl; 59 | } 60 | vector lines_gpu; 61 | if (!d_lines.empty()) 62 | { 63 | lines_gpu.resize(d_lines.cols); 64 | Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]); 65 | d_lines.download(h_lines); 66 | } 67 | 68 | for (size_t i = 0; i < lines_gpu.size(); ++i) 69 | { 70 | Vec4i l = lines_gpu[i]; 71 | line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA); 72 | } 73 | 74 | imshow("source", src); 75 | imshow("detected lines [CPU]", dst_cpu); 76 | imshow("detected lines [GPU]", dst_gpu); 77 | waitKey(); 78 | 79 | return 0; 80 | } 81 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.cc: -------------------------------------------------------------------------------- 1 | #define WIN32_LEAN_AND_MEAN 2 | #include 3 | #include // portable: uint64_t MSVC: __int64 4 | #include "gettime.h" 5 | 6 | int gettimeofday(struct timeval * tp, struct timezone * tzp) 7 | { 8 | // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's 9 | static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL); 10 | 11 | SYSTEMTIME system_time; 12 | FILETIME file_time; 13 | uint64_t time; 14 | 15 | GetSystemTime( &system_time ); 16 | SystemTimeToFileTime( &system_time, &file_time ); 17 | time = ((uint64_t)file_time.dwLowDateTime ) ; 18 | time += ((uint64_t)file_time.dwHighDateTime) << 32; 19 | 20 | tp->tv_sec = (long) ((time - EPOCH) / 10000000L); 21 | tp->tv_usec = (long) (system_time.wMilliseconds * 1000); 22 | return 0; 23 | } 24 | 25 | /*double tic() { 26 | struct timeval t; 27 | gettimeofday(&t, NULL); 28 | return ((double)t.tv_sec * 1000 + ((double)t.tv_usec) / 1000.); 29 | }*/ 30 | 31 | double tic() { 32 | LARGE_INTEGER m_nFreq; 33 | LARGE_INTEGER m_Time; 34 | QueryPerformanceFrequency(&m_nFreq); 35 | QueryPerformanceCounter(&m_Time); 36 | return (double)m_Time.QuadPart * 1000. / m_nFreq.QuadPart; 37 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.h: -------------------------------------------------------------------------------- 1 | #ifndef GETTIME_H 2 | #define GETTIME_H 3 | 4 | #include 5 | 6 | // MSVC defines this in winsock2.h!? 7 | /*struct timeval { 8 | long tv_sec; 9 | long tv_usec; 10 | }; 11 | */ 12 | int gettimeofday(struct timeval * tp, struct timezone * tzp); 13 | double tic(); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "gputimer.h" 11 | #include "gettime.h" 12 | 13 | int main(void) 14 | { 15 | // generate N random numbers serially 16 | int N = 1000000; 17 | std::vector h_vec(N); 18 | std::generate(h_vec.begin(), h_vec.end(), rand); 19 | std::vector h_vec_std(h_vec); 20 | 21 | double t0 = tic(); 22 | thrust::sort(h_vec.begin(), h_vec.end()); 23 | std::cout << "thrust::sort took " << tic() - t0 << " ms." << std::endl; 24 | 25 | t0 = tic(); 26 | std::sort(h_vec_std.begin(), h_vec_std.end()); 27 | std::cout << "std::sort took " << tic() - t0 << " ms." << std::endl; 28 | 29 | for (int i = 0; i < N; i++) { 30 | if (h_vec[i] != h_vec_std[i]) { 31 | std::cout << i << " Not same!" << std::endl; 32 | exit(1); 33 | } 34 | } 35 | 36 | return 0; 37 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "gputimer.h" 5 | #include "utils.h" 6 | 7 | const int BLOCKSIZE = 128; 8 | const int NUMBLOCKS = 100; // set this to 1 or 2 for debugging 9 | const int N = BLOCKSIZE*NUMBLOCKS; 10 | 11 | /* 12 | * TODO: modify the foo and bar kernels to use tiling: 13 | * - copy the input data to shared memory 14 | * - perform the computation there 15 | * - copy the result back to global memory 16 | * - assume thread blocks of 128 threads 17 | * - handle intra-block boundaries correctly 18 | * You can ignore boundary conditions (we ignore the first 2 and last 2 elements) 19 | */ 20 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){ 21 | 22 | int i = threadIdx.x + blockIdx.x*blockDim.x; 23 | 24 | out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f; 25 | } 26 | 27 | __global__ void bar(float out[], float in[]) 28 | { 29 | int i = threadIdx.x + blockIdx.x*blockDim.x; 30 | 31 | out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f; 32 | } 33 | 34 | __global__ void bar_tile(float out[], float in[]) 35 | { 36 | int i = threadIdx.x + blockIdx.x*blockDim.x; 37 | int idx = threadIdx.x; 38 | extern __shared__ float sh_din[]; 39 | sh_din[idx + 2] = in[i]; 40 | if (idx == 0) { 41 | sh_din[idx] = in[i-2]; 42 | sh_din[idx+1] = in[i-1]; 43 | } 44 | else if (idx == blockDim.x - 1) { 45 | sh_din[idx + 3] = in[i+1]; 46 | sh_din[idx + 4] = in[i+2]; 47 | } 48 | __syncthreads(); 49 | 50 | out[i] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f; 51 | } 52 | 53 | __global__ void bar_tile_2(float out[], float in[]) 54 | { 55 | int i = threadIdx.x + blockIdx.x*blockDim.x; 56 | int idx = threadIdx.x; 57 | extern __shared__ float sh_din[]; 58 | sh_din[idx] = in[i]; 59 | __syncthreads(); 60 | if (idx == 0) { 61 | out[i] = (in[i - 2] + in[i - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f; 62 | } 63 | else if (idx == 1) { 64 | out[i] = (in[i - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f; 65 | } 66 | else if (idx == blockDim.x - 2) { 67 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + in[i + 2]) / 5.0f; 68 | } 69 | else if (idx == blockDim.x - 1) { 70 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + in[i + 1] + in[i + 2]) / 5.0f; 71 | } 72 | else { 73 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f; 74 | } 75 | } 76 | 77 | __global__ void bar_tile_3(float out[], float in[]) 78 | { 79 | int idx = threadIdx.x; 80 | extern __shared__ float sh_din[]; 81 | int i_in = blockIdx.x * BLOCKSIZE + idx; 82 | sh_din[idx] = in[i_in-2]; 83 | __syncthreads(); 84 | if (idx < blockDim.x-4) 85 | out[i_in] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f; 86 | } 87 | 88 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[]) 89 | { 90 | for (int i=0; i>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE); 147 | fooTimer.Stop(); 148 | 149 | checkCudaErrors(cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost)); 150 | printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed()); 151 | compareArrays(ref_fooOut, fooOut, N); 152 | 153 | barTimer.Start(); 154 | bar<<>>(d_barOut, d_barIn); 155 | //bar_tile << > >(d_barOut, d_barIn); 156 | //bar_tile_2 << > >(d_barOut, d_barIn); 157 | //bar_tile_3 << > >(d_barOut, d_barIn); 158 | barTimer.Stop(); 159 | 160 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost)); 161 | printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed()); 162 | compareArrays(ref_barOut, barOut, N); 163 | 164 | barTimer.Start(); 165 | bar_tile << > >(d_barOut, d_barIn); 166 | barTimer.Stop(); 167 | 168 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost)); 169 | printf("bar_tile<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed()); 170 | compareArrays(ref_barOut, barOut, N); 171 | 172 | barTimer.Start(); 173 | bar_tile_2 << > >(d_barOut, d_barIn); 174 | barTimer.Stop(); 175 | 176 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost)); 177 | printf("bar_tile_2<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed()); 178 | compareArrays(ref_barOut, barOut, N); 179 | 180 | barTimer.Start(); 181 | bar_tile_3 << > >(d_barOut, d_barIn); 182 | barTimer.Stop(); 183 | 184 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost)); 185 | printf("bar_tile_3<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed()); 186 | compareArrays(ref_barOut, barOut, N); 187 | 188 | checkCudaErrors(cudaFree(d_fooA)); 189 | checkCudaErrors(cudaFree(d_fooB)); 190 | checkCudaErrors(cudaFree(d_fooC)); 191 | checkCudaErrors(cudaFree(d_fooD)); 192 | checkCudaErrors(cudaFree(d_fooE)); 193 | checkCudaErrors(cudaFree(d_barIn)); 194 | checkCudaErrors(cudaFree(d_fooOut)); 195 | checkCudaErrors(cudaFree(d_barOut)); 196 | } 197 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | // error checking utility functions 14 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 15 | 16 | template 17 | void check(T err, const char* const func, const char* const file, const int line) 18 | { 19 | if (err != cudaSuccess) { 20 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line); 21 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);; 22 | exit(1); 23 | } 24 | } 25 | 26 | void printArray(float in[], int N) 27 | { 28 | for (int i=0; i CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu}) -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/HW1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | 9 | cv::Mat imageRGBA; 10 | cv::Mat imageGrey; 11 | 12 | uchar4 *d_rgbaImage__; 13 | unsigned char *d_greyImage__; 14 | 15 | size_t numRows() { return imageRGBA.rows; } 16 | size_t numCols() { return imageRGBA.cols; } 17 | 18 | //return types are void since any internal error will be handled by quitting 19 | //no point in returning error codes... 20 | //returns a pointer to an RGBA version of the input image 21 | //and a pointer to the single channel grey-scale output 22 | //on both the host and device 23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage, 24 | uchar4 **d_rgbaImage, unsigned char **d_greyImage, 25 | const std::string &filename) { 26 | //make sure the context initializes ok 27 | checkCudaErrors(cudaFree(0)); 28 | 29 | cv::Mat image; 30 | image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 31 | if (image.empty()) { 32 | std::cerr << "Couldn't open file: " << filename << std::endl; 33 | exit(1); 34 | } 35 | 36 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 37 | 38 | //allocate memory for the output 39 | imageGrey.create(image.rows, image.cols, CV_8UC1); 40 | 41 | //This shouldn't ever happen given the way the images are created 42 | //at least based upon my limited understanding of OpenCV, but better to check 43 | if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) { 44 | std::cerr << "Images aren't continuous!! Exiting." << std::endl; 45 | exit(1); 46 | } 47 | 48 | *inputImage = (uchar4 *)imageRGBA.ptr(0); 49 | *greyImage = imageGrey.ptr(0); 50 | 51 | const size_t numPixels = numRows() * numCols(); 52 | //allocate memory on the device for both input and output 53 | checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels)); 54 | checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels)); 55 | checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around 56 | 57 | //copy input array to the GPU 58 | checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); 59 | 60 | d_rgbaImage__ = *d_rgbaImage; 61 | d_greyImage__ = *d_greyImage; 62 | } 63 | 64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) { 65 | cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr); 66 | 67 | //output the image 68 | cv::imwrite(output_file.c_str(), output); 69 | } 70 | 71 | void cleanup() 72 | { 73 | //cleanup 74 | cudaFree(d_rgbaImage__); 75 | cudaFree(d_greyImage__); 76 | } 77 | 78 | void generateReferenceImage(std::string input_filename, std::string output_filename) 79 | { 80 | cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE); 81 | 82 | cv::imwrite(output_filename, reference); 83 | 84 | } 85 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | # or if using MacPorts 19 | 20 | #OPENCV_LIBPATH=/opt/local/lib 21 | #OPENCV_INCLUDEPATH=/opt/local/include 22 | 23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 24 | 25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 26 | 27 | ###################################################### 28 | # On Macs the default install locations are below # 29 | # #################################################### 30 | 31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 32 | #CUDA_LIBPATH=/usr/local/cuda/lib 33 | 34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 35 | 36 | GCC_OPTS=-O3 -Wall -Wextra -m64 37 | 38 | student: main.o student_func.o compare.o reference_calc.o Makefile 39 | $(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 40 | 41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp 42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH) 43 | 44 | student_func.o: student_func.cu utils.h 45 | nvcc -c student_func.cu $(NVCC_OPTS) 46 | 47 | compare.o: compare.cpp compare.h 48 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 49 | 50 | reference_calc.o: reference_calc.cpp reference_calc.h 51 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 52 | 53 | clean: 54 | rm -f *.o *.png hw 55 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/cinque_terre.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/cinque_terre_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre_small.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utils.h" 6 | 7 | void compareImages(std::string reference_filename, std::string test_filename, 8 | bool useEpsCheck, double perPixelError, double globalError) 9 | { 10 | cv::Mat reference = cv::imread(reference_filename, -1); 11 | cv::Mat test = cv::imread(test_filename, -1); 12 | 13 | cv::Mat diff = abs(reference - test); 14 | 15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 16 | 17 | double minVal, maxVal; 18 | 19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 20 | 21 | //now perform transform so that we bump values to the full range 22 | 23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 24 | 25 | diff = diffSingleChannel.reshape(reference.channels(), 0); 26 | 27 | cv::imwrite("HW1_differenceImage.png", diff); 28 | //OK, now we can start comparing values... 29 | unsigned char *referencePtr = reference.ptr(0); 30 | unsigned char *testPtr = test.ptr(0); 31 | 32 | if (useEpsCheck) { 33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 34 | } 35 | else 36 | { 37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 38 | } 39 | 40 | std::cout << "PASS" << std::endl; 41 | return; 42 | } 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPARE_H__ 2 | #define COMPARE_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, 5 | bool useEpsCheck, double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW1 Solution 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include "reference_calc.h" 9 | #include "compare.h" 10 | 11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, 12 | uchar4 * const d_rgbaImage, 13 | unsigned char* const d_greyImage, 14 | size_t numRows, size_t numCols); 15 | 16 | //include the definitions of the above functions for this homework 17 | #include "HW1.cpp" 18 | 19 | int main(int argc, char **argv) { 20 | uchar4 *h_rgbaImage, *d_rgbaImage; 21 | unsigned char *h_greyImage, *d_greyImage; 22 | 23 | std::string input_file; 24 | std::string output_file; 25 | std::string reference_file; 26 | double perPixelError = 0.0; 27 | double globalError = 0.0; 28 | bool useEpsCheck = false; 29 | switch (argc) 30 | { 31 | case 2: 32 | input_file = std::string(argv[1]); 33 | output_file = "HW1_output.png"; 34 | reference_file = "HW1_reference.png"; 35 | break; 36 | case 3: 37 | input_file = std::string(argv[1]); 38 | output_file = std::string(argv[2]); 39 | reference_file = "HW1_reference.png"; 40 | break; 41 | case 4: 42 | input_file = std::string(argv[1]); 43 | output_file = std::string(argv[2]); 44 | reference_file = std::string(argv[3]); 45 | break; 46 | case 6: 47 | useEpsCheck=true; 48 | input_file = std::string(argv[1]); 49 | output_file = std::string(argv[2]); 50 | reference_file = std::string(argv[3]); 51 | perPixelError = atof(argv[4]); 52 | globalError = atof(argv[5]); 53 | break; 54 | default: 55 | std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 56 | exit(1); 57 | } 58 | //load the image and give us our input and output pointers 59 | preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); 60 | 61 | GpuTimer timer; 62 | timer.Start(); 63 | //call the students' code 64 | your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); 65 | timer.Stop(); 66 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 67 | 68 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 69 | 70 | if (err < 0) { 71 | //Couldn't print! Probably the student closed stdout - bad news 72 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 73 | exit(1); 74 | } 75 | 76 | size_t numPixels = numRows()*numCols(); 77 | checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost)); 78 | 79 | //check results and output the grey image 80 | postProcess(output_file, h_greyImage); 81 | 82 | referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols()); 83 | 84 | postProcess(reference_file, h_greyImage); 85 | 86 | //generateReferenceImage(input_file, reference_file); 87 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, 88 | globalError); 89 | 90 | cleanup(); 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | // for uchar4 struct 2 | #include 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, 5 | unsigned char *const greyImage, 6 | size_t numRows, 7 | size_t numCols) 8 | { 9 | for (size_t r = 0; r < numRows; ++r) { 10 | for (size_t c = 0; c < numCols; ++c) { 11 | uchar4 rgba = rgbaImage[r * numCols + c]; 12 | float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z; 13 | greyImage[r * numCols + c] = channelSum; 14 | } 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, 5 | unsigned char *const greyImage, 6 | size_t numRows, 7 | size_t numCols); 8 | 9 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/student_func.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/student_func.cu -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | template 25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 26 | //check that the GPU result matches the CPU result 27 | for (size_t i = 0; i < numElem; ++i) { 28 | if (ref[i] != gpu[i]) { 29 | std::cerr << "Difference at pos " << i << std::endl; 30 | //the + is magic to convert char to int without messing 31 | //with other types 32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 33 | "\nGPU : " << +gpu[i] << std::endl; 34 | exit(1); 35 | } 36 | } 37 | } 38 | 39 | template 40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 41 | assert(eps1 >= 0 && eps2 >= 0); 42 | unsigned long long totalDiff = 0; 43 | unsigned numSmallDifferences = 0; 44 | for (size_t i = 0; i < numElem; ++i) { 45 | //subtract smaller from larger in case of unsigned types 46 | T smaller = std::min(ref[i], gpu[i]); 47 | T larger = std::max(ref[i], gpu[i]); 48 | T diff = larger - smaller; 49 | if (diff > 0 && diff <= eps1) { 50 | numSmallDifferences++; 51 | } 52 | else if (diff > eps1) { 53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 55 | "\nGPU : " << +gpu[i] << std::endl; 56 | exit(1); 57 | } 58 | totalDiff += diff * diff; 59 | } 60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 61 | if (percentSmallDifferences > eps2) { 62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 64 | exit(1); 65 | } 66 | } 67 | 68 | //Uses the autodesk method of image comparison 69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 70 | template 71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 72 | { 73 | 74 | size_t numBadPixels = 0; 75 | for (size_t i = 0; i < numElem; ++i) { 76 | T smaller = std::min(ref[i], gpu[i]); 77 | T larger = std::max(ref[i], gpu[i]); 78 | T diff = larger - smaller; 79 | if (diff > variance) 80 | ++numBadPixels; 81 | } 82 | 83 | if (numBadPixels > tolerance) { 84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 85 | exit(1); 86 | } 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu}) 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/HW2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | 9 | cv::Mat imageInputRGBA; 10 | cv::Mat imageOutputRGBA; 11 | 12 | uchar4 *d_inputImageRGBA__; 13 | uchar4 *d_outputImageRGBA__; 14 | 15 | float *h_filter__; 16 | 17 | size_t numRows() { return imageInputRGBA.rows; } 18 | size_t numCols() { return imageInputRGBA.cols; } 19 | 20 | //return types are void since any internal error will be handled by quitting 21 | //no point in returning error codes... 22 | //returns a pointer to an RGBA version of the input image 23 | //and a pointer to the single channel grey-scale output 24 | //on both the host and device 25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA, 26 | uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA, 27 | unsigned char **d_redBlurred, 28 | unsigned char **d_greenBlurred, 29 | unsigned char **d_blueBlurred, 30 | float **h_filter, int *filterWidth, 31 | const std::string &filename) { 32 | 33 | //make sure the context initializes ok 34 | checkCudaErrors(cudaFree(0)); 35 | 36 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 37 | if (image.empty()) { 38 | std::cerr << "Couldn't open file: " << filename << std::endl; 39 | exit(1); 40 | } 41 | 42 | cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); 43 | 44 | //allocate memory for the output 45 | imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); 46 | 47 | //This shouldn't ever happen given the way the images are created 48 | //at least based upon my limited understanding of OpenCV, but better to check 49 | if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { 50 | std::cerr << "Images aren't continuous!! Exiting." << std::endl; 51 | exit(1); 52 | } 53 | 54 | *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr(0); 55 | *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr(0); 56 | 57 | const size_t numPixels = numRows() * numCols(); 58 | //allocate memory on the device for both input and output 59 | checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels)); 60 | checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels)); 61 | checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around 62 | 63 | //copy input array to the GPU 64 | checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); 65 | 66 | d_inputImageRGBA__ = *d_inputImageRGBA; 67 | d_outputImageRGBA__ = *d_outputImageRGBA; 68 | 69 | //now create the filter that they will use 70 | const int blurKernelWidth = 9; 71 | const float blurKernelSigma = 2.; 72 | 73 | *filterWidth = blurKernelWidth; 74 | 75 | //create and fill the filter we will convolve with 76 | *h_filter = new float[blurKernelWidth * blurKernelWidth]; 77 | h_filter__ = *h_filter; 78 | 79 | float filterSum = 0.f; //for normalization 80 | 81 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { 82 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { 83 | float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); 84 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; 85 | filterSum += filterValue; 86 | } 87 | } 88 | 89 | float normalizationFactor = 1.f / filterSum; 90 | 91 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { 92 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { 93 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; 94 | } 95 | } 96 | 97 | //blurred 98 | checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels)); 99 | checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels)); 100 | checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels)); 101 | checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels)); 102 | checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); 103 | checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); 104 | } 105 | 106 | void postProcess(const std::string& output_file, uchar4* data_ptr) { 107 | cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr); 108 | 109 | cv::Mat imageOutputBGR; 110 | cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR); 111 | //output the image 112 | cv::imwrite(output_file.c_str(), imageOutputBGR); 113 | } 114 | 115 | void cleanUp(void) 116 | { 117 | cudaFree(d_inputImageRGBA__); 118 | cudaFree(d_outputImageRGBA__); 119 | delete[] h_filter__; 120 | } 121 | 122 | 123 | // An unused bit of code showing how to accomplish this assignment using OpenCV. It is much faster 124 | // than the naive implementation in reference_calc.cpp. 125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size) 126 | { 127 | cv::Mat input = cv::imread(input_file); 128 | // Create an identical image for the output as a placeholder 129 | cv::Mat reference = cv::imread(input_file); 130 | cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0); 131 | cv::imwrite(reference_file, reference); 132 | } 133 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 20 | 21 | ###################################################### 22 | # On Macs the default install locations are below # 23 | # #################################################### 24 | 25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 26 | #CUDA_LIBPATH=/usr/local/cuda/lib 27 | 28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 29 | 30 | GCC_OPTS=-O3 -Wall -Wextra -m64 31 | 32 | student: main.o student_func.o compare.o reference_calc.o Makefile 33 | $(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 34 | 35 | main.o: main.cpp timer.h utils.h HW2.cpp 36 | g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH) 37 | 38 | student_func.o: student_func.cu reference_calc.cpp utils.h 39 | nvcc -c student_func.cu $(NVCC_OPTS) 40 | 41 | compare.o: compare.cpp compare.h 42 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 43 | 44 | reference_calc.o: reference_calc.cpp reference_calc.h 45 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 46 | 47 | clean: 48 | rm -f *.o *.png hw 49 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/cinque_terre.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/cinque_terre_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre_small.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utils.h" 6 | 7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 8 | double perPixelError, double globalError) 9 | { 10 | cv::Mat reference = cv::imread(reference_filename, -1); 11 | cv::Mat test = cv::imread(test_filename, -1); 12 | 13 | cv::Mat diff = abs(reference - test); 14 | 15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 16 | 17 | double minVal, maxVal; 18 | 19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 20 | 21 | //now perform transform so that we bump values to the full range 22 | 23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 24 | 25 | diff = diffSingleChannel.reshape(reference.channels(), 0); 26 | 27 | cv::imwrite("HW2_differenceImage.png", diff); 28 | //OK, now we can start comparing values... 29 | unsigned char *referencePtr = reference.ptr(0); 30 | unsigned char *testPtr = test.ptr(0); 31 | 32 | if (useEpsCheck) { 33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 34 | } 35 | else 36 | { 37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 38 | } 39 | 40 | std::cout << "PASS" << std::endl; 41 | return; 42 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPARE_H__ 2 | #define COMPARE_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW2 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | 9 | #include "reference_calc.h" 10 | #include "compare.h" 11 | 12 | //include the definitions of the above functions for this homework 13 | #include "HW2.cpp" 14 | 15 | 16 | /******* DEFINED IN student_func.cu *********/ 17 | 18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA, 19 | uchar4* const d_outputImageRGBA, 20 | const size_t numRows, const size_t numCols, 21 | unsigned char *d_redBlurred, 22 | unsigned char *d_greenBlurred, 23 | unsigned char *d_blueBlurred, 24 | const int filterWidth); 25 | 26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage, 27 | const float* const h_filter, const size_t filterWidth); 28 | 29 | 30 | /******* Begin main *********/ 31 | 32 | int main(int argc, char **argv) { 33 | uchar4 *h_inputImageRGBA, *d_inputImageRGBA; 34 | uchar4 *h_outputImageRGBA, *d_outputImageRGBA; 35 | unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; 36 | 37 | float *h_filter; 38 | int filterWidth; 39 | 40 | std::string input_file; 41 | std::string output_file; 42 | std::string reference_file; 43 | double perPixelError = 0.0; 44 | double globalError = 0.0; 45 | bool useEpsCheck = false; 46 | switch (argc) 47 | { 48 | case 2: 49 | input_file = std::string(argv[1]); 50 | output_file = "HW2_output.png"; 51 | reference_file = "HW2_reference.png"; 52 | break; 53 | case 3: 54 | input_file = std::string(argv[1]); 55 | output_file = std::string(argv[2]); 56 | reference_file = "HW2_reference.png"; 57 | break; 58 | case 4: 59 | input_file = std::string(argv[1]); 60 | output_file = std::string(argv[2]); 61 | reference_file = std::string(argv[3]); 62 | break; 63 | case 6: 64 | useEpsCheck=true; 65 | input_file = std::string(argv[1]); 66 | output_file = std::string(argv[2]); 67 | reference_file = std::string(argv[3]); 68 | perPixelError = atof(argv[4]); 69 | globalError = atof(argv[5]); 70 | break; 71 | default: 72 | std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 73 | exit(1); 74 | } 75 | //load the image and give us our input and output pointers 76 | preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, 77 | &d_redBlurred, &d_greenBlurred, &d_blueBlurred, 78 | &h_filter, &filterWidth, input_file); 79 | 80 | allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); 81 | GpuTimer timer; 82 | timer.Start(); 83 | //call the students' code 84 | your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), 85 | d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); 86 | timer.Stop(); 87 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 88 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 89 | 90 | if (err < 0) { 91 | //Couldn't print! Probably the student closed stdout - bad news 92 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 93 | exit(1); 94 | } 95 | 96 | //check results and output the blurred image 97 | 98 | size_t numPixels = numRows()*numCols(); 99 | //copy the output back to the host 100 | checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); 101 | 102 | postProcess(output_file, h_outputImageRGBA); 103 | 104 | referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, 105 | numRows(), numCols(), 106 | h_filter, filterWidth); 107 | 108 | postProcess(reference_file, h_outputImageRGBA); 109 | 110 | // Cheater easy way with OpenCV 111 | //generateReferenceImage(input_file, reference_file, filterWidth); 112 | 113 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 114 | 115 | checkCudaErrors(cudaFree(d_redBlurred)); 116 | checkCudaErrors(cudaFree(d_greenBlurred)); 117 | checkCudaErrors(cudaFree(d_blueBlurred)); 118 | 119 | cleanUp(); 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // for uchar4 struct 4 | #include 5 | 6 | void channelConvolution(const unsigned char* const channel, 7 | unsigned char* const channelBlurred, 8 | const size_t numRows, const size_t numCols, 9 | const float *filter, const int filterWidth) 10 | { 11 | //Dealing with an even width filter is trickier 12 | assert(filterWidth % 2 == 1); 13 | 14 | //For every pixel in the image 15 | for (int r = 0; r < (int)numRows; ++r) { 16 | for (int c = 0; c < (int)numCols; ++c) { 17 | float result = 0.f; 18 | //For every value in the filter around the pixel (c, r) 19 | for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) { 20 | for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) { 21 | //Find the global image position for this filter position 22 | //clamp to boundary of the image 23 | int image_r = std::min(std::max(r + filter_r, 0), static_cast(numRows - 1)); 24 | int image_c = std::min(std::max(c + filter_c, 0), static_cast(numCols - 1)); 25 | 26 | float image_value = static_cast(channel[image_r * numCols + image_c]); 27 | float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2]; 28 | 29 | result += image_value * filter_value; 30 | } 31 | } 32 | 33 | channelBlurred[r * numCols + c] = result; 34 | } 35 | } 36 | } 37 | 38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage, 39 | size_t numRows, size_t numCols, 40 | const float* const filter, const int filterWidth) 41 | { 42 | unsigned char *red = new unsigned char[numRows * numCols]; 43 | unsigned char *blue = new unsigned char[numRows * numCols]; 44 | unsigned char *green = new unsigned char[numRows * numCols]; 45 | 46 | unsigned char *redBlurred = new unsigned char[numRows * numCols]; 47 | unsigned char *blueBlurred = new unsigned char[numRows * numCols]; 48 | unsigned char *greenBlurred = new unsigned char[numRows * numCols]; 49 | 50 | //First we separate the incoming RGBA image into three separate channels 51 | //for Red, Green and Blue 52 | for (size_t i = 0; i < numRows * numCols; ++i) { 53 | uchar4 rgba = rgbaImage[i]; 54 | red[i] = rgba.x; 55 | green[i] = rgba.y; 56 | blue[i] = rgba.z; 57 | } 58 | 59 | //Now we can do the convolution for each of the color channels 60 | channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth); 61 | channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth); 62 | channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth); 63 | 64 | //now recombine into the output image - Alpha is 255 for no transparency 65 | for (size_t i = 0; i < numRows * numCols; ++i) { 66 | uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255); 67 | outputImage[i] = rgba; 68 | } 69 | 70 | delete[] red; 71 | delete[] green; 72 | delete[] blue; 73 | 74 | delete[] redBlurred; 75 | delete[] greenBlurred; 76 | delete[] blueBlurred; 77 | } 78 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage, 5 | size_t numRows, size_t numCols, 6 | const float* const filter, const int filterWidth); 7 | 8 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/student_func.cu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/student_func.cu -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | # minimum required cmake version 8 | cmake_minimum_required(VERSION 2.8) 9 | find_package(CUDA QUIET REQUIRED) 10 | 11 | SET (compare_files compare.cpp) 12 | 13 | file( GLOB hdr *.hpp *.h ) 14 | file( GLOB cu *.cu) 15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 16 | 17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu}) 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 19 | 20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 21 | 22 | ###################################################### 23 | # On Macs the default install locations are below # 24 | # #################################################### 25 | 26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 27 | #CUDA_LIBPATH=/usr/local/cuda/lib 28 | 29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 30 | 31 | GCC_OPTS=-O3 -Wall -Wextra -m64 32 | 33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile 34 | $(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 35 | 36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h 37 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 38 | 39 | HW3.o: HW3.cu loadSaveImage.h utils.h 40 | $(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 41 | 42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 43 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 44 | 45 | compare.o: compare.cpp compare.h 46 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 47 | 48 | reference_calc.o: reference_calc.cpp reference_calc.h 49 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 50 | 51 | student_func.o: student_func.cu utils.h 52 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 53 | 54 | clean: 55 | rm -f *.o hw 56 | find . -type f -name '*.exr' | grep -v memorial | xargs rm -f 57 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError) 6 | { 7 | cv::Mat reference = cv::imread(reference_filename, -1); 8 | cv::Mat test = cv::imread(test_filename, -1); 9 | 10 | cv::Mat diff = abs(reference - test); 11 | 12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 13 | 14 | double minVal, maxVal; 15 | 16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 17 | 18 | //now perform transform so that we bump values to the full range 19 | 20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 21 | 22 | diff = diffSingleChannel.reshape(reference.channels(), 0); 23 | 24 | cv::imwrite("HW3_differenceImage.png", diff); 25 | //OK, now we can start comparing values... 26 | unsigned char *referencePtr = reference.ptr(0); 27 | unsigned char *testPtr = test.ptr(0); 28 | 29 | if (useEpsCheck) { 30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 31 | } 32 | else 33 | { 34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 35 | } 36 | 37 | std::cout << "PASS" << std::endl; 38 | return; 39 | } 40 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW3_H__ 2 | #define HW3_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "cuda_runtime.h" 7 | 8 | //The caller becomes responsible for the returned pointer. This 9 | //is done in the interest of keeping this code as simple as possible. 10 | //In production code this is a bad idea - we should use RAII 11 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 12 | //CODE!!! 13 | void loadImageHDR(const std::string &filename, 14 | float **imagePtr, 15 | size_t *numRows, size_t *numCols) 16 | { 17 | cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 18 | 19 | cv::Mat image; 20 | 21 | if(originImg.type() != CV_32FC3){ 22 | originImg.convertTo(image,CV_32FC3); 23 | } else{ 24 | image = originImg; 25 | } 26 | 27 | if (image.empty()) { 28 | std::cerr << "Couldn't open file: " << filename << std::endl; 29 | exit(1); 30 | } 31 | 32 | if (image.channels() != 3) { 33 | std::cerr << "Image must be color!" << std::endl; 34 | exit(1); 35 | } 36 | 37 | if (!image.isContinuous()) { 38 | std::cerr << "Image isn't continuous!" << std::endl; 39 | exit(1); 40 | } 41 | 42 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 43 | 44 | float *cvPtr = image.ptr(0); 45 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 46 | (*imagePtr)[i] = cvPtr[i]; 47 | 48 | *numRows = image.rows; 49 | *numCols = image.cols; 50 | } 51 | 52 | void loadImageRGBA(const std::string &filename, 53 | uchar4 **imagePtr, 54 | size_t *numRows, size_t *numCols) 55 | { 56 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 57 | if (image.empty()) { 58 | std::cerr << "Couldn't open file: " << filename << std::endl; 59 | exit(1); 60 | } 61 | 62 | if (image.channels() != 3) { 63 | std::cerr << "Image must be color!" << std::endl; 64 | exit(1); 65 | } 66 | 67 | if (!image.isContinuous()) { 68 | std::cerr << "Image isn't continuous!" << std::endl; 69 | exit(1); 70 | } 71 | 72 | cv::Mat imageRGBA; 73 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 74 | 75 | *imagePtr = new uchar4[image.rows * image.cols]; 76 | 77 | unsigned char *cvPtr = imageRGBA.ptr(0); 78 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 79 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 80 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 81 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 82 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 83 | } 84 | 85 | *numRows = image.rows; 86 | *numCols = image.cols; 87 | } 88 | 89 | void saveImageRGBA(const uchar4* const image, 90 | const size_t numRows, const size_t numCols, 91 | const std::string &output_file) 92 | { 93 | int sizes[2]; 94 | sizes[0] = numRows; 95 | sizes[1] = numCols; 96 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 97 | cv::Mat imageOutputBGR; 98 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 99 | //output the image 100 | cv::imwrite(output_file.c_str(), imageOutputBGR); 101 | } 102 | 103 | //output an exr file 104 | //assumed to already be BGR 105 | void saveImageHDR(const float* const image, 106 | const size_t numRows, const size_t numCols, 107 | const std::string &output_file) 108 | { 109 | int sizes[2]; 110 | sizes[0] = numRows; 111 | sizes[1] = numCols; 112 | 113 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 114 | 115 | imageHDR = imageHDR * 255; 116 | 117 | cv::imwrite(output_file.c_str(), imageHDR); 118 | } 119 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void saveImageRGBA(const uchar4* const image, 16 | const size_t numRows, const size_t numCols, 17 | const std::string &output_file); 18 | 19 | void saveImageHDR(const float* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW3 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include 9 | 10 | #include "compare.h" 11 | #include "reference_calc.h" 12 | 13 | // Functions from HW3.cu 14 | void preProcess(float **d_luminance, unsigned int **d_cdf, 15 | size_t *numRows, size_t *numCols, unsigned int *numBins, 16 | const std::string& filename); 17 | 18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols, 19 | float min_logLum, float max_logLum); 20 | 21 | void cleanupGlobalMemory(void); 22 | 23 | // Function from student_func.cu 24 | void your_histogram_and_prefixsum(const float* const d_luminance, 25 | unsigned int* const d_cdf, 26 | float &min_logLum, 27 | float &max_logLum, 28 | const size_t numRows, 29 | const size_t numCols, 30 | const size_t numBins); 31 | 32 | 33 | int main(int argc, char **argv) { 34 | float *d_luminance; 35 | unsigned int *d_cdf; 36 | 37 | size_t numRows, numCols; 38 | unsigned int numBins; 39 | 40 | std::string input_file; 41 | std::string output_file; 42 | std::string reference_file; 43 | double perPixelError = 0.0; 44 | double globalError = 0.0; 45 | bool useEpsCheck = false; 46 | 47 | switch (argc) 48 | { 49 | case 2: 50 | input_file = std::string(argv[1]); 51 | output_file = "HW3_output.png"; 52 | reference_file = "HW3_reference.png"; 53 | break; 54 | case 3: 55 | input_file = std::string(argv[1]); 56 | output_file = std::string(argv[2]); 57 | reference_file = "HW3_reference.png"; 58 | break; 59 | case 4: 60 | input_file = std::string(argv[1]); 61 | output_file = std::string(argv[2]); 62 | reference_file = std::string(argv[3]); 63 | break; 64 | case 6: 65 | useEpsCheck=true; 66 | input_file = std::string(argv[1]); 67 | output_file = std::string(argv[2]); 68 | reference_file = std::string(argv[3]); 69 | perPixelError = atof(argv[4]); 70 | globalError = atof(argv[5]); 71 | break; 72 | default: 73 | std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 74 | exit(1); 75 | } 76 | //load the image and give us our input and output pointers 77 | preProcess(&d_luminance, &d_cdf, 78 | &numRows, &numCols, &numBins, input_file); 79 | 80 | GpuTimer timer; 81 | float min_logLum, max_logLum; 82 | min_logLum = 0.f; 83 | max_logLum = 1.f; 84 | timer.Start(); 85 | //call the students' code 86 | your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum, 87 | numRows, numCols, numBins); 88 | timer.Stop(); 89 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 90 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 91 | 92 | if (err < 0) { 93 | //Couldn't print! Probably the student closed stdout - bad news 94 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 95 | exit(1); 96 | } 97 | 98 | float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols); 99 | unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins); 100 | 101 | checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost)); 102 | 103 | //check results and output the tone-mapped image 104 | postProcess(output_file, numRows, numCols, min_logLum, max_logLum); 105 | 106 | for (size_t i = 1; i < numCols * numRows; ++i) { 107 | min_logLum = std::min(h_luminance[i], min_logLum); 108 | max_logLum = std::max(h_luminance[i], max_logLum); 109 | } 110 | 111 | referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum); 112 | 113 | checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice)); 114 | 115 | //check results and output the tone-mapped image 116 | postProcess(reference_file, numRows, numCols, min_logLum, max_logLum); 117 | 118 | cleanupGlobalMemory(); 119 | 120 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 121 | 122 | return 0; 123 | } 124 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial.exr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial.exr -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_large.exr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_large.exr -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_png.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_png_large.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png_large.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_raw_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw_large.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf, 5 | const size_t numRows, const size_t numCols, const size_t numBins, 6 | float &logLumMin, float &logLumMax) 7 | { 8 | logLumMin = h_logLuminance[0]; 9 | logLumMax = h_logLuminance[0]; 10 | 11 | //Step 1 12 | //first we find the minimum and maximum across the entire image 13 | for (size_t i = 1; i < numCols * numRows; ++i) { 14 | logLumMin = std::min(h_logLuminance[i], logLumMin); 15 | logLumMax = std::max(h_logLuminance[i], logLumMax); 16 | } 17 | 18 | //Step 2 19 | float logLumRange = logLumMax - logLumMin; 20 | 21 | //Step 3 22 | //next we use the now known range to compute 23 | //a histogram of numBins bins 24 | unsigned int *histo = new unsigned int[numBins]; 25 | 26 | for (size_t i = 0; i < numBins; ++i) histo[i] = 0; 27 | 28 | for (size_t i = 0; i < numCols * numRows; ++i) { 29 | unsigned int bin = std::min(static_cast(numBins - 1), 30 | static_cast((h_logLuminance[i] - logLumMin) / logLumRange * numBins)); 31 | histo[bin]++; 32 | } 33 | 34 | //Step 4 35 | //finally we perform and exclusive scan (prefix sum) 36 | //on the histogram to get the cumulative distribution 37 | h_cdf[0] = 0; 38 | for (size_t i = 1; i < numBins; ++i) { 39 | h_cdf[i] = h_cdf[i - 1] + histo[i - 1]; 40 | } 41 | 42 | delete[] histo; 43 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf, 5 | const size_t numRows, const size_t numCols, const size_t numBins, 6 | float &logLumMin, float &logLumMax); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | template 25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 26 | //check that the GPU result matches the CPU result 27 | for (size_t i = 0; i < numElem; ++i) { 28 | if (ref[i] != gpu[i]) { 29 | std::cerr << "Difference at pos " << i << std::endl; 30 | //the + is magic to convert char to int without messing 31 | //with other types 32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 33 | "\nGPU : " << +gpu[i] << std::endl; 34 | exit(1); 35 | } 36 | } 37 | } 38 | 39 | template 40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 41 | assert(eps1 >= 0 && eps2 >= 0); 42 | unsigned long long totalDiff = 0; 43 | unsigned numSmallDifferences = 0; 44 | for (size_t i = 0; i < numElem; ++i) { 45 | //subtract smaller from larger in case of unsigned types 46 | T smaller = std::min(ref[i], gpu[i]); 47 | T larger = std::max(ref[i], gpu[i]); 48 | T diff = larger - smaller; 49 | if (diff > 0 && diff <= eps1) { 50 | numSmallDifferences++; 51 | } 52 | else if (diff > eps1) { 53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 55 | "\nGPU : " << +gpu[i] << std::endl; 56 | exit(1); 57 | } 58 | totalDiff += diff * diff; 59 | } 60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 61 | if (percentSmallDifferences > eps2) { 62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 64 | exit(1); 65 | } 66 | } 67 | 68 | //Uses the autodesk method of image comparison 69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 70 | template 71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 72 | { 73 | 74 | size_t numBadPixels = 0; 75 | for (size_t i = 0; i < numElem; ++i) { 76 | T smaller = std::min(ref[i], gpu[i]); 77 | T larger = std::max(ref[i], gpu[i]); 78 | T diff = larger - smaller; 79 | if (diff > variance) 80 | ++numBadPixels; 81 | } 82 | 83 | if (numBadPixels > tolerance) { 84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 85 | exit(1); 86 | } 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu}) 15 | 16 | 17 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc 2 | #NVCC=nvcc 3 | 4 | ################################### 5 | # These are the default install # 6 | # locations on most linux distros # 7 | ################################### 8 | 9 | OPENCV_LIBPATH=/usr/lib 10 | OPENCV_INCLUDEPATH=/usr/include 11 | 12 | ################################################### 13 | # On Macs the default install locations are below # 14 | ################################################### 15 | 16 | #OPENCV_LIBPATH=/usr/local/lib 17 | #OPENCV_INCLUDEPATH=/usr/local/include 18 | 19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 20 | 21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include 23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include 25 | 26 | ###################################################### 27 | # On Macs the default install locations are below # 28 | # #################################################### 29 | 30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 31 | #CUDA_LIBPATH=/usr/local/cuda/lib 32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 33 | 34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 35 | 36 | GCC_OPTS=-O3 -Wall -Wextra -m64 37 | 38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile 39 | $(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 40 | 41 | main.o: main.cpp timer.h utils.h reference_calc.h 42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 43 | 44 | HW4.o: HW4.cu loadSaveImage.h utils.h 45 | $(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 46 | 47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 48 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 49 | 50 | compare.o: compare.cpp compare.h 51 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 52 | 53 | reference_calc.o: reference_calc.cpp reference_calc.h 54 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 55 | 56 | student_func.o: student_func.cu reference_calc.cpp utils.h 57 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 58 | 59 | clean: 60 | rm -f *.o *.png hw 61 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | 5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 6 | double perPixelError, double globalError) 7 | { 8 | cv::Mat reference = cv::imread(reference_filename, -1); 9 | cv::Mat test = cv::imread(test_filename, -1); 10 | 11 | cv::Mat diff = abs(reference - test); 12 | 13 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 14 | 15 | double minVal, maxVal; 16 | 17 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 18 | 19 | //now perform transform so that we bump values to the full range 20 | 21 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 22 | 23 | diff = diffSingleChannel.reshape(reference.channels(), 0); 24 | 25 | cv::imwrite("HW4_differenceImage.png", diff); 26 | //OK, now we can start comparing values... 27 | unsigned char *referencePtr = reference.ptr(0); 28 | unsigned char *testPtr = test.ptr(0); 29 | 30 | if (useEpsCheck) { 31 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 32 | } 33 | else 34 | { 35 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 36 | } 37 | 38 | std::cout << "PASS" << std::endl; 39 | return; 40 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW4_H__ 2 | #define HW4_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda_runtime.h" 6 | 7 | //The caller becomes responsible for the returned pointer. This 8 | //is done in the interest of keeping this code as simple as possible. 9 | //In production code this is a bad idea - we should use RAII 10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 11 | //CODE!!! 12 | void loadImageHDR(const std::string &filename, 13 | float **imagePtr, 14 | size_t *numRows, size_t *numCols) 15 | { 16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 17 | if (image.empty()) { 18 | std::cerr << "Couldn't open file: " << filename << std::endl; 19 | exit(1); 20 | } 21 | 22 | if (image.channels() != 3) { 23 | std::cerr << "Image must be color!" << std::endl; 24 | exit(1); 25 | } 26 | 27 | if (!image.isContinuous()) { 28 | std::cerr << "Image isn't continuous!" << std::endl; 29 | exit(1); 30 | } 31 | 32 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 33 | 34 | float *cvPtr = image.ptr(0); 35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 36 | (*imagePtr)[i] = cvPtr[i]; 37 | 38 | *numRows = image.rows; 39 | *numCols = image.cols; 40 | } 41 | 42 | void loadImageRGBA(const std::string &filename, 43 | uchar4 **imagePtr, 44 | size_t *numRows, size_t *numCols) 45 | { 46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 47 | if (image.empty()) { 48 | std::cerr << "Couldn't open file: " << filename << std::endl; 49 | exit(1); 50 | } 51 | 52 | if (image.channels() != 3) { 53 | std::cerr << "Image must be color!" << std::endl; 54 | exit(1); 55 | } 56 | 57 | if (!image.isContinuous()) { 58 | std::cerr << "Image isn't continuous!" << std::endl; 59 | exit(1); 60 | } 61 | 62 | cv::Mat imageRGBA; 63 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 64 | 65 | *imagePtr = new uchar4[image.rows * image.cols]; 66 | 67 | unsigned char *cvPtr = imageRGBA.ptr(0); 68 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 69 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 70 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 71 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 72 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 73 | } 74 | 75 | *numRows = image.rows; 76 | *numCols = image.cols; 77 | } 78 | 79 | void saveImageRGBA(const uchar4* const image, 80 | const size_t numRows, const size_t numCols, 81 | const std::string &output_file) 82 | { 83 | int sizes[2]; 84 | sizes[0] = numRows; 85 | sizes[1] = numCols; 86 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 87 | cv::Mat imageOutputBGR; 88 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 89 | //output the image 90 | cv::imwrite(output_file.c_str(), imageOutputBGR); 91 | } 92 | 93 | //output an exr file 94 | //assumed to already be BGR 95 | void saveImageHDR(const float* const image, 96 | const size_t numRows, const size_t numCols, 97 | const std::string &output_file) 98 | { 99 | int sizes[2]; 100 | sizes[0] = numRows; 101 | sizes[1] = numCols; 102 | 103 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 104 | 105 | imageHDR = imageHDR * 255; 106 | 107 | cv::imwrite(output_file.c_str(), imageHDR); 108 | } 109 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void saveImageRGBA(const uchar4* const image, 16 | const size_t numRows, const size_t numCols, 17 | const std::string &output_file); 18 | 19 | void saveImageHDR(const float* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW4 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "compare.h" 12 | #include "reference_calc.h" 13 | 14 | void preProcess(unsigned int **inputVals, 15 | unsigned int **inputPos, 16 | unsigned int **outputVals, 17 | unsigned int **outputPos, 18 | size_t &numElems, 19 | const std::string& filename, 20 | const std::string& template_file); 21 | 22 | void postProcess(const unsigned int* const outputVals, 23 | const unsigned int* const outputPos, 24 | const size_t numElems, 25 | const std::string& output_file); 26 | 27 | void your_sort(unsigned int* const inputVals, 28 | unsigned int* const inputPos, 29 | unsigned int* const outputVals, 30 | unsigned int* const outputPos, 31 | const size_t numElems); 32 | 33 | void PrintFullPath(char * partialPath) 34 | { 35 | char full[_MAX_PATH]; 36 | if (_fullpath(full, partialPath, _MAX_PATH) != NULL) 37 | printf("Full path is: %s\n", full); 38 | else 39 | printf("Invalid path\n"); 40 | } 41 | 42 | int main(int argc, char **argv) { 43 | unsigned int *inputVals; 44 | unsigned int *inputPos; 45 | unsigned int *outputVals; 46 | unsigned int *outputPos; 47 | 48 | size_t numElems = 4; 49 | PrintFullPath(".\\"); 50 | std::string input_file; 51 | std::string template_file; 52 | std::string output_file; 53 | std::string reference_file; 54 | double perPixelError = 0.0; 55 | double globalError = 0.0; 56 | bool useEpsCheck = false; 57 | 58 | switch (argc) 59 | { 60 | case 3: 61 | input_file = std::string(argv[1]); 62 | template_file = std::string(argv[2]); 63 | output_file = "HW4_output.png"; 64 | break; 65 | case 4: 66 | input_file = std::string(argv[1]); 67 | template_file = std::string(argv[2]); 68 | output_file = std::string(argv[3]); 69 | break; 70 | default: 71 | std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl; 72 | exit(1); 73 | } 74 | //load the image and give us our input and output pointers 75 | preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file); 76 | 77 | /* 78 | // Use small array to Debug 79 | checkCudaErrors(cudaMalloc(&inputVals, sizeof(unsigned int)* numElems)); 80 | checkCudaErrors(cudaMalloc(&inputPos, sizeof(unsigned int)* numElems)); 81 | checkCudaErrors(cudaMalloc(&outputVals, sizeof(unsigned int)* numElems)); 82 | checkCudaErrors(cudaMalloc(&outputPos, sizeof(unsigned int)* numElems)); 83 | unsigned int ll[4] = { 0, 5, 2, 7 }; 84 | thrust::host_vector h_v(ll, ll+4); 85 | printf("%d %d %d %d\n", h_v[0], h_v[1], h_v[2], h_v[3]); 86 | thrust::device_vector d_v = h_v; 87 | cudaMemcpy(inputVals, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice); 88 | cudaMemcpy(inputPos, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice); 89 | */ 90 | 91 | GpuTimer timer; 92 | timer.Start(); 93 | 94 | thrust::device_ptr d_inputVals(inputVals); 95 | thrust::device_ptr d_inputPos(inputPos); 96 | 97 | thrust::host_vector h_inputVals(d_inputVals, 98 | d_inputVals + numElems); 99 | thrust::host_vector h_inputPos(d_inputPos, 100 | d_inputPos + numElems); 101 | 102 | //call the students' code 103 | your_sort(inputVals, inputPos, outputVals, outputPos, numElems); 104 | 105 | timer.Stop(); 106 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 107 | printf("\n"); 108 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 109 | 110 | if (err < 0) { 111 | //Couldn't print! Probably the student closed stdout - bad news 112 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 113 | exit(1); 114 | } 115 | 116 | // TODO: something wrong with the function postProcess?? 117 | //check results and output the red-eye corrected image 118 | //postProcess(outputVals, outputPos, numElems, output_file); 119 | 120 | // check code moved from HW4.cu 121 | /**************************************************************************** 122 | * You can use the code below to help with debugging, but make sure to * 123 | * comment it out again before submitting your assignment for grading, * 124 | * otherwise this code will take too much time and make it seem like your * 125 | * GPU implementation isn't fast enough. * 126 | * * 127 | * This code MUST RUN BEFORE YOUR CODE in case you accidentally change * 128 | * the input values when implementing your radix sort. * 129 | * * 130 | * This code performs the reference radix sort on the host and compares your * 131 | * sorted values to the reference. * 132 | * * 133 | * Thrust containers are used for copying memory from the GPU * 134 | * ************************************************************************* */ 135 | ; 136 | 137 | thrust::host_vector h_outputVals(numElems); 138 | thrust::host_vector h_outputPos(numElems); 139 | 140 | reference_calculation(&h_inputVals[0], &h_inputPos[0], 141 | &h_outputVals[0], &h_outputPos[0], 142 | numElems); 143 | 144 | //postProcess(valsPtr, posPtr, numElems, reference_file); 145 | 146 | //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 147 | 148 | thrust::device_ptr d_outputVals(outputVals); 149 | thrust::device_ptr d_outputPos(outputPos); 150 | 151 | thrust::host_vector h_yourOutputVals(d_outputVals, 152 | d_outputVals + numElems); 153 | thrust::host_vector h_yourOutputPos(d_outputPos, 154 | d_outputPos + numElems); 155 | 156 | checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems); 157 | checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems); 158 | 159 | checkCudaErrors(cudaFree(inputVals)); 160 | checkCudaErrors(cudaFree(inputPos)); 161 | checkCudaErrors(cudaFree(outputVals)); 162 | checkCudaErrors(cudaFree(outputPos)); 163 | 164 | return 0; 165 | } 166 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_5.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // For memset 3 | #include 4 | 5 | void reference_calculation(unsigned int* inputVals, 6 | unsigned int* inputPos, 7 | unsigned int* outputVals, 8 | unsigned int* outputPos, 9 | const size_t numElems) 10 | { 11 | const int numBits = 1; 12 | const int numBins = 1 << numBits; 13 | 14 | unsigned int *binHistogram = new unsigned int[numBins]; 15 | unsigned int *binScan = new unsigned int[numBins]; 16 | 17 | unsigned int *vals_src = inputVals; 18 | unsigned int *pos_src = inputPos; 19 | 20 | unsigned int *vals_dst = outputVals; 21 | unsigned int *pos_dst = outputPos; 22 | 23 | //a simple radix sort - only guaranteed to work for numBits that are multiples of 2 24 | for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) { 25 | unsigned int mask = (numBins - 1) << i; 26 | 27 | memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins 28 | memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins 29 | 30 | //perform histogram of data & mask into bins 31 | for (unsigned int j = 0; j < numElems; ++j) { 32 | unsigned int bin = (vals_src[j] & mask) >> i; 33 | binHistogram[bin]++; 34 | } 35 | 36 | //perform exclusive prefix sum (scan) on binHistogram to get starting 37 | //location for each bin 38 | for (unsigned int j = 1; j < numBins; ++j) { 39 | binScan[j] = binScan[j - 1] + binHistogram[j - 1]; 40 | } 41 | 42 | //Gather everything into the correct location 43 | //need to move vals and positions 44 | for (unsigned int j = 0; j < numElems; ++j) { 45 | unsigned int bin = (vals_src[j] & mask) >> i; 46 | vals_dst[binScan[bin]] = vals_src[j]; 47 | pos_dst[binScan[bin]] = pos_src[j]; 48 | binScan[bin]++; 49 | } 50 | 51 | //swap the buffers (pointers only) 52 | std::swap(vals_dst, vals_src); 53 | std::swap(pos_dst, pos_src); 54 | } 55 | 56 | //we did an even number of iterations, need to copy from input buffer into output 57 | std::copy(inputVals, inputVals + numElems, outputVals); 58 | std::copy(inputPos, inputPos + numElems, outputPos); 59 | 60 | delete[] binHistogram; 61 | delete[] binScan; 62 | } 63 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | 5 | //A simple un-optimized reference radix sort calculation 6 | //Only deals with power-of-2 radices 7 | 8 | 9 | void reference_calculation(unsigned int* inputVals, 10 | unsigned int* inputPos, 11 | unsigned int* outputVals, 12 | unsigned int* outputPos, 13 | const size_t numElems); 14 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/student_func.cu: -------------------------------------------------------------------------------- 1 | //Udacity HW 4 2 | //Radix Sorting 3 | 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | /* Red Eye Removal 12 | =============== 13 | 14 | For this assignment we are implementing red eye removal. This is 15 | accomplished by first creating a score for every pixel that tells us how 16 | likely it is to be a red eye pixel. We have already done this for you - you 17 | are receiving the scores and need to sort them in ascending order so that we 18 | know which pixels to alter to remove the red eye. 19 | 20 | Note: ascending order == smallest to largest 21 | 22 | Each score is associated with a position, when you sort the scores, you must 23 | also move the positions accordingly. 24 | 25 | Implementing Parallel Radix Sort with CUDA 26 | ========================================== 27 | 28 | The basic idea is to construct a histogram on each pass of how many of each 29 | "digit" there are. Then we scan this histogram so that we know where to put 30 | the output of each digit. For example, the first 1 must come after all the 31 | 0s so we have to know how many 0s there are to be able to start moving 1s 32 | into the correct position. 33 | 34 | 1) Histogram of the number of occurrences of each digit 35 | 2) Exclusive Prefix Sum of Histogram 36 | 3) Determine relative offset of each digit 37 | For example [0 0 1 1 0 0 1] 38 | -> [0 1 4 5 2 3 6] 39 | 4) Combine the results of steps 2 & 3 to determine the final 40 | output location for each element and move it there 41 | 42 | LSB Radix sort is an out-of-place sort and you will need to ping-pong values 43 | between the input and output buffers we have provided. Make sure the final 44 | sorted results end up in the output buffer! Hint: You may need to do a copy 45 | at the end. 46 | 47 | */ 48 | 49 | //#define USE_THRUST 50 | 51 | __global__ void print_kernel(unsigned int *d_out) 52 | { 53 | printf("%d ", d_out[threadIdx.x]); 54 | } 55 | 56 | 57 | __global__ void histo_kernel(unsigned int * d_out, unsigned int* const d_in, 58 | unsigned int shift, const unsigned int numElems) 59 | { 60 | unsigned int mask = 1 << shift; 61 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 62 | if (myId >= numElems) return; 63 | int bin = (d_in[myId] & mask) >> shift; 64 | atomicAdd(&d_out[bin], 1); 65 | } 66 | 67 | // Blelloch Scan - described in lecture 68 | __global__ void sumscan_kernel(unsigned int * d_in, const size_t numBins, const unsigned int numElems) 69 | { 70 | int myId = threadIdx.x; 71 | if (myId >= numElems) return; 72 | extern __shared__ float sdata[]; 73 | sdata[myId] = d_in[myId]; 74 | __syncthreads(); // make sure entire block is loaded! 75 | 76 | for (int d = 1; d < numBins; d *= 2) { 77 | if (myId >= d) { 78 | sdata[myId] += sdata[myId - d]; 79 | } 80 | __syncthreads(); 81 | } 82 | if (myId == 0) d_in[0] = 0; 83 | else d_in[myId] = sdata[myId - 1]; //inclusive->exclusive 84 | } 85 | 86 | __global__ void makescan_kernel(unsigned int * d_in, unsigned int *d_scan, 87 | unsigned int shift, const unsigned int numElems) 88 | { 89 | unsigned int mask = 1 << shift; 90 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 91 | if (myId >= numElems) return; 92 | d_scan[myId] = ((d_in[myId] & mask) >> shift) ? 0 : 1; 93 | } 94 | 95 | __global__ void move_kernel(unsigned int* const d_inputVals, 96 | unsigned int* const d_inputPos, 97 | unsigned int* const d_outputVals, 98 | unsigned int* const d_outputPos, 99 | const unsigned int numElems, 100 | unsigned int* const d_histogram, 101 | unsigned int* const d_scaned, 102 | unsigned int shift) 103 | { 104 | unsigned int mask = 1 << shift; 105 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 106 | if (myId >= numElems) return; 107 | // Important! 108 | // Algorithm described in 7.4 of http://wykvictor.github.io/2016/04/03/Cuda-2.html 109 | int des_id = 0; 110 | if ((d_inputVals[myId] & mask) >> shift) { 111 | des_id = myId + d_histogram[1] - d_scaned[myId]; 112 | } else { 113 | des_id = d_scaned[myId]; 114 | } 115 | d_outputVals[des_id] = d_inputVals[myId]; 116 | d_outputPos[des_id] = d_inputPos[myId]; 117 | } 118 | 119 | #ifdef USE_THRUST 120 | void your_sort(unsigned int* const d_inputVals, 121 | unsigned int* const d_inputPos, 122 | unsigned int* const d_outputVals, 123 | unsigned int* const d_outputPos, 124 | const size_t numElems) 125 | { 126 | // Thrust vectors wrapping raw GPU data 127 | thrust::device_ptr d_inputVals_p(d_inputVals); 128 | thrust::device_ptr d_inputPos_p(d_inputPos); 129 | thrust::host_vector h_inputVals_vec(d_inputVals_p, 130 | d_inputVals_p + numElems); 131 | thrust::host_vector h_inputPos_vec(d_inputPos_p, 132 | d_inputPos_p + numElems); 133 | // ?? device_vector is wrong 134 | thrust::sort_by_key(h_inputVals_vec.begin(), h_inputVals_vec.end(), h_inputPos_vec.begin()); 135 | checkCudaErrors(cudaMemcpy(d_outputVals, thrust::raw_pointer_cast(&h_inputVals_vec[0]), 136 | numElems * sizeof(unsigned int), cudaMemcpyHostToDevice)); 137 | checkCudaErrors(cudaMemcpy(d_outputPos, thrust::raw_pointer_cast(&h_inputPos_vec[0]), 138 | numElems * sizeof(unsigned int), cudaMemcpyHostToDevice)); 139 | } 140 | #else 141 | void your_sort(unsigned int* const d_inputVals, 142 | unsigned int* const d_inputPos, 143 | unsigned int* const d_outputVals, 144 | unsigned int* const d_outputPos, 145 | const size_t numElems) 146 | { 147 | // use how many bits/time to compare(maybe 4 is most efficent) 148 | const int numBits = 1; //?? 149 | const int numBins = 1 << numBits; 150 | const int m = 1 << 10; 151 | int blocks = ceil((float)numElems / m); 152 | printf("m %d blocks %d\n", m ,blocks); 153 | // allocate GPU memory 154 | unsigned int *d_binHistogram; 155 | checkCudaErrors(cudaMalloc(&d_binHistogram, sizeof(unsigned int)* numBins)); 156 | // not numBins --> different from CPU version 157 | thrust::device_vector d_scan(numElems); 158 | 159 | // Loop bits: only guaranteed to work for numBits that are multiples of 2 160 | for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i++) { 161 | //unsigned int mask = 1 << i; 162 | checkCudaErrors(cudaMemset(d_binHistogram, 0, sizeof(unsigned int)* numBins)); 163 | // 1) perform histogram of data & mask into bins 164 | histo_kernel << > >(d_binHistogram, d_inputVals, i, numElems); 165 | cudaDeviceSynchronize(); 166 | checkCudaErrors(cudaGetLastError()); 167 | //print_kernel << <1, 2 >> >(d_binHistogram); 168 | //printf("\n"); 169 | // 2) perform exclusive prefix sum (scan) on binHistogram to get starting 170 | // location for each bin 171 | sumscan_kernel << <1, numBins, sizeof(unsigned int)* numBins>> >(d_binHistogram, numBins, numElems); 172 | //print_kernel << <1, 2 >> >(d_binHistogram); 173 | //printf("\n"); 174 | cudaDeviceSynchronize(); 175 | checkCudaErrors(cudaGetLastError()); 176 | 177 | // 3) Gather everything into the correct location 178 | // need to move vals and positions 179 | makescan_kernel << > >(d_inputVals, thrust::raw_pointer_cast(&d_scan[0]), i, numElems); 180 | //print_kernel << <1, 4 >> >(thrust::raw_pointer_cast(&d_scan[0])); 181 | //printf("\n"); 182 | cudaDeviceSynchronize(); 183 | checkCudaErrors(cudaGetLastError()); 184 | 185 | // segmented scan described in http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html 186 | //thrust::host_vector h_scan = d_scan; 187 | //printf("%d %d %d\n", h_scan[0], h_scan[1], h_scan[2]); 188 | thrust::exclusive_scan(d_scan.begin(), d_scan.end(), d_scan.begin()); 189 | //print_kernel << <1, 4 >> >(thrust::raw_pointer_cast(&d_scan[0])); 190 | // printf("\n"); 191 | cudaDeviceSynchronize(); 192 | checkCudaErrors(cudaGetLastError()); 193 | 194 | //thrust::host_vector h_scan_2 = d_scan; 195 | //printf("%d %d %d\n", h_scan_2[0], h_scan_2[1], h_scan_2[2]); 196 | move_kernel << > >(d_inputVals, d_inputPos, d_outputVals, d_outputPos, 197 | numElems, d_binHistogram, thrust::raw_pointer_cast(&d_scan[0]), i); 198 | cudaDeviceSynchronize(); 199 | checkCudaErrors(cudaGetLastError()); 200 | 201 | checkCudaErrors(cudaMemcpy(d_inputVals, d_outputVals, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice)); 202 | checkCudaErrors(cudaMemcpy(d_inputPos, d_outputPos, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice)); 203 | cudaDeviceSynchronize(); 204 | checkCudaErrors(cudaGetLastError()); 205 | } 206 | // Free memory 207 | checkCudaErrors(cudaFree(d_binHistogram)); 208 | } 209 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | template 25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 26 | //check that the GPU result matches the CPU result 27 | for (size_t i = 0; i < numElem; ++i) { 28 | if (ref[i] != gpu[i]) { 29 | std::cerr << "Difference at pos " << i << std::endl; 30 | //the + is magic to convert char to int without messing 31 | //with other types 32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 33 | "\nGPU : " << +gpu[i] << std::endl; 34 | exit(1); 35 | } 36 | } 37 | std::cout << "Pass: Same" << std::endl; 38 | } 39 | 40 | template 41 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 42 | assert(eps1 >= 0 && eps2 >= 0); 43 | unsigned long long totalDiff = 0; 44 | unsigned numSmallDifferences = 0; 45 | for (size_t i = 0; i < numElem; ++i) { 46 | //subtract smaller from larger in case of unsigned types 47 | T smaller = std::min(ref[i], gpu[i]); 48 | T larger = std::max(ref[i], gpu[i]); 49 | T diff = larger - smaller; 50 | if (diff > 0 && diff <= eps1) { 51 | numSmallDifferences++; 52 | } 53 | else if (diff > eps1) { 54 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 55 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 56 | "\nGPU : " << +gpu[i] << std::endl; 57 | exit(1); 58 | } 59 | totalDiff += diff * diff; 60 | } 61 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 62 | if (percentSmallDifferences > eps2) { 63 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 64 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 65 | exit(1); 66 | } 67 | } 68 | 69 | //Uses the autodesk method of image comparison 70 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 71 | template 72 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 73 | { 74 | 75 | size_t numBadPixels = 0; 76 | for (size_t i = 0; i < numElem; ++i) { 77 | T smaller = std::min(ref[i], gpu[i]); 78 | T larger = std::max(ref[i], gpu[i]); 79 | T diff = larger - smaller; 80 | if (diff > variance) 81 | ++numBadPixels; 82 | } 83 | 84 | if (numBadPixels > tolerance) { 85 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 86 | exit(1); 87 | } 88 | } 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | 12 | SET (HW5_files main.cu student.cu reference_calc.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW5 ${HW5_files} ${hdr}) 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 3 | 4 | histo: main.cu reference_calc.o student.o Makefile 5 | nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS) 6 | 7 | student.o: student.cu 8 | nvcc -c student.cu $(NVCC_OPTS) 9 | 10 | reference_calc.o: reference_calc.cpp reference_calc.h 11 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 12 | 13 | clean: 14 | rm -f *.o hw *.bin 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "utils.h" 6 | #include "timer.h" 7 | #include 8 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64) 9 | #include 10 | #else 11 | #include 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include "reference_calc.h" 19 | 20 | void computeHistogram(const unsigned int *const d_vals, 21 | unsigned int* const d_histo, 22 | const unsigned int numBins, 23 | const unsigned int numElems, 24 | int types); 25 | 26 | int main(void) 27 | { 28 | const unsigned int numBins = 1024; 29 | const unsigned int numElems = 10000 * numBins; 30 | const float stddev = 100.f; 31 | 32 | unsigned int *vals = new unsigned int[numElems]; 33 | unsigned int *h_vals = new unsigned int[numElems]; 34 | unsigned int *h_studentHisto = new unsigned int[numBins]; 35 | unsigned int *h_refHisto = new unsigned int[numBins]; 36 | 37 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64) 38 | srand(GetTickCount()); 39 | #else 40 | timeval tv; 41 | gettimeofday(&tv, NULL); 42 | 43 | srand(tv.tv_usec); 44 | #endif 45 | 46 | //make the mean unpredictable, but close enough to the middle 47 | //so that timings are unaffected 48 | unsigned int mean = rand() % 100 + 462; 49 | 50 | //Output mean so that grading can happen with the same inputs 51 | std::cout << mean << std::endl; 52 | 53 | thrust::minstd_rand rng; 54 | 55 | thrust::random::normal_distribution normalDist((float)mean, stddev); 56 | 57 | // Generate the random values 58 | for (size_t i = 0; i < numElems; ++i) { 59 | vals[i] = std::min((unsigned int) std::max((int)normalDist(rng), 0), numBins - 1); 60 | } 61 | 62 | unsigned int *d_vals, *d_histo; 63 | 64 | //generate reference for the given mean 65 | reference_calculation(vals, h_refHisto, numBins, numElems); 66 | 67 | for (int i = 0; i < 3; i++) { // test different implenmentations 68 | GpuTimer timer; 69 | 70 | checkCudaErrors(cudaMalloc(&d_vals, sizeof(unsigned int)* numElems)); 71 | checkCudaErrors(cudaMalloc(&d_histo, sizeof(unsigned int)* numBins)); 72 | checkCudaErrors(cudaMemset(d_histo, 0, sizeof(unsigned int)* numBins)); 73 | 74 | checkCudaErrors(cudaMemcpy(d_vals, vals, sizeof(unsigned int)* numElems, cudaMemcpyHostToDevice)); 75 | 76 | timer.Start(); 77 | computeHistogram(d_vals, d_histo, numBins, numElems, i); 78 | timer.Stop(); 79 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 80 | 81 | if (err < 0) { 82 | //Couldn't print! Probably the student closed stdout - bad news 83 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 84 | exit(1); 85 | } 86 | 87 | // copy the student-computed histogram back to the host 88 | checkCudaErrors(cudaMemcpy(h_studentHisto, d_histo, sizeof(unsigned int)* numBins, cudaMemcpyDeviceToHost)); 89 | 90 | //Now do the comparison 91 | checkResultsExact(h_refHisto, h_studentHisto, numBins); 92 | } 93 | 94 | delete[] h_vals; 95 | delete[] h_refHisto; 96 | delete[] h_studentHisto; 97 | 98 | cudaFree(d_vals); 99 | cudaFree(d_histo); 100 | 101 | return 0; 102 | } 103 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | //Reference Histogram calculation 3 | 4 | void reference_calculation(const unsigned int* const vals, 5 | unsigned int* const histo, 6 | const size_t numBins, 7 | const size_t numElems) 8 | 9 | { 10 | //zero out bins 11 | for (size_t i = 0; i < numBins; ++i) 12 | histo[i] = 0; 13 | 14 | //go through vals and increment appropriate bin 15 | for (size_t i = 0; i < numElems; ++i) 16 | histo[vals[i]]++; 17 | } 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | //Reference Histogram calculation 5 | 6 | void reference_calculation(const unsigned int* const vals, 7 | unsigned int* const histo, 8 | const size_t numBins, 9 | const size_t numElems); 10 | 11 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/student.cu: -------------------------------------------------------------------------------- 1 | /* Udacity HW5 2 | Histogramming for Speed 3 | 4 | The goal of this assignment is compute a histogram 5 | as fast as possible. We have simplified the problem as much as 6 | possible to allow you to focus solely on the histogramming algorithm. 7 | 8 | The input values that you need to histogram are already the exact 9 | bins that need to be updated. This is unlike in HW3 where you needed 10 | to compute the range of the data and then do: 11 | bin = (val - valMin) / valRange to determine the bin. 12 | 13 | Here the bin is just: 14 | bin = val 15 | 16 | so the serial histogram calculation looks like: 17 | for (i = 0; i < numElems; ++i) 18 | histo[val[i]]++; 19 | 20 | That's it! Your job is to make it run as fast as possible! 21 | 22 | The values are normally distributed - you may take 23 | advantage of this fact in your implementation. 24 | 25 | */ 26 | 27 | 28 | #include "utils.h" 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | __global__ 36 | void atomic_kernel(const unsigned int* const d_vals, //INPUT 37 | unsigned int* const d_histo, //OUPUT 38 | const unsigned int numElems) 39 | { 40 | int myId = threadIdx.x + blockIdx.x * blockDim.x; 41 | if (myId >= numElems) return; 42 | atomicAdd(&d_histo[d_vals[myId]], 1); 43 | } 44 | 45 | void computeHistogram(const unsigned int* const d_vals, //INPUT 46 | unsigned int* const d_histo, //OUTPUT 47 | const unsigned int numBins, 48 | const unsigned int numElems, int types) 49 | { 50 | const int m = 1 << 10; 51 | int blocks = ceil((float)numElems / m); 52 | 53 | /*thrust::device_ptr in_vals(d_vals); 54 | thrust::device_ptr in_keys(d_vals); 55 | thrust::device_ptr out_vals(d_histo); 56 | unsigned int* out_keys; 57 | checkCudaErrors(cudaMalloc(&out_keys, sizeof(unsigned int)*numElems));*/ 58 | 59 | switch (types){ 60 | case 0: 61 | atomic_kernel << > >(d_vals, d_histo, numElems); 62 | 63 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 64 | break; 65 | 66 | // https://www.ecse.rpi.edu/~wrf/wiki/ParallelComputingSpring2014/thrust/histogram.cu 67 | case 1: 68 | //thrust::sort(in_vals, in_vals + numElems); 69 | //thrust::reduce_by_key(in_keys, in_keys + numElems, in_vals, out_keys, out_vals); 70 | 71 | break; 72 | case 3: 73 | break; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | template 25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 26 | //check that the GPU result matches the CPU result 27 | for (size_t i = 0; i < numElem; ++i) { 28 | if (ref[i] != gpu[i]) { 29 | std::cerr << "Difference at pos " << i << std::endl; 30 | //the + is magic to convert char to int without messing 31 | //with other types 32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 33 | "\nGPU : " << +gpu[i] << std::endl; 34 | exit(1); 35 | } 36 | } 37 | } 38 | 39 | template 40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 41 | assert(eps1 >= 0 && eps2 >= 0); 42 | unsigned long long totalDiff = 0; 43 | unsigned numSmallDifferences = 0; 44 | for (size_t i = 0; i < numElem; ++i) { 45 | //subtract smaller from larger in case of unsigned types 46 | T smaller = std::min(ref[i], gpu[i]); 47 | T larger = std::max(ref[i], gpu[i]); 48 | T diff = larger - smaller; 49 | if (diff > 0 && diff <= eps1) { 50 | numSmallDifferences++; 51 | } 52 | else if (diff > eps1) { 53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 55 | "\nGPU : " << +gpu[i] << std::endl; 56 | exit(1); 57 | } 58 | totalDiff += diff * diff; 59 | } 60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 61 | if (percentSmallDifferences > eps2) { 62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 64 | exit(1); 65 | } 66 | } 67 | 68 | //Uses the autodesk method of image comparison 69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 70 | template 71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 72 | { 73 | 74 | size_t numBadPixels = 0; 75 | for (size_t i = 0; i < numElem; ++i) { 76 | T smaller = std::min(ref[i], gpu[i]); 77 | T larger = std::max(ref[i], gpu[i]); 78 | T diff = larger - smaller; 79 | if (diff > variance) 80 | ++numBadPixels; 81 | } 82 | 83 | if (numBadPixels > tolerance) { 84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 85 | exit(1); 86 | } 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | 12 | SET (HW6_files student_func.cu HW6.cu main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW6 ${HW6_files} ${hdr}) 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/HW6.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "loadSaveImage.h" 8 | #include 9 | 10 | 11 | //return types are void since any internal error will be handled by quitting 12 | //no point in returning error codes... 13 | void preProcess( uchar4 **sourceImg, 14 | size_t &numRows, size_t &numCols, 15 | uchar4 **destImg, 16 | uchar4 **blendedImg, const std::string& source_filename, 17 | const std::string& dest_filename){ 18 | 19 | //make sure the context initializes ok 20 | checkCudaErrors(cudaFree(0)); 21 | 22 | size_t numRowsSource, numColsSource, numRowsDest, numColsDest; 23 | 24 | loadImageRGBA(source_filename, sourceImg, &numRowsSource, &numColsSource); 25 | loadImageRGBA(dest_filename, destImg, &numRowsDest, &numColsDest); 26 | 27 | assert(numRowsSource == numRowsDest); 28 | assert(numColsSource == numColsDest); 29 | 30 | numRows = numRowsSource; 31 | numCols = numColsSource; 32 | 33 | *blendedImg = new uchar4[numRows * numCols]; 34 | 35 | } 36 | 37 | void postProcess(const uchar4* const blendedImg, 38 | const size_t numRowsDest, const size_t numColsDest, 39 | const std::string& output_file) 40 | { 41 | //just need to save the image... 42 | saveImageRGBA(blendedImg, numRowsDest, numColsDest, output_file); 43 | } 44 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc 2 | #NVCC=nvcc 3 | 4 | ################################### 5 | # These are the default install # 6 | # locations on most linux distros # 7 | ################################### 8 | 9 | OPENCV_LIBPATH=/usr/lib 10 | OPENCV_INCLUDEPATH=/usr/include 11 | 12 | ################################################### 13 | # On Macs the default install locations are below # 14 | ################################################### 15 | 16 | #OPENCV_LIBPATH=/usr/local/lib 17 | #OPENCV_INCLUDEPATH=/usr/local/include 18 | 19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 20 | 21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include 23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include 25 | 26 | ###################################################### 27 | # On Macs the default install locations are below # 28 | # #################################################### 29 | 30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 31 | #CUDA_LIBPATH=/usr/local/cuda/lib 32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 33 | 34 | #no warnings otherwise thrust explodes output 35 | 36 | NVCC_OPTS=-O3 -arch=sm_20 -m64 37 | 38 | GCC_OPTS=-O3 -m64 39 | 40 | student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile 41 | $(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 42 | 43 | main.o: main.cpp timer.h utils.h 44 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 45 | 46 | HW6.o: HW6.cu loadSaveImage.h utils.h 47 | $(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 48 | 49 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 50 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 51 | 52 | student_func.o: student_func.cu reference_calc.cpp utils.h 53 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 54 | 55 | compare.o: compare.cpp compare.h 56 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 57 | 58 | reference_calc.o: reference_calc.cpp reference_calc.h 59 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 60 | 61 | clean: 62 | rm -f *.o hw 63 | find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f 64 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/blended.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/blended.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError) 6 | { 7 | cv::Mat reference = cv::imread(reference_filename, -1); 8 | cv::Mat test = cv::imread(test_filename, -1); 9 | 10 | cv::Mat diff = abs(reference - test); 11 | 12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 13 | 14 | double minVal, maxVal; 15 | 16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 17 | 18 | //now perform transform so that we bump values to the full range 19 | 20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 21 | 22 | diff = diffSingleChannel.reshape(reference.channels(), 0); 23 | 24 | cv::imwrite("HW6_differenceImage.png", diff); 25 | //OK, now we can start comparing values... 26 | unsigned char *referencePtr = reference.ptr(0); 27 | unsigned char *testPtr = test.ptr(0); 28 | 29 | if (useEpsCheck) { 30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 31 | } 32 | else 33 | { 34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 35 | } 36 | 37 | std::cout << "PASS" << std::endl; 38 | return; 39 | } 40 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW3_H__ 2 | #define HW3_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/destination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/destination.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda_runtime.h" 6 | 7 | //The caller becomes responsible for the returned pointer. This 8 | //is done in the interest of keeping this code as simple as possible. 9 | //In production code this is a bad idea - we should use RAII 10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 11 | //CODE!!! 12 | void loadImageHDR(const std::string &filename, 13 | float **imagePtr, 14 | size_t *numRows, size_t *numCols) 15 | { 16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 17 | if (image.empty()) { 18 | std::cerr << "Couldn't open file: " << filename << std::endl; 19 | exit(1); 20 | } 21 | 22 | if (image.channels() != 3) { 23 | std::cerr << "Image must be color!" << std::endl; 24 | exit(1); 25 | } 26 | 27 | if (!image.isContinuous()) { 28 | std::cerr << "Image isn't continuous!" << std::endl; 29 | exit(1); 30 | } 31 | 32 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 33 | 34 | float *cvPtr = image.ptr(0); 35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 36 | (*imagePtr)[i] = cvPtr[i]; 37 | 38 | *numRows = image.rows; 39 | *numCols = image.cols; 40 | } 41 | 42 | void loadImageGrey(const std::string &filename, 43 | unsigned char **imagePtr, 44 | size_t *numRows, size_t *numCols) 45 | { 46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); 47 | if (image.empty()) { 48 | std::cerr << "Couldn't open file: " << filename << std::endl; 49 | exit(1); 50 | } 51 | 52 | if (image.channels() != 1) { 53 | std::cerr << "Image must be greyscale!" << std::endl; 54 | exit(1); 55 | } 56 | 57 | if (!image.isContinuous()) { 58 | std::cerr << "Image isn't continuous!" << std::endl; 59 | exit(1); 60 | } 61 | 62 | *imagePtr = new unsigned char[image.rows * image.cols]; 63 | 64 | unsigned char *cvPtr = image.ptr(0); 65 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 66 | (*imagePtr)[i] = cvPtr[i]; 67 | } 68 | 69 | *numRows = image.rows; 70 | *numCols = image.cols; 71 | } 72 | void loadImageRGBA(const std::string &filename, 73 | uchar4 **imagePtr, 74 | size_t *numRows, size_t *numCols) 75 | { 76 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 77 | if (image.empty()) { 78 | std::cerr << "Couldn't open file: " << filename << std::endl; 79 | exit(1); 80 | } 81 | 82 | if (image.channels() != 3) { 83 | std::cerr << "Image must be color!" << std::endl; 84 | exit(1); 85 | } 86 | 87 | if (!image.isContinuous()) { 88 | std::cerr << "Image isn't continuous!" << std::endl; 89 | exit(1); 90 | } 91 | 92 | cv::Mat imageRGBA; 93 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 94 | 95 | *imagePtr = new uchar4[image.rows * image.cols]; 96 | 97 | unsigned char *cvPtr = imageRGBA.ptr(0); 98 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 99 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 100 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 101 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 102 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 103 | } 104 | 105 | *numRows = image.rows; 106 | *numCols = image.cols; 107 | } 108 | 109 | void saveImageRGBA(const uchar4* const image, 110 | const size_t numRows, const size_t numCols, 111 | const std::string &output_file) 112 | { 113 | int sizes[2]; 114 | sizes[0] = numRows; 115 | sizes[1] = numCols; 116 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 117 | cv::Mat imageOutputBGR; 118 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 119 | //output the image 120 | cv::imwrite(output_file.c_str(), imageOutputBGR); 121 | } 122 | 123 | //output an exr file 124 | //assumed to already be BGR 125 | void saveImageHDR(const float* const image, 126 | const size_t numRows, const size_t numCols, 127 | const std::string &output_file) 128 | { 129 | int sizes[2]; 130 | sizes[0] = numRows; 131 | sizes[1] = numCols; 132 | 133 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 134 | 135 | imageHDR = imageHDR * 255; 136 | 137 | cv::imwrite(output_file.c_str(), imageHDR); 138 | } 139 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void loadImageGrey(const std::string &filename, 16 | unsigned char **imagePtr, 17 | size_t *numRows, size_t *numCols); 18 | 19 | void saveImageRGBA(const uchar4* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | void saveImageHDR(const float* const image, 24 | const size_t numRows, const size_t numCols, 25 | const std::string &output_file); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW6 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "reference_calc.h" 14 | #include "compare.h" 15 | 16 | void preProcess( uchar4 **sourceImg, size_t &numRowsSource, size_t &numColsSource, 17 | uchar4 **destImg, 18 | uchar4 **blendedImg, const std::string& source_filename, 19 | const std::string& dest_filename); 20 | 21 | void postProcess(const uchar4* const blendedImg, 22 | const size_t numRowsDest, const size_t numColsDest, 23 | const std::string& output_file); 24 | 25 | void your_blend(const uchar4* const sourceImg, 26 | const size_t numRowsSource, const size_t numColsSource, 27 | const uchar4* const destImg, 28 | uchar4* const blendedImg); 29 | 30 | int main(int argc, char **argv) { 31 | uchar4 *h_sourceImg, *h_destImg, *h_blendedImg; 32 | size_t numRowsSource, numColsSource; 33 | 34 | std::string input_source_file; 35 | std::string input_dest_file; 36 | std::string output_file; 37 | 38 | std::string reference_file; 39 | double perPixelError = 0.0; 40 | double globalError = 0.0; 41 | bool useEpsCheck = false; 42 | 43 | switch (argc) 44 | { 45 | case 3: 46 | input_source_file = std::string(argv[1]); 47 | input_dest_file = std::string(argv[2]); 48 | output_file = "HW6_output.png"; 49 | reference_file = "HW6_reference.png"; 50 | break; 51 | case 4: 52 | input_source_file = std::string(argv[1]); 53 | input_dest_file = std::string(argv[2]); 54 | output_file = std::string(argv[3]); 55 | reference_file = "HW6_reference.png"; 56 | break; 57 | case 5: 58 | input_source_file = std::string(argv[1]); 59 | input_dest_file = std::string(argv[2]); 60 | output_file = std::string(argv[3]); 61 | reference_file = std::string(argv[4]); 62 | break; 63 | case 7: 64 | useEpsCheck=true; 65 | input_source_file = std::string(argv[1]); 66 | input_dest_file = std::string(argv[2]); 67 | output_file = std::string(argv[3]); 68 | reference_file = std::string(argv[4]); 69 | perPixelError = atof(argv[5]); 70 | globalError = atof(argv[6]); 71 | break; 72 | default: 73 | std::cerr << "Usage: ./HW6 input_source_file input_dest_filename [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 74 | exit(1); 75 | } 76 | 77 | //load the image and give us our input and output pointers 78 | preProcess(&h_sourceImg, numRowsSource, numColsSource, 79 | &h_destImg, 80 | &h_blendedImg, input_source_file, input_dest_file); 81 | 82 | GpuTimer timer; 83 | timer.Start(); 84 | 85 | //call the students' code 86 | your_blend(h_sourceImg, numRowsSource, numColsSource, 87 | h_destImg, 88 | h_blendedImg); 89 | 90 | timer.Stop(); 91 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 92 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 93 | printf("\n"); 94 | if (err < 0) { 95 | //Couldn't print! Probably the student closed stdout - bad news 96 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 97 | exit(1); 98 | } 99 | 100 | //check results and output the tone-mapped image 101 | postProcess(h_blendedImg, numRowsSource, numColsSource, output_file); 102 | 103 | // calculate the reference image 104 | uchar4* h_reference = new uchar4[numRowsSource*numColsSource]; 105 | reference_calc(h_sourceImg, numRowsSource, numColsSource, 106 | h_destImg, h_reference); 107 | 108 | // save the reference image 109 | postProcess(h_reference, numRowsSource, numColsSource, reference_file); 110 | 111 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 112 | 113 | delete[] h_reference; 114 | delete[] h_destImg; 115 | delete[] h_sourceImg; 116 | delete[] h_blendedImg; 117 | return 0; 118 | } 119 | 120 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void reference_calc(const uchar4* const h_sourceImg, 5 | const size_t numRowsSource, const size_t numColsSource, 6 | const uchar4* const h_destImg, 7 | uchar4* const h_blendedImg); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 6/source.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/student_func.cu: -------------------------------------------------------------------------------- 1 | //Udacity HW 6 2 | //Poisson Blending 3 | 4 | /* Background 5 | ========== 6 | 7 | The goal for this assignment is to take one image (the source) and 8 | paste it into another image (the destination) attempting to match the 9 | two images so that the pasting is non-obvious. This is 10 | known as a "seamless clone". 11 | 12 | The basic ideas are as follows: 13 | 14 | 1) Figure out the interior and border of the source image 15 | 2) Use the values of the border pixels in the destination image 16 | as boundary conditions for solving a Poisson equation that tells 17 | us how to blend the images. 18 | 19 | No pixels from the destination except pixels on the border 20 | are used to compute the match. 21 | 22 | Solving the Poisson Equation 23 | ============================ 24 | 25 | There are multiple ways to solve this equation - we choose an iterative 26 | method - specifically the Jacobi method. Iterative methods start with 27 | a guess of the solution and then iterate to try and improve the guess 28 | until it stops changing. If the problem was well-suited for the method 29 | then it will stop and where it stops will be the solution. 30 | 31 | The Jacobi method is the simplest iterative method and converges slowly - 32 | that is we need a lot of iterations to get to the answer, but it is the 33 | easiest method to write. 34 | 35 | Jacobi Iterations 36 | ================= 37 | 38 | Our initial guess is going to be the source image itself. This is a pretty 39 | good guess for what the blended image will look like and it means that 40 | we won't have to do as many iterations compared to if we had started far 41 | from the final solution. 42 | 43 | ImageGuess_prev (Floating point) 44 | ImageGuess_next (Floating point) 45 | 46 | DestinationImg 47 | SourceImg 48 | 49 | Follow these steps to implement one iteration: 50 | 51 | 1) For every pixel p in the interior, compute two sums over the four neighboring pixels: 52 | Sum1: If the neighbor is in the interior then += ImageGuess_prev[neighbor] 53 | else if the neighbor in on the border then += DestinationImg[neighbor] 54 | 55 | Sum2: += SourceImg[p] - SourceImg[neighbor] (for all four neighbors) 56 | 57 | 2) Calculate the new pixel value: 58 | float newVal= (Sum1 + Sum2) / 4.f <------ Notice that the result is FLOATING POINT 59 | ImageGuess_next[p] = min(255, max(0, newVal)); //clamp to [0, 255] 60 | 61 | 62 | In this assignment we will do 800 iterations. 63 | */ 64 | 65 | 66 | 67 | #include "utils.h" 68 | #include 69 | 70 | void your_blend(const uchar4* const h_sourceImg, //IN 71 | const size_t numRowsSource, const size_t numColsSource, 72 | const uchar4* const h_destImg, //IN 73 | uchar4* const h_blendedImg) //OUT 74 | { 75 | 76 | /* To Recap here are the steps you need to implement 77 | 78 | 1) Compute a mask of the pixels from the source image to be copied 79 | The pixels that shouldn't be copied are completely white, they 80 | have R=255, G=255, B=255. Any other pixels SHOULD be copied. 81 | 82 | 2) Compute the interior and border regions of the mask. An interior 83 | pixel has all 4 neighbors also inside the mask. A border pixel is 84 | in the mask itself, but has at least one neighbor that isn't. 85 | 86 | 3) Separate out the incoming image into three separate channels 87 | 88 | 4) Create two float(!) buffers for each color channel that will 89 | act as our guesses. Initialize them to the respective color 90 | channel of the source image since that will act as our intial guess. 91 | 92 | 5) For each color channel perform the Jacobi iteration described 93 | above 800 times. 94 | 95 | 6) Create the output image by replacing all the interior pixels 96 | in the destination image with the result of the Jacobi iterations. 97 | Just cast the floating point values to unsigned chars since we have 98 | already made sure to clamp them to the correct range. 99 | 100 | Since this is final assignment we provide little boilerplate code to 101 | help you. Notice that all the input/output pointers are HOST pointers. 102 | 103 | You will have to allocate all of your own GPU memory and perform your own 104 | memcopies to get data in and out of the GPU memory. 105 | 106 | Remember to wrap all of your calls with checkCudaErrors() to catch any 107 | thing that might go wrong. After each kernel call do: 108 | 109 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 110 | 111 | to catch any errors that happened while executing the kernel. 112 | */ 113 | } 114 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 14 | 15 | template 16 | void check(T err, const char* const func, const char* const file, const int line) { 17 | if (err != cudaSuccess) { 18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 20 | exit(1); 21 | } 22 | } 23 | 24 | template 25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 26 | //check that the GPU result matches the CPU result 27 | for (size_t i = 0; i < numElem; ++i) { 28 | if (ref[i] != gpu[i]) { 29 | std::cerr << "Difference at pos " << i << std::endl; 30 | //the + is magic to convert char to int without messing 31 | //with other types 32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 33 | "\nGPU : " << +gpu[i] << std::endl; 34 | exit(1); 35 | } 36 | } 37 | } 38 | 39 | template 40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 41 | assert(eps1 >= 0 && eps2 >= 0); 42 | unsigned long long totalDiff = 0; 43 | unsigned numSmallDifferences = 0; 44 | for (size_t i = 0; i < numElem; ++i) { 45 | //subtract smaller from larger in case of unsigned types 46 | T smaller = std::min(ref[i], gpu[i]); 47 | T larger = std::max(ref[i], gpu[i]); 48 | T diff = larger - smaller; 49 | if (diff > 0 && diff <= eps1) { 50 | numSmallDifferences++; 51 | } 52 | else if (diff > eps1) { 53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 55 | "\nGPU : " << +gpu[i] << std::endl; 56 | exit(1); 57 | } 58 | totalDiff += diff * diff; 59 | } 60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 61 | if (percentSmallDifferences > eps2) { 62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 64 | exit(1); 65 | } 66 | } 67 | 68 | //Uses the autodesk method of image comparison 69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 70 | template 71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 72 | { 73 | 74 | size_t numBadPixels = 0; 75 | for (size_t i = 0; i < numElem; ++i) { 76 | T smaller = std::min(ref[i], gpu[i]); 77 | T larger = std::max(ref[i], gpu[i]); 78 | T diff = larger - smaller; 79 | if (diff > variance) 80 | ++numBadPixels; 81 | } 82 | 83 | if (numBadPixels > tolerance) { 84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 85 | exit(1); 86 | } 87 | } 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Solutions for class: [Introduction to Parallel Programming](https://www.udacity.com/course/intro-to-parallel-programming--cs344) 2 | 3 | #### Building on Windows Visual Studio 4 | ##### Prerequisites 5 | * Install Visual Studio 2013: 6 | 7 | **Note**: `Visual Studio Express` and `Visual Studio 2015` are not supported!(I tried but not work ^_^) 8 | 9 | [Nvidia reference](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/index.html#axzz44vwAc5Qx) 10 | 11 | * Install Cuda 7.5: 12 | Also refer to above link. [download](https://developer.nvidia.com/cuda-downloads) 13 | 14 | * Install CMake: 15 | The latest version is OK. [download](https://cmake.org/) 16 | 17 | * Install OpenCV: 18 | I installed 2.4.12, other versions should also work. [download](http://opencv.org/) 19 | * Run the EXE to extract the files. This EXE does not have an installer. Instead, you put your files where you want, and then add an environment variable 20 | * Adding the environment variable named "OpenCV_DIR" (no quotes) to the "build" subfolder in the folder where you extracted.(The exact folder you need will have one very important file in it: OpenCVConfig.cmake - this tells CMake which variables to set for you.) 21 | * Add a dir of "OpenCV binary DLLs" to Windows $PATH.(like f:/software/opencv/build/x86/vc12/bin) 22 | 23 | ##### Compile the solution 24 | ``` 25 | git clone https://github.com/wykvictor/cs344.git 26 | cd cs344 27 | mkdir build 28 | cd build 29 | cmake .. 30 | ``` 31 | 32 | **Done!** Just use Visual Studio to open the project-solution in dir build/ and compile everything. 33 | 34 | ======= 35 | ### Original README.md forked from [udacity/cs344](https://github.com/udacity/cs344) 36 | 37 | ##### Introduction to Parallel Programming class code 38 | 39 | #### Building on OS X 40 | 41 | These instructions are for OS X 10.9 "Mavericks". 42 | 43 | * Step 1. Build and install OpenCV. The best way to do this is with 44 | Homebrew. However, you must slightly alter the Homebrew OpenCV 45 | installation; you must build it with libstdc++ (instead of the default 46 | libc++) so that it will properly link against the nVidia CUDA dev kit. 47 | [This entry in the Udacity discussion forums](http://forums.udacity.com/questions/100132476/cuda-55-opencv-247-os-x-maverick-it-doesnt-work) describes exactly how to build a compatible OpenCV. 48 | 49 | * Step 2. You can now create 10.9-compatible makefiles, which will allow you to 50 | build and run your homework on your own machine: 51 | ``` 52 | mkdir build 53 | cd build 54 | cmake .. 55 | make 56 | ``` 57 | 58 | --------------------------------------------------------------------------------