├── .gitignore ├── CMakeLists.txt ├── Final ├── batcher │ ├── batcher.cu │ ├── compare.h │ └── gputimer.h ├── smooth │ ├── compare.h │ ├── gputimer.h │ └── smooth.cu └── warpreduce │ ├── part_a │ ├── compare.h │ ├── gputimer.h │ └── warpreduce.cu │ └── part_b │ ├── compare.h │ ├── gputimer.h │ └── warpreduce.cu ├── Lesson Code Snippets ├── Lesson 2 Code Snippets │ ├── associative.cu │ ├── atomics.cu │ ├── gputimer.h │ ├── hello_blockIdx.cu │ ├── hello_threadIdx.cu │ └── memory.cu ├── Lesson 3 Code Snippets │ ├── histo.cu │ └── reduce.cu ├── Lesson 5 Code Snippets │ ├── deviceQuery_simplified.cpp │ └── transpose.cu └── Lesson 7 Code Snippets │ ├── cub │ └── example_block_scan_cum.cu │ ├── thrust │ ├── gputimer.h │ └── thrust_example.cu │ └── tiling │ ├── gputimer.h │ ├── tiling.cu │ └── utils.h ├── Lesson Slides ├── CS344_Lesson1_Slides.pdf ├── CS344_Lesson2_Slides.pdf ├── CS344_Lesson3_Slides.pdf ├── CS344_Lesson4_Slides.pdf ├── CS344_Lesson5_Slides.pdf ├── CS344_Lesson6.1_Slides.pdf ├── CS344_Lesson6.2_Slides.pdf ├── CS344_Lesson7.1_Slides.pdf └── CS344_Lesson7.2_Slides.pdf ├── Problem Sets ├── Problem Set 1.zip ├── Problem Set 1 │ ├── CMakeLists.txt │ ├── HW1.cpp │ ├── Makefile │ ├── cinque_terre.gold │ ├── cinque_terre_small.jpg │ ├── compare.cpp │ ├── compare.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 2.zip ├── Problem Set 2 │ ├── CMakeLists.txt │ ├── HW2.cpp │ ├── Makefile │ ├── cinque_terre.gold │ ├── cinque_terre_small.jpg │ ├── compare.cpp │ ├── compare.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 3.zip ├── Problem Set 3 │ ├── CMakeLists.txt │ ├── HW3.cu │ ├── Makefile │ ├── compare.cpp │ ├── compare.h │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── memorial.exr │ ├── memorial_large.exr │ ├── memorial_png.gold │ ├── memorial_png_large.gold │ ├── memorial_raw.png │ ├── memorial_raw_large.png │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 4.zip ├── Problem Set 4 │ ├── CMakeLists.txt │ ├── HW4.cu │ ├── Makefile │ ├── compare.cpp │ ├── compare.h │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── red_eye_effect.gold │ ├── red_eye_effect_5.jpg │ ├── red_eye_effect_template_5.jpg │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── Problem Set 5.zip ├── Problem Set 5 │ ├── CMakeLists.txt │ ├── Makefile │ ├── main.cu │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── student.cu │ ├── timer.h │ └── utils.h ├── Problem Set 6.zip └── Problem Set 6 │ ├── CMakeLists.txt │ ├── HW6.cu │ ├── Makefile │ ├── blended.gold │ ├── compare.cpp │ ├── compare.h │ ├── destination.png │ ├── loadSaveImage.cpp │ ├── loadSaveImage.h │ ├── main.cpp │ ├── reference_calc.cpp │ ├── reference_calc.h │ ├── source.png │ ├── student_func.cu │ ├── timer.h │ └── utils.h ├── README.md └── Student Contributions └── Notes ├── Unit3 Notes ├── NotesUnit3.pdf └── NotesUnit3Small.pdf └── Unit4 Notes ├── NotesUnit4.pdf └── NotesUnit4_Small.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | 4 | # Libraries 5 | *.lib 6 | *.a 7 | 8 | # Shared objects (inc. Windows DLLs) 9 | *.dll 10 | *.so 11 | *.so.* 12 | *.dylib 13 | 14 | # Executables 15 | *.exe 16 | *.out 17 | *.app 18 | 19 | # OS X stuff 20 | .DS_Store 21 | 22 | build 23 | bin 24 | 25 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR) 9 | project(cs344) 10 | 11 | find_package(OpenCV REQUIRED) 12 | find_package(CUDA REQUIRED) 13 | 14 | link_libraries(${OpenCV_LIBS} ) 15 | 16 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/") 17 | 18 | if(CUDA_FOUND) 19 | # compared to class settings, we let NVidia's FindCUDA CMake detect 20 | # whether to build x64. We tell it to support most devices, though, 21 | # to make sure more people can easily run class code without knowing 22 | # about this compiler argument 23 | set(CUDA_NVCC_FLAGS " 24 | -ccbin /usr/bin/clang; 25 | -gencode;arch=compute_30,code=sm_30; 26 | -gencode;arch=compute_35,code=sm_35; 27 | -gencode;arch=compute_35,code=compute_35; 28 | -gencode;arch=compute_20,code=sm_20; 29 | -gencode;arch=compute_11,code=sm_11; 30 | -gencode;arch=compute_12,code=sm_12; 31 | -gencode;arch=compute_13,code=sm_13;") 32 | 33 | # add -Wextra compiler flag for gcc compilations 34 | if (UNIX) 35 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra") 36 | set(CMAKE_CXX_FLAGS "-stdlib=libstdc++") 37 | endif (UNIX) 38 | 39 | # add debugging to CUDA NVCC flags. For NVidia's NSight tools. 40 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G") 41 | 42 | add_subdirectory (HW1) 43 | add_subdirectory (HW2) 44 | add_subdirectory (HW3) 45 | add_subdirectory (HW4) 46 | add_subdirectory (HW5) 47 | add_subdirectory (HW6) 48 | else(CUDA_FOUND) 49 | message("CUDA is not installed on this system.") 50 | endif() 51 | -------------------------------------------------------------------------------- /Final/batcher/batcher.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // http://en.wikipedia.org/wiki/Bitonic_sort 8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in) 9 | { 10 | // you are guaranteed this is called with <<<1, 64, 64*4>>> 11 | extern __shared__ float sdata[]; 12 | int tid = threadIdx.x; 13 | sdata[tid] = d_in[tid]; 14 | __syncthreads(); 15 | 16 | for (int stage = 0; stage <= 5; stage++) 17 | { 18 | for (int substage = stage; substage >= 0; substage--) 19 | { 20 | // TODO 21 | } 22 | } 23 | 24 | d_out[tid] = sdata[tid]; 25 | } 26 | 27 | int compareFloat (const void * a, const void * b) 28 | { 29 | if ( *(float*)a < *(float*)b ) return -1; 30 | if ( *(float*)a == *(float*)b ) return 0; 31 | if ( *(float*)a > *(float*)b ) return 1; 32 | return 0; // should never reach this 33 | } 34 | 35 | int main(int argc, char **argv) 36 | { 37 | const int ARRAY_SIZE = 64; 38 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 39 | 40 | // generate the input array on the host 41 | float h_in[ARRAY_SIZE]; 42 | float h_sorted[ARRAY_SIZE]; 43 | float h_out[ARRAY_SIZE]; 44 | for(int i = 0; i < ARRAY_SIZE; i++) { 45 | // generate random float in [0, 1] 46 | h_in[i] = (float)random()/(float)RAND_MAX; 47 | h_sorted[i] = h_in[i]; 48 | } 49 | qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat); 50 | 51 | // declare GPU memory pointers 52 | float * d_in, * d_out; 53 | 54 | // allocate GPU memory 55 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 56 | cudaMalloc((void **) &d_out, ARRAY_BYTES); 57 | 58 | // transfer the input array to the GPU 59 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 60 | 61 | // launch the kernel 62 | GpuTimer timer; 63 | timer.Start(); 64 | batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in); 65 | timer.Stop(); 66 | 67 | printf("Your code executed in %g ms\n", timer.Elapsed()); 68 | 69 | // copy back the sum from GPU 70 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost); 71 | 72 | compare(h_out, h_sorted, ARRAY_SIZE); 73 | 74 | // free GPU memory allocation 75 | cudaFree(d_in); 76 | cudaFree(d_out); 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /Final/batcher/compare.h: -------------------------------------------------------------------------------- 1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE) 2 | { 3 | int failure = 0; 4 | for(int i = 0; i < ARRAY_SIZE; i++) { 5 | if (h_out[i] != h_sorted[i]) { 6 | printf("Oops! Index %i is %f, should be %f\n", 7 | i, h_out[i], h_sorted[i]); 8 | failure = 1; 9 | } 10 | } 11 | 12 | if (failure == 0){ 13 | printf("Success! Your bitonic sort worked."); 14 | } 15 | 16 | return failure; 17 | } -------------------------------------------------------------------------------- /Final/batcher/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/smooth/compare.h: -------------------------------------------------------------------------------- 1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){ 2 | int failure = 0; 3 | for(int i = 0; i < ARRAY_SIZE; i++) { 4 | if (h_out[i] != h_cmp[i]) { 5 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n", 6 | i, h_in[i], i, h_out[i], i, h_cmp[i]); 7 | failure = 1; 8 | } 9 | if (h_out_shared[i] != h_cmp[i]) { 10 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n", 11 | i, h_in[i], i, h_out_shared[i], i, h_cmp[i]); 12 | failure = 1; 13 | } 14 | } 15 | 16 | if (failure == 0) 17 | { 18 | printf("Success! Your smooth code worked!\n"); 19 | } 20 | 21 | return failure; 22 | } -------------------------------------------------------------------------------- /Final/smooth/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/smooth/smooth.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Reference 8 | __global__ void smooth(float * v_new, const float * v) { 9 | int myIdx = threadIdx.x * gridDim.x + blockIdx.x; 10 | int numThreads = blockDim.x * gridDim.x; 11 | int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1; 12 | int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1; 13 | float myElt = v[myIdx]; 14 | float myLeftElt = v[myLeftIdx]; 15 | float myRightElt = v[myRightIdx]; 16 | v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt; 17 | } 18 | 19 | // Your code 20 | __global__ void smooth_shared(float * v_new, const float * v) { 21 | extern __shared__ float s[]; 22 | // TODO: Fill in the rest of this function 23 | return v[0]; 24 | } 25 | 26 | int main(int argc, char **argv) 27 | { 28 | 29 | const int ARRAY_SIZE = 4096; 30 | const int BLOCK_SIZE = 256; 31 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 32 | 33 | // generate the input array on the host 34 | float h_in[ARRAY_SIZE]; 35 | float h_cmp[ARRAY_SIZE]; 36 | float h_out[ARRAY_SIZE]; 37 | float h_out_shared[ARRAY_SIZE]; 38 | for(int i = 0; i < ARRAY_SIZE; i++) { 39 | // generate random float in [0, 1] 40 | h_in[i] = (float)random()/(float)RAND_MAX; 41 | } 42 | for(int i = 0; i < ARRAY_SIZE; i++) { 43 | h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] + 44 | 0.50f * h_in[i] + 45 | 0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]); 46 | } 47 | 48 | // declare GPU memory pointers 49 | float * d_in, * d_out, * d_out_shared; 50 | 51 | // allocate GPU memory 52 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 53 | cudaMalloc((void **) &d_out, ARRAY_BYTES); 54 | cudaMalloc((void **) &d_out_shared, ARRAY_BYTES); 55 | 56 | // transfer the input array to the GPU 57 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 58 | 59 | // cudaEvent_t start, stop; 60 | // cudaEventCreate(&start); 61 | // cudaEventCreate(&stop); 62 | // launch the kernel 63 | smooth<<>>(d_out, d_in); 64 | GpuTimer timer; 65 | timer.Start(); 66 | smooth_shared<<>>(d_out_shared, d_in); 67 | timer.Stop(); 68 | 69 | printf("Your code executed in %g ms\n", timer.Elapsed()); 70 | // cudaEventSynchronize(stop); 71 | // float elapsedTime; 72 | // cudaEventElapsedTime(&elapsedTime, start, stop); 73 | 74 | // copy back the result from GPU 75 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost); 76 | cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost); 77 | 78 | // testing for correctness 79 | compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE); 80 | 81 | // free GPU memory allocation 82 | cudaFree(d_in); 83 | cudaFree(d_out); 84 | cudaFree(d_out_shared); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /Final/warpreduce/part_a/compare.h: -------------------------------------------------------------------------------- 1 | int compare(unsigned int h_out_shared, int sum){ 2 | int failure = 0; 3 | if (h_out_shared != sum) { 4 | fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n", 5 | h_out_shared, sum); 6 | failure = 1; 7 | } 8 | 9 | if (failure == 0) 10 | { 11 | printf("Success! Your shared warp reduce worked.\n"); 12 | } 13 | else{ 14 | printf("Error! Your shared reduce code's output did not match sum.\n"); 15 | } 16 | 17 | return failure; 18 | } -------------------------------------------------------------------------------- /Final/warpreduce/part_a/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/warpreduce/part_a/warpreduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Subpart A: 8 | // Write step 1 as a kernel that operates on threads 0--31. 9 | // Assume that the input flags are 0 for false and 1 for true and are stored 10 | // in a local per-thread register called p (for predicate). 11 | // 12 | // You have access to 31 words of shared memory s[0:31], with s[0] 13 | // corresponding to thread 0 and s[31] corresponding to thread 31. 14 | // You may change the values of s[0:31]. Put the return sum in s[0]. 15 | // Your code should execute no more than 5 warp-wide addition operations. 16 | 17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) { 18 | // Assumes values in 'p' are either 1 or 0 19 | // Assumes s[0:31] are allocated 20 | // Sums p across warp, returning the result. Suggest you put 21 | // result in s[0] and return it 22 | // You may change any value in s 23 | // You should execute no more than 5 + operations (if you're doing 24 | // 31, you're doing it wrong) 25 | // 26 | // TODO: Fill in the rest of this function 27 | 28 | return s[0]; 29 | } 30 | 31 | __global__ void reduce(unsigned int * d_out_shared, 32 | const unsigned int * d_in) 33 | { 34 | extern __shared__ unsigned int s[]; 35 | int t = threadIdx.x; 36 | int p = d_in[t]; 37 | unsigned int sr = shared_reduce(p, s); 38 | if (t == 0) 39 | { 40 | *d_out_shared = sr; 41 | } 42 | } 43 | 44 | int main(int argc, char **argv) 45 | { 46 | const int ARRAY_SIZE = 32; 47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int); 48 | 49 | // generate the input array on the host 50 | unsigned int h_in[ARRAY_SIZE]; 51 | unsigned int sum = 0; 52 | for(int i = 0; i < ARRAY_SIZE; i++) { 53 | // generate random float in [0, 1] 54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0; 55 | sum += h_in[i]; 56 | } 57 | 58 | // declare GPU memory pointers 59 | unsigned int * d_in, * d_out_shared; 60 | 61 | // allocate GPU memory 62 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 63 | cudaMalloc((void **) &d_out_shared, sizeof(unsigned int)); 64 | 65 | // transfer the input array to the GPU 66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 67 | 68 | GpuTimer timer; 69 | timer.Start(); 70 | // launch the kernel 71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>> 72 | (d_out_shared, d_in); 73 | timer.Stop(); 74 | 75 | printf("Your code executed in %g ms\n", timer.Elapsed()); 76 | 77 | unsigned int h_out_shared; 78 | // copy back the sum from GPU 79 | cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int), 80 | cudaMemcpyDeviceToHost); 81 | 82 | compare(h_out_shared, sum); 83 | 84 | // free GPU memory allocation 85 | cudaFree(d_in); 86 | cudaFree(d_out_shared); 87 | } 88 | 89 | -------------------------------------------------------------------------------- /Final/warpreduce/part_b/compare.h: -------------------------------------------------------------------------------- 1 | int compare(unsigned int h_out_warp, int sum){ 2 | int failure = 0; 3 | if (h_out_warp != sum) { 4 | fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n", 5 | h_out_warp, sum); 6 | failure = 1; 7 | } 8 | 9 | if (failure == 0) 10 | { 11 | printf("Success! Your warp reduce worked.\n"); 12 | } 13 | else{ 14 | printf("Error! Your warp reduce code's output did not match sum.\n"); 15 | } 16 | 17 | return failure; 18 | } -------------------------------------------------------------------------------- /Final/warpreduce/part_b/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Final/warpreduce/part_b/warpreduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "compare.h" 5 | #include "gputimer.h" 6 | 7 | // Subpart b: 8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions. 9 | // Namely, these instructions are: 10 | // 11 | // int __popc(int x) Population Count: Returns the number of bits that are set 12 | // to 1 in the 32-bit integer x. 13 | // 14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero 15 | // bits beginning at the most significant bit of the 32-bit integer x. 16 | // 17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only 18 | // if the predicate p provided by the thread in lane k of the warp is non-zero. 19 | 20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) { 21 | // Assumes values in 'p' are either 1 or 0 22 | // Should not use 's' 23 | // Sums p across warp, returning the result. 24 | // You can do this without using the character '+' in your code at all 25 | // 26 | // TODO: Fill in the rest of this function 27 | // 28 | } 29 | 30 | __global__ void reduce(unsigned int * d_out_warp, 31 | const unsigned int * d_in) 32 | { 33 | extern __shared__ unsigned int s[]; 34 | int t = threadIdx.x; 35 | int p = d_in[t]; 36 | 37 | unsigned int wr = warp_reduce(p, s); 38 | if (t == 0) 39 | { 40 | *d_out_warp = wr; 41 | } 42 | } 43 | 44 | int main(int argc, char **argv) 45 | { 46 | const int ARRAY_SIZE = 32; 47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int); 48 | 49 | // generate the input array on the host 50 | unsigned int h_in[ARRAY_SIZE]; 51 | unsigned int sum = 0; 52 | for(int i = 0; i < ARRAY_SIZE; i++) { 53 | // generate random float in [0, 1] 54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0; 55 | sum += h_in[i]; 56 | } 57 | 58 | // declare GPU memory pointers 59 | unsigned int * d_in, * d_out_warp; 60 | 61 | // allocate GPU memory 62 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 63 | cudaMalloc((void **) &d_out_warp, sizeof(unsigned int)); 64 | 65 | // transfer the input array to the GPU 66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 67 | 68 | GpuTimer timer; 69 | timer.Start(); 70 | // launch the kernel 71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>> 72 | (d_out_warp, d_in); 73 | timer.Stop(); 74 | 75 | printf("Your code executed in %g ms\n", timer.Elapsed()); 76 | 77 | unsigned int h_out_warp; 78 | // copy back the sum from GPU 79 | cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int), 80 | cudaMemcpyDeviceToHost); 81 | 82 | // compare your result against the expected reduce sum 83 | compare(h_out_warp, sum); 84 | 85 | // free GPU memory allocation 86 | cudaFree(d_in); 87 | cudaFree(d_out_warp); 88 | 89 | } 90 | 91 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc,char **argv) 4 | { 5 | printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n", 6 | 1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99, 7 | 1.f, 1e99, -1e99, 1.f + (1e99 + -1e99)); 8 | return 0; 9 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gputimer.h" 3 | 4 | #define NUM_THREADS 1000000 5 | #define ARRAY_SIZE 100 6 | 7 | #define BLOCK_WIDTH 1000 8 | 9 | void print_array(int *array, int size) 10 | { 11 | printf("{ "); 12 | for (int i = 0; i < size; i++) { printf("%d ", array[i]); } 13 | printf("}\n"); 14 | } 15 | 16 | __global__ void increment_naive(int *g) 17 | { 18 | // which thread is this? 19 | int i = blockIdx.x * blockDim.x + threadIdx.x; 20 | 21 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 22 | i = i % ARRAY_SIZE; 23 | g[i] = g[i] + 1; 24 | } 25 | 26 | __global__ void increment_atomic(int *g) 27 | { 28 | // which thread is this? 29 | int i = blockIdx.x * blockDim.x + threadIdx.x; 30 | 31 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 32 | i = i % ARRAY_SIZE; 33 | atomicAdd(& g[i], 1); 34 | } 35 | 36 | int main(int argc,char **argv) 37 | { 38 | GpuTimer timer; 39 | printf("%d total threads in %d blocks writing into %d array elements\n", 40 | NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); 41 | 42 | // declare and allocate host memory 43 | int h_array[ARRAY_SIZE]; 44 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); 45 | 46 | // declare, allocate, and zero out GPU memory 47 | int * d_array; 48 | cudaMalloc((void **) &d_array, ARRAY_BYTES); 49 | cudaMemset((void *) d_array, 0, ARRAY_BYTES); 50 | 51 | // launch the kernel - comment out one of these 52 | timer.Start(); 53 | // increment_naive<<>>(d_array); 54 | increment_atomic<<>>(d_array); 55 | timer.Stop(); 56 | 57 | // copy back the array of sums from GPU and print 58 | cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost); 59 | print_array(h_array, ARRAY_SIZE); 60 | printf("Time elapsed = %g ms\n", timer.Elapsed()); 61 | 62 | // free GPU memory allocation and exit 63 | cudaFree(d_array); 64 | return 0; 65 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 16 4 | #define BLOCK_WIDTH 1 5 | 6 | __global__ void hello() 7 | { 8 | printf("Hello world! I'm a thread in block %d\n", blockIdx.x); 9 | } 10 | 11 | 12 | int main(int argc,char **argv) 13 | { 14 | // launch the kernel 15 | hello<<>>(); 16 | 17 | // force the printf()s to flush 18 | cudaDeviceSynchronize(); 19 | 20 | printf("That's all!\n"); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define NUM_BLOCKS 1 4 | #define BLOCK_WIDTH 256 5 | 6 | __global__ void hello() 7 | { 8 | printf("Hello world! I'm thread %d\n", threadIdx.x); 9 | } 10 | 11 | 12 | int main(int argc,char **argv) 13 | { 14 | // launch the kernel 15 | hello<<>>(); 16 | 17 | // force the printf()s to flush 18 | cudaDeviceSynchronize(); 19 | 20 | printf("That's all!\n"); 21 | 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu: -------------------------------------------------------------------------------- 1 | // Using different memory spaces in CUDA 2 | #include 3 | 4 | /********************** 5 | * using local memory * 6 | **********************/ 7 | 8 | // a __device__ or __global__ function runs on the GPU 9 | __global__ void use_local_memory_GPU(float in) 10 | { 11 | float f; // variable "f" is in local memory and private to each thread 12 | f = in; // parameter "in" is in local memory and private to each thread 13 | // ... real code would presumably do other stuff here ... 14 | } 15 | 16 | /********************** 17 | * using global memory * 18 | **********************/ 19 | 20 | // a __global__ function runs on the GPU & can be called from host 21 | __global__ void use_global_memory_GPU(float *array) 22 | { 23 | // "array" is a pointer into global memory on the device 24 | array[threadIdx.x] = 2.0f * (float) threadIdx.x; 25 | } 26 | 27 | /********************** 28 | * using shared memory * 29 | **********************/ 30 | 31 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks) 32 | __global__ void use_shared_memory_GPU(float *array) 33 | { 34 | // local variables, private to each thread 35 | int i, index = threadIdx.x; 36 | float average, sum = 0.0f; 37 | 38 | // __shared__ variables are visible to all threads in the thread block 39 | // and have the same lifetime as the thread block 40 | __shared__ float sh_arr[128]; 41 | 42 | // copy data from "array" in global memory to sh_arr in shared memory. 43 | // here, each thread is responsible for copying a single element. 44 | sh_arr[index] = array[index]; 45 | 46 | __syncthreads(); // ensure all the writes to shared memory have completed 47 | 48 | // now, sh_arr is fully populated. Let's find the average of all previous elements 49 | for (i=0; i average) { array[index] = average; } 56 | 57 | // the following code has NO EFFECT: it modifies shared memory, but 58 | // the resulting modified data is never copied back to global memory 59 | // and vanishes when the thread block completes 60 | sh_arr[index] = 3.14; 61 | } 62 | 63 | int main(int argc, char **argv) 64 | { 65 | /* 66 | * First, call a kernel that shows using local memory 67 | */ 68 | use_local_memory_GPU<<<1, 128>>>(2.0f); 69 | 70 | /* 71 | * Next, call a kernel that shows using global memory 72 | */ 73 | float h_arr[128]; // convention: h_ variables live on host 74 | float *d_arr; // convention: d_ variables live on device (GPU global mem) 75 | 76 | // allocate global memory on the device, place result in "d_arr" 77 | cudaMalloc((void **) &d_arr, sizeof(float) * 128); 78 | // now copy data from host memory "h_arr" to device memory "d_arr" 79 | cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice); 80 | // launch the kernel (1 block of 128 threads) 81 | use_global_memory_GPU<<<1, 128>>>(d_arr); // modifies the contents of array at d_arr 82 | // copy the modified array back to the host, overwriting contents of h_arr 83 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost); 84 | // ... do other stuff ... 85 | 86 | /* 87 | * Next, call a kernel that shows using shared memory 88 | */ 89 | 90 | // as before, pass in a pointer to data in global memory 91 | use_shared_memory_GPU<<<1, 128>>>(d_arr); 92 | // copy the modified array back to the host 93 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice); 94 | // ... do other stuff ... 95 | return 0; 96 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int log2(int i) 5 | { 6 | int r = 0; 7 | while (i >>= 1) r++; 8 | return r; 9 | } 10 | 11 | int bit_reverse(int w, int bits) 12 | { 13 | int r = 0; 14 | for (int i = 0; i < bits; i++) 15 | { 16 | int bit = (w & (1 << i)) >> i; 17 | r |= bit << (bits - i - 1); 18 | } 19 | return r; 20 | } 21 | 22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT) 23 | { 24 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 25 | int myItem = d_in[myId]; 26 | int myBin = myItem % BIN_COUNT; 27 | d_bins[myBin]++; 28 | } 29 | 30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT) 31 | { 32 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 33 | int myItem = d_in[myId]; 34 | int myBin = myItem % BIN_COUNT; 35 | atomicAdd(&(d_bins[myBin]), 1); 36 | } 37 | 38 | 39 | int main(int argc, char **argv) 40 | { 41 | int deviceCount; 42 | cudaGetDeviceCount(&deviceCount); 43 | if (deviceCount == 0) { 44 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 45 | exit(EXIT_FAILURE); 46 | } 47 | int dev = 0; 48 | cudaSetDevice(dev); 49 | 50 | cudaDeviceProp devProps; 51 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 52 | { 53 | printf("Using device %d:\n", dev); 54 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 55 | devProps.name, (int)devProps.totalGlobalMem, 56 | (int)devProps.major, (int)devProps.minor, 57 | (int)devProps.clockRate); 58 | } 59 | 60 | const int ARRAY_SIZE = 65536; 61 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); 62 | const int BIN_COUNT = 16; 63 | const int BIN_BYTES = BIN_COUNT * sizeof(int); 64 | 65 | // generate the input array on the host 66 | int h_in[ARRAY_SIZE]; 67 | for(int i = 0; i < ARRAY_SIZE; i++) { 68 | h_in[i] = bit_reverse(i, log2(ARRAY_SIZE)); 69 | } 70 | int h_bins[BIN_COUNT]; 71 | for(int i = 0; i < BIN_COUNT; i++) { 72 | h_bins[i] = 0; 73 | } 74 | 75 | // declare GPU memory pointers 76 | int * d_in; 77 | int * d_bins; 78 | 79 | // allocate GPU memory 80 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 81 | cudaMalloc((void **) &d_bins, BIN_BYTES); 82 | 83 | // transfer the arrays to the GPU 84 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 85 | cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice); 86 | 87 | int whichKernel = 0; 88 | if (argc == 2) { 89 | whichKernel = atoi(argv[1]); 90 | } 91 | 92 | // launch the kernel 93 | switch(whichKernel) { 94 | case 0: 95 | printf("Running naive histo\n"); 96 | naive_histo<<>>(d_bins, d_in, BIN_COUNT); 97 | break; 98 | case 1: 99 | printf("Running simple histo\n"); 100 | simple_histo<<>>(d_bins, d_in, BIN_COUNT); 101 | break; 102 | default: 103 | fprintf(stderr, "error: ran no kernel\n"); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | // copy back the sum from GPU 108 | cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost); 109 | 110 | for(int i = 0; i < BIN_COUNT; i++) { 111 | printf("bin %d: count %d\n", i, h_bins[i]); 112 | } 113 | 114 | // free GPU memory allocation 115 | cudaFree(d_in); 116 | cudaFree(d_bins); 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | __global__ void global_reduce_kernel(float * d_out, float * d_in) 6 | { 7 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 8 | int tid = threadIdx.x; 9 | 10 | // do reduction in global mem 11 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 12 | { 13 | if (tid < s) 14 | { 15 | d_in[myId] += d_in[myId + s]; 16 | } 17 | __syncthreads(); // make sure all adds at one stage are done! 18 | } 19 | 20 | // only thread 0 writes result for this block back to global mem 21 | if (tid == 0) 22 | { 23 | d_out[blockIdx.x] = d_in[myId]; 24 | } 25 | } 26 | 27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in) 28 | { 29 | // sdata is allocated in the kernel call: 3rd arg to <<>> 30 | extern __shared__ float sdata[]; 31 | 32 | int myId = threadIdx.x + blockDim.x * blockIdx.x; 33 | int tid = threadIdx.x; 34 | 35 | // load shared mem from global mem 36 | sdata[tid] = d_in[myId]; 37 | __syncthreads(); // make sure entire block is loaded! 38 | 39 | // do reduction in shared mem 40 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) 41 | { 42 | if (tid < s) 43 | { 44 | sdata[tid] += sdata[tid + s]; 45 | } 46 | __syncthreads(); // make sure all adds at one stage are done! 47 | } 48 | 49 | // only thread 0 writes result for this block back to global mem 50 | if (tid == 0) 51 | { 52 | d_out[blockIdx.x] = sdata[0]; 53 | } 54 | } 55 | 56 | void reduce(float * d_out, float * d_intermediate, float * d_in, 57 | int size, bool usesSharedMemory) 58 | { 59 | // assumes that size is not greater than maxThreadsPerBlock^2 60 | // and that size is a multiple of maxThreadsPerBlock 61 | const int maxThreadsPerBlock = 1024; 62 | int threads = maxThreadsPerBlock; 63 | int blocks = size / maxThreadsPerBlock; 64 | if (usesSharedMemory) 65 | { 66 | shmem_reduce_kernel<<>> 67 | (d_intermediate, d_in); 68 | } 69 | else 70 | { 71 | global_reduce_kernel<<>> 72 | (d_intermediate, d_in); 73 | } 74 | // now we're down to one block left, so reduce it 75 | threads = blocks; // launch one thread for each block in prev step 76 | blocks = 1; 77 | if (usesSharedMemory) 78 | { 79 | shmem_reduce_kernel<<>> 80 | (d_out, d_intermediate); 81 | } 82 | else 83 | { 84 | global_reduce_kernel<<>> 85 | (d_out, d_intermediate); 86 | } 87 | } 88 | 89 | int main(int argc, char **argv) 90 | { 91 | int deviceCount; 92 | cudaGetDeviceCount(&deviceCount); 93 | if (deviceCount == 0) { 94 | fprintf(stderr, "error: no devices supporting CUDA.\n"); 95 | exit(EXIT_FAILURE); 96 | } 97 | int dev = 0; 98 | cudaSetDevice(dev); 99 | 100 | cudaDeviceProp devProps; 101 | if (cudaGetDeviceProperties(&devProps, dev) == 0) 102 | { 103 | printf("Using device %d:\n", dev); 104 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", 105 | devProps.name, (int)devProps.totalGlobalMem, 106 | (int)devProps.major, (int)devProps.minor, 107 | (int)devProps.clockRate); 108 | } 109 | 110 | const int ARRAY_SIZE = 1 << 20; 111 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); 112 | 113 | // generate the input array on the host 114 | float h_in[ARRAY_SIZE]; 115 | float sum = 0.0f; 116 | for(int i = 0; i < ARRAY_SIZE; i++) { 117 | // generate random float in [-1.0f, 1.0f] 118 | h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f); 119 | sum += h_in[i]; 120 | } 121 | 122 | // declare GPU memory pointers 123 | float * d_in, * d_intermediate, * d_out; 124 | 125 | // allocate GPU memory 126 | cudaMalloc((void **) &d_in, ARRAY_BYTES); 127 | cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated 128 | cudaMalloc((void **) &d_out, sizeof(float)); 129 | 130 | // transfer the input array to the GPU 131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 132 | 133 | int whichKernel = 0; 134 | if (argc == 2) { 135 | whichKernel = atoi(argv[1]); 136 | } 137 | 138 | cudaEvent_t start, stop; 139 | cudaEventCreate(&start); 140 | cudaEventCreate(&stop); 141 | // launch the kernel 142 | switch(whichKernel) { 143 | case 0: 144 | printf("Running global reduce\n"); 145 | cudaEventRecord(start, 0); 146 | for (int i = 0; i < 100; i++) 147 | { 148 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false); 149 | } 150 | cudaEventRecord(stop, 0); 151 | break; 152 | case 1: 153 | printf("Running reduce with shared mem\n"); 154 | cudaEventRecord(start, 0); 155 | for (int i = 0; i < 100; i++) 156 | { 157 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true); 158 | } 159 | cudaEventRecord(stop, 0); 160 | break; 161 | default: 162 | fprintf(stderr, "error: ran no kernel\n"); 163 | exit(EXIT_FAILURE); 164 | } 165 | cudaEventSynchronize(stop); 166 | float elapsedTime; 167 | cudaEventElapsedTime(&elapsedTime, start, stop); 168 | elapsedTime /= 100.0f; // 100 trials 169 | 170 | // copy back the sum from GPU 171 | float h_out; 172 | cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost); 173 | 174 | printf("average time elapsed: %f\n", elapsedTime); 175 | 176 | // free GPU memory allocation 177 | cudaFree(d_in); 178 | cudaFree(d_intermediate); 179 | cudaFree(d_out); 180 | 181 | return 0; 182 | } 183 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 5 Code Snippets/transpose.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gputimer.h" 3 | 4 | const int N= 1024; // matrix size is NxN 5 | const int K= 32; // tile size is KxK 6 | 7 | // Utility functions: compare, print, and fill matrices 8 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 9 | 10 | template 11 | void check(T err, const char* const func, const char* const file, const int line) 12 | { 13 | if (err != cudaSuccess) { 14 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line); 15 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);; 16 | exit(1); 17 | } 18 | } 19 | 20 | int compare_matrices(float *gpu, float *ref) 21 | { 22 | int result = 0; 23 | 24 | for(int j=0; j < N; j++) 25 | for(int i=0; i < N; i++) 26 | if (ref[i + j*N] != gpu[i + j*N]) 27 | { 28 | // printf("reference(%d,%d) = %f but test(%d,%d) = %f\n", 29 | // i,j,ref[i+j*N],i,j,test[i+j*N]); 30 | result = 1; 31 | } 32 | return result; 33 | } 34 | 35 | void print_matrix(float *mat) 36 | { 37 | for(int j=0; j < N; j++) 38 | { 39 | for(int i=0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); } 40 | printf("\n"); 41 | } 42 | } 43 | 44 | // fill a matrix with sequential numbers in the range 0..N-1 45 | void fill_matrix(float *mat) 46 | { 47 | for(int j=0; j < N * N; j++) 48 | mat[j] = (float) j; 49 | } 50 | 51 | 52 | 53 | void 54 | transpose_CPU(float in[], float out[]) 55 | { 56 | for(int j=0; j < N; j++) 57 | for(int i=0; i < N; i++) 58 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j) 59 | } 60 | 61 | // to be launched on a single thread 62 | __global__ void 63 | transpose_serial(float in[], float out[]) 64 | { 65 | for(int j=0; j < N; j++) 66 | for(int i=0; i < N; i++) 67 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j) 68 | } 69 | 70 | // to be launched with one thread per row of output matrix 71 | __global__ void 72 | transpose_parallel_per_row(float in[], float out[]) 73 | { 74 | int i = threadIdx.x; 75 | 76 | for(int j=0; j < N; j++) 77 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j) 78 | } 79 | 80 | // to be launched with one thread per element, in KxK threadblocks 81 | // thread (x,y) in grid writes element (i,j) of output matrix 82 | __global__ void 83 | transpose_parallel_per_element(float in[], float out[]) 84 | { 85 | int i = blockIdx.x * K + threadIdx.x; 86 | int j = blockIdx.y * K + threadIdx.y; 87 | 88 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j) 89 | } 90 | 91 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks 92 | // thread blocks read & write tiles, in coalesced fashion 93 | // adjacent threads read adjacent input elements, write adjacent output elmts 94 | __global__ void 95 | transpose_parallel_per_element_tiled(float in[], float out[]) 96 | { 97 | // (i,j) locations of the tile corners for input & output matrices: 98 | int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K; 99 | int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K; 100 | 101 | int x = threadIdx.x, y = threadIdx.y; 102 | 103 | __shared__ float tile[K][K]; 104 | 105 | // coalesced read from global mem, TRANSPOSED write into shared mem: 106 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N]; 107 | __syncthreads(); 108 | // read from shared mem, coalesced write to global mem: 109 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y]; 110 | } 111 | 112 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks 113 | // thread blocks read & write tiles, in coalesced fashion 114 | // adjacent threads read adjacent input elements, write adjacent output elmts 115 | __global__ void 116 | transpose_parallel_per_element_tiled16(float in[], float out[]) 117 | { 118 | // (i,j) locations of the tile corners for input & output matrices: 119 | int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16; 120 | int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16; 121 | 122 | int x = threadIdx.x, y = threadIdx.y; 123 | 124 | __shared__ float tile[16][16]; 125 | 126 | // coalesced read from global mem, TRANSPOSED write into shared mem: 127 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N]; 128 | __syncthreads(); 129 | // read from shared mem, coalesced write to global mem: 130 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y]; 131 | } 132 | 133 | // to be launched with one thread per element, in KxK threadblocks 134 | // thread blocks read & write tiles, in coalesced fashion 135 | // shared memory array padded to avoid bank conflicts 136 | __global__ void 137 | transpose_parallel_per_element_tiled_padded(float in[], float out[]) 138 | { 139 | // (i,j) locations of the tile corners for input & output matrices: 140 | int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K; 141 | int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K; 142 | 143 | int x = threadIdx.x, y = threadIdx.y; 144 | 145 | __shared__ float tile[K][K+1]; 146 | 147 | // coalesced read from global mem, TRANSPOSED write into shared mem: 148 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N]; 149 | __syncthreads(); 150 | // read from shared mem, coalesced write to global mem: 151 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y]; 152 | } 153 | 154 | // to be launched with one thread per element, in KxK threadblocks 155 | // thread blocks read & write tiles, in coalesced fashion 156 | // shared memory array padded to avoid bank conflicts 157 | __global__ void 158 | transpose_parallel_per_element_tiled_padded16(float in[], float out[]) 159 | { 160 | // (i,j) locations of the tile corners for input & output matrices: 161 | int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16; 162 | int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16; 163 | 164 | int x = threadIdx.x, y = threadIdx.y; 165 | 166 | __shared__ float tile[16][16+1]; 167 | 168 | // coalesced read from global mem, TRANSPOSED write into shared mem: 169 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N]; 170 | __syncthreads(); 171 | // read from shared mem, coalesced write to global mem: 172 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y]; 173 | } 174 | 175 | int main(int argc, char **argv) 176 | { 177 | int numbytes = N * N * sizeof(float); 178 | 179 | float *in = (float *) malloc(numbytes); 180 | float *out = (float *) malloc(numbytes); 181 | float *gold = (float *) malloc(numbytes); 182 | 183 | fill_matrix(in); 184 | transpose_CPU(in, gold); 185 | 186 | float *d_in, *d_out; 187 | 188 | cudaMalloc(&d_in, numbytes); 189 | cudaMalloc(&d_out, numbytes); 190 | cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice); 191 | 192 | GpuTimer timer; 193 | 194 | /* 195 | * Now time each kernel and verify that it produces the correct result. 196 | * 197 | * To be really careful about benchmarking purposes, we should run every kernel once 198 | * to "warm" the system and avoid any compilation or code-caching effects, then run 199 | * every kernel 10 or 100 times and average the timings to smooth out any variance. 200 | * But this makes for messy code and our goal is teaching, not detailed benchmarking. 201 | */ 202 | 203 | timer.Start(); 204 | transpose_serial<<<1,1>>>(d_in, d_out); 205 | timer.Stop(); 206 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 207 | printf("transpose_serial: %g ms.\nVerifying transpose...%s\n", 208 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 209 | 210 | timer.Start(); 211 | transpose_parallel_per_row<<<1,N>>>(d_in, d_out); 212 | timer.Stop(); 213 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 214 | printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n", 215 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 216 | 217 | dim3 blocks(N/K,N/K); // blocks per grid 218 | dim3 threads(K,K); // threads per block 219 | 220 | timer.Start(); 221 | transpose_parallel_per_element<<>>(d_in, d_out); 222 | timer.Stop(); 223 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 224 | printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n", 225 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 226 | 227 | timer.Start(); 228 | transpose_parallel_per_element_tiled<<>>(d_in, d_out); 229 | timer.Stop(); 230 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 231 | printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n", 232 | K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 233 | 234 | dim3 blocks16x16(N/16,N/16); // blocks per grid 235 | dim3 threads16x16(16,16); // threads per block 236 | 237 | timer.Start(); 238 | transpose_parallel_per_element_tiled16<<>>(d_in, d_out); 239 | timer.Stop(); 240 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 241 | printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n", 242 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 243 | 244 | timer.Start(); 245 | transpose_parallel_per_element_tiled_padded16<<>>(d_in, d_out); 246 | timer.Stop(); 247 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost); 248 | printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n", 249 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success"); 250 | 251 | cudaFree(d_in); 252 | cudaFree(d_out); 253 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Simple demonstration of cub::BlockScan 31 | * 32 | * Example compilation string: 33 | * 34 | * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum 35 | * 36 | ******************************************************************************/ 37 | 38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h) 39 | #define CUB_STDERR 40 | 41 | #include 42 | #include 43 | 44 | #include 45 | 46 | using namespace cub; 47 | 48 | //--------------------------------------------------------------------- 49 | // Globals, constants and typedefs 50 | //--------------------------------------------------------------------- 51 | 52 | bool g_verbose = false; 53 | int g_iterations = 100; 54 | 55 | 56 | //--------------------------------------------------------------------- 57 | // Kernels 58 | //--------------------------------------------------------------------- 59 | 60 | /** 61 | * Simple kernel for performing a block-wide exclusive prefix sum over integers 62 | */ 63 | template < 64 | int BLOCK_THREADS, 65 | int ITEMS_PER_THREAD> 66 | __global__ void BlockPrefixSumKernel( 67 | int *d_in, // Tile of input 68 | int *d_out, // Tile of output 69 | clock_t *d_elapsed) // Elapsed cycle count of block scan 70 | { 71 | // Parameterize BlockScan type for our thread block 72 | typedef BlockScan BlockScanT; 73 | 74 | // Shared memory 75 | __shared__ typename BlockScanT::SmemStorage smem_storage; 76 | 77 | // Per-thread tile data 78 | int data[ITEMS_PER_THREAD]; 79 | BlockLoadVectorized(d_in, data); 80 | 81 | // Start cycle timer 82 | clock_t start = clock(); 83 | 84 | // Compute exclusive prefix sum 85 | int aggregate; 86 | BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate); 87 | 88 | // Stop cycle timer 89 | clock_t stop = clock(); 90 | 91 | // Store output 92 | BlockStoreVectorized(d_out, data); 93 | 94 | // Store aggregate and elapsed clocks 95 | if (threadIdx.x == 0) 96 | { 97 | *d_elapsed = (start > stop) ? start - stop : stop - start; 98 | d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; 99 | } 100 | } 101 | 102 | 103 | 104 | //--------------------------------------------------------------------- 105 | // Host utilities 106 | //--------------------------------------------------------------------- 107 | 108 | /** 109 | * Initialize exclusive prefix sum problem (and solution). 110 | * Returns the aggregate 111 | */ 112 | int Initialize( 113 | int *h_in, 114 | int *h_reference, 115 | int num_elements) 116 | { 117 | int inclusive = 0; 118 | 119 | for (int i = 0; i < num_elements; ++i) 120 | { 121 | h_in[i] = i % 17; 122 | 123 | h_reference[i] = inclusive; 124 | inclusive += h_in[i]; 125 | } 126 | 127 | return inclusive; 128 | } 129 | 130 | 131 | /** 132 | * Test thread block scan 133 | */ 134 | template < 135 | int BLOCK_THREADS, 136 | int ITEMS_PER_THREAD> 137 | void Test() 138 | { 139 | const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; 140 | 141 | // Allocate host arrays 142 | int *h_in = new int[TILE_SIZE]; 143 | int *h_reference = new int[TILE_SIZE]; 144 | int *h_gpu = new int[TILE_SIZE + 1]; 145 | 146 | // Initialize problem and reference output on host 147 | int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); 148 | 149 | // Initialize device arrays 150 | int *d_in = NULL; 151 | int *d_out = NULL; 152 | clock_t *d_elapsed = NULL; 153 | cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); 154 | cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); 155 | cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); 156 | 157 | // Display input problem data 158 | if (g_verbose) 159 | { 160 | printf("Input data: "); 161 | for (int i = 0; i < TILE_SIZE; i++) 162 | printf("%d, ", h_in[i]); 163 | printf("\n\n"); 164 | } 165 | 166 | // Copy problem to device 167 | cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); 168 | 169 | printf("BlockScan %d items (%d threads, %d items per thread): ", 170 | TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD); 171 | 172 | // Run this several times and average the performance results 173 | clock_t elapsed_scan_clocks = 0; 174 | for (int i = 0; i < g_iterations; ++i) 175 | { 176 | // Run aggregate/prefix kernel 177 | BlockPrefixSumKernel<<<1, BLOCK_THREADS>>>( 178 | d_in, 179 | d_out, 180 | d_elapsed); 181 | 182 | // Copy results from device 183 | clock_t scan_clocks; 184 | cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost); 185 | cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost); 186 | elapsed_scan_clocks += scan_clocks; 187 | } 188 | 189 | // Check scanned items 190 | bool correct = true; 191 | for (int i = 0; i < TILE_SIZE; i++) 192 | { 193 | if (h_gpu[i] != h_reference[i]) 194 | { 195 | printf("Incorrect result @ offset %d (%d != %d)\n", 196 | i, h_gpu[i], h_reference[i]); 197 | correct = false; 198 | break; 199 | } 200 | } 201 | 202 | // Check total aggregate 203 | if (h_gpu[TILE_SIZE] != h_aggregate) 204 | { 205 | printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate); 206 | correct = false; 207 | } 208 | if (correct) printf("Correct!\n"); 209 | 210 | // Display results problem data 211 | if (g_verbose) 212 | { 213 | printf("GPU output (reference output): "); 214 | for (int i = 0; i < TILE_SIZE; i++) 215 | printf("%d (%d), ", h_gpu[i], h_reference[i]); 216 | printf("\n"); 217 | printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate); 218 | printf("\n\n"); 219 | } 220 | 221 | // Display timing results 222 | printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations); 223 | 224 | // Cleanup 225 | if (h_in) delete[] h_in; 226 | if (h_reference) delete[] h_reference; 227 | if (h_gpu) delete[] h_gpu; 228 | if (d_in) cudaFree(d_in); 229 | if (d_out) cudaFree(d_out); 230 | if (d_elapsed) cudaFree(d_elapsed); 231 | } 232 | 233 | 234 | /** 235 | * Main 236 | */ 237 | int main(int argc, char** argv) 238 | { 239 | // Display GPU name 240 | cudaDeviceProp props; 241 | cudaGetDeviceProperties(&props, 0); 242 | printf("Using device %s\n", props.name); 243 | 244 | /** Add tests here **/ 245 | 246 | // Run tests 247 | Test<1024, 1>(); 248 | Test<512, 2>(); 249 | Test<256, 4>(); 250 | Test<128, 8>(); 251 | Test<64, 16>(); 252 | Test<32, 32>(); 253 | Test<16, 64>(); 254 | 255 | /****/ 256 | 257 | return 0; 258 | } 259 | 260 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "gputimer.h" 10 | 11 | int main(void) 12 | { 13 | // generate N random numbers serially 14 | int N = 1000000; 15 | thrust::host_vector h_vec(N); 16 | std::generate(h_vec.begin(), h_vec.end(), rand); 17 | 18 | // transfer data to the device 19 | thrust::device_vector d_vec = h_vec; 20 | 21 | // sort data on the device (846M keys per second on GeForce GTX 480) 22 | GpuTimer timer; 23 | timer.Start(); 24 | thrust::sort(d_vec.begin(), d_vec.end()); 25 | timer.Stop(); 26 | 27 | // transfer data back to host 28 | thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); 29 | 30 | printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed()); 31 | return 0; 32 | } -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gputimer.h" 3 | #include "utils.h" 4 | 5 | const int BLOCKSIZE = 128; 6 | const int NUMBLOCKS = 1000; // set this to 1 or 2 for debugging 7 | const int N = BLOCKSIZE*NUMBLOCKS; 8 | 9 | /* 10 | * TODO: modify the foo and bar kernels to use tiling: 11 | * - copy the input data to shared memory 12 | * - perform the computation there 13 | * - copy the result back to global memory 14 | * - assume thread blocks of 128 threads 15 | * - handle intra-block boundaries correctly 16 | * You can ignore boundary conditions (we ignore the first 2 and last 2 elements) 17 | */ 18 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){ 19 | 20 | int i = threadIdx.x + blockIdx.x*blockDim.x; 21 | 22 | out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f; 23 | } 24 | 25 | __global__ void bar(float out[], float in[]) 26 | { 27 | int i = threadIdx.x + blockIdx.x*blockDim.x; 28 | 29 | out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f; 30 | } 31 | 32 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[]) 33 | { 34 | for (int i=0; i>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE); 91 | fooTimer.Stop(); 92 | 93 | barTimer.Start(); 94 | bar<<>>(d_barOut, d_barIn); 95 | barTimer.Stop(); 96 | 97 | cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost); 98 | cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost); 99 | printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed()); 100 | compareArrays(ref_fooOut, fooOut, N); 101 | printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed()); 102 | compareArrays(ref_barOut, barOut, N); 103 | } 104 | -------------------------------------------------------------------------------- /Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h: -------------------------------------------------------------------------------- 1 | // error checking utility functions 2 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 3 | 4 | template 5 | void check(T err, const char* const func, const char* const file, const int line) 6 | { 7 | if (err != cudaSuccess) { 8 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line); 9 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);; 10 | exit(1); 11 | } 12 | } 13 | 14 | void printArray(float in[], int N) 15 | { 16 | for (int i=0; i CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu}) -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/HW1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | 9 | cv::Mat imageRGBA; 10 | cv::Mat imageGrey; 11 | 12 | uchar4 *d_rgbaImage__; 13 | unsigned char *d_greyImage__; 14 | 15 | size_t numRows() { return imageRGBA.rows; } 16 | size_t numCols() { return imageRGBA.cols; } 17 | 18 | //return types are void since any internal error will be handled by quitting 19 | //no point in returning error codes... 20 | //returns a pointer to an RGBA version of the input image 21 | //and a pointer to the single channel grey-scale output 22 | //on both the host and device 23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage, 24 | uchar4 **d_rgbaImage, unsigned char **d_greyImage, 25 | const std::string &filename) { 26 | //make sure the context initializes ok 27 | checkCudaErrors(cudaFree(0)); 28 | 29 | cv::Mat image; 30 | image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 31 | if (image.empty()) { 32 | std::cerr << "Couldn't open file: " << filename << std::endl; 33 | exit(1); 34 | } 35 | 36 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 37 | 38 | //allocate memory for the output 39 | imageGrey.create(image.rows, image.cols, CV_8UC1); 40 | 41 | //This shouldn't ever happen given the way the images are created 42 | //at least based upon my limited understanding of OpenCV, but better to check 43 | if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) { 44 | std::cerr << "Images aren't continuous!! Exiting." << std::endl; 45 | exit(1); 46 | } 47 | 48 | *inputImage = (uchar4 *)imageRGBA.ptr(0); 49 | *greyImage = imageGrey.ptr(0); 50 | 51 | const size_t numPixels = numRows() * numCols(); 52 | //allocate memory on the device for both input and output 53 | checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels)); 54 | checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels)); 55 | checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around 56 | 57 | //copy input array to the GPU 58 | checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); 59 | 60 | d_rgbaImage__ = *d_rgbaImage; 61 | d_greyImage__ = *d_greyImage; 62 | } 63 | 64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) { 65 | cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr); 66 | 67 | //output the image 68 | cv::imwrite(output_file.c_str(), output); 69 | } 70 | 71 | void cleanup() 72 | { 73 | //cleanup 74 | cudaFree(d_rgbaImage__); 75 | cudaFree(d_greyImage__); 76 | } 77 | 78 | void generateReferenceImage(std::string input_filename, std::string output_filename) 79 | { 80 | cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE); 81 | 82 | cv::imwrite(output_filename, reference); 83 | 84 | } 85 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | # or if using MacPorts 19 | 20 | #OPENCV_LIBPATH=/opt/local/lib 21 | #OPENCV_INCLUDEPATH=/opt/local/include 22 | 23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 24 | 25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 26 | 27 | ###################################################### 28 | # On Macs the default install locations are below # 29 | # #################################################### 30 | 31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 32 | #CUDA_LIBPATH=/usr/local/cuda/lib 33 | 34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 35 | 36 | GCC_OPTS=-O3 -Wall -Wextra -m64 37 | 38 | student: main.o student_func.o compare.o reference_calc.o Makefile 39 | $(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 40 | 41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp 42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH) 43 | 44 | student_func.o: student_func.cu utils.h 45 | nvcc -c student_func.cu $(NVCC_OPTS) 46 | 47 | compare.o: compare.cpp compare.h 48 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 49 | 50 | reference_calc.o: reference_calc.cpp reference_calc.h 51 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 52 | 53 | clean: 54 | rm -f *.o *.png hw 55 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/cinque_terre.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/cinque_terre_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre_small.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utils.h" 6 | 7 | void compareImages(std::string reference_filename, std::string test_filename, 8 | bool useEpsCheck, double perPixelError, double globalError) 9 | { 10 | cv::Mat reference = cv::imread(reference_filename, -1); 11 | cv::Mat test = cv::imread(test_filename, -1); 12 | 13 | cv::Mat diff = abs(reference - test); 14 | 15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 16 | 17 | double minVal, maxVal; 18 | 19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 20 | 21 | //now perform transform so that we bump values to the full range 22 | 23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 24 | 25 | diff = diffSingleChannel.reshape(reference.channels(), 0); 26 | 27 | cv::imwrite("HW1_differenceImage.png", diff); 28 | //OK, now we can start comparing values... 29 | unsigned char *referencePtr = reference.ptr(0); 30 | unsigned char *testPtr = test.ptr(0); 31 | 32 | if (useEpsCheck) { 33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 34 | } 35 | else 36 | { 37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 38 | } 39 | 40 | std::cout << "PASS" << std::endl; 41 | return; 42 | } 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPARE_H__ 2 | #define COMPARE_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, 5 | bool useEpsCheck, double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW1 Solution 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include "reference_calc.h" 9 | #include "compare.h" 10 | 11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, 12 | uchar4 * const d_rgbaImage, 13 | unsigned char* const d_greyImage, 14 | size_t numRows, size_t numCols); 15 | 16 | //include the definitions of the above functions for this homework 17 | #include "HW1.cpp" 18 | 19 | int main(int argc, char **argv) { 20 | uchar4 *h_rgbaImage, *d_rgbaImage; 21 | unsigned char *h_greyImage, *d_greyImage; 22 | 23 | std::string input_file; 24 | std::string output_file; 25 | std::string reference_file; 26 | double perPixelError = 0.0; 27 | double globalError = 0.0; 28 | bool useEpsCheck = false; 29 | switch (argc) 30 | { 31 | case 2: 32 | input_file = std::string(argv[1]); 33 | output_file = "HW1_output.png"; 34 | reference_file = "HW1_reference.png"; 35 | break; 36 | case 3: 37 | input_file = std::string(argv[1]); 38 | output_file = std::string(argv[2]); 39 | reference_file = "HW1_reference.png"; 40 | break; 41 | case 4: 42 | input_file = std::string(argv[1]); 43 | output_file = std::string(argv[2]); 44 | reference_file = std::string(argv[3]); 45 | break; 46 | case 6: 47 | useEpsCheck=true; 48 | input_file = std::string(argv[1]); 49 | output_file = std::string(argv[2]); 50 | reference_file = std::string(argv[3]); 51 | perPixelError = atof(argv[4]); 52 | globalError = atof(argv[5]); 53 | break; 54 | default: 55 | std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 56 | exit(1); 57 | } 58 | //load the image and give us our input and output pointers 59 | preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); 60 | 61 | GpuTimer timer; 62 | timer.Start(); 63 | //call the students' code 64 | your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); 65 | timer.Stop(); 66 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 67 | 68 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 69 | 70 | if (err < 0) { 71 | //Couldn't print! Probably the student closed stdout - bad news 72 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 73 | exit(1); 74 | } 75 | 76 | size_t numPixels = numRows()*numCols(); 77 | checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost)); 78 | 79 | //check results and output the grey image 80 | postProcess(output_file, h_greyImage); 81 | 82 | referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols()); 83 | 84 | postProcess(reference_file, h_greyImage); 85 | 86 | //generateReferenceImage(input_file, reference_file); 87 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, 88 | globalError); 89 | 90 | cleanup(); 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | // for uchar4 struct 2 | #include 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, 5 | unsigned char *const greyImage, 6 | size_t numRows, 7 | size_t numCols) 8 | { 9 | for (size_t r = 0; r < numRows; ++r) { 10 | for (size_t c = 0; c < numCols; ++c) { 11 | uchar4 rgba = rgbaImage[r * numCols + c]; 12 | float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z; 13 | greyImage[r * numCols + c] = channelSum; 14 | } 15 | } 16 | } 17 | 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, 5 | unsigned char *const greyImage, 6 | size_t numRows, 7 | size_t numCols); 8 | 9 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/student_func.cu: -------------------------------------------------------------------------------- 1 | // Homework 1 2 | // Color to Greyscale Conversion 3 | 4 | //A common way to represent color images is known as RGBA - the color 5 | //is specified by how much Red, Grean and Blue is in it. 6 | //The 'A' stands for Alpha and is used for transparency, it will be 7 | //ignored in this homework. 8 | 9 | //Each channel Red, Blue, Green and Alpha is represented by one byte. 10 | //Since we are using one byte for each color there are 256 different 11 | //possible values for each color. This means we use 4 bytes per pixel. 12 | 13 | //Greyscale images are represented by a single intensity value per pixel 14 | //which is one byte in size. 15 | 16 | //To convert an image from color to grayscale one simple method is to 17 | //set the intensity to the average of the RGB channels. But we will 18 | //use a more sophisticated method that takes into account how the eye 19 | //perceives color and weights the channels unequally. 20 | 21 | //The eye responds most strongly to green followed by red and then blue. 22 | //The NTSC (National Television System Committee) recommends the following 23 | //formula for color to greyscale conversion: 24 | 25 | //I = .299f * R + .587f * G + .114f * B 26 | 27 | //Notice the trailing f's on the numbers which indicate that they are 28 | //single precision floating point constants and not double precision 29 | //constants. 30 | 31 | //You should fill in the kernel as well as set the block and grid sizes 32 | //so that the entire image is processed. 33 | 34 | #include "utils.h" 35 | 36 | __global__ 37 | void rgba_to_greyscale(const uchar4* const rgbaImage, 38 | unsigned char* const greyImage, 39 | int numRows, int numCols) 40 | { 41 | //TODO 42 | //Fill in the kernel to convert from color to greyscale 43 | //the mapping from components of a uchar4 to RGBA is: 44 | // .x -> R ; .y -> G ; .z -> B ; .w -> A 45 | // 46 | //The output (greyImage) at each pixel should be the result of 47 | //applying the formula: output = .299f * R + .587f * G + .114f * B; 48 | //Note: We will be ignoring the alpha channel for this conversion 49 | 50 | //First create a mapping from the 2D block and grid locations 51 | //to an absolute 2D location in the image, then use that to 52 | //calculate a 1D offset 53 | } 54 | 55 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage, 56 | unsigned char* const d_greyImage, size_t numRows, size_t numCols) 57 | { 58 | //You must fill in the correct sizes for the blockSize and gridSize 59 | //currently only one block with one thread is being launched 60 | const dim3 blockSize(1, 1, 1); //TODO 61 | const dim3 gridSize( 1, 1, 1); //TODO 62 | rgba_to_greyscale<<>>(d_rgbaImage, d_greyImage, numRows, numCols); 63 | 64 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 65 | 66 | } 67 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 1/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2.zip -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu}) 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/HW2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "utils.h" 5 | #include 6 | #include 7 | #include 8 | 9 | cv::Mat imageInputRGBA; 10 | cv::Mat imageOutputRGBA; 11 | 12 | uchar4 *d_inputImageRGBA__; 13 | uchar4 *d_outputImageRGBA__; 14 | 15 | float *h_filter__; 16 | 17 | size_t numRows() { return imageInputRGBA.rows; } 18 | size_t numCols() { return imageInputRGBA.cols; } 19 | 20 | //return types are void since any internal error will be handled by quitting 21 | //no point in returning error codes... 22 | //returns a pointer to an RGBA version of the input image 23 | //and a pointer to the single channel grey-scale output 24 | //on both the host and device 25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA, 26 | uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA, 27 | unsigned char **d_redBlurred, 28 | unsigned char **d_greenBlurred, 29 | unsigned char **d_blueBlurred, 30 | float **h_filter, int *filterWidth, 31 | const std::string &filename) { 32 | 33 | //make sure the context initializes ok 34 | checkCudaErrors(cudaFree(0)); 35 | 36 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 37 | if (image.empty()) { 38 | std::cerr << "Couldn't open file: " << filename << std::endl; 39 | exit(1); 40 | } 41 | 42 | cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); 43 | 44 | //allocate memory for the output 45 | imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); 46 | 47 | //This shouldn't ever happen given the way the images are created 48 | //at least based upon my limited understanding of OpenCV, but better to check 49 | if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { 50 | std::cerr << "Images aren't continuous!! Exiting." << std::endl; 51 | exit(1); 52 | } 53 | 54 | *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr(0); 55 | *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr(0); 56 | 57 | const size_t numPixels = numRows() * numCols(); 58 | //allocate memory on the device for both input and output 59 | checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels)); 60 | checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels)); 61 | checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around 62 | 63 | //copy input array to the GPU 64 | checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); 65 | 66 | d_inputImageRGBA__ = *d_inputImageRGBA; 67 | d_outputImageRGBA__ = *d_outputImageRGBA; 68 | 69 | //now create the filter that they will use 70 | const int blurKernelWidth = 9; 71 | const float blurKernelSigma = 2.; 72 | 73 | *filterWidth = blurKernelWidth; 74 | 75 | //create and fill the filter we will convolve with 76 | *h_filter = new float[blurKernelWidth * blurKernelWidth]; 77 | h_filter__ = *h_filter; 78 | 79 | float filterSum = 0.f; //for normalization 80 | 81 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { 82 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { 83 | float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); 84 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; 85 | filterSum += filterValue; 86 | } 87 | } 88 | 89 | float normalizationFactor = 1.f / filterSum; 90 | 91 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { 92 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { 93 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; 94 | } 95 | } 96 | 97 | //blurred 98 | checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels)); 99 | checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels)); 100 | checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels)); 101 | checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels)); 102 | checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); 103 | checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); 104 | } 105 | 106 | void postProcess(const std::string& output_file, uchar4* data_ptr) { 107 | cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr); 108 | 109 | cv::Mat imageOutputBGR; 110 | cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR); 111 | //output the image 112 | cv::imwrite(output_file.c_str(), imageOutputBGR); 113 | } 114 | 115 | void cleanUp(void) 116 | { 117 | cudaFree(d_inputImageRGBA__); 118 | cudaFree(d_outputImageRGBA__); 119 | delete[] h_filter__; 120 | } 121 | 122 | 123 | // An unused bit of code showing how to accomplish this assignment using OpenCV. It is much faster 124 | // than the naive implementation in reference_calc.cpp. 125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size) 126 | { 127 | cv::Mat input = cv::imread(input_file); 128 | // Create an identical image for the output as a placeholder 129 | cv::Mat reference = cv::imread(input_file); 130 | cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0); 131 | cv::imwrite(reference_file, reference); 132 | } 133 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 20 | 21 | ###################################################### 22 | # On Macs the default install locations are below # 23 | # #################################################### 24 | 25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 26 | #CUDA_LIBPATH=/usr/local/cuda/lib 27 | 28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 29 | 30 | GCC_OPTS=-O3 -Wall -Wextra -m64 31 | 32 | student: main.o student_func.o compare.o reference_calc.o Makefile 33 | $(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 34 | 35 | main.o: main.cpp timer.h utils.h HW2.cpp 36 | g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH) 37 | 38 | student_func.o: student_func.cu reference_calc.cpp utils.h 39 | nvcc -c student_func.cu $(NVCC_OPTS) 40 | 41 | compare.o: compare.cpp compare.h 42 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 43 | 44 | reference_calc.o: reference_calc.cpp reference_calc.h 45 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 46 | 47 | clean: 48 | rm -f *.o *.png hw 49 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/cinque_terre.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/cinque_terre_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre_small.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "utils.h" 6 | 7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 8 | double perPixelError, double globalError) 9 | { 10 | cv::Mat reference = cv::imread(reference_filename, -1); 11 | cv::Mat test = cv::imread(test_filename, -1); 12 | 13 | cv::Mat diff = abs(reference - test); 14 | 15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 16 | 17 | double minVal, maxVal; 18 | 19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 20 | 21 | //now perform transform so that we bump values to the full range 22 | 23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 24 | 25 | diff = diffSingleChannel.reshape(reference.channels(), 0); 26 | 27 | cv::imwrite("HW2_differenceImage.png", diff); 28 | //OK, now we can start comparing values... 29 | unsigned char *referencePtr = reference.ptr(0); 30 | unsigned char *testPtr = test.ptr(0); 31 | 32 | if (useEpsCheck) { 33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 34 | } 35 | else 36 | { 37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 38 | } 39 | 40 | std::cout << "PASS" << std::endl; 41 | return; 42 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPARE_H__ 2 | #define COMPARE_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW2 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | 9 | #include "reference_calc.h" 10 | #include "compare.h" 11 | 12 | //include the definitions of the above functions for this homework 13 | #include "HW2.cpp" 14 | 15 | 16 | /******* DEFINED IN student_func.cu *********/ 17 | 18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA, 19 | uchar4* const d_outputImageRGBA, 20 | const size_t numRows, const size_t numCols, 21 | unsigned char *d_redBlurred, 22 | unsigned char *d_greenBlurred, 23 | unsigned char *d_blueBlurred, 24 | const int filterWidth); 25 | 26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage, 27 | const float* const h_filter, const size_t filterWidth); 28 | 29 | 30 | /******* Begin main *********/ 31 | 32 | int main(int argc, char **argv) { 33 | uchar4 *h_inputImageRGBA, *d_inputImageRGBA; 34 | uchar4 *h_outputImageRGBA, *d_outputImageRGBA; 35 | unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred; 36 | 37 | float *h_filter; 38 | int filterWidth; 39 | 40 | std::string input_file; 41 | std::string output_file; 42 | std::string reference_file; 43 | double perPixelError = 0.0; 44 | double globalError = 0.0; 45 | bool useEpsCheck = false; 46 | switch (argc) 47 | { 48 | case 2: 49 | input_file = std::string(argv[1]); 50 | output_file = "HW2_output.png"; 51 | reference_file = "HW2_reference.png"; 52 | break; 53 | case 3: 54 | input_file = std::string(argv[1]); 55 | output_file = std::string(argv[2]); 56 | reference_file = "HW2_reference.png"; 57 | break; 58 | case 4: 59 | input_file = std::string(argv[1]); 60 | output_file = std::string(argv[2]); 61 | reference_file = std::string(argv[3]); 62 | break; 63 | case 6: 64 | useEpsCheck=true; 65 | input_file = std::string(argv[1]); 66 | output_file = std::string(argv[2]); 67 | reference_file = std::string(argv[3]); 68 | perPixelError = atof(argv[4]); 69 | globalError = atof(argv[5]); 70 | break; 71 | default: 72 | std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 73 | exit(1); 74 | } 75 | //load the image and give us our input and output pointers 76 | preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA, 77 | &d_redBlurred, &d_greenBlurred, &d_blueBlurred, 78 | &h_filter, &filterWidth, input_file); 79 | 80 | allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth); 81 | GpuTimer timer; 82 | timer.Start(); 83 | //call the students' code 84 | your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(), 85 | d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth); 86 | timer.Stop(); 87 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 88 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 89 | 90 | if (err < 0) { 91 | //Couldn't print! Probably the student closed stdout - bad news 92 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 93 | exit(1); 94 | } 95 | 96 | //check results and output the blurred image 97 | 98 | size_t numPixels = numRows()*numCols(); 99 | //copy the output back to the host 100 | checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); 101 | 102 | postProcess(output_file, h_outputImageRGBA); 103 | 104 | referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, 105 | numRows(), numCols(), 106 | h_filter, filterWidth); 107 | 108 | postProcess(reference_file, h_outputImageRGBA); 109 | 110 | // Cheater easy way with OpenCV 111 | //generateReferenceImage(input_file, reference_file, filterWidth); 112 | 113 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 114 | 115 | checkCudaErrors(cudaFree(d_redBlurred)); 116 | checkCudaErrors(cudaFree(d_greenBlurred)); 117 | checkCudaErrors(cudaFree(d_blueBlurred)); 118 | 119 | cleanUp(); 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // for uchar4 struct 4 | #include 5 | 6 | void channelConvolution(const unsigned char* const channel, 7 | unsigned char* const channelBlurred, 8 | const size_t numRows, const size_t numCols, 9 | const float *filter, const int filterWidth) 10 | { 11 | //Dealing with an even width filter is trickier 12 | assert(filterWidth % 2 == 1); 13 | 14 | //For every pixel in the image 15 | for (int r = 0; r < (int)numRows; ++r) { 16 | for (int c = 0; c < (int)numCols; ++c) { 17 | float result = 0.f; 18 | //For every value in the filter around the pixel (c, r) 19 | for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) { 20 | for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) { 21 | //Find the global image position for this filter position 22 | //clamp to boundary of the image 23 | int image_r = std::min(std::max(r + filter_r, 0), static_cast(numRows - 1)); 24 | int image_c = std::min(std::max(c + filter_c, 0), static_cast(numCols - 1)); 25 | 26 | float image_value = static_cast(channel[image_r * numCols + image_c]); 27 | float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2]; 28 | 29 | result += image_value * filter_value; 30 | } 31 | } 32 | 33 | channelBlurred[r * numCols + c] = result; 34 | } 35 | } 36 | } 37 | 38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage, 39 | size_t numRows, size_t numCols, 40 | const float* const filter, const int filterWidth) 41 | { 42 | unsigned char *red = new unsigned char[numRows * numCols]; 43 | unsigned char *blue = new unsigned char[numRows * numCols]; 44 | unsigned char *green = new unsigned char[numRows * numCols]; 45 | 46 | unsigned char *redBlurred = new unsigned char[numRows * numCols]; 47 | unsigned char *blueBlurred = new unsigned char[numRows * numCols]; 48 | unsigned char *greenBlurred = new unsigned char[numRows * numCols]; 49 | 50 | //First we separate the incoming RGBA image into three separate channels 51 | //for Red, Green and Blue 52 | for (size_t i = 0; i < numRows * numCols; ++i) { 53 | uchar4 rgba = rgbaImage[i]; 54 | red[i] = rgba.x; 55 | green[i] = rgba.y; 56 | blue[i] = rgba.z; 57 | } 58 | 59 | //Now we can do the convolution for each of the color channels 60 | channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth); 61 | channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth); 62 | channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth); 63 | 64 | //now recombine into the output image - Alpha is 255 for no transparency 65 | for (size_t i = 0; i < numRows * numCols; ++i) { 66 | uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255); 67 | outputImage[i] = rgba; 68 | } 69 | 70 | delete[] red; 71 | delete[] green; 72 | delete[] blue; 73 | 74 | delete[] redBlurred; 75 | delete[] greenBlurred; 76 | delete[] blueBlurred; 77 | } 78 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage, 5 | size_t numRows, size_t numCols, 6 | const float* const filter, const int filterWidth); 7 | 8 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 2/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3.zip -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | # minimum required cmake version 8 | cmake_minimum_required(VERSION 2.8) 9 | find_package(CUDA QUIET REQUIRED) 10 | 11 | SET (compare_files compare.cpp) 12 | 13 | file( GLOB hdr *.hpp *.h ) 14 | file( GLOB cu *.cu) 15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 16 | 17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu}) 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | ################################### 4 | # These are the default install # 5 | # locations on most linux distros # 6 | ################################### 7 | 8 | OPENCV_LIBPATH=/usr/lib 9 | OPENCV_INCLUDEPATH=/usr/include 10 | 11 | ################################################### 12 | # On Macs the default install locations are below # 13 | ################################################### 14 | 15 | #OPENCV_LIBPATH=/usr/local/lib 16 | #OPENCV_INCLUDEPATH=/usr/local/include 17 | 18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 19 | 20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 21 | 22 | ###################################################### 23 | # On Macs the default install locations are below # 24 | # #################################################### 25 | 26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 27 | #CUDA_LIBPATH=/usr/local/cuda/lib 28 | 29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 30 | 31 | GCC_OPTS=-O3 -Wall -Wextra -m64 32 | 33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile 34 | $(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 35 | 36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h 37 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 38 | 39 | HW3.o: HW3.cu loadSaveImage.h utils.h 40 | $(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 41 | 42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 43 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 44 | 45 | compare.o: compare.cpp compare.h 46 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 47 | 48 | reference_calc.o: reference_calc.cpp reference_calc.h 49 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 50 | 51 | student_func.o: student_func.cu utils.h 52 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 53 | 54 | clean: 55 | rm -f *.o hw 56 | find . -type f -name '*.exr' | grep -v memorial | xargs rm -f 57 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError) 6 | { 7 | cv::Mat reference = cv::imread(reference_filename, -1); 8 | cv::Mat test = cv::imread(test_filename, -1); 9 | 10 | cv::Mat diff = abs(reference - test); 11 | 12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 13 | 14 | double minVal, maxVal; 15 | 16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 17 | 18 | //now perform transform so that we bump values to the full range 19 | 20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 21 | 22 | diff = diffSingleChannel.reshape(reference.channels(), 0); 23 | 24 | cv::imwrite("HW3_differenceImage.png", diff); 25 | //OK, now we can start comparing values... 26 | unsigned char *referencePtr = reference.ptr(0); 27 | unsigned char *testPtr = test.ptr(0); 28 | 29 | if (useEpsCheck) { 30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 31 | } 32 | else 33 | { 34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 35 | } 36 | 37 | std::cout << "PASS" << std::endl; 38 | return; 39 | } 40 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW3_H__ 2 | #define HW3_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "cuda_runtime.h" 7 | 8 | //The caller becomes responsible for the returned pointer. This 9 | //is done in the interest of keeping this code as simple as possible. 10 | //In production code this is a bad idea - we should use RAII 11 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 12 | //CODE!!! 13 | void loadImageHDR(const std::string &filename, 14 | float **imagePtr, 15 | size_t *numRows, size_t *numCols) 16 | { 17 | cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 18 | 19 | cv::Mat image; 20 | 21 | if(originImg.type() != CV_32FC3){ 22 | originImg.convertTo(image,CV_32FC3); 23 | } else{ 24 | image = originImg; 25 | } 26 | 27 | if (image.empty()) { 28 | std::cerr << "Couldn't open file: " << filename << std::endl; 29 | exit(1); 30 | } 31 | 32 | if (image.channels() != 3) { 33 | std::cerr << "Image must be color!" << std::endl; 34 | exit(1); 35 | } 36 | 37 | if (!image.isContinuous()) { 38 | std::cerr << "Image isn't continuous!" << std::endl; 39 | exit(1); 40 | } 41 | 42 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 43 | 44 | float *cvPtr = image.ptr(0); 45 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 46 | (*imagePtr)[i] = cvPtr[i]; 47 | 48 | *numRows = image.rows; 49 | *numCols = image.cols; 50 | } 51 | 52 | void loadImageRGBA(const std::string &filename, 53 | uchar4 **imagePtr, 54 | size_t *numRows, size_t *numCols) 55 | { 56 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 57 | if (image.empty()) { 58 | std::cerr << "Couldn't open file: " << filename << std::endl; 59 | exit(1); 60 | } 61 | 62 | if (image.channels() != 3) { 63 | std::cerr << "Image must be color!" << std::endl; 64 | exit(1); 65 | } 66 | 67 | if (!image.isContinuous()) { 68 | std::cerr << "Image isn't continuous!" << std::endl; 69 | exit(1); 70 | } 71 | 72 | cv::Mat imageRGBA; 73 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 74 | 75 | *imagePtr = new uchar4[image.rows * image.cols]; 76 | 77 | unsigned char *cvPtr = imageRGBA.ptr(0); 78 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 79 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 80 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 81 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 82 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 83 | } 84 | 85 | *numRows = image.rows; 86 | *numCols = image.cols; 87 | } 88 | 89 | void saveImageRGBA(const uchar4* const image, 90 | const size_t numRows, const size_t numCols, 91 | const std::string &output_file) 92 | { 93 | int sizes[2]; 94 | sizes[0] = numRows; 95 | sizes[1] = numCols; 96 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 97 | cv::Mat imageOutputBGR; 98 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 99 | //output the image 100 | cv::imwrite(output_file.c_str(), imageOutputBGR); 101 | } 102 | 103 | //output an exr file 104 | //assumed to already be BGR 105 | void saveImageHDR(const float* const image, 106 | const size_t numRows, const size_t numCols, 107 | const std::string &output_file) 108 | { 109 | int sizes[2]; 110 | sizes[0] = numRows; 111 | sizes[1] = numCols; 112 | 113 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 114 | 115 | imageHDR = imageHDR * 255; 116 | 117 | cv::imwrite(output_file.c_str(), imageHDR); 118 | } 119 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void saveImageRGBA(const uchar4* const image, 16 | const size_t numRows, const size_t numCols, 17 | const std::string &output_file); 18 | 19 | void saveImageHDR(const float* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW3 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include 9 | 10 | #include "compare.h" 11 | #include "reference_calc.h" 12 | 13 | // Functions from HW3.cu 14 | void preProcess(float **d_luminance, unsigned int **d_cdf, 15 | size_t *numRows, size_t *numCols, unsigned int *numBins, 16 | const std::string& filename); 17 | 18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols, 19 | float min_logLum, float max_logLum); 20 | 21 | void cleanupGlobalMemory(void); 22 | 23 | // Function from student_func.cu 24 | void your_histogram_and_prefixsum(const float* const d_luminance, 25 | unsigned int* const d_cdf, 26 | float &min_logLum, 27 | float &max_logLum, 28 | const size_t numRows, 29 | const size_t numCols, 30 | const size_t numBins); 31 | 32 | 33 | int main(int argc, char **argv) { 34 | float *d_luminance; 35 | unsigned int *d_cdf; 36 | 37 | size_t numRows, numCols; 38 | unsigned int numBins; 39 | 40 | std::string input_file; 41 | std::string output_file; 42 | std::string reference_file; 43 | double perPixelError = 0.0; 44 | double globalError = 0.0; 45 | bool useEpsCheck = false; 46 | 47 | switch (argc) 48 | { 49 | case 2: 50 | input_file = std::string(argv[1]); 51 | output_file = "HW3_output.png"; 52 | reference_file = "HW3_reference.png"; 53 | break; 54 | case 3: 55 | input_file = std::string(argv[1]); 56 | output_file = std::string(argv[2]); 57 | reference_file = "HW3_reference.png"; 58 | break; 59 | case 4: 60 | input_file = std::string(argv[1]); 61 | output_file = std::string(argv[2]); 62 | reference_file = std::string(argv[3]); 63 | break; 64 | case 6: 65 | useEpsCheck=true; 66 | input_file = std::string(argv[1]); 67 | output_file = std::string(argv[2]); 68 | reference_file = std::string(argv[3]); 69 | perPixelError = atof(argv[4]); 70 | globalError = atof(argv[5]); 71 | break; 72 | default: 73 | std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 74 | exit(1); 75 | } 76 | //load the image and give us our input and output pointers 77 | preProcess(&d_luminance, &d_cdf, 78 | &numRows, &numCols, &numBins, input_file); 79 | 80 | GpuTimer timer; 81 | float min_logLum, max_logLum; 82 | min_logLum = 0.f; 83 | max_logLum = 1.f; 84 | timer.Start(); 85 | //call the students' code 86 | your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum, 87 | numRows, numCols, numBins); 88 | timer.Stop(); 89 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 90 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 91 | 92 | if (err < 0) { 93 | //Couldn't print! Probably the student closed stdout - bad news 94 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 95 | exit(1); 96 | } 97 | 98 | float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols); 99 | unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins); 100 | 101 | checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost)); 102 | 103 | //check results and output the tone-mapped image 104 | postProcess(output_file, numRows, numCols, min_logLum, max_logLum); 105 | 106 | for (size_t i = 1; i < numCols * numRows; ++i) { 107 | min_logLum = std::min(h_luminance[i], min_logLum); 108 | max_logLum = std::max(h_luminance[i], max_logLum); 109 | } 110 | 111 | referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum); 112 | 113 | checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice)); 114 | 115 | //check results and output the tone-mapped image 116 | postProcess(reference_file, numRows, numCols, min_logLum, max_logLum); 117 | 118 | cleanupGlobalMemory(); 119 | 120 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 121 | 122 | return 0; 123 | } 124 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial.exr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial.exr -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_large.exr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_large.exr -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_png.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_png_large.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png_large.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/memorial_raw_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw_large.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf, 5 | const size_t numRows, const size_t numCols, const size_t numBins, 6 | float &logLumMin, float &logLumMax) 7 | { 8 | logLumMin = h_logLuminance[0]; 9 | logLumMax = h_logLuminance[0]; 10 | 11 | //Step 1 12 | //first we find the minimum and maximum across the entire image 13 | for (size_t i = 1; i < numCols * numRows; ++i) { 14 | logLumMin = std::min(h_logLuminance[i], logLumMin); 15 | logLumMax = std::max(h_logLuminance[i], logLumMax); 16 | } 17 | 18 | //Step 2 19 | float logLumRange = logLumMax - logLumMin; 20 | 21 | //Step 3 22 | //next we use the now known range to compute 23 | //a histogram of numBins bins 24 | unsigned int *histo = new unsigned int[numBins]; 25 | 26 | for (size_t i = 0; i < numBins; ++i) histo[i] = 0; 27 | 28 | for (size_t i = 0; i < numCols * numRows; ++i) { 29 | unsigned int bin = std::min(static_cast(numBins - 1), 30 | static_cast((h_logLuminance[i] - logLumMin) / logLumRange * numBins)); 31 | histo[bin]++; 32 | } 33 | 34 | //Step 4 35 | //finally we perform and exclusive scan (prefix sum) 36 | //on the histogram to get the cumulative distribution 37 | h_cdf[0] = 0; 38 | for (size_t i = 1; i < numBins; ++i) { 39 | h_cdf[i] = h_cdf[i - 1] + histo[i - 1]; 40 | } 41 | 42 | delete[] histo; 43 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf, 5 | const size_t numRows, const size_t numCols, const size_t numBins, 6 | float &logLumMin, float &logLumMax); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/student_func.cu: -------------------------------------------------------------------------------- 1 | /* Udacity Homework 3 2 | HDR Tone-mapping 3 | 4 | Background HDR 5 | ============== 6 | 7 | A High Dynamic Range (HDR) image contains a wider variation of intensity 8 | and color than is allowed by the RGB format with 1 byte per channel that we 9 | have used in the previous assignment. 10 | 11 | To store this extra information we use single precision floating point for 12 | each channel. This allows for an extremely wide range of intensity values. 13 | 14 | In the image for this assignment, the inside of church with light coming in 15 | through stained glass windows, the raw input floating point values for the 16 | channels range from 0 to 275. But the mean is .41 and 98% of the values are 17 | less than 3! This means that certain areas (the windows) are extremely bright 18 | compared to everywhere else. If we linearly map this [0-275] range into the 19 | [0-255] range that we have been using then most values will be mapped to zero! 20 | The only thing we will be able to see are the very brightest areas - the 21 | windows - everything else will appear pitch black. 22 | 23 | The problem is that although we have cameras capable of recording the wide 24 | range of intensity that exists in the real world our monitors are not capable 25 | of displaying them. Our eyes are also quite capable of observing a much wider 26 | range of intensities than our image formats / monitors are capable of 27 | displaying. 28 | 29 | Tone-mapping is a process that transforms the intensities in the image so that 30 | the brightest values aren't nearly so far away from the mean. That way when 31 | we transform the values into [0-255] we can actually see the entire image. 32 | There are many ways to perform this process and it is as much an art as a 33 | science - there is no single "right" answer. In this homework we will 34 | implement one possible technique. 35 | 36 | Background Chrominance-Luminance 37 | ================================ 38 | 39 | The RGB space that we have been using to represent images can be thought of as 40 | one possible set of axes spanning a three dimensional space of color. We 41 | sometimes choose other axes to represent this space because they make certain 42 | operations more convenient. 43 | 44 | Another possible way of representing a color image is to separate the color 45 | information (chromaticity) from the brightness information. There are 46 | multiple different methods for doing this - a common one during the analog 47 | television days was known as Chrominance-Luminance or YUV. 48 | 49 | We choose to represent the image in this way so that we can remap only the 50 | intensity channel and then recombine the new intensity values with the color 51 | information to form the final image. 52 | 53 | Old TV signals used to be transmitted in this way so that black & white 54 | televisions could display the luminance channel while color televisions would 55 | display all three of the channels. 56 | 57 | 58 | Tone-mapping 59 | ============ 60 | 61 | In this assignment we are going to transform the luminance channel (actually 62 | the log of the luminance, but this is unimportant for the parts of the 63 | algorithm that you will be implementing) by compressing its range to [0, 1]. 64 | To do this we need the cumulative distribution of the luminance values. 65 | 66 | Example 67 | ------- 68 | 69 | input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2] 70 | min / max / range: 0 / 9 / 9 71 | 72 | histo with 3 bins: [4 7 3] 73 | 74 | cdf : [4 11 14] 75 | 76 | 77 | Your task is to calculate this cumulative distribution by following these 78 | steps. 79 | 80 | */ 81 | 82 | #include "utils.h" 83 | 84 | void your_histogram_and_prefixsum(const float* const d_logLuminance, 85 | unsigned int* const d_cdf, 86 | float &min_logLum, 87 | float &max_logLum, 88 | const size_t numRows, 89 | const size_t numCols, 90 | const size_t numBins) 91 | { 92 | //TODO 93 | /*Here are the steps you need to implement 94 | 1) find the minimum and maximum value in the input logLuminance channel 95 | store in min_logLum and max_logLum 96 | 2) subtract them to find the range 97 | 3) generate a histogram of all the values in the logLuminance channel using 98 | the formula: bin = (lum[i] - lumMin) / lumRange * numBins 99 | 4) Perform an exclusive scan (prefix sum) on the histogram to get 100 | the cumulative distribution of luminance values (this should go in the 101 | incoming d_cdf pointer which already has been allocated for you) */ 102 | 103 | 104 | } 105 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 3/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4.zip -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | file( GLOB cu *.cu) 12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu}) 15 | 16 | 17 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc 2 | #NVCC=nvcc 3 | 4 | ################################### 5 | # These are the default install # 6 | # locations on most linux distros # 7 | ################################### 8 | 9 | OPENCV_LIBPATH=/usr/lib 10 | OPENCV_INCLUDEPATH=/usr/include 11 | 12 | ################################################### 13 | # On Macs the default install locations are below # 14 | ################################################### 15 | 16 | #OPENCV_LIBPATH=/usr/local/lib 17 | #OPENCV_INCLUDEPATH=/usr/local/include 18 | 19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 20 | 21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include 23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include 25 | 26 | ###################################################### 27 | # On Macs the default install locations are below # 28 | # #################################################### 29 | 30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 31 | #CUDA_LIBPATH=/usr/local/cuda/lib 32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 33 | 34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 35 | 36 | GCC_OPTS=-O3 -Wall -Wextra -m64 37 | 38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile 39 | $(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 40 | 41 | main.o: main.cpp timer.h utils.h reference_calc.h 42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 43 | 44 | HW4.o: HW4.cu loadSaveImage.h utils.h 45 | $(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 46 | 47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 48 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 49 | 50 | compare.o: compare.cpp compare.h 51 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 52 | 53 | reference_calc.o: reference_calc.cpp reference_calc.h 54 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 55 | 56 | student_func.o: student_func.cu reference_calc.cpp utils.h 57 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 58 | 59 | clean: 60 | rm -f *.o *.png hw 61 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | 5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 6 | double perPixelError, double globalError) 7 | { 8 | cv::Mat reference = cv::imread(reference_filename, -1); 9 | cv::Mat test = cv::imread(test_filename, -1); 10 | 11 | cv::Mat diff = abs(reference - test); 12 | 13 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 14 | 15 | double minVal, maxVal; 16 | 17 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 18 | 19 | //now perform transform so that we bump values to the full range 20 | 21 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 22 | 23 | diff = diffSingleChannel.reshape(reference.channels(), 0); 24 | 25 | cv::imwrite("HW4_differenceImage.png", diff); 26 | //OK, now we can start comparing values... 27 | unsigned char *referencePtr = reference.ptr(0); 28 | unsigned char *testPtr = test.ptr(0); 29 | 30 | if (useEpsCheck) { 31 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 32 | } 33 | else 34 | { 35 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 36 | } 37 | 38 | std::cout << "PASS" << std::endl; 39 | return; 40 | } -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW4_H__ 2 | #define HW4_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda_runtime.h" 6 | 7 | //The caller becomes responsible for the returned pointer. This 8 | //is done in the interest of keeping this code as simple as possible. 9 | //In production code this is a bad idea - we should use RAII 10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 11 | //CODE!!! 12 | void loadImageHDR(const std::string &filename, 13 | float **imagePtr, 14 | size_t *numRows, size_t *numCols) 15 | { 16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 17 | if (image.empty()) { 18 | std::cerr << "Couldn't open file: " << filename << std::endl; 19 | exit(1); 20 | } 21 | 22 | if (image.channels() != 3) { 23 | std::cerr << "Image must be color!" << std::endl; 24 | exit(1); 25 | } 26 | 27 | if (!image.isContinuous()) { 28 | std::cerr << "Image isn't continuous!" << std::endl; 29 | exit(1); 30 | } 31 | 32 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 33 | 34 | float *cvPtr = image.ptr(0); 35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 36 | (*imagePtr)[i] = cvPtr[i]; 37 | 38 | *numRows = image.rows; 39 | *numCols = image.cols; 40 | } 41 | 42 | void loadImageRGBA(const std::string &filename, 43 | uchar4 **imagePtr, 44 | size_t *numRows, size_t *numCols) 45 | { 46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 47 | if (image.empty()) { 48 | std::cerr << "Couldn't open file: " << filename << std::endl; 49 | exit(1); 50 | } 51 | 52 | if (image.channels() != 3) { 53 | std::cerr << "Image must be color!" << std::endl; 54 | exit(1); 55 | } 56 | 57 | if (!image.isContinuous()) { 58 | std::cerr << "Image isn't continuous!" << std::endl; 59 | exit(1); 60 | } 61 | 62 | cv::Mat imageRGBA; 63 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 64 | 65 | *imagePtr = new uchar4[image.rows * image.cols]; 66 | 67 | unsigned char *cvPtr = imageRGBA.ptr(0); 68 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 69 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 70 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 71 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 72 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 73 | } 74 | 75 | *numRows = image.rows; 76 | *numCols = image.cols; 77 | } 78 | 79 | void saveImageRGBA(const uchar4* const image, 80 | const size_t numRows, const size_t numCols, 81 | const std::string &output_file) 82 | { 83 | int sizes[2]; 84 | sizes[0] = numRows; 85 | sizes[1] = numCols; 86 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 87 | cv::Mat imageOutputBGR; 88 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 89 | //output the image 90 | cv::imwrite(output_file.c_str(), imageOutputBGR); 91 | } 92 | 93 | //output an exr file 94 | //assumed to already be BGR 95 | void saveImageHDR(const float* const image, 96 | const size_t numRows, const size_t numCols, 97 | const std::string &output_file) 98 | { 99 | int sizes[2]; 100 | sizes[0] = numRows; 101 | sizes[1] = numCols; 102 | 103 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 104 | 105 | imageHDR = imageHDR * 255; 106 | 107 | cv::imwrite(output_file.c_str(), imageHDR); 108 | } 109 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void saveImageRGBA(const uchar4* const image, 16 | const size_t numRows, const size_t numCols, 17 | const std::string &output_file); 18 | 19 | void saveImageHDR(const float* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW4 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "compare.h" 12 | #include "reference_calc.h" 13 | 14 | void preProcess(unsigned int **inputVals, 15 | unsigned int **inputPos, 16 | unsigned int **outputVals, 17 | unsigned int **outputPos, 18 | size_t &numElems, 19 | const std::string& filename, 20 | const std::string& template_file); 21 | 22 | void postProcess(const unsigned int* const outputVals, 23 | const unsigned int* const outputPos, 24 | const size_t numElems, 25 | const std::string& output_file); 26 | 27 | void your_sort(unsigned int* const inputVals, 28 | unsigned int* const inputPos, 29 | unsigned int* const outputVals, 30 | unsigned int* const outputPos, 31 | const size_t numElems); 32 | 33 | int main(int argc, char **argv) { 34 | unsigned int *inputVals; 35 | unsigned int *inputPos; 36 | unsigned int *outputVals; 37 | unsigned int *outputPos; 38 | 39 | size_t numElems; 40 | 41 | std::string input_file; 42 | std::string template_file; 43 | std::string output_file; 44 | std::string reference_file; 45 | double perPixelError = 0.0; 46 | double globalError = 0.0; 47 | bool useEpsCheck = false; 48 | 49 | switch (argc) 50 | { 51 | case 3: 52 | input_file = std::string(argv[1]); 53 | template_file = std::string(argv[2]); 54 | output_file = "HW4_output.png"; 55 | break; 56 | case 4: 57 | input_file = std::string(argv[1]); 58 | template_file = std::string(argv[2]); 59 | output_file = std::string(argv[3]); 60 | break; 61 | default: 62 | std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl; 63 | exit(1); 64 | } 65 | //load the image and give us our input and output pointers 66 | preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file); 67 | 68 | GpuTimer timer; 69 | timer.Start(); 70 | 71 | //call the students' code 72 | your_sort(inputVals, inputPos, outputVals, outputPos, numElems); 73 | 74 | timer.Stop(); 75 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 76 | printf("\n"); 77 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 78 | 79 | if (err < 0) { 80 | //Couldn't print! Probably the student closed stdout - bad news 81 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 82 | exit(1); 83 | } 84 | 85 | //check results and output the red-eye corrected image 86 | postProcess(outputVals, outputPos, numElems, output_file); 87 | 88 | // check code moved from HW4.cu 89 | /**************************************************************************** 90 | * You can use the code below to help with debugging, but make sure to * 91 | * comment it out again before submitting your assignment for grading, * 92 | * otherwise this code will take too much time and make it seem like your * 93 | * GPU implementation isn't fast enough. * 94 | * * 95 | * This code MUST RUN BEFORE YOUR CODE in case you accidentally change * 96 | * the input values when implementing your radix sort. * 97 | * * 98 | * This code performs the reference radix sort on the host and compares your * 99 | * sorted values to the reference. * 100 | * * 101 | * Thrust containers are used for copying memory from the GPU * 102 | * ************************************************************************* */ 103 | thrust::device_ptr d_inputVals(inputVals); 104 | thrust::device_ptr d_inputPos(inputPos); 105 | 106 | thrust::host_vector h_inputVals(d_inputVals, 107 | d_inputVals+numElems); 108 | thrust::host_vector h_inputPos(d_inputPos, 109 | d_inputPos + numElems); 110 | 111 | thrust::host_vector h_outputVals(numElems); 112 | thrust::host_vector h_outputPos(numElems); 113 | 114 | reference_calculation(&h_inputVals[0], &h_inputPos[0], 115 | &h_outputVals[0], &h_outputPos[0], 116 | numElems); 117 | 118 | //postProcess(valsPtr, posPtr, numElems, reference_file); 119 | 120 | //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 121 | 122 | thrust::device_ptr d_outputVals(outputVals); 123 | thrust::device_ptr d_outputPos(outputPos); 124 | 125 | thrust::host_vector h_yourOutputVals(d_outputVals, 126 | d_outputVals + numElems); 127 | thrust::host_vector h_yourOutputPos(d_outputPos, 128 | d_outputPos + numElems); 129 | 130 | checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems); 131 | checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems); 132 | 133 | checkCudaErrors(cudaFree(inputVals)); 134 | checkCudaErrors(cudaFree(inputPos)); 135 | checkCudaErrors(cudaFree(outputVals)); 136 | checkCudaErrors(cudaFree(outputPos)); 137 | 138 | return 0; 139 | } 140 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_5.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // For memset 3 | #include 4 | 5 | void reference_calculation(unsigned int* inputVals, 6 | unsigned int* inputPos, 7 | unsigned int* outputVals, 8 | unsigned int* outputPos, 9 | const size_t numElems) 10 | { 11 | const int numBits = 1; 12 | const int numBins = 1 << numBits; 13 | 14 | unsigned int *binHistogram = new unsigned int[numBins]; 15 | unsigned int *binScan = new unsigned int[numBins]; 16 | 17 | unsigned int *vals_src = inputVals; 18 | unsigned int *pos_src = inputPos; 19 | 20 | unsigned int *vals_dst = outputVals; 21 | unsigned int *pos_dst = outputPos; 22 | 23 | //a simple radix sort - only guaranteed to work for numBits that are multiples of 2 24 | for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) { 25 | unsigned int mask = (numBins - 1) << i; 26 | 27 | memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins 28 | memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins 29 | 30 | //perform histogram of data & mask into bins 31 | for (unsigned int j = 0; j < numElems; ++j) { 32 | unsigned int bin = (vals_src[j] & mask) >> i; 33 | binHistogram[bin]++; 34 | } 35 | 36 | //perform exclusive prefix sum (scan) on binHistogram to get starting 37 | //location for each bin 38 | for (unsigned int j = 1; j < numBins; ++j) { 39 | binScan[j] = binScan[j - 1] + binHistogram[j - 1]; 40 | } 41 | 42 | //Gather everything into the correct location 43 | //need to move vals and positions 44 | for (unsigned int j = 0; j < numElems; ++j) { 45 | unsigned int bin = (vals_src[j] & mask) >> i; 46 | vals_dst[binScan[bin]] = vals_src[j]; 47 | pos_dst[binScan[bin]] = pos_src[j]; 48 | binScan[bin]++; 49 | } 50 | 51 | //swap the buffers (pointers only) 52 | std::swap(vals_dst, vals_src); 53 | std::swap(pos_dst, pos_src); 54 | } 55 | 56 | //we did an even number of iterations, need to copy from input buffer into output 57 | std::copy(inputVals, inputVals + numElems, outputVals); 58 | std::copy(inputPos, inputPos + numElems, outputPos); 59 | 60 | delete[] binHistogram; 61 | delete[] binScan; 62 | } 63 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | 5 | //A simple un-optimized reference radix sort calculation 6 | //Only deals with power-of-2 radices 7 | 8 | 9 | void reference_calculation(unsigned int* inputVals, 10 | unsigned int* inputPos, 11 | unsigned int* outputVals, 12 | unsigned int* outputPos, 13 | const size_t numElems); 14 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/student_func.cu: -------------------------------------------------------------------------------- 1 | //Udacity HW 4 2 | //Radix Sorting 3 | 4 | #include "utils.h" 5 | #include 6 | 7 | /* Red Eye Removal 8 | =============== 9 | 10 | For this assignment we are implementing red eye removal. This is 11 | accomplished by first creating a score for every pixel that tells us how 12 | likely it is to be a red eye pixel. We have already done this for you - you 13 | are receiving the scores and need to sort them in ascending order so that we 14 | know which pixels to alter to remove the red eye. 15 | 16 | Note: ascending order == smallest to largest 17 | 18 | Each score is associated with a position, when you sort the scores, you must 19 | also move the positions accordingly. 20 | 21 | Implementing Parallel Radix Sort with CUDA 22 | ========================================== 23 | 24 | The basic idea is to construct a histogram on each pass of how many of each 25 | "digit" there are. Then we scan this histogram so that we know where to put 26 | the output of each digit. For example, the first 1 must come after all the 27 | 0s so we have to know how many 0s there are to be able to start moving 1s 28 | into the correct position. 29 | 30 | 1) Histogram of the number of occurrences of each digit 31 | 2) Exclusive Prefix Sum of Histogram 32 | 3) Determine relative offset of each digit 33 | For example [0 0 1 1 0 0 1] 34 | -> [0 1 0 1 2 3 2] 35 | 4) Combine the results of steps 2 & 3 to determine the final 36 | output location for each element and move it there 37 | 38 | LSB Radix sort is an out-of-place sort and you will need to ping-pong values 39 | between the input and output buffers we have provided. Make sure the final 40 | sorted results end up in the output buffer! Hint: You may need to do a copy 41 | at the end. 42 | 43 | */ 44 | 45 | 46 | void your_sort(unsigned int* const d_inputVals, 47 | unsigned int* const d_inputPos, 48 | unsigned int* const d_outputVals, 49 | unsigned int* const d_outputPos, 50 | const size_t numElems) 51 | { 52 | //TODO 53 | //PUT YOUR SORT HERE 54 | } 55 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 4/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 5.zip -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | 12 | SET (HW5_files main.cu student.cu reference_calc.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW5 ${HW5_files} ${hdr}) 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64 3 | 4 | histo: main.cu reference_calc.o student.o Makefile 5 | nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS) 6 | 7 | student.o: student.cu 8 | nvcc -c student.cu $(NVCC_OPTS) 9 | 10 | reference_calc.o: reference_calc.cpp reference_calc.h 11 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 12 | 13 | clean: 14 | rm -f *.o hw *.bin 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "utils.h" 6 | #include "timer.h" 7 | #include 8 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64) 9 | #include 10 | #else 11 | #include 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include "reference_calc.h" 19 | 20 | void computeHistogram(const unsigned int *const d_vals, 21 | unsigned int* const d_histo, 22 | const unsigned int numBins, 23 | const unsigned int numElems); 24 | 25 | int main(void) 26 | { 27 | const unsigned int numBins = 1024; 28 | const unsigned int numElems = 10000 * numBins; 29 | const float stddev = 100.f; 30 | 31 | unsigned int *vals = new unsigned int[numElems]; 32 | unsigned int *h_vals = new unsigned int[numElems]; 33 | unsigned int *h_studentHisto = new unsigned int[numBins]; 34 | unsigned int *h_refHisto = new unsigned int[numBins]; 35 | 36 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64) 37 | srand(GetTickCount()); 38 | #else 39 | timeval tv; 40 | gettimeofday(&tv, NULL); 41 | 42 | srand(tv.tv_usec); 43 | #endif 44 | 45 | //make the mean unpredictable, but close enough to the middle 46 | //so that timings are unaffected 47 | unsigned int mean = rand() % 100 + 462; 48 | 49 | //Output mean so that grading can happen with the same inputs 50 | std::cout << mean << std::endl; 51 | 52 | thrust::minstd_rand rng; 53 | 54 | thrust::random::experimental::normal_distribution normalDist((float)mean, stddev); 55 | 56 | // Generate the random values 57 | for (size_t i = 0; i < numElems; ++i) { 58 | vals[i] = std::min((unsigned int) std::max((int)normalDist(rng), 0), numBins - 1); 59 | } 60 | 61 | unsigned int *d_vals, *d_histo; 62 | 63 | GpuTimer timer; 64 | 65 | checkCudaErrors(cudaMalloc(&d_vals, sizeof(unsigned int) * numElems)); 66 | checkCudaErrors(cudaMalloc(&d_histo, sizeof(unsigned int) * numBins)); 67 | checkCudaErrors(cudaMemset(d_histo, 0, sizeof(unsigned int) * numBins)); 68 | 69 | checkCudaErrors(cudaMemcpy(d_vals, vals, sizeof(unsigned int) * numElems, cudaMemcpyHostToDevice)); 70 | 71 | timer.Start(); 72 | computeHistogram(d_vals, d_histo, numBins, numElems); 73 | timer.Stop(); 74 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 75 | 76 | if (err < 0) { 77 | //Couldn't print! Probably the student closed stdout - bad news 78 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 79 | exit(1); 80 | } 81 | 82 | // copy the student-computed histogram back to the host 83 | checkCudaErrors(cudaMemcpy(h_studentHisto, d_histo, sizeof(unsigned int) * numBins, cudaMemcpyDeviceToHost)); 84 | 85 | //generate reference for the given mean 86 | reference_calculation(vals, h_refHisto, numBins, numElems); 87 | 88 | //Now do the comparison 89 | checkResultsExact(h_refHisto, h_studentHisto, numBins); 90 | 91 | delete[] h_vals; 92 | delete[] h_refHisto; 93 | delete[] h_studentHisto; 94 | 95 | cudaFree(d_vals); 96 | cudaFree(d_histo); 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | //Reference Histogram calculation 3 | 4 | void reference_calculation(const unsigned int* const vals, 5 | unsigned int* const histo, 6 | const size_t numBins, 7 | const size_t numElems) 8 | 9 | { 10 | //zero out bins 11 | for (size_t i = 0; i < numBins; ++i) 12 | histo[i] = 0; 13 | 14 | //go through vals and increment appropriate bin 15 | for (size_t i = 0; i < numElems; ++i) 16 | histo[vals[i]]++; 17 | } 18 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | //Reference Histogram calculation 5 | 6 | void reference_calculation(const unsigned int* const vals, 7 | unsigned int* const histo, 8 | const size_t numBins, 9 | const size_t numElems); 10 | 11 | #endif -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/student.cu: -------------------------------------------------------------------------------- 1 | /* Udacity HW5 2 | Histogramming for Speed 3 | 4 | The goal of this assignment is compute a histogram 5 | as fast as possible. We have simplified the problem as much as 6 | possible to allow you to focus solely on the histogramming algorithm. 7 | 8 | The input values that you need to histogram are already the exact 9 | bins that need to be updated. This is unlike in HW3 where you needed 10 | to compute the range of the data and then do: 11 | bin = (val - valMin) / valRange to determine the bin. 12 | 13 | Here the bin is just: 14 | bin = val 15 | 16 | so the serial histogram calculation looks like: 17 | for (i = 0; i < numElems; ++i) 18 | histo[val[i]]++; 19 | 20 | That's it! Your job is to make it run as fast as possible! 21 | 22 | The values are normally distributed - you may take 23 | advantage of this fact in your implementation. 24 | 25 | */ 26 | 27 | 28 | #include "utils.h" 29 | 30 | __global__ 31 | void yourHisto(const unsigned int* const vals, //INPUT 32 | unsigned int* const histo, //OUPUT 33 | int numVals) 34 | { 35 | //TODO fill in this kernel to calculate the histogram 36 | //as quickly as possible 37 | 38 | //Although we provide only one kernel skeleton, 39 | //feel free to use more if it will help you 40 | //write faster code 41 | } 42 | 43 | void computeHistogram(const unsigned int* const d_vals, //INPUT 44 | unsigned int* const d_histo, //OUTPUT 45 | const unsigned int numBins, 46 | const unsigned int numElems) 47 | { 48 | //TODO Launch the yourHisto kernel 49 | 50 | //if you want to use/launch more than one kernel, 51 | //feel free 52 | 53 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 54 | } 55 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 5/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6.zip -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | # CMakeLists.txt for OpenCV and CUDA. 3 | # 2012-02-07 4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan 5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com 6 | ############################################################################ 7 | 8 | # collect source files 9 | 10 | file( GLOB hdr *.hpp *.h ) 11 | 12 | SET (HW6_files student_func.cu HW6.cu main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp) 13 | 14 | CUDA_ADD_EXECUTABLE(HW6 ${HW6_files} ${hdr}) 15 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/HW6.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "loadSaveImage.h" 8 | #include 9 | 10 | 11 | //return types are void since any internal error will be handled by quitting 12 | //no point in returning error codes... 13 | void preProcess( uchar4 **sourceImg, 14 | size_t &numRows, size_t &numCols, 15 | uchar4 **destImg, 16 | uchar4 **blendedImg, const std::string& source_filename, 17 | const std::string& dest_filename){ 18 | 19 | //make sure the context initializes ok 20 | checkCudaErrors(cudaFree(0)); 21 | 22 | size_t numRowsSource, numColsSource, numRowsDest, numColsDest; 23 | 24 | loadImageRGBA(source_filename, sourceImg, &numRowsSource, &numColsSource); 25 | loadImageRGBA(dest_filename, destImg, &numRowsDest, &numColsDest); 26 | 27 | assert(numRowsSource == numRowsDest); 28 | assert(numColsSource == numColsDest); 29 | 30 | numRows = numRowsSource; 31 | numCols = numColsSource; 32 | 33 | *blendedImg = new uchar4[numRows * numCols]; 34 | 35 | } 36 | 37 | void postProcess(const uchar4* const blendedImg, 38 | const size_t numRowsDest, const size_t numColsDest, 39 | const std::string& output_file) 40 | { 41 | //just need to save the image... 42 | saveImageRGBA(blendedImg, numRowsDest, numColsDest, output_file); 43 | } 44 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc 2 | #NVCC=nvcc 3 | 4 | ################################### 5 | # These are the default install # 6 | # locations on most linux distros # 7 | ################################### 8 | 9 | OPENCV_LIBPATH=/usr/lib 10 | OPENCV_INCLUDEPATH=/usr/include 11 | 12 | ################################################### 13 | # On Macs the default install locations are below # 14 | ################################################### 15 | 16 | #OPENCV_LIBPATH=/usr/local/lib 17 | #OPENCV_INCLUDEPATH=/usr/local/include 18 | 19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui 20 | 21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include 23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include 24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include 25 | 26 | ###################################################### 27 | # On Macs the default install locations are below # 28 | # #################################################### 29 | 30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include 31 | #CUDA_LIBPATH=/usr/local/cuda/lib 32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64 33 | 34 | #no warnings otherwise thrust explodes output 35 | 36 | NVCC_OPTS=-O3 -arch=sm_20 -m64 37 | 38 | GCC_OPTS=-O3 -m64 39 | 40 | student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile 41 | $(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS) 42 | 43 | main.o: main.cpp timer.h utils.h 44 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 45 | 46 | HW6.o: HW6.cu loadSaveImage.h utils.h 47 | $(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS) 48 | 49 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h 50 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 51 | 52 | student_func.o: student_func.cu reference_calc.cpp utils.h 53 | $(NVCC) -c student_func.cu $(NVCC_OPTS) 54 | 55 | compare.o: compare.cpp compare.h 56 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 57 | 58 | reference_calc.o: reference_calc.cpp reference_calc.h 59 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) 60 | 61 | clean: 62 | rm -f *.o hw 63 | find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f 64 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/blended.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/blended.gold -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/compare.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError) 6 | { 7 | cv::Mat reference = cv::imread(reference_filename, -1); 8 | cv::Mat test = cv::imread(test_filename, -1); 9 | 10 | cv::Mat diff = abs(reference - test); 11 | 12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows 13 | 14 | double minVal, maxVal; 15 | 16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location 17 | 18 | //now perform transform so that we bump values to the full range 19 | 20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal)); 21 | 22 | diff = diffSingleChannel.reshape(reference.channels(), 0); 23 | 24 | cv::imwrite("HW6_differenceImage.png", diff); 25 | //OK, now we can start comparing values... 26 | unsigned char *referencePtr = reference.ptr(0); 27 | unsigned char *testPtr = test.ptr(0); 28 | 29 | if (useEpsCheck) { 30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError); 31 | } 32 | else 33 | { 34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels()); 35 | } 36 | 37 | std::cout << "PASS" << std::endl; 38 | return; 39 | } 40 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/compare.h: -------------------------------------------------------------------------------- 1 | #ifndef HW3_H__ 2 | #define HW3_H__ 3 | 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck, 5 | double perPixelError, double globalError); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/destination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/destination.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/loadSaveImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "cuda_runtime.h" 6 | 7 | //The caller becomes responsible for the returned pointer. This 8 | //is done in the interest of keeping this code as simple as possible. 9 | //In production code this is a bad idea - we should use RAII 10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION 11 | //CODE!!! 12 | void loadImageHDR(const std::string &filename, 13 | float **imagePtr, 14 | size_t *numRows, size_t *numCols) 15 | { 16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH); 17 | if (image.empty()) { 18 | std::cerr << "Couldn't open file: " << filename << std::endl; 19 | exit(1); 20 | } 21 | 22 | if (image.channels() != 3) { 23 | std::cerr << "Image must be color!" << std::endl; 24 | exit(1); 25 | } 26 | 27 | if (!image.isContinuous()) { 28 | std::cerr << "Image isn't continuous!" << std::endl; 29 | exit(1); 30 | } 31 | 32 | *imagePtr = new float[image.rows * image.cols * image.channels()]; 33 | 34 | float *cvPtr = image.ptr(0); 35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i) 36 | (*imagePtr)[i] = cvPtr[i]; 37 | 38 | *numRows = image.rows; 39 | *numCols = image.cols; 40 | } 41 | 42 | void loadImageGrey(const std::string &filename, 43 | unsigned char **imagePtr, 44 | size_t *numRows, size_t *numCols) 45 | { 46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE); 47 | if (image.empty()) { 48 | std::cerr << "Couldn't open file: " << filename << std::endl; 49 | exit(1); 50 | } 51 | 52 | if (image.channels() != 1) { 53 | std::cerr << "Image must be greyscale!" << std::endl; 54 | exit(1); 55 | } 56 | 57 | if (!image.isContinuous()) { 58 | std::cerr << "Image isn't continuous!" << std::endl; 59 | exit(1); 60 | } 61 | 62 | *imagePtr = new unsigned char[image.rows * image.cols]; 63 | 64 | unsigned char *cvPtr = image.ptr(0); 65 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 66 | (*imagePtr)[i] = cvPtr[i]; 67 | } 68 | 69 | *numRows = image.rows; 70 | *numCols = image.cols; 71 | } 72 | void loadImageRGBA(const std::string &filename, 73 | uchar4 **imagePtr, 74 | size_t *numRows, size_t *numCols) 75 | { 76 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); 77 | if (image.empty()) { 78 | std::cerr << "Couldn't open file: " << filename << std::endl; 79 | exit(1); 80 | } 81 | 82 | if (image.channels() != 3) { 83 | std::cerr << "Image must be color!" << std::endl; 84 | exit(1); 85 | } 86 | 87 | if (!image.isContinuous()) { 88 | std::cerr << "Image isn't continuous!" << std::endl; 89 | exit(1); 90 | } 91 | 92 | cv::Mat imageRGBA; 93 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA); 94 | 95 | *imagePtr = new uchar4[image.rows * image.cols]; 96 | 97 | unsigned char *cvPtr = imageRGBA.ptr(0); 98 | for (size_t i = 0; i < image.rows * image.cols; ++i) { 99 | (*imagePtr)[i].x = cvPtr[4 * i + 0]; 100 | (*imagePtr)[i].y = cvPtr[4 * i + 1]; 101 | (*imagePtr)[i].z = cvPtr[4 * i + 2]; 102 | (*imagePtr)[i].w = cvPtr[4 * i + 3]; 103 | } 104 | 105 | *numRows = image.rows; 106 | *numCols = image.cols; 107 | } 108 | 109 | void saveImageRGBA(const uchar4* const image, 110 | const size_t numRows, const size_t numCols, 111 | const std::string &output_file) 112 | { 113 | int sizes[2]; 114 | sizes[0] = numRows; 115 | sizes[1] = numCols; 116 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image); 117 | cv::Mat imageOutputBGR; 118 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR); 119 | //output the image 120 | cv::imwrite(output_file.c_str(), imageOutputBGR); 121 | } 122 | 123 | //output an exr file 124 | //assumed to already be BGR 125 | void saveImageHDR(const float* const image, 126 | const size_t numRows, const size_t numCols, 127 | const std::string &output_file) 128 | { 129 | int sizes[2]; 130 | sizes[0] = numRows; 131 | sizes[1] = numCols; 132 | 133 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image); 134 | 135 | imageHDR = imageHDR * 255; 136 | 137 | cv::imwrite(output_file.c_str(), imageHDR); 138 | } 139 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/loadSaveImage.h: -------------------------------------------------------------------------------- 1 | #ifndef LOADSAVEIMAGE_H__ 2 | #define LOADSAVEIMAGE_H__ 3 | 4 | #include 5 | #include //for uchar4 6 | 7 | void loadImageHDR(const std::string &filename, 8 | float **imagePtr, 9 | size_t *numRows, size_t *numCols); 10 | 11 | void loadImageRGBA(const std::string &filename, 12 | uchar4 **imagePtr, 13 | size_t *numRows, size_t *numCols); 14 | 15 | void loadImageGrey(const std::string &filename, 16 | unsigned char **imagePtr, 17 | size_t *numRows, size_t *numCols); 18 | 19 | void saveImageRGBA(const uchar4* const image, 20 | const size_t numRows, const size_t numCols, 21 | const std::string &output_file); 22 | 23 | void saveImageHDR(const float* const image, 24 | const size_t numRows, const size_t numCols, 25 | const std::string &output_file); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/main.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW6 Driver 2 | 3 | #include 4 | #include "timer.h" 5 | #include "utils.h" 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #include "reference_calc.h" 14 | #include "compare.h" 15 | 16 | void preProcess( uchar4 **sourceImg, size_t &numRowsSource, size_t &numColsSource, 17 | uchar4 **destImg, 18 | uchar4 **blendedImg, const std::string& source_filename, 19 | const std::string& dest_filename); 20 | 21 | void postProcess(const uchar4* const blendedImg, 22 | const size_t numRowsDest, const size_t numColsDest, 23 | const std::string& output_file); 24 | 25 | void your_blend(const uchar4* const sourceImg, 26 | const size_t numRowsSource, const size_t numColsSource, 27 | const uchar4* const destImg, 28 | uchar4* const blendedImg); 29 | 30 | int main(int argc, char **argv) { 31 | uchar4 *h_sourceImg, *h_destImg, *h_blendedImg; 32 | size_t numRowsSource, numColsSource; 33 | 34 | std::string input_source_file; 35 | std::string input_dest_file; 36 | std::string output_file; 37 | 38 | std::string reference_file; 39 | double perPixelError = 0.0; 40 | double globalError = 0.0; 41 | bool useEpsCheck = false; 42 | 43 | switch (argc) 44 | { 45 | case 3: 46 | input_source_file = std::string(argv[1]); 47 | input_dest_file = std::string(argv[2]); 48 | output_file = "HW6_output.png"; 49 | reference_file = "HW6_reference.png"; 50 | break; 51 | case 4: 52 | input_source_file = std::string(argv[1]); 53 | input_dest_file = std::string(argv[2]); 54 | output_file = std::string(argv[3]); 55 | reference_file = "HW6_reference.png"; 56 | break; 57 | case 5: 58 | input_source_file = std::string(argv[1]); 59 | input_dest_file = std::string(argv[2]); 60 | output_file = std::string(argv[3]); 61 | reference_file = std::string(argv[4]); 62 | break; 63 | case 7: 64 | useEpsCheck=true; 65 | input_source_file = std::string(argv[1]); 66 | input_dest_file = std::string(argv[2]); 67 | output_file = std::string(argv[3]); 68 | reference_file = std::string(argv[4]); 69 | perPixelError = atof(argv[5]); 70 | globalError = atof(argv[6]); 71 | break; 72 | default: 73 | std::cerr << "Usage: ./HW6 input_source_file input_dest_filename [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; 74 | exit(1); 75 | } 76 | 77 | //load the image and give us our input and output pointers 78 | preProcess(&h_sourceImg, numRowsSource, numColsSource, 79 | &h_destImg, 80 | &h_blendedImg, input_source_file, input_dest_file); 81 | 82 | GpuTimer timer; 83 | timer.Start(); 84 | 85 | //call the students' code 86 | your_blend(h_sourceImg, numRowsSource, numColsSource, 87 | h_destImg, 88 | h_blendedImg); 89 | 90 | timer.Stop(); 91 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 92 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); 93 | printf("\n"); 94 | if (err < 0) { 95 | //Couldn't print! Probably the student closed stdout - bad news 96 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; 97 | exit(1); 98 | } 99 | 100 | //check results and output the tone-mapped image 101 | postProcess(h_blendedImg, numRowsSource, numColsSource, output_file); 102 | 103 | // calculate the reference image 104 | uchar4* h_reference = new uchar4[numRowsSource*numColsSource]; 105 | reference_calc(h_sourceImg, numRowsSource, numColsSource, 106 | h_destImg, h_reference); 107 | 108 | // save the reference image 109 | postProcess(h_reference, numRowsSource, numColsSource, reference_file); 110 | 111 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); 112 | 113 | delete[] h_reference; 114 | delete[] h_destImg; 115 | delete[] h_sourceImg; 116 | delete[] h_blendedImg; 117 | return 0; 118 | } 119 | 120 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/reference_calc.cpp: -------------------------------------------------------------------------------- 1 | //Udacity HW 6 2 | //Poisson Blending Reference Calculation 3 | 4 | #include "utils.h" 5 | #include 6 | 7 | //Performs one iteration of the solver 8 | void computeIteration(const unsigned char* const dstImg, 9 | const unsigned char* const strictInteriorPixels, 10 | const unsigned char* const borderPixels, 11 | const std::vector& interiorPixelList, 12 | const size_t numColsSource, 13 | const float* const f, 14 | const float* const g, 15 | float* const f_next) 16 | { 17 | unsigned int off = interiorPixelList[0].x * numColsSource + interiorPixelList[0].y; 18 | 19 | for (size_t i = 0; i < interiorPixelList.size(); ++i) { 20 | float blendedSum = 0.f; 21 | float borderSum = 0.f; 22 | 23 | uint2 coord = interiorPixelList[i]; 24 | 25 | unsigned int offset = coord.x * numColsSource + coord.y; 26 | 27 | //process all 4 neighbor pixels 28 | //for each pixel if it is an interior pixel 29 | //then we add the previous f, otherwise if it is a 30 | //border pixel then we add the value of the destination 31 | //image at the border. These border values are our boundary 32 | //conditions. 33 | if (strictInteriorPixels[offset - 1]) { 34 | blendedSum += f[offset - 1]; 35 | } 36 | else { 37 | borderSum += dstImg[offset - 1]; 38 | } 39 | 40 | if (strictInteriorPixels[offset + 1]) { 41 | blendedSum += f[offset + 1]; 42 | } 43 | else { 44 | borderSum += dstImg[offset + 1]; 45 | } 46 | 47 | if (strictInteriorPixels[offset - numColsSource]) { 48 | blendedSum += f[offset - numColsSource]; 49 | } 50 | else { 51 | borderSum += dstImg[offset - numColsSource]; 52 | } 53 | 54 | if (strictInteriorPixels[offset + numColsSource]) { 55 | blendedSum += f[offset + numColsSource]; 56 | } 57 | else { 58 | borderSum += dstImg[offset + numColsSource]; 59 | } 60 | 61 | float f_next_val = (blendedSum + borderSum + g[offset]) / 4.f; 62 | 63 | f_next[offset] = std::min(255.f, std::max(0.f, f_next_val)); //clip to [0, 255] 64 | } 65 | 66 | } 67 | 68 | //pre-compute the values of g, which depend only the source image 69 | //and aren't iteration dependent. 70 | void computeG(const unsigned char* const channel, 71 | float* const g, 72 | const size_t numColsSource, 73 | const std::vector& interiorPixelList) 74 | { 75 | for (size_t i = 0; i < interiorPixelList.size(); ++i) { 76 | uint2 coord = interiorPixelList[i]; 77 | unsigned int offset = coord.x * numColsSource + coord.y; 78 | 79 | float sum = 4.f * channel[offset]; 80 | 81 | sum -= (float)channel[offset - 1] + (float)channel[offset + 1]; 82 | sum -= (float)channel[offset + numColsSource] + (float)channel[offset - numColsSource]; 83 | 84 | g[offset] = sum; 85 | } 86 | } 87 | 88 | void reference_calc(const uchar4* const h_sourceImg, 89 | const size_t numRowsSource, const size_t numColsSource, 90 | const uchar4* const h_destImg, 91 | uchar4* const h_blendedImg){ 92 | 93 | //we need to create a list of border pixels and interior pixels 94 | //this is a conceptually simple implementation, not a particularly efficient one... 95 | 96 | //first create mask 97 | size_t srcSize = numRowsSource * numColsSource; 98 | unsigned char* mask = new unsigned char[srcSize]; 99 | 100 | for (int i = 0; i < srcSize; ++i) { 101 | mask[i] = (h_sourceImg[i].x + h_sourceImg[i].y + h_sourceImg[i].z < 3 * 255) ? 1 : 0; 102 | } 103 | 104 | //next compute strictly interior pixels and border pixels 105 | unsigned char *borderPixels = new unsigned char[srcSize]; 106 | unsigned char *strictInteriorPixels = new unsigned char[srcSize]; 107 | 108 | std::vector interiorPixelList; 109 | 110 | //the source region in the homework isn't near an image boundary, so we can 111 | //simplify the conditionals a little... 112 | for (size_t r = 1; r < numRowsSource - 1; ++r) { 113 | for (size_t c = 1; c < numColsSource - 1; ++c) { 114 | if (mask[r * numColsSource + c]) { 115 | if (mask[(r -1) * numColsSource + c] && mask[(r + 1) * numColsSource + c] && 116 | mask[r * numColsSource + c - 1] && mask[r * numColsSource + c + 1]) { 117 | strictInteriorPixels[r * numColsSource + c] = 1; 118 | borderPixels[r * numColsSource + c] = 0; 119 | interiorPixelList.push_back(make_uint2(r, c)); 120 | } 121 | else { 122 | strictInteriorPixels[r * numColsSource + c] = 0; 123 | borderPixels[r * numColsSource + c] = 1; 124 | } 125 | } 126 | else { 127 | strictInteriorPixels[r * numColsSource + c] = 0; 128 | borderPixels[r * numColsSource + c] = 0; 129 | 130 | } 131 | } 132 | } 133 | 134 | //split the source and destination images into their respective 135 | //channels 136 | unsigned char* red_src = new unsigned char[srcSize]; 137 | unsigned char* blue_src = new unsigned char[srcSize]; 138 | unsigned char* green_src = new unsigned char[srcSize]; 139 | 140 | for (int i = 0; i < srcSize; ++i) { 141 | red_src[i] = h_sourceImg[i].x; 142 | blue_src[i] = h_sourceImg[i].y; 143 | green_src[i] = h_sourceImg[i].z; 144 | } 145 | 146 | unsigned char* red_dst = new unsigned char[srcSize]; 147 | unsigned char* blue_dst = new unsigned char[srcSize]; 148 | unsigned char* green_dst = new unsigned char[srcSize]; 149 | 150 | for (int i = 0; i < srcSize; ++i) { 151 | red_dst[i] = h_destImg[i].x; 152 | blue_dst[i] = h_destImg[i].y; 153 | green_dst[i] = h_destImg[i].z; 154 | } 155 | 156 | //next we'll precompute the g term - it never changes, no need to recompute every iteration 157 | float *g_red = new float[srcSize]; 158 | float *g_blue = new float[srcSize]; 159 | float *g_green = new float[srcSize]; 160 | 161 | memset(g_red, 0, srcSize * sizeof(float)); 162 | memset(g_blue, 0, srcSize * sizeof(float)); 163 | memset(g_green, 0, srcSize * sizeof(float)); 164 | 165 | computeG(red_src, g_red, numColsSource, interiorPixelList); 166 | computeG(blue_src, g_blue, numColsSource, interiorPixelList); 167 | computeG(green_src, g_green, numColsSource, interiorPixelList); 168 | 169 | //for each color channel we'll need two buffers and we'll ping-pong between them 170 | float *blendedValsRed_1 = new float[srcSize]; 171 | float *blendedValsRed_2 = new float[srcSize]; 172 | 173 | float *blendedValsBlue_1 = new float[srcSize]; 174 | float *blendedValsBlue_2 = new float[srcSize]; 175 | 176 | float *blendedValsGreen_1 = new float[srcSize]; 177 | float *blendedValsGreen_2 = new float[srcSize]; 178 | 179 | //IC is the source image, copy over 180 | for (size_t i = 0; i < srcSize; ++i) { 181 | blendedValsRed_1[i] = red_src[i]; 182 | blendedValsRed_2[i] = red_src[i]; 183 | blendedValsBlue_1[i] = blue_src[i]; 184 | blendedValsBlue_2[i] = blue_src[i]; 185 | blendedValsGreen_1[i] = green_src[i]; 186 | blendedValsGreen_2[i] = green_src[i]; 187 | } 188 | 189 | //Perform the solve on each color channel 190 | const size_t numIterations = 800; 191 | for (size_t i = 0; i < numIterations; ++i) { 192 | computeIteration(red_dst, strictInteriorPixels, borderPixels, 193 | interiorPixelList, numColsSource, blendedValsRed_1, g_red, 194 | blendedValsRed_2); 195 | 196 | std::swap(blendedValsRed_1, blendedValsRed_2); 197 | } 198 | 199 | for (size_t i = 0; i < numIterations; ++i) { 200 | computeIteration(blue_dst, strictInteriorPixels, borderPixels, 201 | interiorPixelList, numColsSource, blendedValsBlue_1, g_blue, 202 | blendedValsBlue_2); 203 | 204 | std::swap(blendedValsBlue_1, blendedValsBlue_2); 205 | } 206 | 207 | for (size_t i = 0; i < numIterations; ++i) { 208 | computeIteration(green_dst, strictInteriorPixels, borderPixels, 209 | interiorPixelList, numColsSource, blendedValsGreen_1, g_green, 210 | blendedValsGreen_2); 211 | 212 | std::swap(blendedValsGreen_1, blendedValsGreen_2); 213 | } 214 | std::swap(blendedValsRed_1, blendedValsRed_2); //put output into _2 215 | std::swap(blendedValsBlue_1, blendedValsBlue_2); //put output into _2 216 | std::swap(blendedValsGreen_1, blendedValsGreen_2); //put output into _2 217 | 218 | //copy the destination image to the output 219 | memcpy(h_blendedImg, h_destImg, sizeof(uchar4) * srcSize); 220 | 221 | //copy computed values for the interior into the output 222 | for (size_t i = 0; i < interiorPixelList.size(); ++i) { 223 | uint2 coord = interiorPixelList[i]; 224 | 225 | unsigned int offset = coord.x * numColsSource + coord.y; 226 | 227 | h_blendedImg[offset].x = blendedValsRed_2[offset]; 228 | h_blendedImg[offset].y = blendedValsBlue_2[offset]; 229 | h_blendedImg[offset].z = blendedValsGreen_2[offset]; 230 | } 231 | 232 | //wow, we allocated a lot of memory! 233 | delete[] mask; 234 | delete[] blendedValsRed_1; 235 | delete[] blendedValsRed_2; 236 | delete[] blendedValsBlue_1; 237 | delete[] blendedValsBlue_2; 238 | delete[] blendedValsGreen_1; 239 | delete[] blendedValsGreen_2; 240 | delete[] g_red; 241 | delete[] g_blue; 242 | delete[] g_green; 243 | delete[] red_src; 244 | delete[] red_dst; 245 | delete[] blue_src; 246 | delete[] blue_dst; 247 | delete[] green_src; 248 | delete[] green_dst; 249 | delete[] borderPixels; 250 | delete[] strictInteriorPixels; 251 | } 252 | 253 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/reference_calc.h: -------------------------------------------------------------------------------- 1 | #ifndef REFERENCE_H__ 2 | #define REFERENCE_H__ 3 | 4 | void reference_calc(const uchar4* const h_sourceImg, 5 | const size_t numRowsSource, const size_t numColsSource, 6 | const uchar4* const h_destImg, 7 | uchar4* const h_blendedImg); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/source.png -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/student_func.cu: -------------------------------------------------------------------------------- 1 | //Udacity HW 6 2 | //Poisson Blending 3 | 4 | /* Background 5 | ========== 6 | 7 | The goal for this assignment is to take one image (the source) and 8 | paste it into another image (the destination) attempting to match the 9 | two images so that the pasting is non-obvious. This is 10 | known as a "seamless clone". 11 | 12 | The basic ideas are as follows: 13 | 14 | 1) Figure out the interior and border of the source image 15 | 2) Use the values of the border pixels in the destination image 16 | as boundary conditions for solving a Poisson equation that tells 17 | us how to blend the images. 18 | 19 | No pixels from the destination except pixels on the border 20 | are used to compute the match. 21 | 22 | Solving the Poisson Equation 23 | ============================ 24 | 25 | There are multiple ways to solve this equation - we choose an iterative 26 | method - specifically the Jacobi method. Iterative methods start with 27 | a guess of the solution and then iterate to try and improve the guess 28 | until it stops changing. If the problem was well-suited for the method 29 | then it will stop and where it stops will be the solution. 30 | 31 | The Jacobi method is the simplest iterative method and converges slowly - 32 | that is we need a lot of iterations to get to the answer, but it is the 33 | easiest method to write. 34 | 35 | Jacobi Iterations 36 | ================= 37 | 38 | Our initial guess is going to be the source image itself. This is a pretty 39 | good guess for what the blended image will look like and it means that 40 | we won't have to do as many iterations compared to if we had started far 41 | from the final solution. 42 | 43 | ImageGuess_prev (Floating point) 44 | ImageGuess_next (Floating point) 45 | 46 | DestinationImg 47 | SourceImg 48 | 49 | Follow these steps to implement one iteration: 50 | 51 | 1) For every pixel p in the interior, compute two sums over the four neighboring pixels: 52 | Sum1: If the neighbor is in the interior then += ImageGuess_prev[neighbor] 53 | else if the neighbor in on the border then += DestinationImg[neighbor] 54 | 55 | Sum2: += SourceImg[p] - SourceImg[neighbor] (for all four neighbors) 56 | 57 | 2) Calculate the new pixel value: 58 | float newVal= (Sum1 + Sum2) / 4.f <------ Notice that the result is FLOATING POINT 59 | ImageGuess_next[p] = min(255, max(0, newVal)); //clamp to [0, 255] 60 | 61 | 62 | In this assignment we will do 800 iterations. 63 | */ 64 | 65 | 66 | 67 | #include "utils.h" 68 | #include 69 | 70 | void your_blend(const uchar4* const h_sourceImg, //IN 71 | const size_t numRowsSource, const size_t numColsSource, 72 | const uchar4* const h_destImg, //IN 73 | uchar4* const h_blendedImg) //OUT 74 | { 75 | 76 | /* To Recap here are the steps you need to implement 77 | 78 | 1) Compute a mask of the pixels from the source image to be copied 79 | The pixels that shouldn't be copied are completely white, they 80 | have R=255, G=255, B=255. Any other pixels SHOULD be copied. 81 | 82 | 2) Compute the interior and border regions of the mask. An interior 83 | pixel has all 4 neighbors also inside the mask. A border pixel is 84 | in the mask itself, but has at least one neighbor that isn't. 85 | 86 | 3) Separate out the incoming image into three separate channels 87 | 88 | 4) Create two float(!) buffers for each color channel that will 89 | act as our guesses. Initialize them to the respective color 90 | channel of the source image since that will act as our intial guess. 91 | 92 | 5) For each color channel perform the Jacobi iteration described 93 | above 800 times. 94 | 95 | 6) Create the output image by replacing all the interior pixels 96 | in the destination image with the result of the Jacobi iterations. 97 | Just cast the floating point values to unsigned chars since we have 98 | already made sure to clamp them to the correct range. 99 | 100 | Since this is final assignment we provide little boilerplate code to 101 | help you. Notice that all the input/output pointers are HOST pointers. 102 | 103 | You will have to allocate all of your own GPU memory and perform your own 104 | memcopies to get data in and out of the GPU memory. 105 | 106 | Remember to wrap all of your calls with checkCudaErrors() to catch any 107 | thing that might go wrong. After each kernel call do: 108 | 109 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 110 | 111 | to catch any errors that happened while executing the kernel. 112 | */ 113 | } 114 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_TIMER_H__ 2 | #define GPU_TIMER_H__ 3 | 4 | #include 5 | 6 | struct GpuTimer 7 | { 8 | cudaEvent_t start; 9 | cudaEvent_t stop; 10 | 11 | GpuTimer() 12 | { 13 | cudaEventCreate(&start); 14 | cudaEventCreate(&stop); 15 | } 16 | 17 | ~GpuTimer() 18 | { 19 | cudaEventDestroy(start); 20 | cudaEventDestroy(stop); 21 | } 22 | 23 | void Start() 24 | { 25 | cudaEventRecord(start, 0); 26 | } 27 | 28 | void Stop() 29 | { 30 | cudaEventRecord(stop, 0); 31 | } 32 | 33 | float Elapsed() 34 | { 35 | float elapsed; 36 | cudaEventSynchronize(stop); 37 | cudaEventElapsedTime(&elapsed, start, stop); 38 | return elapsed; 39 | } 40 | }; 41 | 42 | #endif /* GPU_TIMER_H__ */ 43 | -------------------------------------------------------------------------------- /Problem Sets/Problem Set 6/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H__ 2 | #define UTILS_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__) 13 | 14 | template 15 | void check(T err, const char* const func, const char* const file, const int line) { 16 | if (err != cudaSuccess) { 17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl; 18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl; 19 | exit(1); 20 | } 21 | } 22 | 23 | template 24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) { 25 | //check that the GPU result matches the CPU result 26 | for (size_t i = 0; i < numElem; ++i) { 27 | if (ref[i] != gpu[i]) { 28 | std::cerr << "Difference at pos " << i << std::endl; 29 | //the + is magic to convert char to int without messing 30 | //with other types 31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 32 | "\nGPU : " << +gpu[i] << std::endl; 33 | exit(1); 34 | } 35 | } 36 | } 37 | 38 | template 39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) { 40 | assert(eps1 >= 0 && eps2 >= 0); 41 | unsigned long long totalDiff = 0; 42 | unsigned numSmallDifferences = 0; 43 | for (size_t i = 0; i < numElem; ++i) { 44 | //subtract smaller from larger in case of unsigned types 45 | T smaller = std::min(ref[i], gpu[i]); 46 | T larger = std::max(ref[i], gpu[i]); 47 | T diff = larger - smaller; 48 | if (diff > 0 && diff <= eps1) { 49 | numSmallDifferences++; 50 | } 51 | else if (diff > eps1) { 52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl; 53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] << 54 | "\nGPU : " << +gpu[i] << std::endl; 55 | exit(1); 56 | } 57 | totalDiff += diff * diff; 58 | } 59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem; 60 | if (percentSmallDifferences > eps2) { 61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl; 62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl; 63 | exit(1); 64 | } 65 | } 66 | 67 | //Uses the autodesk method of image comparison 68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels 69 | template 70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance) 71 | { 72 | 73 | size_t numBadPixels = 0; 74 | for (size_t i = 0; i < numElem; ++i) { 75 | T smaller = std::min(ref[i], gpu[i]); 76 | T larger = std::max(ref[i], gpu[i]); 77 | T diff = larger - smaller; 78 | if (diff > variance) 79 | ++numBadPixels; 80 | } 81 | 82 | if (numBadPixels > tolerance) { 83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl; 84 | exit(1); 85 | } 86 | } 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archival Note 2 | This repository is deprecated; therefore, we are going to archive it. 3 | However, learners will be able to fork it to their personal Github account but cannot submit PRs to this repository. If you have any issues or suggestions to make, feel free to: 4 | - Utilize the https://knowledge.udacity.com/ forum to seek help on content-specific issues. 5 | - Submit a support ticket along with the link to your forked repository if (learners are) blocked for other reasons. Here are the links for the [retail consumers](https://udacity.zendesk.com/hc/en-us/requests/new) and [enterprise learners](https://udacityenterprise.zendesk.com/hc/en-us/requests/new?ticket_form_id=360000279131). 6 | 7 | cs344 8 | ===== 9 | 10 | Introduction to Parallel Programming class code 11 | 12 | # Building on OS X 13 | 14 | These instructions are for OS X 10.9 "Mavericks". 15 | 16 | * Step 1. Build and install OpenCV. The best way to do this is with 17 | Homebrew. However, you must slightly alter the Homebrew OpenCV 18 | installation; you must build it with libstdc++ (instead of the default 19 | libc++) so that it will properly link against the nVidia CUDA dev kit. 20 | [This entry in the Udacity discussion forums](http://forums.udacity.com/questions/100132476/cuda-55-opencv-247-os-x-maverick-it-doesnt-work) describes exactly how to build a compatible OpenCV. 21 | 22 | * Step 2. You can now create 10.9-compatible makefiles, which will allow you to 23 | build and run your homework on your own machine: 24 | ``` 25 | mkdir build 26 | cd build 27 | cmake .. 28 | make 29 | ``` 30 | 31 | -------------------------------------------------------------------------------- /Student Contributions/Notes/Unit3 Notes/NotesUnit3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit3 Notes/NotesUnit3.pdf -------------------------------------------------------------------------------- /Student Contributions/Notes/Unit3 Notes/NotesUnit3Small.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit3 Notes/NotesUnit3Small.pdf -------------------------------------------------------------------------------- /Student Contributions/Notes/Unit4 Notes/NotesUnit4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit4 Notes/NotesUnit4.pdf -------------------------------------------------------------------------------- /Student Contributions/Notes/Unit4 Notes/NotesUnit4_Small.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit4 Notes/NotesUnit4_Small.pdf --------------------------------------------------------------------------------