├── .gitignore
├── CMakeLists.txt
├── Final
├── batcher
│ ├── batcher.cu
│ ├── compare.h
│ └── gputimer.h
├── smooth
│ ├── compare.h
│ ├── gputimer.h
│ └── smooth.cu
└── warpreduce
│ ├── part_a
│ ├── compare.h
│ ├── gputimer.h
│ └── warpreduce.cu
│ └── part_b
│ ├── compare.h
│ ├── gputimer.h
│ └── warpreduce.cu
├── Lesson Code Snippets
├── Lesson 2 Code Snippets
│ ├── associative.cu
│ ├── atomics.cu
│ ├── gputimer.h
│ ├── hello_blockIdx.cu
│ ├── hello_threadIdx.cu
│ └── memory.cu
├── Lesson 3 Code Snippets
│ ├── histo.cu
│ └── reduce.cu
├── Lesson 5 Code Snippets
│ ├── deviceQuery_simplified.cpp
│ └── transpose.cu
└── Lesson 7 Code Snippets
│ ├── cub
│ └── example_block_scan_cum.cu
│ ├── thrust
│ ├── gputimer.h
│ └── thrust_example.cu
│ └── tiling
│ ├── gputimer.h
│ ├── tiling.cu
│ └── utils.h
├── Lesson Slides
├── CS344_Lesson1_Slides.pdf
├── CS344_Lesson2_Slides.pdf
├── CS344_Lesson3_Slides.pdf
├── CS344_Lesson4_Slides.pdf
├── CS344_Lesson5_Slides.pdf
├── CS344_Lesson6.1_Slides.pdf
├── CS344_Lesson6.2_Slides.pdf
├── CS344_Lesson7.1_Slides.pdf
└── CS344_Lesson7.2_Slides.pdf
├── Problem Sets
├── Problem Set 1.zip
├── Problem Set 1
│ ├── CMakeLists.txt
│ ├── HW1.cpp
│ ├── Makefile
│ ├── cinque_terre.gold
│ ├── cinque_terre_small.jpg
│ ├── compare.cpp
│ ├── compare.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 2.zip
├── Problem Set 2
│ ├── CMakeLists.txt
│ ├── HW2.cpp
│ ├── Makefile
│ ├── cinque_terre.gold
│ ├── cinque_terre_small.jpg
│ ├── compare.cpp
│ ├── compare.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 3.zip
├── Problem Set 3
│ ├── CMakeLists.txt
│ ├── HW3.cu
│ ├── Makefile
│ ├── compare.cpp
│ ├── compare.h
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── memorial.exr
│ ├── memorial_large.exr
│ ├── memorial_png.gold
│ ├── memorial_png_large.gold
│ ├── memorial_raw.png
│ ├── memorial_raw_large.png
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 4.zip
├── Problem Set 4
│ ├── CMakeLists.txt
│ ├── HW4.cu
│ ├── Makefile
│ ├── compare.cpp
│ ├── compare.h
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── red_eye_effect.gold
│ ├── red_eye_effect_5.jpg
│ ├── red_eye_effect_template_5.jpg
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 5.zip
├── Problem Set 5
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── main.cu
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 6.zip
└── Problem Set 6
│ ├── CMakeLists.txt
│ ├── HW6.cu
│ ├── Makefile
│ ├── blended.gold
│ ├── compare.cpp
│ ├── compare.h
│ ├── destination.png
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── source.png
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── README.md
└── Student Contributions
└── Notes
├── Unit3 Notes
├── NotesUnit3.pdf
└── NotesUnit3Small.pdf
└── Unit4 Notes
├── NotesUnit4.pdf
└── NotesUnit4_Small.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | # Object files
2 | *.o
3 |
4 | # Libraries
5 | *.lib
6 | *.a
7 |
8 | # Shared objects (inc. Windows DLLs)
9 | *.dll
10 | *.so
11 | *.so.*
12 | *.dylib
13 |
14 | # Executables
15 | *.exe
16 | *.out
17 | *.app
18 |
19 | # OS X stuff
20 | .DS_Store
21 |
22 | build
23 | bin
24 |
25 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
9 | project(cs344)
10 |
11 | find_package(OpenCV REQUIRED)
12 | find_package(CUDA REQUIRED)
13 |
14 | link_libraries(${OpenCV_LIBS} )
15 |
16 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/")
17 |
18 | if(CUDA_FOUND)
19 | # compared to class settings, we let NVidia's FindCUDA CMake detect
20 | # whether to build x64. We tell it to support most devices, though,
21 | # to make sure more people can easily run class code without knowing
22 | # about this compiler argument
23 | set(CUDA_NVCC_FLAGS "
24 | -ccbin /usr/bin/clang;
25 | -gencode;arch=compute_30,code=sm_30;
26 | -gencode;arch=compute_35,code=sm_35;
27 | -gencode;arch=compute_35,code=compute_35;
28 | -gencode;arch=compute_20,code=sm_20;
29 | -gencode;arch=compute_11,code=sm_11;
30 | -gencode;arch=compute_12,code=sm_12;
31 | -gencode;arch=compute_13,code=sm_13;")
32 |
33 | # add -Wextra compiler flag for gcc compilations
34 | if (UNIX)
35 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra")
36 | set(CMAKE_CXX_FLAGS "-stdlib=libstdc++")
37 | endif (UNIX)
38 |
39 | # add debugging to CUDA NVCC flags. For NVidia's NSight tools.
40 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
41 |
42 | add_subdirectory (HW1)
43 | add_subdirectory (HW2)
44 | add_subdirectory (HW3)
45 | add_subdirectory (HW4)
46 | add_subdirectory (HW5)
47 | add_subdirectory (HW6)
48 | else(CUDA_FOUND)
49 | message("CUDA is not installed on this system.")
50 | endif()
51 |
--------------------------------------------------------------------------------
/Final/batcher/batcher.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // http://en.wikipedia.org/wiki/Bitonic_sort
8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in)
9 | {
10 | // you are guaranteed this is called with <<<1, 64, 64*4>>>
11 | extern __shared__ float sdata[];
12 | int tid = threadIdx.x;
13 | sdata[tid] = d_in[tid];
14 | __syncthreads();
15 |
16 | for (int stage = 0; stage <= 5; stage++)
17 | {
18 | for (int substage = stage; substage >= 0; substage--)
19 | {
20 | // TODO
21 | }
22 | }
23 |
24 | d_out[tid] = sdata[tid];
25 | }
26 |
27 | int compareFloat (const void * a, const void * b)
28 | {
29 | if ( *(float*)a < *(float*)b ) return -1;
30 | if ( *(float*)a == *(float*)b ) return 0;
31 | if ( *(float*)a > *(float*)b ) return 1;
32 | return 0; // should never reach this
33 | }
34 |
35 | int main(int argc, char **argv)
36 | {
37 | const int ARRAY_SIZE = 64;
38 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
39 |
40 | // generate the input array on the host
41 | float h_in[ARRAY_SIZE];
42 | float h_sorted[ARRAY_SIZE];
43 | float h_out[ARRAY_SIZE];
44 | for(int i = 0; i < ARRAY_SIZE; i++) {
45 | // generate random float in [0, 1]
46 | h_in[i] = (float)random()/(float)RAND_MAX;
47 | h_sorted[i] = h_in[i];
48 | }
49 | qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat);
50 |
51 | // declare GPU memory pointers
52 | float * d_in, * d_out;
53 |
54 | // allocate GPU memory
55 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
56 | cudaMalloc((void **) &d_out, ARRAY_BYTES);
57 |
58 | // transfer the input array to the GPU
59 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
60 |
61 | // launch the kernel
62 | GpuTimer timer;
63 | timer.Start();
64 | batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in);
65 | timer.Stop();
66 |
67 | printf("Your code executed in %g ms\n", timer.Elapsed());
68 |
69 | // copy back the sum from GPU
70 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
71 |
72 | compare(h_out, h_sorted, ARRAY_SIZE);
73 |
74 | // free GPU memory allocation
75 | cudaFree(d_in);
76 | cudaFree(d_out);
77 |
78 | return 0;
79 | }
80 |
--------------------------------------------------------------------------------
/Final/batcher/compare.h:
--------------------------------------------------------------------------------
1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE)
2 | {
3 | int failure = 0;
4 | for(int i = 0; i < ARRAY_SIZE; i++) {
5 | if (h_out[i] != h_sorted[i]) {
6 | printf("Oops! Index %i is %f, should be %f\n",
7 | i, h_out[i], h_sorted[i]);
8 | failure = 1;
9 | }
10 | }
11 |
12 | if (failure == 0){
13 | printf("Success! Your bitonic sort worked.");
14 | }
15 |
16 | return failure;
17 | }
--------------------------------------------------------------------------------
/Final/batcher/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/smooth/compare.h:
--------------------------------------------------------------------------------
1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){
2 | int failure = 0;
3 | for(int i = 0; i < ARRAY_SIZE; i++) {
4 | if (h_out[i] != h_cmp[i]) {
5 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n",
6 | i, h_in[i], i, h_out[i], i, h_cmp[i]);
7 | failure = 1;
8 | }
9 | if (h_out_shared[i] != h_cmp[i]) {
10 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n",
11 | i, h_in[i], i, h_out_shared[i], i, h_cmp[i]);
12 | failure = 1;
13 | }
14 | }
15 |
16 | if (failure == 0)
17 | {
18 | printf("Success! Your smooth code worked!\n");
19 | }
20 |
21 | return failure;
22 | }
--------------------------------------------------------------------------------
/Final/smooth/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/smooth/smooth.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Reference
8 | __global__ void smooth(float * v_new, const float * v) {
9 | int myIdx = threadIdx.x * gridDim.x + blockIdx.x;
10 | int numThreads = blockDim.x * gridDim.x;
11 | int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1;
12 | int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1;
13 | float myElt = v[myIdx];
14 | float myLeftElt = v[myLeftIdx];
15 | float myRightElt = v[myRightIdx];
16 | v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt;
17 | }
18 |
19 | // Your code
20 | __global__ void smooth_shared(float * v_new, const float * v) {
21 | extern __shared__ float s[];
22 | // TODO: Fill in the rest of this function
23 | return v[0];
24 | }
25 |
26 | int main(int argc, char **argv)
27 | {
28 |
29 | const int ARRAY_SIZE = 4096;
30 | const int BLOCK_SIZE = 256;
31 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
32 |
33 | // generate the input array on the host
34 | float h_in[ARRAY_SIZE];
35 | float h_cmp[ARRAY_SIZE];
36 | float h_out[ARRAY_SIZE];
37 | float h_out_shared[ARRAY_SIZE];
38 | for(int i = 0; i < ARRAY_SIZE; i++) {
39 | // generate random float in [0, 1]
40 | h_in[i] = (float)random()/(float)RAND_MAX;
41 | }
42 | for(int i = 0; i < ARRAY_SIZE; i++) {
43 | h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] +
44 | 0.50f * h_in[i] +
45 | 0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]);
46 | }
47 |
48 | // declare GPU memory pointers
49 | float * d_in, * d_out, * d_out_shared;
50 |
51 | // allocate GPU memory
52 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
53 | cudaMalloc((void **) &d_out, ARRAY_BYTES);
54 | cudaMalloc((void **) &d_out_shared, ARRAY_BYTES);
55 |
56 | // transfer the input array to the GPU
57 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
58 |
59 | // cudaEvent_t start, stop;
60 | // cudaEventCreate(&start);
61 | // cudaEventCreate(&stop);
62 | // launch the kernel
63 | smooth<<>>(d_out, d_in);
64 | GpuTimer timer;
65 | timer.Start();
66 | smooth_shared<<>>(d_out_shared, d_in);
67 | timer.Stop();
68 |
69 | printf("Your code executed in %g ms\n", timer.Elapsed());
70 | // cudaEventSynchronize(stop);
71 | // float elapsedTime;
72 | // cudaEventElapsedTime(&elapsedTime, start, stop);
73 |
74 | // copy back the result from GPU
75 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
76 | cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost);
77 |
78 | // testing for correctness
79 | compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE);
80 |
81 | // free GPU memory allocation
82 | cudaFree(d_in);
83 | cudaFree(d_out);
84 | cudaFree(d_out_shared);
85 |
86 | return 0;
87 | }
88 |
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/compare.h:
--------------------------------------------------------------------------------
1 | int compare(unsigned int h_out_shared, int sum){
2 | int failure = 0;
3 | if (h_out_shared != sum) {
4 | fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n",
5 | h_out_shared, sum);
6 | failure = 1;
7 | }
8 |
9 | if (failure == 0)
10 | {
11 | printf("Success! Your shared warp reduce worked.\n");
12 | }
13 | else{
14 | printf("Error! Your shared reduce code's output did not match sum.\n");
15 | }
16 |
17 | return failure;
18 | }
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/warpreduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Subpart A:
8 | // Write step 1 as a kernel that operates on threads 0--31.
9 | // Assume that the input flags are 0 for false and 1 for true and are stored
10 | // in a local per-thread register called p (for predicate).
11 | //
12 | // You have access to 31 words of shared memory s[0:31], with s[0]
13 | // corresponding to thread 0 and s[31] corresponding to thread 31.
14 | // You may change the values of s[0:31]. Put the return sum in s[0].
15 | // Your code should execute no more than 5 warp-wide addition operations.
16 |
17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) {
18 | // Assumes values in 'p' are either 1 or 0
19 | // Assumes s[0:31] are allocated
20 | // Sums p across warp, returning the result. Suggest you put
21 | // result in s[0] and return it
22 | // You may change any value in s
23 | // You should execute no more than 5 + operations (if you're doing
24 | // 31, you're doing it wrong)
25 | //
26 | // TODO: Fill in the rest of this function
27 |
28 | return s[0];
29 | }
30 |
31 | __global__ void reduce(unsigned int * d_out_shared,
32 | const unsigned int * d_in)
33 | {
34 | extern __shared__ unsigned int s[];
35 | int t = threadIdx.x;
36 | int p = d_in[t];
37 | unsigned int sr = shared_reduce(p, s);
38 | if (t == 0)
39 | {
40 | *d_out_shared = sr;
41 | }
42 | }
43 |
44 | int main(int argc, char **argv)
45 | {
46 | const int ARRAY_SIZE = 32;
47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 |
49 | // generate the input array on the host
50 | unsigned int h_in[ARRAY_SIZE];
51 | unsigned int sum = 0;
52 | for(int i = 0; i < ARRAY_SIZE; i++) {
53 | // generate random float in [0, 1]
54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 | sum += h_in[i];
56 | }
57 |
58 | // declare GPU memory pointers
59 | unsigned int * d_in, * d_out_shared;
60 |
61 | // allocate GPU memory
62 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 | cudaMalloc((void **) &d_out_shared, sizeof(unsigned int));
64 |
65 | // transfer the input array to the GPU
66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
67 |
68 | GpuTimer timer;
69 | timer.Start();
70 | // launch the kernel
71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 | (d_out_shared, d_in);
73 | timer.Stop();
74 |
75 | printf("Your code executed in %g ms\n", timer.Elapsed());
76 |
77 | unsigned int h_out_shared;
78 | // copy back the sum from GPU
79 | cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int),
80 | cudaMemcpyDeviceToHost);
81 |
82 | compare(h_out_shared, sum);
83 |
84 | // free GPU memory allocation
85 | cudaFree(d_in);
86 | cudaFree(d_out_shared);
87 | }
88 |
89 |
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/compare.h:
--------------------------------------------------------------------------------
1 | int compare(unsigned int h_out_warp, int sum){
2 | int failure = 0;
3 | if (h_out_warp != sum) {
4 | fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n",
5 | h_out_warp, sum);
6 | failure = 1;
7 | }
8 |
9 | if (failure == 0)
10 | {
11 | printf("Success! Your warp reduce worked.\n");
12 | }
13 | else{
14 | printf("Error! Your warp reduce code's output did not match sum.\n");
15 | }
16 |
17 | return failure;
18 | }
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/warpreduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Subpart b:
8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions.
9 | // Namely, these instructions are:
10 | //
11 | // int __popc(int x) Population Count: Returns the number of bits that are set
12 | // to 1 in the 32-bit integer x.
13 | //
14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero
15 | // bits beginning at the most significant bit of the 32-bit integer x.
16 | //
17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only
18 | // if the predicate p provided by the thread in lane k of the warp is non-zero.
19 |
20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) {
21 | // Assumes values in 'p' are either 1 or 0
22 | // Should not use 's'
23 | // Sums p across warp, returning the result.
24 | // You can do this without using the character '+' in your code at all
25 | //
26 | // TODO: Fill in the rest of this function
27 | //
28 | }
29 |
30 | __global__ void reduce(unsigned int * d_out_warp,
31 | const unsigned int * d_in)
32 | {
33 | extern __shared__ unsigned int s[];
34 | int t = threadIdx.x;
35 | int p = d_in[t];
36 |
37 | unsigned int wr = warp_reduce(p, s);
38 | if (t == 0)
39 | {
40 | *d_out_warp = wr;
41 | }
42 | }
43 |
44 | int main(int argc, char **argv)
45 | {
46 | const int ARRAY_SIZE = 32;
47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 |
49 | // generate the input array on the host
50 | unsigned int h_in[ARRAY_SIZE];
51 | unsigned int sum = 0;
52 | for(int i = 0; i < ARRAY_SIZE; i++) {
53 | // generate random float in [0, 1]
54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 | sum += h_in[i];
56 | }
57 |
58 | // declare GPU memory pointers
59 | unsigned int * d_in, * d_out_warp;
60 |
61 | // allocate GPU memory
62 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 | cudaMalloc((void **) &d_out_warp, sizeof(unsigned int));
64 |
65 | // transfer the input array to the GPU
66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
67 |
68 | GpuTimer timer;
69 | timer.Start();
70 | // launch the kernel
71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 | (d_out_warp, d_in);
73 | timer.Stop();
74 |
75 | printf("Your code executed in %g ms\n", timer.Elapsed());
76 |
77 | unsigned int h_out_warp;
78 | // copy back the sum from GPU
79 | cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int),
80 | cudaMemcpyDeviceToHost);
81 |
82 | // compare your result against the expected reduce sum
83 | compare(h_out_warp, sum);
84 |
85 | // free GPU memory allocation
86 | cudaFree(d_in);
87 | cudaFree(d_out_warp);
88 |
89 | }
90 |
91 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int main(int argc,char **argv)
4 | {
5 | printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n",
6 | 1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99,
7 | 1.f, 1e99, -1e99, 1.f + (1e99 + -1e99));
8 | return 0;
9 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "gputimer.h"
3 |
4 | #define NUM_THREADS 1000000
5 | #define ARRAY_SIZE 100
6 |
7 | #define BLOCK_WIDTH 1000
8 |
9 | void print_array(int *array, int size)
10 | {
11 | printf("{ ");
12 | for (int i = 0; i < size; i++) { printf("%d ", array[i]); }
13 | printf("}\n");
14 | }
15 |
16 | __global__ void increment_naive(int *g)
17 | {
18 | // which thread is this?
19 | int i = blockIdx.x * blockDim.x + threadIdx.x;
20 |
21 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
22 | i = i % ARRAY_SIZE;
23 | g[i] = g[i] + 1;
24 | }
25 |
26 | __global__ void increment_atomic(int *g)
27 | {
28 | // which thread is this?
29 | int i = blockIdx.x * blockDim.x + threadIdx.x;
30 |
31 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
32 | i = i % ARRAY_SIZE;
33 | atomicAdd(& g[i], 1);
34 | }
35 |
36 | int main(int argc,char **argv)
37 | {
38 | GpuTimer timer;
39 | printf("%d total threads in %d blocks writing into %d array elements\n",
40 | NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
41 |
42 | // declare and allocate host memory
43 | int h_array[ARRAY_SIZE];
44 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
45 |
46 | // declare, allocate, and zero out GPU memory
47 | int * d_array;
48 | cudaMalloc((void **) &d_array, ARRAY_BYTES);
49 | cudaMemset((void *) d_array, 0, ARRAY_BYTES);
50 |
51 | // launch the kernel - comment out one of these
52 | timer.Start();
53 | // increment_naive<<>>(d_array);
54 | increment_atomic<<>>(d_array);
55 | timer.Stop();
56 |
57 | // copy back the array of sums from GPU and print
58 | cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
59 | print_array(h_array, ARRAY_SIZE);
60 | printf("Time elapsed = %g ms\n", timer.Elapsed());
61 |
62 | // free GPU memory allocation and exit
63 | cudaFree(d_array);
64 | return 0;
65 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define NUM_BLOCKS 16
4 | #define BLOCK_WIDTH 1
5 |
6 | __global__ void hello()
7 | {
8 | printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
9 | }
10 |
11 |
12 | int main(int argc,char **argv)
13 | {
14 | // launch the kernel
15 | hello<<>>();
16 |
17 | // force the printf()s to flush
18 | cudaDeviceSynchronize();
19 |
20 | printf("That's all!\n");
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define NUM_BLOCKS 1
4 | #define BLOCK_WIDTH 256
5 |
6 | __global__ void hello()
7 | {
8 | printf("Hello world! I'm thread %d\n", threadIdx.x);
9 | }
10 |
11 |
12 | int main(int argc,char **argv)
13 | {
14 | // launch the kernel
15 | hello<<>>();
16 |
17 | // force the printf()s to flush
18 | cudaDeviceSynchronize();
19 |
20 | printf("That's all!\n");
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu:
--------------------------------------------------------------------------------
1 | // Using different memory spaces in CUDA
2 | #include
3 |
4 | /**********************
5 | * using local memory *
6 | **********************/
7 |
8 | // a __device__ or __global__ function runs on the GPU
9 | __global__ void use_local_memory_GPU(float in)
10 | {
11 | float f; // variable "f" is in local memory and private to each thread
12 | f = in; // parameter "in" is in local memory and private to each thread
13 | // ... real code would presumably do other stuff here ...
14 | }
15 |
16 | /**********************
17 | * using global memory *
18 | **********************/
19 |
20 | // a __global__ function runs on the GPU & can be called from host
21 | __global__ void use_global_memory_GPU(float *array)
22 | {
23 | // "array" is a pointer into global memory on the device
24 | array[threadIdx.x] = 2.0f * (float) threadIdx.x;
25 | }
26 |
27 | /**********************
28 | * using shared memory *
29 | **********************/
30 |
31 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)
32 | __global__ void use_shared_memory_GPU(float *array)
33 | {
34 | // local variables, private to each thread
35 | int i, index = threadIdx.x;
36 | float average, sum = 0.0f;
37 |
38 | // __shared__ variables are visible to all threads in the thread block
39 | // and have the same lifetime as the thread block
40 | __shared__ float sh_arr[128];
41 |
42 | // copy data from "array" in global memory to sh_arr in shared memory.
43 | // here, each thread is responsible for copying a single element.
44 | sh_arr[index] = array[index];
45 |
46 | __syncthreads(); // ensure all the writes to shared memory have completed
47 |
48 | // now, sh_arr is fully populated. Let's find the average of all previous elements
49 | for (i=0; i average) { array[index] = average; }
56 |
57 | // the following code has NO EFFECT: it modifies shared memory, but
58 | // the resulting modified data is never copied back to global memory
59 | // and vanishes when the thread block completes
60 | sh_arr[index] = 3.14;
61 | }
62 |
63 | int main(int argc, char **argv)
64 | {
65 | /*
66 | * First, call a kernel that shows using local memory
67 | */
68 | use_local_memory_GPU<<<1, 128>>>(2.0f);
69 |
70 | /*
71 | * Next, call a kernel that shows using global memory
72 | */
73 | float h_arr[128]; // convention: h_ variables live on host
74 | float *d_arr; // convention: d_ variables live on device (GPU global mem)
75 |
76 | // allocate global memory on the device, place result in "d_arr"
77 | cudaMalloc((void **) &d_arr, sizeof(float) * 128);
78 | // now copy data from host memory "h_arr" to device memory "d_arr"
79 | cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
80 | // launch the kernel (1 block of 128 threads)
81 | use_global_memory_GPU<<<1, 128>>>(d_arr); // modifies the contents of array at d_arr
82 | // copy the modified array back to the host, overwriting contents of h_arr
83 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost);
84 | // ... do other stuff ...
85 |
86 | /*
87 | * Next, call a kernel that shows using shared memory
88 | */
89 |
90 | // as before, pass in a pointer to data in global memory
91 | use_shared_memory_GPU<<<1, 128>>>(d_arr);
92 | // copy the modified array back to the host
93 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
94 | // ... do other stuff ...
95 | return 0;
96 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int log2(int i)
5 | {
6 | int r = 0;
7 | while (i >>= 1) r++;
8 | return r;
9 | }
10 |
11 | int bit_reverse(int w, int bits)
12 | {
13 | int r = 0;
14 | for (int i = 0; i < bits; i++)
15 | {
16 | int bit = (w & (1 << i)) >> i;
17 | r |= bit << (bits - i - 1);
18 | }
19 | return r;
20 | }
21 |
22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
23 | {
24 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
25 | int myItem = d_in[myId];
26 | int myBin = myItem % BIN_COUNT;
27 | d_bins[myBin]++;
28 | }
29 |
30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
31 | {
32 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
33 | int myItem = d_in[myId];
34 | int myBin = myItem % BIN_COUNT;
35 | atomicAdd(&(d_bins[myBin]), 1);
36 | }
37 |
38 |
39 | int main(int argc, char **argv)
40 | {
41 | int deviceCount;
42 | cudaGetDeviceCount(&deviceCount);
43 | if (deviceCount == 0) {
44 | fprintf(stderr, "error: no devices supporting CUDA.\n");
45 | exit(EXIT_FAILURE);
46 | }
47 | int dev = 0;
48 | cudaSetDevice(dev);
49 |
50 | cudaDeviceProp devProps;
51 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
52 | {
53 | printf("Using device %d:\n", dev);
54 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
55 | devProps.name, (int)devProps.totalGlobalMem,
56 | (int)devProps.major, (int)devProps.minor,
57 | (int)devProps.clockRate);
58 | }
59 |
60 | const int ARRAY_SIZE = 65536;
61 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
62 | const int BIN_COUNT = 16;
63 | const int BIN_BYTES = BIN_COUNT * sizeof(int);
64 |
65 | // generate the input array on the host
66 | int h_in[ARRAY_SIZE];
67 | for(int i = 0; i < ARRAY_SIZE; i++) {
68 | h_in[i] = bit_reverse(i, log2(ARRAY_SIZE));
69 | }
70 | int h_bins[BIN_COUNT];
71 | for(int i = 0; i < BIN_COUNT; i++) {
72 | h_bins[i] = 0;
73 | }
74 |
75 | // declare GPU memory pointers
76 | int * d_in;
77 | int * d_bins;
78 |
79 | // allocate GPU memory
80 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
81 | cudaMalloc((void **) &d_bins, BIN_BYTES);
82 |
83 | // transfer the arrays to the GPU
84 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
85 | cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice);
86 |
87 | int whichKernel = 0;
88 | if (argc == 2) {
89 | whichKernel = atoi(argv[1]);
90 | }
91 |
92 | // launch the kernel
93 | switch(whichKernel) {
94 | case 0:
95 | printf("Running naive histo\n");
96 | naive_histo<<>>(d_bins, d_in, BIN_COUNT);
97 | break;
98 | case 1:
99 | printf("Running simple histo\n");
100 | simple_histo<<>>(d_bins, d_in, BIN_COUNT);
101 | break;
102 | default:
103 | fprintf(stderr, "error: ran no kernel\n");
104 | exit(EXIT_FAILURE);
105 | }
106 |
107 | // copy back the sum from GPU
108 | cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);
109 |
110 | for(int i = 0; i < BIN_COUNT; i++) {
111 | printf("bin %d: count %d\n", i, h_bins[i]);
112 | }
113 |
114 | // free GPU memory allocation
115 | cudaFree(d_in);
116 | cudaFree(d_bins);
117 |
118 | return 0;
119 | }
120 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __global__ void global_reduce_kernel(float * d_out, float * d_in)
6 | {
7 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
8 | int tid = threadIdx.x;
9 |
10 | // do reduction in global mem
11 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
12 | {
13 | if (tid < s)
14 | {
15 | d_in[myId] += d_in[myId + s];
16 | }
17 | __syncthreads(); // make sure all adds at one stage are done!
18 | }
19 |
20 | // only thread 0 writes result for this block back to global mem
21 | if (tid == 0)
22 | {
23 | d_out[blockIdx.x] = d_in[myId];
24 | }
25 | }
26 |
27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
28 | {
29 | // sdata is allocated in the kernel call: 3rd arg to <<>>
30 | extern __shared__ float sdata[];
31 |
32 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
33 | int tid = threadIdx.x;
34 |
35 | // load shared mem from global mem
36 | sdata[tid] = d_in[myId];
37 | __syncthreads(); // make sure entire block is loaded!
38 |
39 | // do reduction in shared mem
40 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
41 | {
42 | if (tid < s)
43 | {
44 | sdata[tid] += sdata[tid + s];
45 | }
46 | __syncthreads(); // make sure all adds at one stage are done!
47 | }
48 |
49 | // only thread 0 writes result for this block back to global mem
50 | if (tid == 0)
51 | {
52 | d_out[blockIdx.x] = sdata[0];
53 | }
54 | }
55 |
56 | void reduce(float * d_out, float * d_intermediate, float * d_in,
57 | int size, bool usesSharedMemory)
58 | {
59 | // assumes that size is not greater than maxThreadsPerBlock^2
60 | // and that size is a multiple of maxThreadsPerBlock
61 | const int maxThreadsPerBlock = 1024;
62 | int threads = maxThreadsPerBlock;
63 | int blocks = size / maxThreadsPerBlock;
64 | if (usesSharedMemory)
65 | {
66 | shmem_reduce_kernel<<>>
67 | (d_intermediate, d_in);
68 | }
69 | else
70 | {
71 | global_reduce_kernel<<>>
72 | (d_intermediate, d_in);
73 | }
74 | // now we're down to one block left, so reduce it
75 | threads = blocks; // launch one thread for each block in prev step
76 | blocks = 1;
77 | if (usesSharedMemory)
78 | {
79 | shmem_reduce_kernel<<>>
80 | (d_out, d_intermediate);
81 | }
82 | else
83 | {
84 | global_reduce_kernel<<>>
85 | (d_out, d_intermediate);
86 | }
87 | }
88 |
89 | int main(int argc, char **argv)
90 | {
91 | int deviceCount;
92 | cudaGetDeviceCount(&deviceCount);
93 | if (deviceCount == 0) {
94 | fprintf(stderr, "error: no devices supporting CUDA.\n");
95 | exit(EXIT_FAILURE);
96 | }
97 | int dev = 0;
98 | cudaSetDevice(dev);
99 |
100 | cudaDeviceProp devProps;
101 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
102 | {
103 | printf("Using device %d:\n", dev);
104 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
105 | devProps.name, (int)devProps.totalGlobalMem,
106 | (int)devProps.major, (int)devProps.minor,
107 | (int)devProps.clockRate);
108 | }
109 |
110 | const int ARRAY_SIZE = 1 << 20;
111 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
112 |
113 | // generate the input array on the host
114 | float h_in[ARRAY_SIZE];
115 | float sum = 0.0f;
116 | for(int i = 0; i < ARRAY_SIZE; i++) {
117 | // generate random float in [-1.0f, 1.0f]
118 | h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
119 | sum += h_in[i];
120 | }
121 |
122 | // declare GPU memory pointers
123 | float * d_in, * d_intermediate, * d_out;
124 |
125 | // allocate GPU memory
126 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
127 | cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
128 | cudaMalloc((void **) &d_out, sizeof(float));
129 |
130 | // transfer the input array to the GPU
131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
132 |
133 | int whichKernel = 0;
134 | if (argc == 2) {
135 | whichKernel = atoi(argv[1]);
136 | }
137 |
138 | cudaEvent_t start, stop;
139 | cudaEventCreate(&start);
140 | cudaEventCreate(&stop);
141 | // launch the kernel
142 | switch(whichKernel) {
143 | case 0:
144 | printf("Running global reduce\n");
145 | cudaEventRecord(start, 0);
146 | for (int i = 0; i < 100; i++)
147 | {
148 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
149 | }
150 | cudaEventRecord(stop, 0);
151 | break;
152 | case 1:
153 | printf("Running reduce with shared mem\n");
154 | cudaEventRecord(start, 0);
155 | for (int i = 0; i < 100; i++)
156 | {
157 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
158 | }
159 | cudaEventRecord(stop, 0);
160 | break;
161 | default:
162 | fprintf(stderr, "error: ran no kernel\n");
163 | exit(EXIT_FAILURE);
164 | }
165 | cudaEventSynchronize(stop);
166 | float elapsedTime;
167 | cudaEventElapsedTime(&elapsedTime, start, stop);
168 | elapsedTime /= 100.0f; // 100 trials
169 |
170 | // copy back the sum from GPU
171 | float h_out;
172 | cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
173 |
174 | printf("average time elapsed: %f\n", elapsedTime);
175 |
176 | // free GPU memory allocation
177 | cudaFree(d_in);
178 | cudaFree(d_intermediate);
179 | cudaFree(d_out);
180 |
181 | return 0;
182 | }
183 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/transpose.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "gputimer.h"
3 |
4 | const int N= 1024; // matrix size is NxN
5 | const int K= 32; // tile size is KxK
6 |
7 | // Utility functions: compare, print, and fill matrices
8 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
9 |
10 | template
11 | void check(T err, const char* const func, const char* const file, const int line)
12 | {
13 | if (err != cudaSuccess) {
14 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
15 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
16 | exit(1);
17 | }
18 | }
19 |
20 | int compare_matrices(float *gpu, float *ref)
21 | {
22 | int result = 0;
23 |
24 | for(int j=0; j < N; j++)
25 | for(int i=0; i < N; i++)
26 | if (ref[i + j*N] != gpu[i + j*N])
27 | {
28 | // printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
29 | // i,j,ref[i+j*N],i,j,test[i+j*N]);
30 | result = 1;
31 | }
32 | return result;
33 | }
34 |
35 | void print_matrix(float *mat)
36 | {
37 | for(int j=0; j < N; j++)
38 | {
39 | for(int i=0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
40 | printf("\n");
41 | }
42 | }
43 |
44 | // fill a matrix with sequential numbers in the range 0..N-1
45 | void fill_matrix(float *mat)
46 | {
47 | for(int j=0; j < N * N; j++)
48 | mat[j] = (float) j;
49 | }
50 |
51 |
52 |
53 | void
54 | transpose_CPU(float in[], float out[])
55 | {
56 | for(int j=0; j < N; j++)
57 | for(int i=0; i < N; i++)
58 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
59 | }
60 |
61 | // to be launched on a single thread
62 | __global__ void
63 | transpose_serial(float in[], float out[])
64 | {
65 | for(int j=0; j < N; j++)
66 | for(int i=0; i < N; i++)
67 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
68 | }
69 |
70 | // to be launched with one thread per row of output matrix
71 | __global__ void
72 | transpose_parallel_per_row(float in[], float out[])
73 | {
74 | int i = threadIdx.x;
75 |
76 | for(int j=0; j < N; j++)
77 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
78 | }
79 |
80 | // to be launched with one thread per element, in KxK threadblocks
81 | // thread (x,y) in grid writes element (i,j) of output matrix
82 | __global__ void
83 | transpose_parallel_per_element(float in[], float out[])
84 | {
85 | int i = blockIdx.x * K + threadIdx.x;
86 | int j = blockIdx.y * K + threadIdx.y;
87 |
88 | out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
89 | }
90 |
91 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
92 | // thread blocks read & write tiles, in coalesced fashion
93 | // adjacent threads read adjacent input elements, write adjacent output elmts
94 | __global__ void
95 | transpose_parallel_per_element_tiled(float in[], float out[])
96 | {
97 | // (i,j) locations of the tile corners for input & output matrices:
98 | int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
99 | int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
100 |
101 | int x = threadIdx.x, y = threadIdx.y;
102 |
103 | __shared__ float tile[K][K];
104 |
105 | // coalesced read from global mem, TRANSPOSED write into shared mem:
106 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
107 | __syncthreads();
108 | // read from shared mem, coalesced write to global mem:
109 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
110 | }
111 |
112 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
113 | // thread blocks read & write tiles, in coalesced fashion
114 | // adjacent threads read adjacent input elements, write adjacent output elmts
115 | __global__ void
116 | transpose_parallel_per_element_tiled16(float in[], float out[])
117 | {
118 | // (i,j) locations of the tile corners for input & output matrices:
119 | int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
120 | int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
121 |
122 | int x = threadIdx.x, y = threadIdx.y;
123 |
124 | __shared__ float tile[16][16];
125 |
126 | // coalesced read from global mem, TRANSPOSED write into shared mem:
127 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
128 | __syncthreads();
129 | // read from shared mem, coalesced write to global mem:
130 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
131 | }
132 |
133 | // to be launched with one thread per element, in KxK threadblocks
134 | // thread blocks read & write tiles, in coalesced fashion
135 | // shared memory array padded to avoid bank conflicts
136 | __global__ void
137 | transpose_parallel_per_element_tiled_padded(float in[], float out[])
138 | {
139 | // (i,j) locations of the tile corners for input & output matrices:
140 | int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
141 | int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
142 |
143 | int x = threadIdx.x, y = threadIdx.y;
144 |
145 | __shared__ float tile[K][K+1];
146 |
147 | // coalesced read from global mem, TRANSPOSED write into shared mem:
148 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
149 | __syncthreads();
150 | // read from shared mem, coalesced write to global mem:
151 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
152 | }
153 |
154 | // to be launched with one thread per element, in KxK threadblocks
155 | // thread blocks read & write tiles, in coalesced fashion
156 | // shared memory array padded to avoid bank conflicts
157 | __global__ void
158 | transpose_parallel_per_element_tiled_padded16(float in[], float out[])
159 | {
160 | // (i,j) locations of the tile corners for input & output matrices:
161 | int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
162 | int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
163 |
164 | int x = threadIdx.x, y = threadIdx.y;
165 |
166 | __shared__ float tile[16][16+1];
167 |
168 | // coalesced read from global mem, TRANSPOSED write into shared mem:
169 | tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
170 | __syncthreads();
171 | // read from shared mem, coalesced write to global mem:
172 | out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
173 | }
174 |
175 | int main(int argc, char **argv)
176 | {
177 | int numbytes = N * N * sizeof(float);
178 |
179 | float *in = (float *) malloc(numbytes);
180 | float *out = (float *) malloc(numbytes);
181 | float *gold = (float *) malloc(numbytes);
182 |
183 | fill_matrix(in);
184 | transpose_CPU(in, gold);
185 |
186 | float *d_in, *d_out;
187 |
188 | cudaMalloc(&d_in, numbytes);
189 | cudaMalloc(&d_out, numbytes);
190 | cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
191 |
192 | GpuTimer timer;
193 |
194 | /*
195 | * Now time each kernel and verify that it produces the correct result.
196 | *
197 | * To be really careful about benchmarking purposes, we should run every kernel once
198 | * to "warm" the system and avoid any compilation or code-caching effects, then run
199 | * every kernel 10 or 100 times and average the timings to smooth out any variance.
200 | * But this makes for messy code and our goal is teaching, not detailed benchmarking.
201 | */
202 |
203 | timer.Start();
204 | transpose_serial<<<1,1>>>(d_in, d_out);
205 | timer.Stop();
206 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
207 | printf("transpose_serial: %g ms.\nVerifying transpose...%s\n",
208 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
209 |
210 | timer.Start();
211 | transpose_parallel_per_row<<<1,N>>>(d_in, d_out);
212 | timer.Stop();
213 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
214 | printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n",
215 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
216 |
217 | dim3 blocks(N/K,N/K); // blocks per grid
218 | dim3 threads(K,K); // threads per block
219 |
220 | timer.Start();
221 | transpose_parallel_per_element<<>>(d_in, d_out);
222 | timer.Stop();
223 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
224 | printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
225 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
226 |
227 | timer.Start();
228 | transpose_parallel_per_element_tiled<<>>(d_in, d_out);
229 | timer.Stop();
230 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
231 | printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n",
232 | K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
233 |
234 | dim3 blocks16x16(N/16,N/16); // blocks per grid
235 | dim3 threads16x16(16,16); // threads per block
236 |
237 | timer.Start();
238 | transpose_parallel_per_element_tiled16<<>>(d_in, d_out);
239 | timer.Stop();
240 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
241 | printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n",
242 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
243 |
244 | timer.Start();
245 | transpose_parallel_per_element_tiled_padded16<<>>(d_in, d_out);
246 | timer.Stop();
247 | cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
248 | printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n",
249 | timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
250 |
251 | cudaFree(d_in);
252 | cudaFree(d_out);
253 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /******************************************************************************
30 | * Simple demonstration of cub::BlockScan
31 | *
32 | * Example compilation string:
33 | *
34 | * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum
35 | *
36 | ******************************************************************************/
37 |
38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h)
39 | #define CUB_STDERR
40 |
41 | #include
42 | #include
43 |
44 | #include
45 |
46 | using namespace cub;
47 |
48 | //---------------------------------------------------------------------
49 | // Globals, constants and typedefs
50 | //---------------------------------------------------------------------
51 |
52 | bool g_verbose = false;
53 | int g_iterations = 100;
54 |
55 |
56 | //---------------------------------------------------------------------
57 | // Kernels
58 | //---------------------------------------------------------------------
59 |
60 | /**
61 | * Simple kernel for performing a block-wide exclusive prefix sum over integers
62 | */
63 | template <
64 | int BLOCK_THREADS,
65 | int ITEMS_PER_THREAD>
66 | __global__ void BlockPrefixSumKernel(
67 | int *d_in, // Tile of input
68 | int *d_out, // Tile of output
69 | clock_t *d_elapsed) // Elapsed cycle count of block scan
70 | {
71 | // Parameterize BlockScan type for our thread block
72 | typedef BlockScan BlockScanT;
73 |
74 | // Shared memory
75 | __shared__ typename BlockScanT::SmemStorage smem_storage;
76 |
77 | // Per-thread tile data
78 | int data[ITEMS_PER_THREAD];
79 | BlockLoadVectorized(d_in, data);
80 |
81 | // Start cycle timer
82 | clock_t start = clock();
83 |
84 | // Compute exclusive prefix sum
85 | int aggregate;
86 | BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
87 |
88 | // Stop cycle timer
89 | clock_t stop = clock();
90 |
91 | // Store output
92 | BlockStoreVectorized(d_out, data);
93 |
94 | // Store aggregate and elapsed clocks
95 | if (threadIdx.x == 0)
96 | {
97 | *d_elapsed = (start > stop) ? start - stop : stop - start;
98 | d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
99 | }
100 | }
101 |
102 |
103 |
104 | //---------------------------------------------------------------------
105 | // Host utilities
106 | //---------------------------------------------------------------------
107 |
108 | /**
109 | * Initialize exclusive prefix sum problem (and solution).
110 | * Returns the aggregate
111 | */
112 | int Initialize(
113 | int *h_in,
114 | int *h_reference,
115 | int num_elements)
116 | {
117 | int inclusive = 0;
118 |
119 | for (int i = 0; i < num_elements; ++i)
120 | {
121 | h_in[i] = i % 17;
122 |
123 | h_reference[i] = inclusive;
124 | inclusive += h_in[i];
125 | }
126 |
127 | return inclusive;
128 | }
129 |
130 |
131 | /**
132 | * Test thread block scan
133 | */
134 | template <
135 | int BLOCK_THREADS,
136 | int ITEMS_PER_THREAD>
137 | void Test()
138 | {
139 | const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
140 |
141 | // Allocate host arrays
142 | int *h_in = new int[TILE_SIZE];
143 | int *h_reference = new int[TILE_SIZE];
144 | int *h_gpu = new int[TILE_SIZE + 1];
145 |
146 | // Initialize problem and reference output on host
147 | int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
148 |
149 | // Initialize device arrays
150 | int *d_in = NULL;
151 | int *d_out = NULL;
152 | clock_t *d_elapsed = NULL;
153 | cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE);
154 | cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
155 | cudaMalloc((void**)&d_elapsed, sizeof(clock_t));
156 |
157 | // Display input problem data
158 | if (g_verbose)
159 | {
160 | printf("Input data: ");
161 | for (int i = 0; i < TILE_SIZE; i++)
162 | printf("%d, ", h_in[i]);
163 | printf("\n\n");
164 | }
165 |
166 | // Copy problem to device
167 | cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
168 |
169 | printf("BlockScan %d items (%d threads, %d items per thread): ",
170 | TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
171 |
172 | // Run this several times and average the performance results
173 | clock_t elapsed_scan_clocks = 0;
174 | for (int i = 0; i < g_iterations; ++i)
175 | {
176 | // Run aggregate/prefix kernel
177 | BlockPrefixSumKernel<<<1, BLOCK_THREADS>>>(
178 | d_in,
179 | d_out,
180 | d_elapsed);
181 |
182 | // Copy results from device
183 | clock_t scan_clocks;
184 | cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
185 | cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
186 | elapsed_scan_clocks += scan_clocks;
187 | }
188 |
189 | // Check scanned items
190 | bool correct = true;
191 | for (int i = 0; i < TILE_SIZE; i++)
192 | {
193 | if (h_gpu[i] != h_reference[i])
194 | {
195 | printf("Incorrect result @ offset %d (%d != %d)\n",
196 | i, h_gpu[i], h_reference[i]);
197 | correct = false;
198 | break;
199 | }
200 | }
201 |
202 | // Check total aggregate
203 | if (h_gpu[TILE_SIZE] != h_aggregate)
204 | {
205 | printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
206 | correct = false;
207 | }
208 | if (correct) printf("Correct!\n");
209 |
210 | // Display results problem data
211 | if (g_verbose)
212 | {
213 | printf("GPU output (reference output): ");
214 | for (int i = 0; i < TILE_SIZE; i++)
215 | printf("%d (%d), ", h_gpu[i], h_reference[i]);
216 | printf("\n");
217 | printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
218 | printf("\n\n");
219 | }
220 |
221 | // Display timing results
222 | printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
223 |
224 | // Cleanup
225 | if (h_in) delete[] h_in;
226 | if (h_reference) delete[] h_reference;
227 | if (h_gpu) delete[] h_gpu;
228 | if (d_in) cudaFree(d_in);
229 | if (d_out) cudaFree(d_out);
230 | if (d_elapsed) cudaFree(d_elapsed);
231 | }
232 |
233 |
234 | /**
235 | * Main
236 | */
237 | int main(int argc, char** argv)
238 | {
239 | // Display GPU name
240 | cudaDeviceProp props;
241 | cudaGetDeviceProperties(&props, 0);
242 | printf("Using device %s\n", props.name);
243 |
244 | /** Add tests here **/
245 |
246 | // Run tests
247 | Test<1024, 1>();
248 | Test<512, 2>();
249 | Test<256, 4>();
250 | Test<128, 8>();
251 | Test<64, 16>();
252 | Test<32, 32>();
253 | Test<16, 64>();
254 |
255 | /****/
256 |
257 | return 0;
258 | }
259 |
260 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "gputimer.h"
10 |
11 | int main(void)
12 | {
13 | // generate N random numbers serially
14 | int N = 1000000;
15 | thrust::host_vector h_vec(N);
16 | std::generate(h_vec.begin(), h_vec.end(), rand);
17 |
18 | // transfer data to the device
19 | thrust::device_vector d_vec = h_vec;
20 |
21 | // sort data on the device (846M keys per second on GeForce GTX 480)
22 | GpuTimer timer;
23 | timer.Start();
24 | thrust::sort(d_vec.begin(), d_vec.end());
25 | timer.Stop();
26 |
27 | // transfer data back to host
28 | thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
29 |
30 | printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
31 | return 0;
32 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "gputimer.h"
3 | #include "utils.h"
4 |
5 | const int BLOCKSIZE = 128;
6 | const int NUMBLOCKS = 1000; // set this to 1 or 2 for debugging
7 | const int N = BLOCKSIZE*NUMBLOCKS;
8 |
9 | /*
10 | * TODO: modify the foo and bar kernels to use tiling:
11 | * - copy the input data to shared memory
12 | * - perform the computation there
13 | * - copy the result back to global memory
14 | * - assume thread blocks of 128 threads
15 | * - handle intra-block boundaries correctly
16 | * You can ignore boundary conditions (we ignore the first 2 and last 2 elements)
17 | */
18 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){
19 |
20 | int i = threadIdx.x + blockIdx.x*blockDim.x;
21 |
22 | out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
23 | }
24 |
25 | __global__ void bar(float out[], float in[])
26 | {
27 | int i = threadIdx.x + blockIdx.x*blockDim.x;
28 |
29 | out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
30 | }
31 |
32 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[])
33 | {
34 | for (int i=0; i>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE);
91 | fooTimer.Stop();
92 |
93 | barTimer.Start();
94 | bar<<>>(d_barOut, d_barIn);
95 | barTimer.Stop();
96 |
97 | cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost);
98 | cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost);
99 | printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed());
100 | compareArrays(ref_fooOut, fooOut, N);
101 | printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
102 | compareArrays(ref_barOut, barOut, N);
103 | }
104 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h:
--------------------------------------------------------------------------------
1 | // error checking utility functions
2 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
3 |
4 | template
5 | void check(T err, const char* const func, const char* const file, const int line)
6 | {
7 | if (err != cudaSuccess) {
8 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
9 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
10 | exit(1);
11 | }
12 | }
13 |
14 | void printArray(float in[], int N)
15 | {
16 | for (int i=0; i CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu})
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/HW1.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "utils.h"
5 | #include
6 | #include
7 | #include
8 |
9 | cv::Mat imageRGBA;
10 | cv::Mat imageGrey;
11 |
12 | uchar4 *d_rgbaImage__;
13 | unsigned char *d_greyImage__;
14 |
15 | size_t numRows() { return imageRGBA.rows; }
16 | size_t numCols() { return imageRGBA.cols; }
17 |
18 | //return types are void since any internal error will be handled by quitting
19 | //no point in returning error codes...
20 | //returns a pointer to an RGBA version of the input image
21 | //and a pointer to the single channel grey-scale output
22 | //on both the host and device
23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage,
24 | uchar4 **d_rgbaImage, unsigned char **d_greyImage,
25 | const std::string &filename) {
26 | //make sure the context initializes ok
27 | checkCudaErrors(cudaFree(0));
28 |
29 | cv::Mat image;
30 | image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
31 | if (image.empty()) {
32 | std::cerr << "Couldn't open file: " << filename << std::endl;
33 | exit(1);
34 | }
35 |
36 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
37 |
38 | //allocate memory for the output
39 | imageGrey.create(image.rows, image.cols, CV_8UC1);
40 |
41 | //This shouldn't ever happen given the way the images are created
42 | //at least based upon my limited understanding of OpenCV, but better to check
43 | if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
44 | std::cerr << "Images aren't continuous!! Exiting." << std::endl;
45 | exit(1);
46 | }
47 |
48 | *inputImage = (uchar4 *)imageRGBA.ptr(0);
49 | *greyImage = imageGrey.ptr(0);
50 |
51 | const size_t numPixels = numRows() * numCols();
52 | //allocate memory on the device for both input and output
53 | checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels));
54 | checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels));
55 | checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around
56 |
57 | //copy input array to the GPU
58 | checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
59 |
60 | d_rgbaImage__ = *d_rgbaImage;
61 | d_greyImage__ = *d_greyImage;
62 | }
63 |
64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) {
65 | cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
66 |
67 | //output the image
68 | cv::imwrite(output_file.c_str(), output);
69 | }
70 |
71 | void cleanup()
72 | {
73 | //cleanup
74 | cudaFree(d_rgbaImage__);
75 | cudaFree(d_greyImage__);
76 | }
77 |
78 | void generateReferenceImage(std::string input_filename, std::string output_filename)
79 | {
80 | cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE);
81 |
82 | cv::imwrite(output_filename, reference);
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | # or if using MacPorts
19 |
20 | #OPENCV_LIBPATH=/opt/local/lib
21 | #OPENCV_INCLUDEPATH=/opt/local/include
22 |
23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
24 |
25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
26 |
27 | ######################################################
28 | # On Macs the default install locations are below #
29 | # ####################################################
30 |
31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
32 | #CUDA_LIBPATH=/usr/local/cuda/lib
33 |
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 |
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 |
38 | student: main.o student_func.o compare.o reference_calc.o Makefile
39 | $(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 |
41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp
42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH)
43 |
44 | student_func.o: student_func.cu utils.h
45 | nvcc -c student_func.cu $(NVCC_OPTS)
46 |
47 | compare.o: compare.cpp compare.h
48 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 |
50 | reference_calc.o: reference_calc.cpp reference_calc.h
51 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 |
53 | clean:
54 | rm -f *.o *.png hw
55 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre_small.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "utils.h"
6 |
7 | void compareImages(std::string reference_filename, std::string test_filename,
8 | bool useEpsCheck, double perPixelError, double globalError)
9 | {
10 | cv::Mat reference = cv::imread(reference_filename, -1);
11 | cv::Mat test = cv::imread(test_filename, -1);
12 |
13 | cv::Mat diff = abs(reference - test);
14 |
15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 |
17 | double minVal, maxVal;
18 |
19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 |
21 | //now perform transform so that we bump values to the full range
22 |
23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 |
25 | diff = diffSingleChannel.reshape(reference.channels(), 0);
26 |
27 | cv::imwrite("HW1_differenceImage.png", diff);
28 | //OK, now we can start comparing values...
29 | unsigned char *referencePtr = reference.ptr(0);
30 | unsigned char *testPtr = test.ptr(0);
31 |
32 | if (useEpsCheck) {
33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 | }
35 | else
36 | {
37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 | }
39 |
40 | std::cout << "PASS" << std::endl;
41 | return;
42 | }
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename,
5 | bool useEpsCheck, double perPixelError, double globalError);
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW1 Solution
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include "reference_calc.h"
9 | #include "compare.h"
10 |
11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
12 | uchar4 * const d_rgbaImage,
13 | unsigned char* const d_greyImage,
14 | size_t numRows, size_t numCols);
15 |
16 | //include the definitions of the above functions for this homework
17 | #include "HW1.cpp"
18 |
19 | int main(int argc, char **argv) {
20 | uchar4 *h_rgbaImage, *d_rgbaImage;
21 | unsigned char *h_greyImage, *d_greyImage;
22 |
23 | std::string input_file;
24 | std::string output_file;
25 | std::string reference_file;
26 | double perPixelError = 0.0;
27 | double globalError = 0.0;
28 | bool useEpsCheck = false;
29 | switch (argc)
30 | {
31 | case 2:
32 | input_file = std::string(argv[1]);
33 | output_file = "HW1_output.png";
34 | reference_file = "HW1_reference.png";
35 | break;
36 | case 3:
37 | input_file = std::string(argv[1]);
38 | output_file = std::string(argv[2]);
39 | reference_file = "HW1_reference.png";
40 | break;
41 | case 4:
42 | input_file = std::string(argv[1]);
43 | output_file = std::string(argv[2]);
44 | reference_file = std::string(argv[3]);
45 | break;
46 | case 6:
47 | useEpsCheck=true;
48 | input_file = std::string(argv[1]);
49 | output_file = std::string(argv[2]);
50 | reference_file = std::string(argv[3]);
51 | perPixelError = atof(argv[4]);
52 | globalError = atof(argv[5]);
53 | break;
54 | default:
55 | std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
56 | exit(1);
57 | }
58 | //load the image and give us our input and output pointers
59 | preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
60 |
61 | GpuTimer timer;
62 | timer.Start();
63 | //call the students' code
64 | your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
65 | timer.Stop();
66 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
67 |
68 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
69 |
70 | if (err < 0) {
71 | //Couldn't print! Probably the student closed stdout - bad news
72 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
73 | exit(1);
74 | }
75 |
76 | size_t numPixels = numRows()*numCols();
77 | checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));
78 |
79 | //check results and output the grey image
80 | postProcess(output_file, h_greyImage);
81 |
82 | referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());
83 |
84 | postProcess(reference_file, h_greyImage);
85 |
86 | //generateReferenceImage(input_file, reference_file);
87 | compareImages(reference_file, output_file, useEpsCheck, perPixelError,
88 | globalError);
89 |
90 | cleanup();
91 |
92 | return 0;
93 | }
94 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | // for uchar4 struct
2 | #include
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 | unsigned char *const greyImage,
6 | size_t numRows,
7 | size_t numCols)
8 | {
9 | for (size_t r = 0; r < numRows; ++r) {
10 | for (size_t c = 0; c < numCols; ++c) {
11 | uchar4 rgba = rgbaImage[r * numCols + c];
12 | float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
13 | greyImage[r * numCols + c] = channelSum;
14 | }
15 | }
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 | unsigned char *const greyImage,
6 | size_t numRows,
7 | size_t numCols);
8 |
9 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/student_func.cu:
--------------------------------------------------------------------------------
1 | // Homework 1
2 | // Color to Greyscale Conversion
3 |
4 | //A common way to represent color images is known as RGBA - the color
5 | //is specified by how much Red, Grean and Blue is in it.
6 | //The 'A' stands for Alpha and is used for transparency, it will be
7 | //ignored in this homework.
8 |
9 | //Each channel Red, Blue, Green and Alpha is represented by one byte.
10 | //Since we are using one byte for each color there are 256 different
11 | //possible values for each color. This means we use 4 bytes per pixel.
12 |
13 | //Greyscale images are represented by a single intensity value per pixel
14 | //which is one byte in size.
15 |
16 | //To convert an image from color to grayscale one simple method is to
17 | //set the intensity to the average of the RGB channels. But we will
18 | //use a more sophisticated method that takes into account how the eye
19 | //perceives color and weights the channels unequally.
20 |
21 | //The eye responds most strongly to green followed by red and then blue.
22 | //The NTSC (National Television System Committee) recommends the following
23 | //formula for color to greyscale conversion:
24 |
25 | //I = .299f * R + .587f * G + .114f * B
26 |
27 | //Notice the trailing f's on the numbers which indicate that they are
28 | //single precision floating point constants and not double precision
29 | //constants.
30 |
31 | //You should fill in the kernel as well as set the block and grid sizes
32 | //so that the entire image is processed.
33 |
34 | #include "utils.h"
35 |
36 | __global__
37 | void rgba_to_greyscale(const uchar4* const rgbaImage,
38 | unsigned char* const greyImage,
39 | int numRows, int numCols)
40 | {
41 | //TODO
42 | //Fill in the kernel to convert from color to greyscale
43 | //the mapping from components of a uchar4 to RGBA is:
44 | // .x -> R ; .y -> G ; .z -> B ; .w -> A
45 | //
46 | //The output (greyImage) at each pixel should be the result of
47 | //applying the formula: output = .299f * R + .587f * G + .114f * B;
48 | //Note: We will be ignoring the alpha channel for this conversion
49 |
50 | //First create a mapping from the 2D block and grid locations
51 | //to an absolute 2D location in the image, then use that to
52 | //calculate a 1D offset
53 | }
54 |
55 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
56 | unsigned char* const d_greyImage, size_t numRows, size_t numCols)
57 | {
58 | //You must fill in the correct sizes for the blockSize and gridSize
59 | //currently only one block with one thread is being launched
60 | const dim3 blockSize(1, 1, 1); //TODO
61 | const dim3 gridSize( 1, 1, 1); //TODO
62 | rgba_to_greyscale<<>>(d_rgbaImage, d_greyImage, numRows, numCols);
63 |
64 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2.zip
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu})
15 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/HW2.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "utils.h"
5 | #include
6 | #include
7 | #include
8 |
9 | cv::Mat imageInputRGBA;
10 | cv::Mat imageOutputRGBA;
11 |
12 | uchar4 *d_inputImageRGBA__;
13 | uchar4 *d_outputImageRGBA__;
14 |
15 | float *h_filter__;
16 |
17 | size_t numRows() { return imageInputRGBA.rows; }
18 | size_t numCols() { return imageInputRGBA.cols; }
19 |
20 | //return types are void since any internal error will be handled by quitting
21 | //no point in returning error codes...
22 | //returns a pointer to an RGBA version of the input image
23 | //and a pointer to the single channel grey-scale output
24 | //on both the host and device
25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
26 | uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
27 | unsigned char **d_redBlurred,
28 | unsigned char **d_greenBlurred,
29 | unsigned char **d_blueBlurred,
30 | float **h_filter, int *filterWidth,
31 | const std::string &filename) {
32 |
33 | //make sure the context initializes ok
34 | checkCudaErrors(cudaFree(0));
35 |
36 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
37 | if (image.empty()) {
38 | std::cerr << "Couldn't open file: " << filename << std::endl;
39 | exit(1);
40 | }
41 |
42 | cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);
43 |
44 | //allocate memory for the output
45 | imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);
46 |
47 | //This shouldn't ever happen given the way the images are created
48 | //at least based upon my limited understanding of OpenCV, but better to check
49 | if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
50 | std::cerr << "Images aren't continuous!! Exiting." << std::endl;
51 | exit(1);
52 | }
53 |
54 | *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr(0);
55 | *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr(0);
56 |
57 | const size_t numPixels = numRows() * numCols();
58 | //allocate memory on the device for both input and output
59 | checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
60 | checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
61 | checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around
62 |
63 | //copy input array to the GPU
64 | checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
65 |
66 | d_inputImageRGBA__ = *d_inputImageRGBA;
67 | d_outputImageRGBA__ = *d_outputImageRGBA;
68 |
69 | //now create the filter that they will use
70 | const int blurKernelWidth = 9;
71 | const float blurKernelSigma = 2.;
72 |
73 | *filterWidth = blurKernelWidth;
74 |
75 | //create and fill the filter we will convolve with
76 | *h_filter = new float[blurKernelWidth * blurKernelWidth];
77 | h_filter__ = *h_filter;
78 |
79 | float filterSum = 0.f; //for normalization
80 |
81 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
82 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
83 | float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
84 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
85 | filterSum += filterValue;
86 | }
87 | }
88 |
89 | float normalizationFactor = 1.f / filterSum;
90 |
91 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
92 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
93 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
94 | }
95 | }
96 |
97 | //blurred
98 | checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels));
99 | checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels));
100 | checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels));
101 | checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels));
102 | checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
103 | checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels));
104 | }
105 |
106 | void postProcess(const std::string& output_file, uchar4* data_ptr) {
107 | cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
108 |
109 | cv::Mat imageOutputBGR;
110 | cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
111 | //output the image
112 | cv::imwrite(output_file.c_str(), imageOutputBGR);
113 | }
114 |
115 | void cleanUp(void)
116 | {
117 | cudaFree(d_inputImageRGBA__);
118 | cudaFree(d_outputImageRGBA__);
119 | delete[] h_filter__;
120 | }
121 |
122 |
123 | // An unused bit of code showing how to accomplish this assignment using OpenCV. It is much faster
124 | // than the naive implementation in reference_calc.cpp.
125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size)
126 | {
127 | cv::Mat input = cv::imread(input_file);
128 | // Create an identical image for the output as a placeholder
129 | cv::Mat reference = cv::imread(input_file);
130 | cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0);
131 | cv::imwrite(reference_file, reference);
132 | }
133 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
20 |
21 | ######################################################
22 | # On Macs the default install locations are below #
23 | # ####################################################
24 |
25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
26 | #CUDA_LIBPATH=/usr/local/cuda/lib
27 |
28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
29 |
30 | GCC_OPTS=-O3 -Wall -Wextra -m64
31 |
32 | student: main.o student_func.o compare.o reference_calc.o Makefile
33 | $(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
34 |
35 | main.o: main.cpp timer.h utils.h HW2.cpp
36 | g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH)
37 |
38 | student_func.o: student_func.cu reference_calc.cpp utils.h
39 | nvcc -c student_func.cu $(NVCC_OPTS)
40 |
41 | compare.o: compare.cpp compare.h
42 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 |
44 | reference_calc.o: reference_calc.cpp reference_calc.h
45 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
46 |
47 | clean:
48 | rm -f *.o *.png hw
49 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre_small.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "utils.h"
6 |
7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
8 | double perPixelError, double globalError)
9 | {
10 | cv::Mat reference = cv::imread(reference_filename, -1);
11 | cv::Mat test = cv::imread(test_filename, -1);
12 |
13 | cv::Mat diff = abs(reference - test);
14 |
15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 |
17 | double minVal, maxVal;
18 |
19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 |
21 | //now perform transform so that we bump values to the full range
22 |
23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 |
25 | diff = diffSingleChannel.reshape(reference.channels(), 0);
26 |
27 | cv::imwrite("HW2_differenceImage.png", diff);
28 | //OK, now we can start comparing values...
29 | unsigned char *referencePtr = reference.ptr(0);
30 | unsigned char *testPtr = test.ptr(0);
31 |
32 | if (useEpsCheck) {
33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 | }
35 | else
36 | {
37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 | }
39 |
40 | std::cout << "PASS" << std::endl;
41 | return;
42 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW2 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 |
9 | #include "reference_calc.h"
10 | #include "compare.h"
11 |
12 | //include the definitions of the above functions for this homework
13 | #include "HW2.cpp"
14 |
15 |
16 | /******* DEFINED IN student_func.cu *********/
17 |
18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
19 | uchar4* const d_outputImageRGBA,
20 | const size_t numRows, const size_t numCols,
21 | unsigned char *d_redBlurred,
22 | unsigned char *d_greenBlurred,
23 | unsigned char *d_blueBlurred,
24 | const int filterWidth);
25 |
26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
27 | const float* const h_filter, const size_t filterWidth);
28 |
29 |
30 | /******* Begin main *********/
31 |
32 | int main(int argc, char **argv) {
33 | uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
34 | uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
35 | unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;
36 |
37 | float *h_filter;
38 | int filterWidth;
39 |
40 | std::string input_file;
41 | std::string output_file;
42 | std::string reference_file;
43 | double perPixelError = 0.0;
44 | double globalError = 0.0;
45 | bool useEpsCheck = false;
46 | switch (argc)
47 | {
48 | case 2:
49 | input_file = std::string(argv[1]);
50 | output_file = "HW2_output.png";
51 | reference_file = "HW2_reference.png";
52 | break;
53 | case 3:
54 | input_file = std::string(argv[1]);
55 | output_file = std::string(argv[2]);
56 | reference_file = "HW2_reference.png";
57 | break;
58 | case 4:
59 | input_file = std::string(argv[1]);
60 | output_file = std::string(argv[2]);
61 | reference_file = std::string(argv[3]);
62 | break;
63 | case 6:
64 | useEpsCheck=true;
65 | input_file = std::string(argv[1]);
66 | output_file = std::string(argv[2]);
67 | reference_file = std::string(argv[3]);
68 | perPixelError = atof(argv[4]);
69 | globalError = atof(argv[5]);
70 | break;
71 | default:
72 | std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
73 | exit(1);
74 | }
75 | //load the image and give us our input and output pointers
76 | preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
77 | &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
78 | &h_filter, &filterWidth, input_file);
79 |
80 | allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
81 | GpuTimer timer;
82 | timer.Start();
83 | //call the students' code
84 | your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
85 | d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
86 | timer.Stop();
87 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
88 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
89 |
90 | if (err < 0) {
91 | //Couldn't print! Probably the student closed stdout - bad news
92 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
93 | exit(1);
94 | }
95 |
96 | //check results and output the blurred image
97 |
98 | size_t numPixels = numRows()*numCols();
99 | //copy the output back to the host
100 | checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
101 |
102 | postProcess(output_file, h_outputImageRGBA);
103 |
104 | referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
105 | numRows(), numCols(),
106 | h_filter, filterWidth);
107 |
108 | postProcess(reference_file, h_outputImageRGBA);
109 |
110 | // Cheater easy way with OpenCV
111 | //generateReferenceImage(input_file, reference_file, filterWidth);
112 |
113 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
114 |
115 | checkCudaErrors(cudaFree(d_redBlurred));
116 | checkCudaErrors(cudaFree(d_greenBlurred));
117 | checkCudaErrors(cudaFree(d_blueBlurred));
118 |
119 | cleanUp();
120 |
121 | return 0;
122 | }
123 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | // for uchar4 struct
4 | #include
5 |
6 | void channelConvolution(const unsigned char* const channel,
7 | unsigned char* const channelBlurred,
8 | const size_t numRows, const size_t numCols,
9 | const float *filter, const int filterWidth)
10 | {
11 | //Dealing with an even width filter is trickier
12 | assert(filterWidth % 2 == 1);
13 |
14 | //For every pixel in the image
15 | for (int r = 0; r < (int)numRows; ++r) {
16 | for (int c = 0; c < (int)numCols; ++c) {
17 | float result = 0.f;
18 | //For every value in the filter around the pixel (c, r)
19 | for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) {
20 | for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) {
21 | //Find the global image position for this filter position
22 | //clamp to boundary of the image
23 | int image_r = std::min(std::max(r + filter_r, 0), static_cast(numRows - 1));
24 | int image_c = std::min(std::max(c + filter_c, 0), static_cast(numCols - 1));
25 |
26 | float image_value = static_cast(channel[image_r * numCols + image_c]);
27 | float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2];
28 |
29 | result += image_value * filter_value;
30 | }
31 | }
32 |
33 | channelBlurred[r * numCols + c] = result;
34 | }
35 | }
36 | }
37 |
38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
39 | size_t numRows, size_t numCols,
40 | const float* const filter, const int filterWidth)
41 | {
42 | unsigned char *red = new unsigned char[numRows * numCols];
43 | unsigned char *blue = new unsigned char[numRows * numCols];
44 | unsigned char *green = new unsigned char[numRows * numCols];
45 |
46 | unsigned char *redBlurred = new unsigned char[numRows * numCols];
47 | unsigned char *blueBlurred = new unsigned char[numRows * numCols];
48 | unsigned char *greenBlurred = new unsigned char[numRows * numCols];
49 |
50 | //First we separate the incoming RGBA image into three separate channels
51 | //for Red, Green and Blue
52 | for (size_t i = 0; i < numRows * numCols; ++i) {
53 | uchar4 rgba = rgbaImage[i];
54 | red[i] = rgba.x;
55 | green[i] = rgba.y;
56 | blue[i] = rgba.z;
57 | }
58 |
59 | //Now we can do the convolution for each of the color channels
60 | channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth);
61 | channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth);
62 | channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth);
63 |
64 | //now recombine into the output image - Alpha is 255 for no transparency
65 | for (size_t i = 0; i < numRows * numCols; ++i) {
66 | uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255);
67 | outputImage[i] = rgba;
68 | }
69 |
70 | delete[] red;
71 | delete[] green;
72 | delete[] blue;
73 |
74 | delete[] redBlurred;
75 | delete[] greenBlurred;
76 | delete[] blueBlurred;
77 | }
78 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
5 | size_t numRows, size_t numCols,
6 | const float* const filter, const int filterWidth);
7 |
8 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3.zip
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 | # minimum required cmake version
8 | cmake_minimum_required(VERSION 2.8)
9 | find_package(CUDA QUIET REQUIRED)
10 |
11 | SET (compare_files compare.cpp)
12 |
13 | file( GLOB hdr *.hpp *.h )
14 | file( GLOB cu *.cu)
15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
16 |
17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu})
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 |
20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
21 |
22 | ######################################################
23 | # On Macs the default install locations are below #
24 | # ####################################################
25 |
26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
27 | #CUDA_LIBPATH=/usr/local/cuda/lib
28 |
29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
30 |
31 | GCC_OPTS=-O3 -Wall -Wextra -m64
32 |
33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile
34 | $(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
35 |
36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h
37 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
38 |
39 | HW3.o: HW3.cu loadSaveImage.h utils.h
40 | $(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
41 |
42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
43 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
44 |
45 | compare.o: compare.cpp compare.h
46 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
47 |
48 | reference_calc.o: reference_calc.cpp reference_calc.h
49 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
50 |
51 | student_func.o: student_func.cu utils.h
52 | $(NVCC) -c student_func.cu $(NVCC_OPTS)
53 |
54 | clean:
55 | rm -f *.o hw
56 | find . -type f -name '*.exr' | grep -v memorial | xargs rm -f
57 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "utils.h"
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError)
6 | {
7 | cv::Mat reference = cv::imread(reference_filename, -1);
8 | cv::Mat test = cv::imread(test_filename, -1);
9 |
10 | cv::Mat diff = abs(reference - test);
11 |
12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 |
14 | double minVal, maxVal;
15 |
16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 |
18 | //now perform transform so that we bump values to the full range
19 |
20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 |
22 | diff = diffSingleChannel.reshape(reference.channels(), 0);
23 |
24 | cv::imwrite("HW3_differenceImage.png", diff);
25 | //OK, now we can start comparing values...
26 | unsigned char *referencePtr = reference.ptr(0);
27 | unsigned char *testPtr = test.ptr(0);
28 |
29 | if (useEpsCheck) {
30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 | }
32 | else
33 | {
34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 | }
36 |
37 | std::cout << "PASS" << std::endl;
38 | return;
39 | }
40 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "cuda_runtime.h"
7 |
8 | //The caller becomes responsible for the returned pointer. This
9 | //is done in the interest of keeping this code as simple as possible.
10 | //In production code this is a bad idea - we should use RAII
11 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION
12 | //CODE!!!
13 | void loadImageHDR(const std::string &filename,
14 | float **imagePtr,
15 | size_t *numRows, size_t *numCols)
16 | {
17 | cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
18 |
19 | cv::Mat image;
20 |
21 | if(originImg.type() != CV_32FC3){
22 | originImg.convertTo(image,CV_32FC3);
23 | } else{
24 | image = originImg;
25 | }
26 |
27 | if (image.empty()) {
28 | std::cerr << "Couldn't open file: " << filename << std::endl;
29 | exit(1);
30 | }
31 |
32 | if (image.channels() != 3) {
33 | std::cerr << "Image must be color!" << std::endl;
34 | exit(1);
35 | }
36 |
37 | if (!image.isContinuous()) {
38 | std::cerr << "Image isn't continuous!" << std::endl;
39 | exit(1);
40 | }
41 |
42 | *imagePtr = new float[image.rows * image.cols * image.channels()];
43 |
44 | float *cvPtr = image.ptr(0);
45 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
46 | (*imagePtr)[i] = cvPtr[i];
47 |
48 | *numRows = image.rows;
49 | *numCols = image.cols;
50 | }
51 |
52 | void loadImageRGBA(const std::string &filename,
53 | uchar4 **imagePtr,
54 | size_t *numRows, size_t *numCols)
55 | {
56 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
57 | if (image.empty()) {
58 | std::cerr << "Couldn't open file: " << filename << std::endl;
59 | exit(1);
60 | }
61 |
62 | if (image.channels() != 3) {
63 | std::cerr << "Image must be color!" << std::endl;
64 | exit(1);
65 | }
66 |
67 | if (!image.isContinuous()) {
68 | std::cerr << "Image isn't continuous!" << std::endl;
69 | exit(1);
70 | }
71 |
72 | cv::Mat imageRGBA;
73 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
74 |
75 | *imagePtr = new uchar4[image.rows * image.cols];
76 |
77 | unsigned char *cvPtr = imageRGBA.ptr(0);
78 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
79 | (*imagePtr)[i].x = cvPtr[4 * i + 0];
80 | (*imagePtr)[i].y = cvPtr[4 * i + 1];
81 | (*imagePtr)[i].z = cvPtr[4 * i + 2];
82 | (*imagePtr)[i].w = cvPtr[4 * i + 3];
83 | }
84 |
85 | *numRows = image.rows;
86 | *numCols = image.cols;
87 | }
88 |
89 | void saveImageRGBA(const uchar4* const image,
90 | const size_t numRows, const size_t numCols,
91 | const std::string &output_file)
92 | {
93 | int sizes[2];
94 | sizes[0] = numRows;
95 | sizes[1] = numCols;
96 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
97 | cv::Mat imageOutputBGR;
98 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
99 | //output the image
100 | cv::imwrite(output_file.c_str(), imageOutputBGR);
101 | }
102 |
103 | //output an exr file
104 | //assumed to already be BGR
105 | void saveImageHDR(const float* const image,
106 | const size_t numRows, const size_t numCols,
107 | const std::string &output_file)
108 | {
109 | int sizes[2];
110 | sizes[0] = numRows;
111 | sizes[1] = numCols;
112 |
113 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
114 |
115 | imageHDR = imageHDR * 255;
116 |
117 | cv::imwrite(output_file.c_str(), imageHDR);
118 | }
119 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.h:
--------------------------------------------------------------------------------
1 | #ifndef LOADSAVEIMAGE_H__
2 | #define LOADSAVEIMAGE_H__
3 |
4 | #include
5 | #include //for uchar4
6 |
7 | void loadImageHDR(const std::string &filename,
8 | float **imagePtr,
9 | size_t *numRows, size_t *numCols);
10 |
11 | void loadImageRGBA(const std::string &filename,
12 | uchar4 **imagePtr,
13 | size_t *numRows, size_t *numCols);
14 |
15 | void saveImageRGBA(const uchar4* const image,
16 | const size_t numRows, const size_t numCols,
17 | const std::string &output_file);
18 |
19 | void saveImageHDR(const float* const image,
20 | const size_t numRows, const size_t numCols,
21 | const std::string &output_file);
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW3 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include
9 |
10 | #include "compare.h"
11 | #include "reference_calc.h"
12 |
13 | // Functions from HW3.cu
14 | void preProcess(float **d_luminance, unsigned int **d_cdf,
15 | size_t *numRows, size_t *numCols, unsigned int *numBins,
16 | const std::string& filename);
17 |
18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
19 | float min_logLum, float max_logLum);
20 |
21 | void cleanupGlobalMemory(void);
22 |
23 | // Function from student_func.cu
24 | void your_histogram_and_prefixsum(const float* const d_luminance,
25 | unsigned int* const d_cdf,
26 | float &min_logLum,
27 | float &max_logLum,
28 | const size_t numRows,
29 | const size_t numCols,
30 | const size_t numBins);
31 |
32 |
33 | int main(int argc, char **argv) {
34 | float *d_luminance;
35 | unsigned int *d_cdf;
36 |
37 | size_t numRows, numCols;
38 | unsigned int numBins;
39 |
40 | std::string input_file;
41 | std::string output_file;
42 | std::string reference_file;
43 | double perPixelError = 0.0;
44 | double globalError = 0.0;
45 | bool useEpsCheck = false;
46 |
47 | switch (argc)
48 | {
49 | case 2:
50 | input_file = std::string(argv[1]);
51 | output_file = "HW3_output.png";
52 | reference_file = "HW3_reference.png";
53 | break;
54 | case 3:
55 | input_file = std::string(argv[1]);
56 | output_file = std::string(argv[2]);
57 | reference_file = "HW3_reference.png";
58 | break;
59 | case 4:
60 | input_file = std::string(argv[1]);
61 | output_file = std::string(argv[2]);
62 | reference_file = std::string(argv[3]);
63 | break;
64 | case 6:
65 | useEpsCheck=true;
66 | input_file = std::string(argv[1]);
67 | output_file = std::string(argv[2]);
68 | reference_file = std::string(argv[3]);
69 | perPixelError = atof(argv[4]);
70 | globalError = atof(argv[5]);
71 | break;
72 | default:
73 | std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
74 | exit(1);
75 | }
76 | //load the image and give us our input and output pointers
77 | preProcess(&d_luminance, &d_cdf,
78 | &numRows, &numCols, &numBins, input_file);
79 |
80 | GpuTimer timer;
81 | float min_logLum, max_logLum;
82 | min_logLum = 0.f;
83 | max_logLum = 1.f;
84 | timer.Start();
85 | //call the students' code
86 | your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
87 | numRows, numCols, numBins);
88 | timer.Stop();
89 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
90 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
91 |
92 | if (err < 0) {
93 | //Couldn't print! Probably the student closed stdout - bad news
94 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
95 | exit(1);
96 | }
97 |
98 | float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
99 | unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);
100 |
101 | checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));
102 |
103 | //check results and output the tone-mapped image
104 | postProcess(output_file, numRows, numCols, min_logLum, max_logLum);
105 |
106 | for (size_t i = 1; i < numCols * numRows; ++i) {
107 | min_logLum = std::min(h_luminance[i], min_logLum);
108 | max_logLum = std::max(h_luminance[i], max_logLum);
109 | }
110 |
111 | referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);
112 |
113 | checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));
114 |
115 | //check results and output the tone-mapped image
116 | postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);
117 |
118 | cleanupGlobalMemory();
119 |
120 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 |
122 | return 0;
123 | }
124 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial.exr
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_large.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_large.exr
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png_large.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png_large.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw_large.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 | const size_t numRows, const size_t numCols, const size_t numBins,
6 | float &logLumMin, float &logLumMax)
7 | {
8 | logLumMin = h_logLuminance[0];
9 | logLumMax = h_logLuminance[0];
10 |
11 | //Step 1
12 | //first we find the minimum and maximum across the entire image
13 | for (size_t i = 1; i < numCols * numRows; ++i) {
14 | logLumMin = std::min(h_logLuminance[i], logLumMin);
15 | logLumMax = std::max(h_logLuminance[i], logLumMax);
16 | }
17 |
18 | //Step 2
19 | float logLumRange = logLumMax - logLumMin;
20 |
21 | //Step 3
22 | //next we use the now known range to compute
23 | //a histogram of numBins bins
24 | unsigned int *histo = new unsigned int[numBins];
25 |
26 | for (size_t i = 0; i < numBins; ++i) histo[i] = 0;
27 |
28 | for (size_t i = 0; i < numCols * numRows; ++i) {
29 | unsigned int bin = std::min(static_cast(numBins - 1),
30 | static_cast((h_logLuminance[i] - logLumMin) / logLumRange * numBins));
31 | histo[bin]++;
32 | }
33 |
34 | //Step 4
35 | //finally we perform and exclusive scan (prefix sum)
36 | //on the histogram to get the cumulative distribution
37 | h_cdf[0] = 0;
38 | for (size_t i = 1; i < numBins; ++i) {
39 | h_cdf[i] = h_cdf[i - 1] + histo[i - 1];
40 | }
41 |
42 | delete[] histo;
43 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 | const size_t numRows, const size_t numCols, const size_t numBins,
6 | float &logLumMin, float &logLumMax);
7 |
8 | #endif
9 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/student_func.cu:
--------------------------------------------------------------------------------
1 | /* Udacity Homework 3
2 | HDR Tone-mapping
3 |
4 | Background HDR
5 | ==============
6 |
7 | A High Dynamic Range (HDR) image contains a wider variation of intensity
8 | and color than is allowed by the RGB format with 1 byte per channel that we
9 | have used in the previous assignment.
10 |
11 | To store this extra information we use single precision floating point for
12 | each channel. This allows for an extremely wide range of intensity values.
13 |
14 | In the image for this assignment, the inside of church with light coming in
15 | through stained glass windows, the raw input floating point values for the
16 | channels range from 0 to 275. But the mean is .41 and 98% of the values are
17 | less than 3! This means that certain areas (the windows) are extremely bright
18 | compared to everywhere else. If we linearly map this [0-275] range into the
19 | [0-255] range that we have been using then most values will be mapped to zero!
20 | The only thing we will be able to see are the very brightest areas - the
21 | windows - everything else will appear pitch black.
22 |
23 | The problem is that although we have cameras capable of recording the wide
24 | range of intensity that exists in the real world our monitors are not capable
25 | of displaying them. Our eyes are also quite capable of observing a much wider
26 | range of intensities than our image formats / monitors are capable of
27 | displaying.
28 |
29 | Tone-mapping is a process that transforms the intensities in the image so that
30 | the brightest values aren't nearly so far away from the mean. That way when
31 | we transform the values into [0-255] we can actually see the entire image.
32 | There are many ways to perform this process and it is as much an art as a
33 | science - there is no single "right" answer. In this homework we will
34 | implement one possible technique.
35 |
36 | Background Chrominance-Luminance
37 | ================================
38 |
39 | The RGB space that we have been using to represent images can be thought of as
40 | one possible set of axes spanning a three dimensional space of color. We
41 | sometimes choose other axes to represent this space because they make certain
42 | operations more convenient.
43 |
44 | Another possible way of representing a color image is to separate the color
45 | information (chromaticity) from the brightness information. There are
46 | multiple different methods for doing this - a common one during the analog
47 | television days was known as Chrominance-Luminance or YUV.
48 |
49 | We choose to represent the image in this way so that we can remap only the
50 | intensity channel and then recombine the new intensity values with the color
51 | information to form the final image.
52 |
53 | Old TV signals used to be transmitted in this way so that black & white
54 | televisions could display the luminance channel while color televisions would
55 | display all three of the channels.
56 |
57 |
58 | Tone-mapping
59 | ============
60 |
61 | In this assignment we are going to transform the luminance channel (actually
62 | the log of the luminance, but this is unimportant for the parts of the
63 | algorithm that you will be implementing) by compressing its range to [0, 1].
64 | To do this we need the cumulative distribution of the luminance values.
65 |
66 | Example
67 | -------
68 |
69 | input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
70 | min / max / range: 0 / 9 / 9
71 |
72 | histo with 3 bins: [4 7 3]
73 |
74 | cdf : [4 11 14]
75 |
76 |
77 | Your task is to calculate this cumulative distribution by following these
78 | steps.
79 |
80 | */
81 |
82 | #include "utils.h"
83 |
84 | void your_histogram_and_prefixsum(const float* const d_logLuminance,
85 | unsigned int* const d_cdf,
86 | float &min_logLum,
87 | float &max_logLum,
88 | const size_t numRows,
89 | const size_t numCols,
90 | const size_t numBins)
91 | {
92 | //TODO
93 | /*Here are the steps you need to implement
94 | 1) find the minimum and maximum value in the input logLuminance channel
95 | store in min_logLum and max_logLum
96 | 2) subtract them to find the range
97 | 3) generate a histogram of all the values in the logLuminance channel using
98 | the formula: bin = (lum[i] - lumMin) / lumRange * numBins
99 | 4) Perform an exclusive scan (prefix sum) on the histogram to get
100 | the cumulative distribution of luminance values (this should go in the
101 | incoming d_cdf pointer which already has been allocated for you) */
102 |
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4.zip
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu})
15 |
16 |
17 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
2 | #NVCC=nvcc
3 |
4 | ###################################
5 | # These are the default install #
6 | # locations on most linux distros #
7 | ###################################
8 |
9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 |
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 |
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 |
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 |
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 |
26 | ######################################################
27 | # On Macs the default install locations are below #
28 | # ####################################################
29 |
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 |
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 |
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 |
38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile
39 | $(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 |
41 | main.o: main.cpp timer.h utils.h reference_calc.h
42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 |
44 | HW4.o: HW4.cu loadSaveImage.h utils.h
45 | $(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
46 |
47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
48 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 |
50 | compare.o: compare.cpp compare.h
51 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 |
53 | reference_calc.o: reference_calc.cpp reference_calc.h
54 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
55 |
56 | student_func.o: student_func.cu reference_calc.cpp utils.h
57 | $(NVCC) -c student_func.cu $(NVCC_OPTS)
58 |
59 | clean:
60 | rm -f *.o *.png hw
61 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "utils.h"
3 |
4 |
5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
6 | double perPixelError, double globalError)
7 | {
8 | cv::Mat reference = cv::imread(reference_filename, -1);
9 | cv::Mat test = cv::imread(test_filename, -1);
10 |
11 | cv::Mat diff = abs(reference - test);
12 |
13 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
14 |
15 | double minVal, maxVal;
16 |
17 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
18 |
19 | //now perform transform so that we bump values to the full range
20 |
21 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
22 |
23 | diff = diffSingleChannel.reshape(reference.channels(), 0);
24 |
25 | cv::imwrite("HW4_differenceImage.png", diff);
26 | //OK, now we can start comparing values...
27 | unsigned char *referencePtr = reference.ptr(0);
28 | unsigned char *testPtr = test.ptr(0);
29 |
30 | if (useEpsCheck) {
31 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
32 | }
33 | else
34 | {
35 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
36 | }
37 |
38 | std::cout << "PASS" << std::endl;
39 | return;
40 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW4_H__
2 | #define HW4_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "cuda_runtime.h"
6 |
7 | //The caller becomes responsible for the returned pointer. This
8 | //is done in the interest of keeping this code as simple as possible.
9 | //In production code this is a bad idea - we should use RAII
10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION
11 | //CODE!!!
12 | void loadImageHDR(const std::string &filename,
13 | float **imagePtr,
14 | size_t *numRows, size_t *numCols)
15 | {
16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
17 | if (image.empty()) {
18 | std::cerr << "Couldn't open file: " << filename << std::endl;
19 | exit(1);
20 | }
21 |
22 | if (image.channels() != 3) {
23 | std::cerr << "Image must be color!" << std::endl;
24 | exit(1);
25 | }
26 |
27 | if (!image.isContinuous()) {
28 | std::cerr << "Image isn't continuous!" << std::endl;
29 | exit(1);
30 | }
31 |
32 | *imagePtr = new float[image.rows * image.cols * image.channels()];
33 |
34 | float *cvPtr = image.ptr(0);
35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
36 | (*imagePtr)[i] = cvPtr[i];
37 |
38 | *numRows = image.rows;
39 | *numCols = image.cols;
40 | }
41 |
42 | void loadImageRGBA(const std::string &filename,
43 | uchar4 **imagePtr,
44 | size_t *numRows, size_t *numCols)
45 | {
46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
47 | if (image.empty()) {
48 | std::cerr << "Couldn't open file: " << filename << std::endl;
49 | exit(1);
50 | }
51 |
52 | if (image.channels() != 3) {
53 | std::cerr << "Image must be color!" << std::endl;
54 | exit(1);
55 | }
56 |
57 | if (!image.isContinuous()) {
58 | std::cerr << "Image isn't continuous!" << std::endl;
59 | exit(1);
60 | }
61 |
62 | cv::Mat imageRGBA;
63 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
64 |
65 | *imagePtr = new uchar4[image.rows * image.cols];
66 |
67 | unsigned char *cvPtr = imageRGBA.ptr(0);
68 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
69 | (*imagePtr)[i].x = cvPtr[4 * i + 0];
70 | (*imagePtr)[i].y = cvPtr[4 * i + 1];
71 | (*imagePtr)[i].z = cvPtr[4 * i + 2];
72 | (*imagePtr)[i].w = cvPtr[4 * i + 3];
73 | }
74 |
75 | *numRows = image.rows;
76 | *numCols = image.cols;
77 | }
78 |
79 | void saveImageRGBA(const uchar4* const image,
80 | const size_t numRows, const size_t numCols,
81 | const std::string &output_file)
82 | {
83 | int sizes[2];
84 | sizes[0] = numRows;
85 | sizes[1] = numCols;
86 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
87 | cv::Mat imageOutputBGR;
88 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
89 | //output the image
90 | cv::imwrite(output_file.c_str(), imageOutputBGR);
91 | }
92 |
93 | //output an exr file
94 | //assumed to already be BGR
95 | void saveImageHDR(const float* const image,
96 | const size_t numRows, const size_t numCols,
97 | const std::string &output_file)
98 | {
99 | int sizes[2];
100 | sizes[0] = numRows;
101 | sizes[1] = numCols;
102 |
103 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
104 |
105 | imageHDR = imageHDR * 255;
106 |
107 | cv::imwrite(output_file.c_str(), imageHDR);
108 | }
109 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.h:
--------------------------------------------------------------------------------
1 | #ifndef LOADSAVEIMAGE_H__
2 | #define LOADSAVEIMAGE_H__
3 |
4 | #include
5 | #include //for uchar4
6 |
7 | void loadImageHDR(const std::string &filename,
8 | float **imagePtr,
9 | size_t *numRows, size_t *numCols);
10 |
11 | void loadImageRGBA(const std::string &filename,
12 | uchar4 **imagePtr,
13 | size_t *numRows, size_t *numCols);
14 |
15 | void saveImageRGBA(const uchar4* const image,
16 | const size_t numRows, const size_t numCols,
17 | const std::string &output_file);
18 |
19 | void saveImageHDR(const float* const image,
20 | const size_t numRows, const size_t numCols,
21 | const std::string &output_file);
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW4 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "compare.h"
12 | #include "reference_calc.h"
13 |
14 | void preProcess(unsigned int **inputVals,
15 | unsigned int **inputPos,
16 | unsigned int **outputVals,
17 | unsigned int **outputPos,
18 | size_t &numElems,
19 | const std::string& filename,
20 | const std::string& template_file);
21 |
22 | void postProcess(const unsigned int* const outputVals,
23 | const unsigned int* const outputPos,
24 | const size_t numElems,
25 | const std::string& output_file);
26 |
27 | void your_sort(unsigned int* const inputVals,
28 | unsigned int* const inputPos,
29 | unsigned int* const outputVals,
30 | unsigned int* const outputPos,
31 | const size_t numElems);
32 |
33 | int main(int argc, char **argv) {
34 | unsigned int *inputVals;
35 | unsigned int *inputPos;
36 | unsigned int *outputVals;
37 | unsigned int *outputPos;
38 |
39 | size_t numElems;
40 |
41 | std::string input_file;
42 | std::string template_file;
43 | std::string output_file;
44 | std::string reference_file;
45 | double perPixelError = 0.0;
46 | double globalError = 0.0;
47 | bool useEpsCheck = false;
48 |
49 | switch (argc)
50 | {
51 | case 3:
52 | input_file = std::string(argv[1]);
53 | template_file = std::string(argv[2]);
54 | output_file = "HW4_output.png";
55 | break;
56 | case 4:
57 | input_file = std::string(argv[1]);
58 | template_file = std::string(argv[2]);
59 | output_file = std::string(argv[3]);
60 | break;
61 | default:
62 | std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
63 | exit(1);
64 | }
65 | //load the image and give us our input and output pointers
66 | preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
67 |
68 | GpuTimer timer;
69 | timer.Start();
70 |
71 | //call the students' code
72 | your_sort(inputVals, inputPos, outputVals, outputPos, numElems);
73 |
74 | timer.Stop();
75 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
76 | printf("\n");
77 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
78 |
79 | if (err < 0) {
80 | //Couldn't print! Probably the student closed stdout - bad news
81 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
82 | exit(1);
83 | }
84 |
85 | //check results and output the red-eye corrected image
86 | postProcess(outputVals, outputPos, numElems, output_file);
87 |
88 | // check code moved from HW4.cu
89 | /****************************************************************************
90 | * You can use the code below to help with debugging, but make sure to *
91 | * comment it out again before submitting your assignment for grading, *
92 | * otherwise this code will take too much time and make it seem like your *
93 | * GPU implementation isn't fast enough. *
94 | * *
95 | * This code MUST RUN BEFORE YOUR CODE in case you accidentally change *
96 | * the input values when implementing your radix sort. *
97 | * *
98 | * This code performs the reference radix sort on the host and compares your *
99 | * sorted values to the reference. *
100 | * *
101 | * Thrust containers are used for copying memory from the GPU *
102 | * ************************************************************************* */
103 | thrust::device_ptr d_inputVals(inputVals);
104 | thrust::device_ptr d_inputPos(inputPos);
105 |
106 | thrust::host_vector h_inputVals(d_inputVals,
107 | d_inputVals+numElems);
108 | thrust::host_vector h_inputPos(d_inputPos,
109 | d_inputPos + numElems);
110 |
111 | thrust::host_vector h_outputVals(numElems);
112 | thrust::host_vector h_outputPos(numElems);
113 |
114 | reference_calculation(&h_inputVals[0], &h_inputPos[0],
115 | &h_outputVals[0], &h_outputPos[0],
116 | numElems);
117 |
118 | //postProcess(valsPtr, posPtr, numElems, reference_file);
119 |
120 | //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 |
122 | thrust::device_ptr d_outputVals(outputVals);
123 | thrust::device_ptr d_outputPos(outputPos);
124 |
125 | thrust::host_vector h_yourOutputVals(d_outputVals,
126 | d_outputVals + numElems);
127 | thrust::host_vector h_yourOutputPos(d_outputPos,
128 | d_outputPos + numElems);
129 |
130 | checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
131 | checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);
132 |
133 | checkCudaErrors(cudaFree(inputVals));
134 | checkCudaErrors(cudaFree(inputPos));
135 | checkCudaErrors(cudaFree(outputVals));
136 | checkCudaErrors(cudaFree(outputPos));
137 |
138 | return 0;
139 | }
140 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_5.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | // For memset
3 | #include
4 |
5 | void reference_calculation(unsigned int* inputVals,
6 | unsigned int* inputPos,
7 | unsigned int* outputVals,
8 | unsigned int* outputPos,
9 | const size_t numElems)
10 | {
11 | const int numBits = 1;
12 | const int numBins = 1 << numBits;
13 |
14 | unsigned int *binHistogram = new unsigned int[numBins];
15 | unsigned int *binScan = new unsigned int[numBins];
16 |
17 | unsigned int *vals_src = inputVals;
18 | unsigned int *pos_src = inputPos;
19 |
20 | unsigned int *vals_dst = outputVals;
21 | unsigned int *pos_dst = outputPos;
22 |
23 | //a simple radix sort - only guaranteed to work for numBits that are multiples of 2
24 | for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) {
25 | unsigned int mask = (numBins - 1) << i;
26 |
27 | memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins
28 | memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins
29 |
30 | //perform histogram of data & mask into bins
31 | for (unsigned int j = 0; j < numElems; ++j) {
32 | unsigned int bin = (vals_src[j] & mask) >> i;
33 | binHistogram[bin]++;
34 | }
35 |
36 | //perform exclusive prefix sum (scan) on binHistogram to get starting
37 | //location for each bin
38 | for (unsigned int j = 1; j < numBins; ++j) {
39 | binScan[j] = binScan[j - 1] + binHistogram[j - 1];
40 | }
41 |
42 | //Gather everything into the correct location
43 | //need to move vals and positions
44 | for (unsigned int j = 0; j < numElems; ++j) {
45 | unsigned int bin = (vals_src[j] & mask) >> i;
46 | vals_dst[binScan[bin]] = vals_src[j];
47 | pos_dst[binScan[bin]] = pos_src[j];
48 | binScan[bin]++;
49 | }
50 |
51 | //swap the buffers (pointers only)
52 | std::swap(vals_dst, vals_src);
53 | std::swap(pos_dst, pos_src);
54 | }
55 |
56 | //we did an even number of iterations, need to copy from input buffer into output
57 | std::copy(inputVals, inputVals + numElems, outputVals);
58 | std::copy(inputPos, inputPos + numElems, outputPos);
59 |
60 | delete[] binHistogram;
61 | delete[] binScan;
62 | }
63 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 |
5 | //A simple un-optimized reference radix sort calculation
6 | //Only deals with power-of-2 radices
7 |
8 |
9 | void reference_calculation(unsigned int* inputVals,
10 | unsigned int* inputPos,
11 | unsigned int* outputVals,
12 | unsigned int* outputPos,
13 | const size_t numElems);
14 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/student_func.cu:
--------------------------------------------------------------------------------
1 | //Udacity HW 4
2 | //Radix Sorting
3 |
4 | #include "utils.h"
5 | #include
6 |
7 | /* Red Eye Removal
8 | ===============
9 |
10 | For this assignment we are implementing red eye removal. This is
11 | accomplished by first creating a score for every pixel that tells us how
12 | likely it is to be a red eye pixel. We have already done this for you - you
13 | are receiving the scores and need to sort them in ascending order so that we
14 | know which pixels to alter to remove the red eye.
15 |
16 | Note: ascending order == smallest to largest
17 |
18 | Each score is associated with a position, when you sort the scores, you must
19 | also move the positions accordingly.
20 |
21 | Implementing Parallel Radix Sort with CUDA
22 | ==========================================
23 |
24 | The basic idea is to construct a histogram on each pass of how many of each
25 | "digit" there are. Then we scan this histogram so that we know where to put
26 | the output of each digit. For example, the first 1 must come after all the
27 | 0s so we have to know how many 0s there are to be able to start moving 1s
28 | into the correct position.
29 |
30 | 1) Histogram of the number of occurrences of each digit
31 | 2) Exclusive Prefix Sum of Histogram
32 | 3) Determine relative offset of each digit
33 | For example [0 0 1 1 0 0 1]
34 | -> [0 1 0 1 2 3 2]
35 | 4) Combine the results of steps 2 & 3 to determine the final
36 | output location for each element and move it there
37 |
38 | LSB Radix sort is an out-of-place sort and you will need to ping-pong values
39 | between the input and output buffers we have provided. Make sure the final
40 | sorted results end up in the output buffer! Hint: You may need to do a copy
41 | at the end.
42 |
43 | */
44 |
45 |
46 | void your_sort(unsigned int* const d_inputVals,
47 | unsigned int* const d_inputPos,
48 | unsigned int* const d_outputVals,
49 | unsigned int* const d_outputPos,
50 | const size_t numElems)
51 | {
52 | //TODO
53 | //PUT YOUR SORT HERE
54 | }
55 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 5.zip
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 |
12 | SET (HW5_files main.cu student.cu reference_calc.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW5 ${HW5_files} ${hdr})
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
3 |
4 | histo: main.cu reference_calc.o student.o Makefile
5 | nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS)
6 |
7 | student.o: student.cu
8 | nvcc -c student.cu $(NVCC_OPTS)
9 |
10 | reference_calc.o: reference_calc.cpp reference_calc.h
11 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
12 |
13 | clean:
14 | rm -f *.o hw *.bin
15 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/main.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "utils.h"
6 | #include "timer.h"
7 | #include
8 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
9 | #include
10 | #else
11 | #include
12 | #endif
13 |
14 | #include
15 | #include
16 | #include
17 |
18 | #include "reference_calc.h"
19 |
20 | void computeHistogram(const unsigned int *const d_vals,
21 | unsigned int* const d_histo,
22 | const unsigned int numBins,
23 | const unsigned int numElems);
24 |
25 | int main(void)
26 | {
27 | const unsigned int numBins = 1024;
28 | const unsigned int numElems = 10000 * numBins;
29 | const float stddev = 100.f;
30 |
31 | unsigned int *vals = new unsigned int[numElems];
32 | unsigned int *h_vals = new unsigned int[numElems];
33 | unsigned int *h_studentHisto = new unsigned int[numBins];
34 | unsigned int *h_refHisto = new unsigned int[numBins];
35 |
36 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
37 | srand(GetTickCount());
38 | #else
39 | timeval tv;
40 | gettimeofday(&tv, NULL);
41 |
42 | srand(tv.tv_usec);
43 | #endif
44 |
45 | //make the mean unpredictable, but close enough to the middle
46 | //so that timings are unaffected
47 | unsigned int mean = rand() % 100 + 462;
48 |
49 | //Output mean so that grading can happen with the same inputs
50 | std::cout << mean << std::endl;
51 |
52 | thrust::minstd_rand rng;
53 |
54 | thrust::random::experimental::normal_distribution normalDist((float)mean, stddev);
55 |
56 | // Generate the random values
57 | for (size_t i = 0; i < numElems; ++i) {
58 | vals[i] = std::min((unsigned int) std::max((int)normalDist(rng), 0), numBins - 1);
59 | }
60 |
61 | unsigned int *d_vals, *d_histo;
62 |
63 | GpuTimer timer;
64 |
65 | checkCudaErrors(cudaMalloc(&d_vals, sizeof(unsigned int) * numElems));
66 | checkCudaErrors(cudaMalloc(&d_histo, sizeof(unsigned int) * numBins));
67 | checkCudaErrors(cudaMemset(d_histo, 0, sizeof(unsigned int) * numBins));
68 |
69 | checkCudaErrors(cudaMemcpy(d_vals, vals, sizeof(unsigned int) * numElems, cudaMemcpyHostToDevice));
70 |
71 | timer.Start();
72 | computeHistogram(d_vals, d_histo, numBins, numElems);
73 | timer.Stop();
74 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
75 |
76 | if (err < 0) {
77 | //Couldn't print! Probably the student closed stdout - bad news
78 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
79 | exit(1);
80 | }
81 |
82 | // copy the student-computed histogram back to the host
83 | checkCudaErrors(cudaMemcpy(h_studentHisto, d_histo, sizeof(unsigned int) * numBins, cudaMemcpyDeviceToHost));
84 |
85 | //generate reference for the given mean
86 | reference_calculation(vals, h_refHisto, numBins, numElems);
87 |
88 | //Now do the comparison
89 | checkResultsExact(h_refHisto, h_studentHisto, numBins);
90 |
91 | delete[] h_vals;
92 | delete[] h_refHisto;
93 | delete[] h_studentHisto;
94 |
95 | cudaFree(d_vals);
96 | cudaFree(d_histo);
97 |
98 | return 0;
99 | }
100 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | //Reference Histogram calculation
3 |
4 | void reference_calculation(const unsigned int* const vals,
5 | unsigned int* const histo,
6 | const size_t numBins,
7 | const size_t numElems)
8 |
9 | {
10 | //zero out bins
11 | for (size_t i = 0; i < numBins; ++i)
12 | histo[i] = 0;
13 |
14 | //go through vals and increment appropriate bin
15 | for (size_t i = 0; i < numElems; ++i)
16 | histo[vals[i]]++;
17 | }
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | //Reference Histogram calculation
5 |
6 | void reference_calculation(const unsigned int* const vals,
7 | unsigned int* const histo,
8 | const size_t numBins,
9 | const size_t numElems);
10 |
11 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/student.cu:
--------------------------------------------------------------------------------
1 | /* Udacity HW5
2 | Histogramming for Speed
3 |
4 | The goal of this assignment is compute a histogram
5 | as fast as possible. We have simplified the problem as much as
6 | possible to allow you to focus solely on the histogramming algorithm.
7 |
8 | The input values that you need to histogram are already the exact
9 | bins that need to be updated. This is unlike in HW3 where you needed
10 | to compute the range of the data and then do:
11 | bin = (val - valMin) / valRange to determine the bin.
12 |
13 | Here the bin is just:
14 | bin = val
15 |
16 | so the serial histogram calculation looks like:
17 | for (i = 0; i < numElems; ++i)
18 | histo[val[i]]++;
19 |
20 | That's it! Your job is to make it run as fast as possible!
21 |
22 | The values are normally distributed - you may take
23 | advantage of this fact in your implementation.
24 |
25 | */
26 |
27 |
28 | #include "utils.h"
29 |
30 | __global__
31 | void yourHisto(const unsigned int* const vals, //INPUT
32 | unsigned int* const histo, //OUPUT
33 | int numVals)
34 | {
35 | //TODO fill in this kernel to calculate the histogram
36 | //as quickly as possible
37 |
38 | //Although we provide only one kernel skeleton,
39 | //feel free to use more if it will help you
40 | //write faster code
41 | }
42 |
43 | void computeHistogram(const unsigned int* const d_vals, //INPUT
44 | unsigned int* const d_histo, //OUTPUT
45 | const unsigned int numBins,
46 | const unsigned int numElems)
47 | {
48 | //TODO Launch the yourHisto kernel
49 |
50 | //if you want to use/launch more than one kernel,
51 | //feel free
52 |
53 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
54 | }
55 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6.zip
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 |
12 | SET (HW6_files student_func.cu HW6.cu main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW6 ${HW6_files} ${hdr})
15 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/HW6.cu:
--------------------------------------------------------------------------------
1 | #include "utils.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include "loadSaveImage.h"
8 | #include
9 |
10 |
11 | //return types are void since any internal error will be handled by quitting
12 | //no point in returning error codes...
13 | void preProcess( uchar4 **sourceImg,
14 | size_t &numRows, size_t &numCols,
15 | uchar4 **destImg,
16 | uchar4 **blendedImg, const std::string& source_filename,
17 | const std::string& dest_filename){
18 |
19 | //make sure the context initializes ok
20 | checkCudaErrors(cudaFree(0));
21 |
22 | size_t numRowsSource, numColsSource, numRowsDest, numColsDest;
23 |
24 | loadImageRGBA(source_filename, sourceImg, &numRowsSource, &numColsSource);
25 | loadImageRGBA(dest_filename, destImg, &numRowsDest, &numColsDest);
26 |
27 | assert(numRowsSource == numRowsDest);
28 | assert(numColsSource == numColsDest);
29 |
30 | numRows = numRowsSource;
31 | numCols = numColsSource;
32 |
33 | *blendedImg = new uchar4[numRows * numCols];
34 |
35 | }
36 |
37 | void postProcess(const uchar4* const blendedImg,
38 | const size_t numRowsDest, const size_t numColsDest,
39 | const std::string& output_file)
40 | {
41 | //just need to save the image...
42 | saveImageRGBA(blendedImg, numRowsDest, numColsDest, output_file);
43 | }
44 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
2 | #NVCC=nvcc
3 |
4 | ###################################
5 | # These are the default install #
6 | # locations on most linux distros #
7 | ###################################
8 |
9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 |
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 |
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 |
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 |
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 |
26 | ######################################################
27 | # On Macs the default install locations are below #
28 | # ####################################################
29 |
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 |
34 | #no warnings otherwise thrust explodes output
35 |
36 | NVCC_OPTS=-O3 -arch=sm_20 -m64
37 |
38 | GCC_OPTS=-O3 -m64
39 |
40 | student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile
41 | $(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
42 |
43 | main.o: main.cpp timer.h utils.h
44 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
45 |
46 | HW6.o: HW6.cu loadSaveImage.h utils.h
47 | $(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
48 |
49 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
50 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
51 |
52 | student_func.o: student_func.cu reference_calc.cpp utils.h
53 | $(NVCC) -c student_func.cu $(NVCC_OPTS)
54 |
55 | compare.o: compare.cpp compare.h
56 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
57 |
58 | reference_calc.o: reference_calc.cpp reference_calc.h
59 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
60 |
61 | clean:
62 | rm -f *.o hw
63 | find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f
64 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/blended.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/blended.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "utils.h"
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError)
6 | {
7 | cv::Mat reference = cv::imread(reference_filename, -1);
8 | cv::Mat test = cv::imread(test_filename, -1);
9 |
10 | cv::Mat diff = abs(reference - test);
11 |
12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 |
14 | double minVal, maxVal;
15 |
16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 |
18 | //now perform transform so that we bump values to the full range
19 |
20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 |
22 | diff = diffSingleChannel.reshape(reference.channels(), 0);
23 |
24 | cv::imwrite("HW6_differenceImage.png", diff);
25 | //OK, now we can start comparing values...
26 | unsigned char *referencePtr = reference.ptr(0);
27 | unsigned char *testPtr = test.ptr(0);
28 |
29 | if (useEpsCheck) {
30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 | }
32 | else
33 | {
34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 | }
36 |
37 | std::cout << "PASS" << std::endl;
38 | return;
39 | }
40 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/destination.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "cuda_runtime.h"
6 |
7 | //The caller becomes responsible for the returned pointer. This
8 | //is done in the interest of keeping this code as simple as possible.
9 | //In production code this is a bad idea - we should use RAII
10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION
11 | //CODE!!!
12 | void loadImageHDR(const std::string &filename,
13 | float **imagePtr,
14 | size_t *numRows, size_t *numCols)
15 | {
16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
17 | if (image.empty()) {
18 | std::cerr << "Couldn't open file: " << filename << std::endl;
19 | exit(1);
20 | }
21 |
22 | if (image.channels() != 3) {
23 | std::cerr << "Image must be color!" << std::endl;
24 | exit(1);
25 | }
26 |
27 | if (!image.isContinuous()) {
28 | std::cerr << "Image isn't continuous!" << std::endl;
29 | exit(1);
30 | }
31 |
32 | *imagePtr = new float[image.rows * image.cols * image.channels()];
33 |
34 | float *cvPtr = image.ptr(0);
35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
36 | (*imagePtr)[i] = cvPtr[i];
37 |
38 | *numRows = image.rows;
39 | *numCols = image.cols;
40 | }
41 |
42 | void loadImageGrey(const std::string &filename,
43 | unsigned char **imagePtr,
44 | size_t *numRows, size_t *numCols)
45 | {
46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
47 | if (image.empty()) {
48 | std::cerr << "Couldn't open file: " << filename << std::endl;
49 | exit(1);
50 | }
51 |
52 | if (image.channels() != 1) {
53 | std::cerr << "Image must be greyscale!" << std::endl;
54 | exit(1);
55 | }
56 |
57 | if (!image.isContinuous()) {
58 | std::cerr << "Image isn't continuous!" << std::endl;
59 | exit(1);
60 | }
61 |
62 | *imagePtr = new unsigned char[image.rows * image.cols];
63 |
64 | unsigned char *cvPtr = image.ptr(0);
65 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
66 | (*imagePtr)[i] = cvPtr[i];
67 | }
68 |
69 | *numRows = image.rows;
70 | *numCols = image.cols;
71 | }
72 | void loadImageRGBA(const std::string &filename,
73 | uchar4 **imagePtr,
74 | size_t *numRows, size_t *numCols)
75 | {
76 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
77 | if (image.empty()) {
78 | std::cerr << "Couldn't open file: " << filename << std::endl;
79 | exit(1);
80 | }
81 |
82 | if (image.channels() != 3) {
83 | std::cerr << "Image must be color!" << std::endl;
84 | exit(1);
85 | }
86 |
87 | if (!image.isContinuous()) {
88 | std::cerr << "Image isn't continuous!" << std::endl;
89 | exit(1);
90 | }
91 |
92 | cv::Mat imageRGBA;
93 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
94 |
95 | *imagePtr = new uchar4[image.rows * image.cols];
96 |
97 | unsigned char *cvPtr = imageRGBA.ptr(0);
98 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
99 | (*imagePtr)[i].x = cvPtr[4 * i + 0];
100 | (*imagePtr)[i].y = cvPtr[4 * i + 1];
101 | (*imagePtr)[i].z = cvPtr[4 * i + 2];
102 | (*imagePtr)[i].w = cvPtr[4 * i + 3];
103 | }
104 |
105 | *numRows = image.rows;
106 | *numCols = image.cols;
107 | }
108 |
109 | void saveImageRGBA(const uchar4* const image,
110 | const size_t numRows, const size_t numCols,
111 | const std::string &output_file)
112 | {
113 | int sizes[2];
114 | sizes[0] = numRows;
115 | sizes[1] = numCols;
116 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
117 | cv::Mat imageOutputBGR;
118 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
119 | //output the image
120 | cv::imwrite(output_file.c_str(), imageOutputBGR);
121 | }
122 |
123 | //output an exr file
124 | //assumed to already be BGR
125 | void saveImageHDR(const float* const image,
126 | const size_t numRows, const size_t numCols,
127 | const std::string &output_file)
128 | {
129 | int sizes[2];
130 | sizes[0] = numRows;
131 | sizes[1] = numCols;
132 |
133 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
134 |
135 | imageHDR = imageHDR * 255;
136 |
137 | cv::imwrite(output_file.c_str(), imageHDR);
138 | }
139 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.h:
--------------------------------------------------------------------------------
1 | #ifndef LOADSAVEIMAGE_H__
2 | #define LOADSAVEIMAGE_H__
3 |
4 | #include
5 | #include //for uchar4
6 |
7 | void loadImageHDR(const std::string &filename,
8 | float **imagePtr,
9 | size_t *numRows, size_t *numCols);
10 |
11 | void loadImageRGBA(const std::string &filename,
12 | uchar4 **imagePtr,
13 | size_t *numRows, size_t *numCols);
14 |
15 | void loadImageGrey(const std::string &filename,
16 | unsigned char **imagePtr,
17 | size_t *numRows, size_t *numCols);
18 |
19 | void saveImageRGBA(const uchar4* const image,
20 | const size_t numRows, const size_t numCols,
21 | const std::string &output_file);
22 |
23 | void saveImageHDR(const float* const image,
24 | const size_t numRows, const size_t numCols,
25 | const std::string &output_file);
26 |
27 | #endif
28 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW6 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | #include "reference_calc.h"
14 | #include "compare.h"
15 |
16 | void preProcess( uchar4 **sourceImg, size_t &numRowsSource, size_t &numColsSource,
17 | uchar4 **destImg,
18 | uchar4 **blendedImg, const std::string& source_filename,
19 | const std::string& dest_filename);
20 |
21 | void postProcess(const uchar4* const blendedImg,
22 | const size_t numRowsDest, const size_t numColsDest,
23 | const std::string& output_file);
24 |
25 | void your_blend(const uchar4* const sourceImg,
26 | const size_t numRowsSource, const size_t numColsSource,
27 | const uchar4* const destImg,
28 | uchar4* const blendedImg);
29 |
30 | int main(int argc, char **argv) {
31 | uchar4 *h_sourceImg, *h_destImg, *h_blendedImg;
32 | size_t numRowsSource, numColsSource;
33 |
34 | std::string input_source_file;
35 | std::string input_dest_file;
36 | std::string output_file;
37 |
38 | std::string reference_file;
39 | double perPixelError = 0.0;
40 | double globalError = 0.0;
41 | bool useEpsCheck = false;
42 |
43 | switch (argc)
44 | {
45 | case 3:
46 | input_source_file = std::string(argv[1]);
47 | input_dest_file = std::string(argv[2]);
48 | output_file = "HW6_output.png";
49 | reference_file = "HW6_reference.png";
50 | break;
51 | case 4:
52 | input_source_file = std::string(argv[1]);
53 | input_dest_file = std::string(argv[2]);
54 | output_file = std::string(argv[3]);
55 | reference_file = "HW6_reference.png";
56 | break;
57 | case 5:
58 | input_source_file = std::string(argv[1]);
59 | input_dest_file = std::string(argv[2]);
60 | output_file = std::string(argv[3]);
61 | reference_file = std::string(argv[4]);
62 | break;
63 | case 7:
64 | useEpsCheck=true;
65 | input_source_file = std::string(argv[1]);
66 | input_dest_file = std::string(argv[2]);
67 | output_file = std::string(argv[3]);
68 | reference_file = std::string(argv[4]);
69 | perPixelError = atof(argv[5]);
70 | globalError = atof(argv[6]);
71 | break;
72 | default:
73 | std::cerr << "Usage: ./HW6 input_source_file input_dest_filename [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
74 | exit(1);
75 | }
76 |
77 | //load the image and give us our input and output pointers
78 | preProcess(&h_sourceImg, numRowsSource, numColsSource,
79 | &h_destImg,
80 | &h_blendedImg, input_source_file, input_dest_file);
81 |
82 | GpuTimer timer;
83 | timer.Start();
84 |
85 | //call the students' code
86 | your_blend(h_sourceImg, numRowsSource, numColsSource,
87 | h_destImg,
88 | h_blendedImg);
89 |
90 | timer.Stop();
91 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
92 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
93 | printf("\n");
94 | if (err < 0) {
95 | //Couldn't print! Probably the student closed stdout - bad news
96 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
97 | exit(1);
98 | }
99 |
100 | //check results and output the tone-mapped image
101 | postProcess(h_blendedImg, numRowsSource, numColsSource, output_file);
102 |
103 | // calculate the reference image
104 | uchar4* h_reference = new uchar4[numRowsSource*numColsSource];
105 | reference_calc(h_sourceImg, numRowsSource, numColsSource,
106 | h_destImg, h_reference);
107 |
108 | // save the reference image
109 | postProcess(h_reference, numRowsSource, numColsSource, reference_file);
110 |
111 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
112 |
113 | delete[] h_reference;
114 | delete[] h_destImg;
115 | delete[] h_sourceImg;
116 | delete[] h_blendedImg;
117 | return 0;
118 | }
119 |
120 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW 6
2 | //Poisson Blending Reference Calculation
3 |
4 | #include "utils.h"
5 | #include
6 |
7 | //Performs one iteration of the solver
8 | void computeIteration(const unsigned char* const dstImg,
9 | const unsigned char* const strictInteriorPixels,
10 | const unsigned char* const borderPixels,
11 | const std::vector& interiorPixelList,
12 | const size_t numColsSource,
13 | const float* const f,
14 | const float* const g,
15 | float* const f_next)
16 | {
17 | unsigned int off = interiorPixelList[0].x * numColsSource + interiorPixelList[0].y;
18 |
19 | for (size_t i = 0; i < interiorPixelList.size(); ++i) {
20 | float blendedSum = 0.f;
21 | float borderSum = 0.f;
22 |
23 | uint2 coord = interiorPixelList[i];
24 |
25 | unsigned int offset = coord.x * numColsSource + coord.y;
26 |
27 | //process all 4 neighbor pixels
28 | //for each pixel if it is an interior pixel
29 | //then we add the previous f, otherwise if it is a
30 | //border pixel then we add the value of the destination
31 | //image at the border. These border values are our boundary
32 | //conditions.
33 | if (strictInteriorPixels[offset - 1]) {
34 | blendedSum += f[offset - 1];
35 | }
36 | else {
37 | borderSum += dstImg[offset - 1];
38 | }
39 |
40 | if (strictInteriorPixels[offset + 1]) {
41 | blendedSum += f[offset + 1];
42 | }
43 | else {
44 | borderSum += dstImg[offset + 1];
45 | }
46 |
47 | if (strictInteriorPixels[offset - numColsSource]) {
48 | blendedSum += f[offset - numColsSource];
49 | }
50 | else {
51 | borderSum += dstImg[offset - numColsSource];
52 | }
53 |
54 | if (strictInteriorPixels[offset + numColsSource]) {
55 | blendedSum += f[offset + numColsSource];
56 | }
57 | else {
58 | borderSum += dstImg[offset + numColsSource];
59 | }
60 |
61 | float f_next_val = (blendedSum + borderSum + g[offset]) / 4.f;
62 |
63 | f_next[offset] = std::min(255.f, std::max(0.f, f_next_val)); //clip to [0, 255]
64 | }
65 |
66 | }
67 |
68 | //pre-compute the values of g, which depend only the source image
69 | //and aren't iteration dependent.
70 | void computeG(const unsigned char* const channel,
71 | float* const g,
72 | const size_t numColsSource,
73 | const std::vector& interiorPixelList)
74 | {
75 | for (size_t i = 0; i < interiorPixelList.size(); ++i) {
76 | uint2 coord = interiorPixelList[i];
77 | unsigned int offset = coord.x * numColsSource + coord.y;
78 |
79 | float sum = 4.f * channel[offset];
80 |
81 | sum -= (float)channel[offset - 1] + (float)channel[offset + 1];
82 | sum -= (float)channel[offset + numColsSource] + (float)channel[offset - numColsSource];
83 |
84 | g[offset] = sum;
85 | }
86 | }
87 |
88 | void reference_calc(const uchar4* const h_sourceImg,
89 | const size_t numRowsSource, const size_t numColsSource,
90 | const uchar4* const h_destImg,
91 | uchar4* const h_blendedImg){
92 |
93 | //we need to create a list of border pixels and interior pixels
94 | //this is a conceptually simple implementation, not a particularly efficient one...
95 |
96 | //first create mask
97 | size_t srcSize = numRowsSource * numColsSource;
98 | unsigned char* mask = new unsigned char[srcSize];
99 |
100 | for (int i = 0; i < srcSize; ++i) {
101 | mask[i] = (h_sourceImg[i].x + h_sourceImg[i].y + h_sourceImg[i].z < 3 * 255) ? 1 : 0;
102 | }
103 |
104 | //next compute strictly interior pixels and border pixels
105 | unsigned char *borderPixels = new unsigned char[srcSize];
106 | unsigned char *strictInteriorPixels = new unsigned char[srcSize];
107 |
108 | std::vector interiorPixelList;
109 |
110 | //the source region in the homework isn't near an image boundary, so we can
111 | //simplify the conditionals a little...
112 | for (size_t r = 1; r < numRowsSource - 1; ++r) {
113 | for (size_t c = 1; c < numColsSource - 1; ++c) {
114 | if (mask[r * numColsSource + c]) {
115 | if (mask[(r -1) * numColsSource + c] && mask[(r + 1) * numColsSource + c] &&
116 | mask[r * numColsSource + c - 1] && mask[r * numColsSource + c + 1]) {
117 | strictInteriorPixels[r * numColsSource + c] = 1;
118 | borderPixels[r * numColsSource + c] = 0;
119 | interiorPixelList.push_back(make_uint2(r, c));
120 | }
121 | else {
122 | strictInteriorPixels[r * numColsSource + c] = 0;
123 | borderPixels[r * numColsSource + c] = 1;
124 | }
125 | }
126 | else {
127 | strictInteriorPixels[r * numColsSource + c] = 0;
128 | borderPixels[r * numColsSource + c] = 0;
129 |
130 | }
131 | }
132 | }
133 |
134 | //split the source and destination images into their respective
135 | //channels
136 | unsigned char* red_src = new unsigned char[srcSize];
137 | unsigned char* blue_src = new unsigned char[srcSize];
138 | unsigned char* green_src = new unsigned char[srcSize];
139 |
140 | for (int i = 0; i < srcSize; ++i) {
141 | red_src[i] = h_sourceImg[i].x;
142 | blue_src[i] = h_sourceImg[i].y;
143 | green_src[i] = h_sourceImg[i].z;
144 | }
145 |
146 | unsigned char* red_dst = new unsigned char[srcSize];
147 | unsigned char* blue_dst = new unsigned char[srcSize];
148 | unsigned char* green_dst = new unsigned char[srcSize];
149 |
150 | for (int i = 0; i < srcSize; ++i) {
151 | red_dst[i] = h_destImg[i].x;
152 | blue_dst[i] = h_destImg[i].y;
153 | green_dst[i] = h_destImg[i].z;
154 | }
155 |
156 | //next we'll precompute the g term - it never changes, no need to recompute every iteration
157 | float *g_red = new float[srcSize];
158 | float *g_blue = new float[srcSize];
159 | float *g_green = new float[srcSize];
160 |
161 | memset(g_red, 0, srcSize * sizeof(float));
162 | memset(g_blue, 0, srcSize * sizeof(float));
163 | memset(g_green, 0, srcSize * sizeof(float));
164 |
165 | computeG(red_src, g_red, numColsSource, interiorPixelList);
166 | computeG(blue_src, g_blue, numColsSource, interiorPixelList);
167 | computeG(green_src, g_green, numColsSource, interiorPixelList);
168 |
169 | //for each color channel we'll need two buffers and we'll ping-pong between them
170 | float *blendedValsRed_1 = new float[srcSize];
171 | float *blendedValsRed_2 = new float[srcSize];
172 |
173 | float *blendedValsBlue_1 = new float[srcSize];
174 | float *blendedValsBlue_2 = new float[srcSize];
175 |
176 | float *blendedValsGreen_1 = new float[srcSize];
177 | float *blendedValsGreen_2 = new float[srcSize];
178 |
179 | //IC is the source image, copy over
180 | for (size_t i = 0; i < srcSize; ++i) {
181 | blendedValsRed_1[i] = red_src[i];
182 | blendedValsRed_2[i] = red_src[i];
183 | blendedValsBlue_1[i] = blue_src[i];
184 | blendedValsBlue_2[i] = blue_src[i];
185 | blendedValsGreen_1[i] = green_src[i];
186 | blendedValsGreen_2[i] = green_src[i];
187 | }
188 |
189 | //Perform the solve on each color channel
190 | const size_t numIterations = 800;
191 | for (size_t i = 0; i < numIterations; ++i) {
192 | computeIteration(red_dst, strictInteriorPixels, borderPixels,
193 | interiorPixelList, numColsSource, blendedValsRed_1, g_red,
194 | blendedValsRed_2);
195 |
196 | std::swap(blendedValsRed_1, blendedValsRed_2);
197 | }
198 |
199 | for (size_t i = 0; i < numIterations; ++i) {
200 | computeIteration(blue_dst, strictInteriorPixels, borderPixels,
201 | interiorPixelList, numColsSource, blendedValsBlue_1, g_blue,
202 | blendedValsBlue_2);
203 |
204 | std::swap(blendedValsBlue_1, blendedValsBlue_2);
205 | }
206 |
207 | for (size_t i = 0; i < numIterations; ++i) {
208 | computeIteration(green_dst, strictInteriorPixels, borderPixels,
209 | interiorPixelList, numColsSource, blendedValsGreen_1, g_green,
210 | blendedValsGreen_2);
211 |
212 | std::swap(blendedValsGreen_1, blendedValsGreen_2);
213 | }
214 | std::swap(blendedValsRed_1, blendedValsRed_2); //put output into _2
215 | std::swap(blendedValsBlue_1, blendedValsBlue_2); //put output into _2
216 | std::swap(blendedValsGreen_1, blendedValsGreen_2); //put output into _2
217 |
218 | //copy the destination image to the output
219 | memcpy(h_blendedImg, h_destImg, sizeof(uchar4) * srcSize);
220 |
221 | //copy computed values for the interior into the output
222 | for (size_t i = 0; i < interiorPixelList.size(); ++i) {
223 | uint2 coord = interiorPixelList[i];
224 |
225 | unsigned int offset = coord.x * numColsSource + coord.y;
226 |
227 | h_blendedImg[offset].x = blendedValsRed_2[offset];
228 | h_blendedImg[offset].y = blendedValsBlue_2[offset];
229 | h_blendedImg[offset].z = blendedValsGreen_2[offset];
230 | }
231 |
232 | //wow, we allocated a lot of memory!
233 | delete[] mask;
234 | delete[] blendedValsRed_1;
235 | delete[] blendedValsRed_2;
236 | delete[] blendedValsBlue_1;
237 | delete[] blendedValsBlue_2;
238 | delete[] blendedValsGreen_1;
239 | delete[] blendedValsGreen_2;
240 | delete[] g_red;
241 | delete[] g_blue;
242 | delete[] g_green;
243 | delete[] red_src;
244 | delete[] red_dst;
245 | delete[] blue_src;
246 | delete[] blue_dst;
247 | delete[] green_src;
248 | delete[] green_dst;
249 | delete[] borderPixels;
250 | delete[] strictInteriorPixels;
251 | }
252 |
253 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void reference_calc(const uchar4* const h_sourceImg,
5 | const size_t numRowsSource, const size_t numColsSource,
6 | const uchar4* const h_destImg,
7 | uchar4* const h_blendedImg);
8 |
9 | #endif
10 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/source.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/student_func.cu:
--------------------------------------------------------------------------------
1 | //Udacity HW 6
2 | //Poisson Blending
3 |
4 | /* Background
5 | ==========
6 |
7 | The goal for this assignment is to take one image (the source) and
8 | paste it into another image (the destination) attempting to match the
9 | two images so that the pasting is non-obvious. This is
10 | known as a "seamless clone".
11 |
12 | The basic ideas are as follows:
13 |
14 | 1) Figure out the interior and border of the source image
15 | 2) Use the values of the border pixels in the destination image
16 | as boundary conditions for solving a Poisson equation that tells
17 | us how to blend the images.
18 |
19 | No pixels from the destination except pixels on the border
20 | are used to compute the match.
21 |
22 | Solving the Poisson Equation
23 | ============================
24 |
25 | There are multiple ways to solve this equation - we choose an iterative
26 | method - specifically the Jacobi method. Iterative methods start with
27 | a guess of the solution and then iterate to try and improve the guess
28 | until it stops changing. If the problem was well-suited for the method
29 | then it will stop and where it stops will be the solution.
30 |
31 | The Jacobi method is the simplest iterative method and converges slowly -
32 | that is we need a lot of iterations to get to the answer, but it is the
33 | easiest method to write.
34 |
35 | Jacobi Iterations
36 | =================
37 |
38 | Our initial guess is going to be the source image itself. This is a pretty
39 | good guess for what the blended image will look like and it means that
40 | we won't have to do as many iterations compared to if we had started far
41 | from the final solution.
42 |
43 | ImageGuess_prev (Floating point)
44 | ImageGuess_next (Floating point)
45 |
46 | DestinationImg
47 | SourceImg
48 |
49 | Follow these steps to implement one iteration:
50 |
51 | 1) For every pixel p in the interior, compute two sums over the four neighboring pixels:
52 | Sum1: If the neighbor is in the interior then += ImageGuess_prev[neighbor]
53 | else if the neighbor in on the border then += DestinationImg[neighbor]
54 |
55 | Sum2: += SourceImg[p] - SourceImg[neighbor] (for all four neighbors)
56 |
57 | 2) Calculate the new pixel value:
58 | float newVal= (Sum1 + Sum2) / 4.f <------ Notice that the result is FLOATING POINT
59 | ImageGuess_next[p] = min(255, max(0, newVal)); //clamp to [0, 255]
60 |
61 |
62 | In this assignment we will do 800 iterations.
63 | */
64 |
65 |
66 |
67 | #include "utils.h"
68 | #include
69 |
70 | void your_blend(const uchar4* const h_sourceImg, //IN
71 | const size_t numRowsSource, const size_t numColsSource,
72 | const uchar4* const h_destImg, //IN
73 | uchar4* const h_blendedImg) //OUT
74 | {
75 |
76 | /* To Recap here are the steps you need to implement
77 |
78 | 1) Compute a mask of the pixels from the source image to be copied
79 | The pixels that shouldn't be copied are completely white, they
80 | have R=255, G=255, B=255. Any other pixels SHOULD be copied.
81 |
82 | 2) Compute the interior and border regions of the mask. An interior
83 | pixel has all 4 neighbors also inside the mask. A border pixel is
84 | in the mask itself, but has at least one neighbor that isn't.
85 |
86 | 3) Separate out the incoming image into three separate channels
87 |
88 | 4) Create two float(!) buffers for each color channel that will
89 | act as our guesses. Initialize them to the respective color
90 | channel of the source image since that will act as our intial guess.
91 |
92 | 5) For each color channel perform the Jacobi iteration described
93 | above 800 times.
94 |
95 | 6) Create the output image by replacing all the interior pixels
96 | in the destination image with the result of the Jacobi iterations.
97 | Just cast the floating point values to unsigned chars since we have
98 | already made sure to clamp them to the correct range.
99 |
100 | Since this is final assignment we provide little boilerplate code to
101 | help you. Notice that all the input/output pointers are HOST pointers.
102 |
103 | You will have to allocate all of your own GPU memory and perform your own
104 | memcopies to get data in and out of the GPU memory.
105 |
106 | Remember to wrap all of your calls with checkCudaErrors() to catch any
107 | thing that might go wrong. After each kernel call do:
108 |
109 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
110 |
111 | to catch any errors that happened while executing the kernel.
112 | */
113 | }
114 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include