├── .gitignore
├── CMakeLists.txt
├── Final
├── batcher
│ ├── batcher.cu
│ ├── compare.h
│ └── gputimer.h
├── smooth
│ ├── compare.h
│ ├── gputimer.h
│ └── smooth.cu
└── warpreduce
│ ├── part_a
│ ├── compare.h
│ ├── gputimer.h
│ └── warpreduce.cu
│ └── part_b
│ ├── compare.h
│ ├── gputimer.h
│ └── warpreduce.cu
├── Lesson Code Snippets
├── Lesson 2 Code Snippets
│ ├── CMakeLists.txt
│ ├── associative.cu
│ ├── atomics.cu
│ ├── gputimer.h
│ ├── hello.cu
│ ├── hello_blockIdx.cu
│ ├── hello_threadIdx.cu
│ └── memory.cu
├── Lesson 3 Code Snippets
│ ├── CMakeLists.txt
│ ├── histo.cu
│ ├── reduce.cu
│ ├── reduce_minmax.cu
│ └── reduce_minmax_2.cu
├── Lesson 5 Code Snippets
│ ├── CMakeLists.txt
│ ├── deviceQuery_simplified.cpp
│ ├── gputimer.h
│ └── transpose.cu
└── Lesson 7 Code Snippets
│ ├── CMakeLists.txt
│ ├── cub
│ └── example_block_scan_cum.cu
│ ├── opencv
│ ├── gettime.cc
│ ├── gettime.h
│ └── opencv.cu
│ ├── thrust
│ ├── gettime.cc
│ ├── gettime.h
│ ├── gputimer.h
│ └── thrust_example.cu
│ └── tiling
│ ├── a.exp
│ ├── gputimer.h
│ ├── tiling.cu
│ └── utils.h
├── Problem Sets
├── Problem Set 1
│ ├── CMakeLists.txt
│ ├── HW1.cpp
│ ├── Makefile
│ ├── cinque_terre.gold
│ ├── cinque_terre_small.jpg
│ ├── compare.cpp
│ ├── compare.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 2
│ ├── CMakeLists.txt
│ ├── HW2.cpp
│ ├── Makefile
│ ├── cinque_terre.gold
│ ├── cinque_terre_small.jpg
│ ├── compare.cpp
│ ├── compare.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 3
│ ├── CMakeLists.txt
│ ├── HW3.cu
│ ├── Makefile
│ ├── compare.cpp
│ ├── compare.h
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── memorial.exr
│ ├── memorial_large.exr
│ ├── memorial_png.gold
│ ├── memorial_png_large.gold
│ ├── memorial_raw.png
│ ├── memorial_raw_large.png
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 4
│ ├── CMakeLists.txt
│ ├── HW4.cu
│ ├── Makefile
│ ├── compare.cpp
│ ├── compare.h
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── red_eye_effect.gold
│ ├── red_eye_effect_5.jpg
│ ├── red_eye_effect_template_5.jpg
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
├── Problem Set 5
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── main.cu
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── student.cu
│ ├── timer.h
│ └── utils.h
└── Problem Set 6
│ ├── CMakeLists.txt
│ ├── HW6.cu
│ ├── Makefile
│ ├── blended.gold
│ ├── compare.cpp
│ ├── compare.h
│ ├── destination.png
│ ├── loadSaveImage.cpp
│ ├── loadSaveImage.h
│ ├── main.cpp
│ ├── reference_calc.cpp
│ ├── reference_calc.h
│ ├── source.png
│ ├── student_func.cu
│ ├── timer.h
│ └── utils.h
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Object files
2 | *.o
3 |
4 | # Libraries
5 | *.lib
6 | *.a
7 |
8 | # Shared objects (inc. Windows DLLs)
9 | *.dll
10 | *.so
11 | *.so.*
12 | *.dylib
13 |
14 | # Executables
15 | *.exe
16 | *.out
17 | *.app
18 |
19 | # OS X stuff
20 | .DS_Store
21 |
22 | build*
23 | bin
24 |
25 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
9 | project(cs344)
10 |
11 | find_package(OpenCV REQUIRED)
12 | find_package(CUDA REQUIRED)
13 |
14 | link_libraries(${OpenCV_LIBS})
15 | include_directories(${OpenCV_INCLUDE_DIRS})
16 |
17 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/")
18 |
19 | if(CUDA_FOUND)
20 | # compared to class settings, we let NVidia's FindCUDA CMake detect
21 | # whether to build x64. We tell it to support most devices, though,
22 | # to make sure more people can easily run class code without knowing
23 | # about this compiler argument
24 |
25 | # Commented out these lines, otherwise there will be some tricky errors
26 | # set(CUDA_NVCC_FLAGS "
27 | # -ccbin /usr/bin/clang;
28 | # -gencode;arch=compute_30,code=sm_30;
29 | # -gencode;arch=compute_35,code=sm_35;
30 | # -gencode;arch=compute_35,code=compute_35;
31 | # -gencode;arch=compute_20,code=sm_20;
32 | # -gencode;arch=compute_11,code=sm_11;
33 | # -gencode;arch=compute_12,code=sm_12;
34 | # -gencode;arch=compute_13,code=sm_13;")
35 |
36 | # add -Wextra compiler flag for gcc compilations
37 | if (UNIX)
38 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra")
39 | set(CMAKE_CXX_FLAGS "-stdlib=libstdc++")
40 | endif (UNIX)
41 |
42 | # add debugging to CUDA NVCC flags. For NVidia's NSight tools.
43 | set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
44 |
45 | add_subdirectory ("Problem Sets/Problem Set 1")
46 | add_subdirectory ("Problem Sets/Problem Set 2")
47 | add_subdirectory ("Problem Sets/Problem Set 3")
48 | add_subdirectory ("Problem Sets/Problem Set 4")
49 | add_subdirectory ("Problem Sets/Problem Set 5")
50 | add_subdirectory ("Problem Sets/Problem Set 6")
51 |
52 | add_subdirectory ("Lesson Code Snippets/Lesson 7 Code Snippets")
53 | add_subdirectory ("Lesson Code Snippets/Lesson 5 Code Snippets")
54 | add_subdirectory ("Lesson Code Snippets/Lesson 3 Code Snippets")
55 | add_subdirectory ("Lesson Code Snippets/Lesson 2 Code Snippets")
56 | else(CUDA_FOUND)
57 | message("CUDA is not installed on this system.")
58 | endif()
59 |
--------------------------------------------------------------------------------
/Final/batcher/batcher.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // http://en.wikipedia.org/wiki/Bitonic_sort
8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in)
9 | {
10 | // you are guaranteed this is called with <<<1, 64, 64*4>>>
11 | extern __shared__ float sdata[];
12 | int tid = threadIdx.x;
13 | sdata[tid] = d_in[tid];
14 | __syncthreads();
15 |
16 | for (int stage = 0; stage <= 5; stage++)
17 | {
18 | for (int substage = stage; substage >= 0; substage--)
19 | {
20 | // TODO
21 | }
22 | }
23 |
24 | d_out[tid] = sdata[tid];
25 | }
26 |
27 | int compareFloat (const void * a, const void * b)
28 | {
29 | if ( *(float*)a < *(float*)b ) return -1;
30 | if ( *(float*)a == *(float*)b ) return 0;
31 | if ( *(float*)a > *(float*)b ) return 1;
32 | return 0; // should never reach this
33 | }
34 |
35 | int main(int argc, char **argv)
36 | {
37 | const int ARRAY_SIZE = 64;
38 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
39 |
40 | // generate the input array on the host
41 | float h_in[ARRAY_SIZE];
42 | float h_sorted[ARRAY_SIZE];
43 | float h_out[ARRAY_SIZE];
44 | for(int i = 0; i < ARRAY_SIZE; i++) {
45 | // generate random float in [0, 1]
46 | h_in[i] = (float)random()/(float)RAND_MAX;
47 | h_sorted[i] = h_in[i];
48 | }
49 | qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat);
50 |
51 | // declare GPU memory pointers
52 | float * d_in, * d_out;
53 |
54 | // allocate GPU memory
55 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
56 | cudaMalloc((void **) &d_out, ARRAY_BYTES);
57 |
58 | // transfer the input array to the GPU
59 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
60 |
61 | // launch the kernel
62 | GpuTimer timer;
63 | timer.Start();
64 | batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in);
65 | timer.Stop();
66 |
67 | printf("Your code executed in %g ms\n", timer.Elapsed());
68 |
69 | // copy back the sum from GPU
70 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
71 |
72 | compare(h_out, h_sorted, ARRAY_SIZE);
73 |
74 | // free GPU memory allocation
75 | cudaFree(d_in);
76 | cudaFree(d_out);
77 |
78 | return 0;
79 | }
80 |
--------------------------------------------------------------------------------
/Final/batcher/compare.h:
--------------------------------------------------------------------------------
1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE)
2 | {
3 | int failure = 0;
4 | for(int i = 0; i < ARRAY_SIZE; i++) {
5 | if (h_out[i] != h_sorted[i]) {
6 | printf("Oops! Index %i is %f, should be %f\n",
7 | i, h_out[i], h_sorted[i]);
8 | failure = 1;
9 | }
10 | }
11 |
12 | if (failure == 0){
13 | printf("Success! Your bitonic sort worked.");
14 | }
15 |
16 | return failure;
17 | }
--------------------------------------------------------------------------------
/Final/batcher/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/smooth/compare.h:
--------------------------------------------------------------------------------
1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){
2 | int failure = 0;
3 | for(int i = 0; i < ARRAY_SIZE; i++) {
4 | if (h_out[i] != h_cmp[i]) {
5 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n",
6 | i, h_in[i], i, h_out[i], i, h_cmp[i]);
7 | failure = 1;
8 | }
9 | if (h_out_shared[i] != h_cmp[i]) {
10 | fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n",
11 | i, h_in[i], i, h_out_shared[i], i, h_cmp[i]);
12 | failure = 1;
13 | }
14 | }
15 |
16 | if (failure == 0)
17 | {
18 | printf("Success! Your smooth code worked!\n");
19 | }
20 |
21 | return failure;
22 | }
--------------------------------------------------------------------------------
/Final/smooth/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/smooth/smooth.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Reference
8 | __global__ void smooth(float * v_new, const float * v) {
9 | int myIdx = threadIdx.x * gridDim.x + blockIdx.x;
10 | int numThreads = blockDim.x * gridDim.x;
11 | int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1;
12 | int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1;
13 | float myElt = v[myIdx];
14 | float myLeftElt = v[myLeftIdx];
15 | float myRightElt = v[myRightIdx];
16 | v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt;
17 | }
18 |
19 | // Your code
20 | __global__ void smooth_shared(float * v_new, const float * v) {
21 | extern __shared__ float s[];
22 | // TODO: Fill in the rest of this function
23 | return v[0];
24 | }
25 |
26 | int main(int argc, char **argv)
27 | {
28 |
29 | const int ARRAY_SIZE = 4096;
30 | const int BLOCK_SIZE = 256;
31 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
32 |
33 | // generate the input array on the host
34 | float h_in[ARRAY_SIZE];
35 | float h_cmp[ARRAY_SIZE];
36 | float h_out[ARRAY_SIZE];
37 | float h_out_shared[ARRAY_SIZE];
38 | for(int i = 0; i < ARRAY_SIZE; i++) {
39 | // generate random float in [0, 1]
40 | h_in[i] = (float)random()/(float)RAND_MAX;
41 | }
42 | for(int i = 0; i < ARRAY_SIZE; i++) {
43 | h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] +
44 | 0.50f * h_in[i] +
45 | 0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]);
46 | }
47 |
48 | // declare GPU memory pointers
49 | float * d_in, * d_out, * d_out_shared;
50 |
51 | // allocate GPU memory
52 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
53 | cudaMalloc((void **) &d_out, ARRAY_BYTES);
54 | cudaMalloc((void **) &d_out_shared, ARRAY_BYTES);
55 |
56 | // transfer the input array to the GPU
57 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
58 |
59 | // cudaEvent_t start, stop;
60 | // cudaEventCreate(&start);
61 | // cudaEventCreate(&stop);
62 | // launch the kernel
63 | smooth<<>>(d_out, d_in);
64 | GpuTimer timer;
65 | timer.Start();
66 | smooth_shared<<>>(d_out_shared, d_in);
67 | timer.Stop();
68 |
69 | printf("Your code executed in %g ms\n", timer.Elapsed());
70 | // cudaEventSynchronize(stop);
71 | // float elapsedTime;
72 | // cudaEventElapsedTime(&elapsedTime, start, stop);
73 |
74 | // copy back the result from GPU
75 | cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
76 | cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost);
77 |
78 | // testing for correctness
79 | compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE);
80 |
81 | // free GPU memory allocation
82 | cudaFree(d_in);
83 | cudaFree(d_out);
84 | cudaFree(d_out_shared);
85 |
86 | return 0;
87 | }
88 |
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/compare.h:
--------------------------------------------------------------------------------
1 | int compare(unsigned int h_out_shared, int sum){
2 | int failure = 0;
3 | if (h_out_shared != sum) {
4 | fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n",
5 | h_out_shared, sum);
6 | failure = 1;
7 | }
8 |
9 | if (failure == 0)
10 | {
11 | printf("Success! Your shared warp reduce worked.\n");
12 | }
13 | else{
14 | printf("Error! Your shared reduce code's output did not match sum.\n");
15 | }
16 |
17 | return failure;
18 | }
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/warpreduce/part_a/warpreduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Subpart A:
8 | // Write step 1 as a kernel that operates on threads 0--31.
9 | // Assume that the input flags are 0 for false and 1 for true and are stored
10 | // in a local per-thread register called p (for predicate).
11 | //
12 | // You have access to 31 words of shared memory s[0:31], with s[0]
13 | // corresponding to thread 0 and s[31] corresponding to thread 31.
14 | // You may change the values of s[0:31]. Put the return sum in s[0].
15 | // Your code should execute no more than 5 warp-wide addition operations.
16 |
17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) {
18 | // Assumes values in 'p' are either 1 or 0
19 | // Assumes s[0:31] are allocated
20 | // Sums p across warp, returning the result. Suggest you put
21 | // result in s[0] and return it
22 | // You may change any value in s
23 | // You should execute no more than 5 + operations (if you're doing
24 | // 31, you're doing it wrong)
25 | //
26 | // TODO: Fill in the rest of this function
27 |
28 | return s[0];
29 | }
30 |
31 | __global__ void reduce(unsigned int * d_out_shared,
32 | const unsigned int * d_in)
33 | {
34 | extern __shared__ unsigned int s[];
35 | int t = threadIdx.x;
36 | int p = d_in[t];
37 | unsigned int sr = shared_reduce(p, s);
38 | if (t == 0)
39 | {
40 | *d_out_shared = sr;
41 | }
42 | }
43 |
44 | int main(int argc, char **argv)
45 | {
46 | const int ARRAY_SIZE = 32;
47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 |
49 | // generate the input array on the host
50 | unsigned int h_in[ARRAY_SIZE];
51 | unsigned int sum = 0;
52 | for(int i = 0; i < ARRAY_SIZE; i++) {
53 | // generate random float in [0, 1]
54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 | sum += h_in[i];
56 | }
57 |
58 | // declare GPU memory pointers
59 | unsigned int * d_in, * d_out_shared;
60 |
61 | // allocate GPU memory
62 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 | cudaMalloc((void **) &d_out_shared, sizeof(unsigned int));
64 |
65 | // transfer the input array to the GPU
66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
67 |
68 | GpuTimer timer;
69 | timer.Start();
70 | // launch the kernel
71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 | (d_out_shared, d_in);
73 | timer.Stop();
74 |
75 | printf("Your code executed in %g ms\n", timer.Elapsed());
76 |
77 | unsigned int h_out_shared;
78 | // copy back the sum from GPU
79 | cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int),
80 | cudaMemcpyDeviceToHost);
81 |
82 | compare(h_out_shared, sum);
83 |
84 | // free GPU memory allocation
85 | cudaFree(d_in);
86 | cudaFree(d_out_shared);
87 | }
88 |
89 |
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/compare.h:
--------------------------------------------------------------------------------
1 | int compare(unsigned int h_out_warp, int sum){
2 | int failure = 0;
3 | if (h_out_warp != sum) {
4 | fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n",
5 | h_out_warp, sum);
6 | failure = 1;
7 | }
8 |
9 | if (failure == 0)
10 | {
11 | printf("Success! Your warp reduce worked.\n");
12 | }
13 | else{
14 | printf("Error! Your warp reduce code's output did not match sum.\n");
15 | }
16 |
17 | return failure;
18 | }
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Final/warpreduce/part_b/warpreduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "compare.h"
5 | #include "gputimer.h"
6 |
7 | // Subpart b:
8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions.
9 | // Namely, these instructions are:
10 | //
11 | // int __popc(int x) Population Count: Returns the number of bits that are set
12 | // to 1 in the 32-bit integer x.
13 | //
14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero
15 | // bits beginning at the most significant bit of the 32-bit integer x.
16 | //
17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only
18 | // if the predicate p provided by the thread in lane k of the warp is non-zero.
19 |
20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) {
21 | // Assumes values in 'p' are either 1 or 0
22 | // Should not use 's'
23 | // Sums p across warp, returning the result.
24 | // You can do this without using the character '+' in your code at all
25 | //
26 | // TODO: Fill in the rest of this function
27 | //
28 | }
29 |
30 | __global__ void reduce(unsigned int * d_out_warp,
31 | const unsigned int * d_in)
32 | {
33 | extern __shared__ unsigned int s[];
34 | int t = threadIdx.x;
35 | int p = d_in[t];
36 |
37 | unsigned int wr = warp_reduce(p, s);
38 | if (t == 0)
39 | {
40 | *d_out_warp = wr;
41 | }
42 | }
43 |
44 | int main(int argc, char **argv)
45 | {
46 | const int ARRAY_SIZE = 32;
47 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 |
49 | // generate the input array on the host
50 | unsigned int h_in[ARRAY_SIZE];
51 | unsigned int sum = 0;
52 | for(int i = 0; i < ARRAY_SIZE; i++) {
53 | // generate random float in [0, 1]
54 | h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 | sum += h_in[i];
56 | }
57 |
58 | // declare GPU memory pointers
59 | unsigned int * d_in, * d_out_warp;
60 |
61 | // allocate GPU memory
62 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 | cudaMalloc((void **) &d_out_warp, sizeof(unsigned int));
64 |
65 | // transfer the input array to the GPU
66 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
67 |
68 | GpuTimer timer;
69 | timer.Start();
70 | // launch the kernel
71 | reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 | (d_out_warp, d_in);
73 | timer.Stop();
74 |
75 | printf("Your code executed in %g ms\n", timer.Elapsed());
76 |
77 | unsigned int h_out_warp;
78 | // copy back the sum from GPU
79 | cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int),
80 | cudaMemcpyDeviceToHost);
81 |
82 | // compare your result against the expected reduce sum
83 | compare(h_out_warp, sum);
84 |
85 | // free GPU memory allocation
86 | cudaFree(d_in);
87 | cudaFree(d_out_warp);
88 |
89 | }
90 |
91 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | CUDA_ADD_EXECUTABLE(Lesson2_atomics atomics.cu)
11 |
12 | CUDA_ADD_EXECUTABLE(Lesson2_memory memory.cu)
13 |
14 | CUDA_ADD_EXECUTABLE(Lesson2_hello_world hello.cu)
15 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int main(int argc,char **argv)
4 | {
5 | printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n",
6 | 1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99,
7 | 1.f, 1e99, -1e99, 1.f + (1e99 + -1e99));
8 | return 0;
9 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include "gputimer.h"
8 |
9 | #define NUM_THREADS 1000000
10 | #define ARRAY_SIZE 100
11 |
12 | #define BLOCK_WIDTH 1000
13 |
14 | void print_array(int *array, int size)
15 | {
16 | printf("{ ");
17 | for (int i = 0; i < size; i++) { printf("%d ", array[i]); }
18 | printf("}\n");
19 | }
20 |
21 | __global__ void increment_naive(int *g)
22 | {
23 | // which thread is this?
24 | int i = blockIdx.x * blockDim.x + threadIdx.x;
25 |
26 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
27 | i = i % ARRAY_SIZE;
28 | g[i] = g[i] + 1;
29 | }
30 |
31 | __global__ void increment_atomic(int *g)
32 | {
33 | // which thread is this?
34 | int i = blockIdx.x * blockDim.x + threadIdx.x;
35 |
36 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE
37 | i = i % ARRAY_SIZE;
38 | atomicAdd(& g[i], 1);
39 | }
40 |
41 | int main(int argc,char **argv)
42 | {
43 | GpuTimer timer;
44 | printf("%d total threads in %d blocks writing into %d array elements\n",
45 | NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
46 |
47 | // declare and allocate host memory
48 | int h_array[ARRAY_SIZE];
49 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
50 |
51 | // declare, allocate, and zero out GPU memory
52 | int * d_array;
53 | cudaMalloc((void **) &d_array, ARRAY_BYTES);
54 | cudaMemset((void *) d_array, 0, ARRAY_BYTES);
55 |
56 | // launch the kernel - comment out one of these
57 | timer.Start();
58 | //increment_naive<<>>(d_array);
59 | increment_atomic<<>>(d_array);
60 | timer.Stop();
61 |
62 | // copy back the array of sums from GPU and print
63 | cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
64 | print_array(h_array, ARRAY_SIZE);
65 | printf("Time elapsed = %g ms\n", timer.Elapsed());
66 |
67 | // free GPU memory allocation and exit
68 | cudaFree(d_array);
69 | return 0;
70 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #define NUM_BLOCKS 4
9 | #define BLOCK_WIDTH 4
10 |
11 | __global__ void hello()
12 | {
13 | printf("Hello world! I'm thread %d in block %d\n", threadIdx.x, blockIdx.x);
14 | }
15 |
16 |
17 | int main(int argc,char **argv)
18 | {
19 | // launch the kernel
20 | hello<<>>();
21 |
22 | // force the printf()s to flush
23 | cudaDeviceSynchronize();
24 |
25 | printf("That's all!\n");
26 |
27 | return 0;
28 | }
29 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define NUM_BLOCKS 16
4 | #define BLOCK_WIDTH 1
5 |
6 | __global__ void hello()
7 | {
8 | printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
9 | }
10 |
11 |
12 | int main(int argc,char **argv)
13 | {
14 | // launch the kernel
15 | hello<<>>();
16 |
17 | // force the printf()s to flush
18 | cudaDeviceSynchronize();
19 |
20 | printf("That's all!\n");
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #define NUM_BLOCKS 1
4 | #define BLOCK_WIDTH 256
5 |
6 | __global__ void hello()
7 | {
8 | printf("Hello world! I'm thread %d\n", threadIdx.x);
9 | }
10 |
11 |
12 | int main(int argc,char **argv)
13 | {
14 | // launch the kernel
15 | hello<<>>();
16 |
17 | // force the printf()s to flush
18 | cudaDeviceSynchronize();
19 |
20 | printf("That's all!\n");
21 |
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu:
--------------------------------------------------------------------------------
1 | // Using different memory spaces in CUDA
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | /**********************
10 | * using local memory *
11 | **********************/
12 |
13 | // a __device__ or __global__ function runs on the GPU
14 | __global__ void use_local_memory_GPU(float in)
15 | {
16 | float f; // variable "f" is in local memory and private to each thread
17 | f = in; // parameter "in" is in local memory and private to each thread
18 | // ... real code would presumably do other stuff here ...
19 | }
20 |
21 | /**********************
22 | * using global memory *
23 | **********************/
24 |
25 | // a __global__ function runs on the GPU & can be called from host
26 | __global__ void use_global_memory_GPU(float *array)
27 | {
28 | // "array" is a pointer into global memory on the device
29 | array[threadIdx.x] = 2.0f * (float) threadIdx.x;
30 | }
31 |
32 | /**********************
33 | * using shared memory *
34 | **********************/
35 |
36 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)
37 | __global__ void use_shared_memory_GPU(float *array)
38 | {
39 | // local variables, private to each thread
40 | int i, index = threadIdx.x;
41 | float average, sum = 0.0f;
42 |
43 | // __shared__ variables are visible to all threads in the thread block
44 | // and have the same lifetime as the thread block
45 | __shared__ float sh_arr[128];
46 |
47 | // copy data from "array" in global memory to sh_arr in shared memory.
48 | // here, each thread is responsible for copying a single element.
49 | sh_arr[index] = array[index];
50 |
51 | __syncthreads(); // ensure all the writes to shared memory have completed
52 |
53 | // now, sh_arr is fully populated. Let's find the average of all previous elements
54 | for (i=0; i average) { array[index] = average; }
61 |
62 | // the following code has NO EFFECT: it modifies shared memory, but
63 | // the resulting modified data is never copied back to global memory
64 | // and vanishes when the thread block completes
65 | sh_arr[index] = 3.14;
66 | }
67 |
68 | int main(int argc, char **argv)
69 | {
70 | /*
71 | * First, call a kernel that shows using local memory
72 | */
73 | use_local_memory_GPU<<<1, 128>>>(2.0f);
74 |
75 | /*
76 | * Next, call a kernel that shows using global memory
77 | */
78 | float h_arr[128]; // convention: h_ variables live on host
79 | float *d_arr; // convention: d_ variables live on device (GPU global mem)
80 |
81 | // allocate global memory on the device, place result in "d_arr"
82 | cudaMalloc((void **) &d_arr, sizeof(float) * 128);
83 | // now copy data from host memory "h_arr" to device memory "d_arr"
84 | cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
85 | // launch the kernel (1 block of 128 threads)
86 | use_global_memory_GPU<<<1, 128>>>(d_arr); // modifies the contents of array at d_arr
87 | // copy the modified array back to the host, overwriting contents of h_arr
88 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost);
89 | // ... do other stuff ...
90 |
91 | /*
92 | * Next, call a kernel that shows using shared memory
93 | */
94 |
95 | // as before, pass in a pointer to data in global memory
96 | use_shared_memory_GPU<<<1, 128>>>(d_arr);
97 | // copy the modified array back to the host
98 | cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
99 | // ... do other stuff ...
100 | return 0;
101 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | CUDA_ADD_EXECUTABLE(Lesson3_histo histo.cu)
11 |
12 | CUDA_ADD_EXECUTABLE(Lesson3_reduce reduce.cu)
13 |
14 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax reduce_minmax.cu)
15 |
16 | CUDA_ADD_EXECUTABLE(Lesson3_reduce_minmax_2 reduce_minmax_2.cu)
17 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | int log2(int i)
5 | {
6 | int r = 0;
7 | while (i >>= 1) r++;
8 | return r;
9 | }
10 |
11 | int bit_reverse(int w, int bits)
12 | {
13 | int r = 0;
14 | for (int i = 0; i < bits; i++)
15 | {
16 | int bit = (w & (1 << i)) >> i;
17 | r |= bit << (bits - i - 1);
18 | }
19 | return r;
20 | }
21 |
22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
23 | {
24 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
25 | int myItem = d_in[myId];
26 | int myBin = myItem % BIN_COUNT;
27 | d_bins[myBin]++;
28 | }
29 |
30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
31 | {
32 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
33 | int myItem = d_in[myId];
34 | int myBin = myItem % BIN_COUNT;
35 | atomicAdd(&(d_bins[myBin]), 1);
36 | }
37 |
38 |
39 | int main(int argc, char **argv)
40 | {
41 | int deviceCount;
42 | cudaGetDeviceCount(&deviceCount);
43 | if (deviceCount == 0) {
44 | fprintf(stderr, "error: no devices supporting CUDA.\n");
45 | exit(EXIT_FAILURE);
46 | }
47 | int dev = 0;
48 | cudaSetDevice(dev);
49 |
50 | cudaDeviceProp devProps;
51 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
52 | {
53 | printf("Using device %d:\n", dev);
54 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
55 | devProps.name, (int)devProps.totalGlobalMem,
56 | (int)devProps.major, (int)devProps.minor,
57 | (int)devProps.clockRate);
58 | }
59 |
60 | const int ARRAY_SIZE = 65536;
61 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
62 | const int BIN_COUNT = 16;
63 | const int BIN_BYTES = BIN_COUNT * sizeof(int);
64 |
65 | // generate the input array on the host
66 | int h_in[ARRAY_SIZE];
67 | for(int i = 0; i < ARRAY_SIZE; i++) {
68 | h_in[i] = bit_reverse(i, log2(ARRAY_SIZE));
69 | }
70 | int h_bins[BIN_COUNT];
71 | for(int i = 0; i < BIN_COUNT; i++) {
72 | h_bins[i] = 0;
73 | }
74 |
75 | // declare GPU memory pointers
76 | int * d_in;
77 | int * d_bins;
78 |
79 | // allocate GPU memory
80 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
81 | cudaMalloc((void **) &d_bins, BIN_BYTES);
82 |
83 | // transfer the arrays to the GPU
84 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
85 | cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice);
86 |
87 | int whichKernel = 0;
88 | if (argc == 2) {
89 | whichKernel = atoi(argv[1]);
90 | }
91 |
92 | // launch the kernel
93 | switch(whichKernel) {
94 | case 0:
95 | printf("Running naive histo\n");
96 | naive_histo<<>>(d_bins, d_in, BIN_COUNT);
97 | break;
98 | case 1:
99 | printf("Running simple histo\n");
100 | simple_histo<<>>(d_bins, d_in, BIN_COUNT);
101 | break;
102 | default:
103 | fprintf(stderr, "error: ran no kernel\n");
104 | exit(EXIT_FAILURE);
105 | }
106 |
107 | // copy back the sum from GPU
108 | cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);
109 |
110 | for(int i = 0; i < BIN_COUNT; i++) {
111 | printf("bin %d: count %d\n", i, h_bins[i]);
112 | }
113 |
114 | // free GPU memory allocation
115 | cudaFree(d_in);
116 | cudaFree(d_bins);
117 |
118 | return 0;
119 | }
120 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | __global__ void global_reduce_kernel(float * d_out, float * d_in)
6 | {
7 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
8 | int tid = threadIdx.x;
9 |
10 | // do reduction in global mem
11 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
12 | {
13 | if (tid < s)
14 | {
15 | d_in[myId] += d_in[myId + s];
16 | }
17 | __syncthreads(); // make sure all adds at one stage are done!
18 | }
19 |
20 | // only thread 0 writes result for this block back to global mem
21 | if (tid == 0)
22 | {
23 | d_out[blockIdx.x] = d_in[myId];
24 | }
25 | }
26 |
27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
28 | {
29 | // sdata is allocated in the kernel call: 3rd arg to <<>>
30 | extern __shared__ float sdata[];
31 |
32 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
33 | int tid = threadIdx.x;
34 |
35 | // load shared mem from global mem
36 | sdata[tid] = d_in[myId];
37 | __syncthreads(); // make sure entire block is loaded!
38 |
39 | // do reduction in shared mem
40 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
41 | {
42 | if (tid < s)
43 | {
44 | sdata[tid] += sdata[tid + s];
45 | }
46 | __syncthreads(); // make sure all adds at one stage are done!
47 | }
48 |
49 | // only thread 0 writes result for this block back to global mem
50 | if (tid == 0)
51 | {
52 | d_out[blockIdx.x] = sdata[0];
53 | }
54 | }
55 |
56 | void reduce(float * d_out, float * d_intermediate, float * d_in,
57 | int size, bool usesSharedMemory)
58 | {
59 | // assumes that size is not greater than maxThreadsPerBlock^2
60 | // and that size is a multiple of maxThreadsPerBlock
61 | const int maxThreadsPerBlock = 1024;
62 | int threads = maxThreadsPerBlock;
63 | int blocks = size / maxThreadsPerBlock;
64 | if (usesSharedMemory)
65 | {
66 | shmem_reduce_kernel<<>>
67 | (d_intermediate, d_in);
68 | }
69 | else
70 | {
71 | global_reduce_kernel<<>>
72 | (d_intermediate, d_in);
73 | }
74 | // now we're down to one block left, so reduce it
75 | threads = blocks; // launch one thread for each block in prev step
76 | blocks = 1;
77 | if (usesSharedMemory)
78 | {
79 | shmem_reduce_kernel<<>>
80 | (d_out, d_intermediate);
81 | }
82 | else
83 | {
84 | global_reduce_kernel<<>>
85 | (d_out, d_intermediate);
86 | }
87 | }
88 |
89 | int main(int argc, char **argv)
90 | {
91 | int deviceCount;
92 | cudaGetDeviceCount(&deviceCount);
93 | if (deviceCount == 0) {
94 | fprintf(stderr, "error: no devices supporting CUDA.\n");
95 | exit(EXIT_FAILURE);
96 | }
97 | int dev = 0;
98 | cudaSetDevice(dev);
99 |
100 | cudaDeviceProp devProps;
101 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
102 | {
103 | printf("Using device %d:\n", dev);
104 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
105 | devProps.name, (int)devProps.totalGlobalMem,
106 | (int)devProps.major, (int)devProps.minor,
107 | (int)devProps.clockRate);
108 | }
109 |
110 | const int ARRAY_SIZE = 1 << 16;
111 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
112 |
113 | // generate the input array on the host
114 | float h_in[ARRAY_SIZE];
115 | float sum = 0.0f;
116 | for(int i = 0; i < ARRAY_SIZE; i++) {
117 | // generate random float in [-1.0f, 1.0f]
118 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
119 | sum += h_in[i];
120 | }
121 |
122 | // declare GPU memory pointers
123 | float * d_in, * d_intermediate, * d_out;
124 |
125 | // allocate GPU memory
126 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
127 | cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
128 | cudaMalloc((void **) &d_out, sizeof(float));
129 |
130 | // transfer the input array to the GPU
131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
132 |
133 | int whichKernel = 0;
134 | if (argc == 2) {
135 | whichKernel = atoi(argv[1]);
136 | }
137 |
138 | cudaEvent_t start, stop;
139 | cudaEventCreate(&start);
140 | cudaEventCreate(&stop);
141 | // launch the kernel
142 | switch(whichKernel) {
143 | case 0:
144 | printf("Running global reduce\n");
145 | cudaEventRecord(start, 0);
146 | for (int i = 0; i < 100; i++)
147 | {
148 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
149 | }
150 | cudaEventRecord(stop, 0);
151 | break;
152 | case 1:
153 | printf("Running reduce with shared mem\n");
154 | cudaEventRecord(start, 0);
155 | for (int i = 0; i < 100; i++)
156 | {
157 | reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
158 | }
159 | cudaEventRecord(stop, 0);
160 | break;
161 | default:
162 | fprintf(stderr, "error: ran no kernel\n");
163 | exit(EXIT_FAILURE);
164 | }
165 | cudaEventSynchronize(stop);
166 | float elapsedTime;
167 | cudaEventElapsedTime(&elapsedTime, start, stop);
168 | elapsedTime /= 100.0f; // 100 trials
169 |
170 | // copy back the sum from GPU
171 | float h_out;
172 | cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
173 |
174 | printf("average time elapsed: %f\n", elapsedTime);
175 |
176 | // free GPU memory allocation
177 | cudaFree(d_in);
178 | cudaFree(d_intermediate);
179 | cudaFree(d_out);
180 |
181 | return 0;
182 | }
183 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in, bool is_max)
8 | {
9 | // sdata is allocated in the kernel call: 3rd arg to <<>>
10 | extern __shared__ float sdata[];
11 |
12 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
13 | int tid = threadIdx.x;
14 |
15 | // load shared mem from global mem
16 | sdata[tid] = d_in[myId];
17 | __syncthreads(); // make sure entire block is loaded!
18 |
19 | // do reduction in shared mem
20 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
21 | {
22 | if (tid < s)
23 | {
24 | if(is_max)
25 | sdata[tid] = max(sdata[tid], sdata[tid + s]);
26 | else
27 | sdata[tid] = min(sdata[tid], sdata[tid + s]);
28 | }
29 | __syncthreads(); // make sure all adds at one stage are done!
30 | }
31 |
32 | // only thread 0 writes result for this block back to global mem
33 | if (tid == 0)
34 | {
35 | d_out[blockIdx.x] = sdata[0];
36 | }
37 | }
38 |
39 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length)
40 | {
41 | // use reduce
42 | const int m = 1 << 10;
43 | int blocks = ceil((float)length / m);
44 | float *d_intermediate; // should not modify d_in
45 | cudaMalloc(&d_intermediate, sizeof(float)* blocks); // store max and min
46 |
47 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance, true);
48 | shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(max_logLum, d_intermediate, true);
49 |
50 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance, false);
51 | shmem_reduce_kernel<<<1, blocks, blocks * sizeof(float)>>>(min_logLum, d_intermediate, false);
52 |
53 | cudaFree(d_intermediate);
54 | }
55 |
56 | int main(int argc, char **argv)
57 | {
58 | int deviceCount;
59 | cudaGetDeviceCount(&deviceCount);
60 | if (deviceCount == 0) {
61 | fprintf(stderr, "error: no devices supporting CUDA.\n");
62 | exit(EXIT_FAILURE);
63 | }
64 | int dev = 0;
65 | cudaSetDevice(dev);
66 |
67 | cudaDeviceProp devProps;
68 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
69 | {
70 | printf("Using device %d:\n", dev);
71 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
72 | devProps.name, (int)devProps.totalGlobalMem,
73 | (int)devProps.major, (int)devProps.minor,
74 | (int)devProps.clockRate);
75 | }
76 |
77 | const int ARRAY_SIZE = 1 << 16;
78 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
79 |
80 | // generate the input array on the host
81 | float h_in[ARRAY_SIZE];
82 | float sum = 0.0f;
83 | srand((unsigned)time(0));
84 | for(int i = 0; i < ARRAY_SIZE; i++) {
85 | // generate random float in [-1.0f, 1.0f]
86 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
87 | sum += h_in[i];
88 | }
89 |
90 | // declare GPU memory pointers
91 | float *d_in;
92 |
93 | // allocate GPU memory
94 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
95 |
96 | // transfer the input array to the GPU
97 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
98 |
99 | // launch the kernel
100 | printf("Running reduce\n");
101 | float *d_min, *d_max;
102 | cudaMalloc((void **) &d_min, sizeof(float));
103 | cudaMalloc((void **) &d_max, sizeof(float));
104 | reduce(d_min, d_max, d_in, ARRAY_SIZE);
105 |
106 | // copy back the sum from GPU
107 | float h_min, h_max;
108 | cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost);
109 | cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
110 |
111 | printf("Max_GPU: %f Min_GPU: %f\n", h_max, h_min);
112 | h_max = h_in[0]; h_min = h_in[0];
113 | for (size_t i = 1; i < ARRAY_SIZE; ++i) {
114 | h_max = std::max(h_in[i], h_max);
115 | h_min = std::min(h_in[i], h_min);
116 | }
117 | printf("Max_CPU: %f Min_CPU: %f\n", h_max, h_min);
118 |
119 | // free GPU memory allocation
120 | cudaFree(d_in);
121 | cudaFree(d_min);
122 | cudaFree(d_max);
123 | return 0;
124 | }
125 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce_minmax_2.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | __global__ void shmem_reduce_kernel(float * d_out, const float * const d_in)
7 | {
8 | // sdata is allocated in the kernel call: 3rd arg to <<>>
9 | extern __shared__ float sdata[];
10 |
11 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
12 | int tid = threadIdx.x;
13 |
14 | // load shared mem from global mem
15 | sdata[tid] = d_in[myId];
16 | __syncthreads(); // make sure entire block is loaded!
17 |
18 | // do reduction in shared mem
19 | unsigned int s = blockDim.x / 2;
20 | // step 1: 分两半
21 | if (tid < s) {
22 | float temp = sdata[tid];
23 | sdata[tid] = max(temp, sdata[tid + s]);
24 | sdata[tid + s] = min(temp, sdata[tid + s]);
25 | }
26 | __syncthreads(); // make sure all adds at one stage are done!
27 |
28 | // step 2: 两边走
29 | for (s = s / 2; s > 0; s >>= 1)
30 | {
31 | if (tid < s) {
32 | sdata[tid] = max(sdata[tid], sdata[tid + s]);
33 | }
34 | else if (tid >= blockDim.x / 2 && tid < blockDim.x / 2 + s) {
35 | sdata[tid] = min(sdata[tid], sdata[tid + s]);
36 | }
37 | __syncthreads(); // make sure all adds at one stage are done!
38 | }
39 |
40 | // only thread 0 writes result for this block back to global mem
41 | if (tid == 0)
42 | {
43 | d_out[blockIdx.x] = sdata[0];
44 | d_out[blockDim.x + blockIdx.x] = sdata[blockDim.x / 2];
45 | //printf("%f %f\n", sdata[0], sdata[blockDim.x / 2]);
46 | }
47 | }
48 |
49 | __global__ void shmem_reduce_finish_kernel(float *min_logLum,
50 | float *max_logLum, const float * const d_in)
51 | {
52 | // sdata is allocated in the kernel call: 3rd arg to <<>>
53 | extern __shared__ float sdata[];
54 | int tid = threadIdx.x;
55 |
56 | // load shared mem from global mem
57 | sdata[tid] = d_in[tid];
58 | sdata[tid + blockDim.x] = d_in[tid + blockDim.x];
59 | __syncthreads(); // make sure entire block is loaded!
60 |
61 | // do reduction in shared mem
62 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
63 | {
64 | if (tid < s) {
65 | sdata[tid] = max(sdata[tid], sdata[tid + s]);
66 | sdata[tid + blockDim.x] = min(sdata[tid + blockDim.x], sdata[tid + blockDim.x + s]);
67 | }
68 | __syncthreads(); // make sure all adds at one stage are done!
69 | }
70 |
71 | // only thread 0 writes result for this block back to global mem
72 | if (tid == 0)
73 | {
74 | *max_logLum = sdata[0];
75 | *min_logLum = sdata[blockDim.x];
76 | }
77 | }
78 |
79 | void reduce(float *min_logLum, float *max_logLum, const float* const d_logLuminance, int length)
80 | {
81 | // use reduce
82 | const int m = 1 << 6;
83 | int blocks = ceil((float)length / m);
84 | float *d_intermediate; // should not modify d_in
85 | cudaMalloc(&d_intermediate, sizeof(float)* blocks * 2); // store max and min
86 | shmem_reduce_kernel<<>>(d_intermediate, d_logLuminance);
87 | shmem_reduce_finish_kernel<<<1, blocks, 2 * blocks*sizeof(float)>>>(min_logLum, max_logLum, d_intermediate);
88 | cudaFree(d_intermediate);
89 | }
90 |
91 | int main(int argc, char **argv)
92 | {
93 | int deviceCount;
94 | cudaGetDeviceCount(&deviceCount);
95 | if (deviceCount == 0) {
96 | fprintf(stderr, "error: no devices supporting CUDA.\n");
97 | exit(EXIT_FAILURE);
98 | }
99 | int dev = 0;
100 | cudaSetDevice(dev);
101 |
102 | cudaDeviceProp devProps;
103 | if (cudaGetDeviceProperties(&devProps, dev) == 0)
104 | {
105 | printf("Using device %d:\n", dev);
106 | printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
107 | devProps.name, (int)devProps.totalGlobalMem,
108 | (int)devProps.major, (int)devProps.minor,
109 | (int)devProps.clockRate);
110 | }
111 |
112 | const int ARRAY_SIZE = 1 << 12;
113 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
114 |
115 | // generate the input array on the host
116 | float h_in[ARRAY_SIZE];
117 | float sum = 0.0f;
118 | for(int i = 0; i < ARRAY_SIZE; i++) {
119 | // generate random float in [-1.0f, 1.0f]
120 | h_in[i] = -1.0f + (float)rand()/((float)RAND_MAX/2.0f);
121 | sum += h_in[i];
122 | }
123 |
124 | // declare GPU memory pointers
125 | float *d_in;
126 |
127 | // allocate GPU memory
128 | cudaMalloc((void **) &d_in, ARRAY_BYTES);
129 |
130 | // transfer the input array to the GPU
131 | cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
132 |
133 | // launch the kernel
134 | printf("Running reduce\n");
135 | float *d_min, *d_max;
136 | cudaMalloc((void **) &d_min, sizeof(float));
137 | cudaMalloc((void **) &d_max, sizeof(float));
138 | reduce(d_min, d_max, d_in, ARRAY_SIZE);
139 |
140 | // copy back the sum from GPU
141 | float h_min, h_max;
142 | cudaMemcpy(&h_min, d_min, sizeof(float), cudaMemcpyDeviceToHost);
143 | cudaMemcpy(&h_max, d_max, sizeof(float), cudaMemcpyDeviceToHost);
144 |
145 | printf("Max_GPU: %f Min_GPU: %f\n", h_max, h_min);
146 | h_max = h_in[0]; h_min = h_in[0];
147 | for (size_t i = 1; i < ARRAY_SIZE; ++i) {
148 | h_max = std::max(h_in[i], h_max);
149 | h_min = std::min(h_in[i], h_min);
150 | }
151 | printf("Max_CPU: %f Min_CPU: %f\n", h_max, h_min);
152 |
153 | // free GPU memory allocation
154 | cudaFree(d_in);
155 | cudaFree(d_min);
156 | cudaFree(d_max);
157 | return 0;
158 | }
159 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | CUDA_ADD_EXECUTABLE(Lesson5_deviceQuery deviceQuery_simplified.cpp)
11 |
12 | CUDA_ADD_EXECUTABLE(Lesson5_transpose transpose.cu gputimer.h)
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB Lesson7_tiling_hdr tiling/*.hpp tiling/*.h )
11 | SET (Lesson7_tiling_files tiling/tiling.cu)
12 | CUDA_ADD_EXECUTABLE(Lesson7_tiling ${Lesson7_tiling_files} ${Lesson7_tiling_hdr})
13 |
14 | file( GLOB Lesson7_thrust_hdr thrust/*.h )
15 | SET (Lesson7_thrust_files thrust/thrust_example.cu thrust/gettime.cc)
16 | CUDA_ADD_EXECUTABLE(Lesson7_thrust ${Lesson7_thrust_files} ${Lesson7_thrust_hdr})
17 |
18 | file( GLOB Lesson7_opencv_hdr opencv/*.h )
19 | SET (Lesson7_opencv_files opencv/opencv.cu opencv/gettime.cc)
20 | CUDA_ADD_EXECUTABLE(Lesson7_opencv ${Lesson7_opencv_files} ${Lesson7_opencv_hdr})
21 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2013, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /******************************************************************************
30 | * Simple demonstration of cub::BlockScan
31 | *
32 | * Example compilation string:
33 | *
34 | * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum
35 | *
36 | ******************************************************************************/
37 |
38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h)
39 | #define CUB_STDERR
40 |
41 | #include
42 | #include
43 |
44 | #include
45 |
46 | using namespace cub;
47 |
48 | //---------------------------------------------------------------------
49 | // Globals, constants and typedefs
50 | //---------------------------------------------------------------------
51 |
52 | bool g_verbose = false;
53 | int g_iterations = 100;
54 |
55 |
56 | //---------------------------------------------------------------------
57 | // Kernels
58 | //---------------------------------------------------------------------
59 |
60 | /**
61 | * Simple kernel for performing a block-wide exclusive prefix sum over integers
62 | */
63 | template <
64 | int BLOCK_THREADS,
65 | int ITEMS_PER_THREAD>
66 | __global__ void BlockPrefixSumKernel(
67 | int *d_in, // Tile of input
68 | int *d_out, // Tile of output
69 | clock_t *d_elapsed) // Elapsed cycle count of block scan
70 | {
71 | // Parameterize BlockScan type for our thread block
72 | typedef BlockScan BlockScanT;
73 |
74 | // Shared memory
75 | __shared__ typename BlockScanT::SmemStorage smem_storage;
76 |
77 | // Per-thread tile data
78 | int data[ITEMS_PER_THREAD];
79 | BlockLoadVectorized(d_in, data);
80 |
81 | // Start cycle timer
82 | clock_t start = clock();
83 |
84 | // Compute exclusive prefix sum
85 | int aggregate;
86 | BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
87 |
88 | // Stop cycle timer
89 | clock_t stop = clock();
90 |
91 | // Store output
92 | BlockStoreVectorized(d_out, data);
93 |
94 | // Store aggregate and elapsed clocks
95 | if (threadIdx.x == 0)
96 | {
97 | *d_elapsed = (start > stop) ? start - stop : stop - start;
98 | d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
99 | }
100 | }
101 |
102 |
103 |
104 | //---------------------------------------------------------------------
105 | // Host utilities
106 | //---------------------------------------------------------------------
107 |
108 | /**
109 | * Initialize exclusive prefix sum problem (and solution).
110 | * Returns the aggregate
111 | */
112 | int Initialize(
113 | int *h_in,
114 | int *h_reference,
115 | int num_elements)
116 | {
117 | int inclusive = 0;
118 |
119 | for (int i = 0; i < num_elements; ++i)
120 | {
121 | h_in[i] = i % 17;
122 |
123 | h_reference[i] = inclusive;
124 | inclusive += h_in[i];
125 | }
126 |
127 | return inclusive;
128 | }
129 |
130 |
131 | /**
132 | * Test thread block scan
133 | */
134 | template <
135 | int BLOCK_THREADS,
136 | int ITEMS_PER_THREAD>
137 | void Test()
138 | {
139 | const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
140 |
141 | // Allocate host arrays
142 | int *h_in = new int[TILE_SIZE];
143 | int *h_reference = new int[TILE_SIZE];
144 | int *h_gpu = new int[TILE_SIZE + 1];
145 |
146 | // Initialize problem and reference output on host
147 | int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
148 |
149 | // Initialize device arrays
150 | int *d_in = NULL;
151 | int *d_out = NULL;
152 | clock_t *d_elapsed = NULL;
153 | cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE);
154 | cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
155 | cudaMalloc((void**)&d_elapsed, sizeof(clock_t));
156 |
157 | // Display input problem data
158 | if (g_verbose)
159 | {
160 | printf("Input data: ");
161 | for (int i = 0; i < TILE_SIZE; i++)
162 | printf("%d, ", h_in[i]);
163 | printf("\n\n");
164 | }
165 |
166 | // Copy problem to device
167 | cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
168 |
169 | printf("BlockScan %d items (%d threads, %d items per thread): ",
170 | TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
171 |
172 | // Run this several times and average the performance results
173 | clock_t elapsed_scan_clocks = 0;
174 | for (int i = 0; i < g_iterations; ++i)
175 | {
176 | // Run aggregate/prefix kernel
177 | BlockPrefixSumKernel<<<1, BLOCK_THREADS>>>(
178 | d_in,
179 | d_out,
180 | d_elapsed);
181 |
182 | // Copy results from device
183 | clock_t scan_clocks;
184 | cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
185 | cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
186 | elapsed_scan_clocks += scan_clocks;
187 | }
188 |
189 | // Check scanned items
190 | bool correct = true;
191 | for (int i = 0; i < TILE_SIZE; i++)
192 | {
193 | if (h_gpu[i] != h_reference[i])
194 | {
195 | printf("Incorrect result @ offset %d (%d != %d)\n",
196 | i, h_gpu[i], h_reference[i]);
197 | correct = false;
198 | break;
199 | }
200 | }
201 |
202 | // Check total aggregate
203 | if (h_gpu[TILE_SIZE] != h_aggregate)
204 | {
205 | printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
206 | correct = false;
207 | }
208 | if (correct) printf("Correct!\n");
209 |
210 | // Display results problem data
211 | if (g_verbose)
212 | {
213 | printf("GPU output (reference output): ");
214 | for (int i = 0; i < TILE_SIZE; i++)
215 | printf("%d (%d), ", h_gpu[i], h_reference[i]);
216 | printf("\n");
217 | printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
218 | printf("\n\n");
219 | }
220 |
221 | // Display timing results
222 | printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
223 |
224 | // Cleanup
225 | if (h_in) delete[] h_in;
226 | if (h_reference) delete[] h_reference;
227 | if (h_gpu) delete[] h_gpu;
228 | if (d_in) cudaFree(d_in);
229 | if (d_out) cudaFree(d_out);
230 | if (d_elapsed) cudaFree(d_elapsed);
231 | }
232 |
233 |
234 | /**
235 | * Main
236 | */
237 | int main(int argc, char** argv)
238 | {
239 | // Display GPU name
240 | cudaDeviceProp props;
241 | cudaGetDeviceProperties(&props, 0);
242 | printf("Using device %s\n", props.name);
243 |
244 | /** Add tests here **/
245 |
246 | // Run tests
247 | Test<1024, 1>();
248 | Test<512, 2>();
249 | Test<256, 4>();
250 | Test<128, 8>();
251 | Test<64, 16>();
252 | Test<32, 32>();
253 | Test<16, 64>();
254 |
255 | /****/
256 |
257 | return 0;
258 | }
259 |
260 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.cc:
--------------------------------------------------------------------------------
1 | #define WIN32_LEAN_AND_MEAN
2 | #include
3 | #include // portable: uint64_t MSVC: __int64
4 | #include "gettime.h"
5 |
6 | int gettimeofday(struct timeval * tp, struct timezone * tzp)
7 | {
8 | // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
9 | static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL);
10 |
11 | SYSTEMTIME system_time;
12 | FILETIME file_time;
13 | uint64_t time;
14 |
15 | GetSystemTime( &system_time );
16 | SystemTimeToFileTime( &system_time, &file_time );
17 | time = ((uint64_t)file_time.dwLowDateTime ) ;
18 | time += ((uint64_t)file_time.dwHighDateTime) << 32;
19 |
20 | tp->tv_sec = (long) ((time - EPOCH) / 10000000L);
21 | tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/gettime.h:
--------------------------------------------------------------------------------
1 | #ifndef GETTIME_H
2 | #define GETTIME_H
3 |
4 | #include
5 |
6 | // MSVC defines this in winsock2.h!?
7 | /*struct timeval {
8 | long tv_sec;
9 | long tv_usec;
10 | };
11 | */
12 | int gettimeofday(struct timeval * tp, struct timezone * tzp);
13 |
14 | double tic();
15 |
16 | #endif
17 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/opencv/opencv.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "gettime.h"
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | using namespace std;
12 | using namespace cv;
13 | using namespace cv::cuda;
14 |
15 | int main(int argc, char **argv) {
16 |
17 | cv::Mat src = cv::imread("IMAG0179_small.jpg", cv::IMREAD_GRAYSCALE);
18 |
19 | if (!src.data) {
20 | printf("failed opening jpg\n");
21 | exit(1);
22 | }
23 |
24 | Mat mask;
25 | cv::Canny(src, mask, 100, 200, 3);
26 |
27 | Mat dst_cpu;
28 | cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR);
29 | Mat dst_gpu = dst_cpu.clone();
30 |
31 | vector lines_cpu;
32 | {
33 | const int64 start = getTickCount();
34 |
35 | cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
36 |
37 | const double timeSec = (getTickCount() - start) / getTickFrequency();
38 | cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
39 | cout << "CPU Found : " << lines_cpu.size() << endl;
40 | }
41 |
42 | for (size_t i = 0; i < lines_cpu.size(); ++i)
43 | {
44 | Vec4i l = lines_cpu[i];
45 | line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
46 | }
47 |
48 | GpuMat d_src(mask);
49 | GpuMat d_lines;
50 | {
51 | const int64 start = getTickCount();
52 |
53 | Ptr hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 60, 5);
54 | hough->detect(d_src, d_lines);
55 |
56 | const double timeSec = (getTickCount() - start) / getTickFrequency();
57 | cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
58 | cout << "GPU Found : " << d_lines.cols << endl;
59 | }
60 | vector lines_gpu;
61 | if (!d_lines.empty())
62 | {
63 | lines_gpu.resize(d_lines.cols);
64 | Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
65 | d_lines.download(h_lines);
66 | }
67 |
68 | for (size_t i = 0; i < lines_gpu.size(); ++i)
69 | {
70 | Vec4i l = lines_gpu[i];
71 | line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
72 | }
73 |
74 | imshow("source", src);
75 | imshow("detected lines [CPU]", dst_cpu);
76 | imshow("detected lines [GPU]", dst_gpu);
77 | waitKey();
78 |
79 | return 0;
80 | }
81 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.cc:
--------------------------------------------------------------------------------
1 | #define WIN32_LEAN_AND_MEAN
2 | #include
3 | #include // portable: uint64_t MSVC: __int64
4 | #include "gettime.h"
5 |
6 | int gettimeofday(struct timeval * tp, struct timezone * tzp)
7 | {
8 | // Note: some broken versions only have 8 trailing zero's, the correct epoch has 9 trailing zero's
9 | static const uint64_t EPOCH = ((uint64_t) 116444736000000000ULL);
10 |
11 | SYSTEMTIME system_time;
12 | FILETIME file_time;
13 | uint64_t time;
14 |
15 | GetSystemTime( &system_time );
16 | SystemTimeToFileTime( &system_time, &file_time );
17 | time = ((uint64_t)file_time.dwLowDateTime ) ;
18 | time += ((uint64_t)file_time.dwHighDateTime) << 32;
19 |
20 | tp->tv_sec = (long) ((time - EPOCH) / 10000000L);
21 | tp->tv_usec = (long) (system_time.wMilliseconds * 1000);
22 | return 0;
23 | }
24 |
25 | /*double tic() {
26 | struct timeval t;
27 | gettimeofday(&t, NULL);
28 | return ((double)t.tv_sec * 1000 + ((double)t.tv_usec) / 1000.);
29 | }*/
30 |
31 | double tic() {
32 | LARGE_INTEGER m_nFreq;
33 | LARGE_INTEGER m_Time;
34 | QueryPerformanceFrequency(&m_nFreq);
35 | QueryPerformanceCounter(&m_Time);
36 | return (double)m_Time.QuadPart * 1000. / m_nFreq.QuadPart;
37 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gettime.h:
--------------------------------------------------------------------------------
1 | #ifndef GETTIME_H
2 | #define GETTIME_H
3 |
4 | #include
5 |
6 | // MSVC defines this in winsock2.h!?
7 | /*struct timeval {
8 | long tv_sec;
9 | long tv_usec;
10 | };
11 | */
12 | int gettimeofday(struct timeval * tp, struct timezone * tzp);
13 | double tic();
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include "gputimer.h"
11 | #include "gettime.h"
12 |
13 | int main(void)
14 | {
15 | // generate N random numbers serially
16 | int N = 1000000;
17 | std::vector h_vec(N);
18 | std::generate(h_vec.begin(), h_vec.end(), rand);
19 | std::vector h_vec_std(h_vec);
20 |
21 | double t0 = tic();
22 | thrust::sort(h_vec.begin(), h_vec.end());
23 | std::cout << "thrust::sort took " << tic() - t0 << " ms." << std::endl;
24 |
25 | t0 = tic();
26 | std::sort(h_vec_std.begin(), h_vec_std.end());
27 | std::cout << "std::sort took " << tic() - t0 << " ms." << std::endl;
28 |
29 | for (int i = 0; i < N; i++) {
30 | if (h_vec[i] != h_vec_std[i]) {
31 | std::cout << i << " Not same!" << std::endl;
32 | exit(1);
33 | }
34 | }
35 |
36 | return 0;
37 | }
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/a.exp
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h:
--------------------------------------------------------------------------------
1 | #ifndef __GPU_TIMER_H__
2 | #define __GPU_TIMER_H__
3 |
4 | struct GpuTimer
5 | {
6 | cudaEvent_t start;
7 | cudaEvent_t stop;
8 |
9 | GpuTimer()
10 | {
11 | cudaEventCreate(&start);
12 | cudaEventCreate(&stop);
13 | }
14 |
15 | ~GpuTimer()
16 | {
17 | cudaEventDestroy(start);
18 | cudaEventDestroy(stop);
19 | }
20 |
21 | void Start()
22 | {
23 | cudaEventRecord(start, 0);
24 | }
25 |
26 | void Stop()
27 | {
28 | cudaEventRecord(stop, 0);
29 | }
30 |
31 | float Elapsed()
32 | {
33 | float elapsed;
34 | cudaEventSynchronize(stop);
35 | cudaEventElapsedTime(&elapsed, start, stop);
36 | return elapsed;
37 | }
38 | };
39 |
40 | #endif /* __GPU_TIMER_H__ */
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "gputimer.h"
5 | #include "utils.h"
6 |
7 | const int BLOCKSIZE = 128;
8 | const int NUMBLOCKS = 100; // set this to 1 or 2 for debugging
9 | const int N = BLOCKSIZE*NUMBLOCKS;
10 |
11 | /*
12 | * TODO: modify the foo and bar kernels to use tiling:
13 | * - copy the input data to shared memory
14 | * - perform the computation there
15 | * - copy the result back to global memory
16 | * - assume thread blocks of 128 threads
17 | * - handle intra-block boundaries correctly
18 | * You can ignore boundary conditions (we ignore the first 2 and last 2 elements)
19 | */
20 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){
21 |
22 | int i = threadIdx.x + blockIdx.x*blockDim.x;
23 |
24 | out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
25 | }
26 |
27 | __global__ void bar(float out[], float in[])
28 | {
29 | int i = threadIdx.x + blockIdx.x*blockDim.x;
30 |
31 | out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
32 | }
33 |
34 | __global__ void bar_tile(float out[], float in[])
35 | {
36 | int i = threadIdx.x + blockIdx.x*blockDim.x;
37 | int idx = threadIdx.x;
38 | extern __shared__ float sh_din[];
39 | sh_din[idx + 2] = in[i];
40 | if (idx == 0) {
41 | sh_din[idx] = in[i-2];
42 | sh_din[idx+1] = in[i-1];
43 | }
44 | else if (idx == blockDim.x - 1) {
45 | sh_din[idx + 3] = in[i+1];
46 | sh_din[idx + 4] = in[i+2];
47 | }
48 | __syncthreads();
49 |
50 | out[i] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f;
51 | }
52 |
53 | __global__ void bar_tile_2(float out[], float in[])
54 | {
55 | int i = threadIdx.x + blockIdx.x*blockDim.x;
56 | int idx = threadIdx.x;
57 | extern __shared__ float sh_din[];
58 | sh_din[idx] = in[i];
59 | __syncthreads();
60 | if (idx == 0) {
61 | out[i] = (in[i - 2] + in[i - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
62 | }
63 | else if (idx == 1) {
64 | out[i] = (in[i - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
65 | }
66 | else if (idx == blockDim.x - 2) {
67 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + in[i + 2]) / 5.0f;
68 | }
69 | else if (idx == blockDim.x - 1) {
70 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + in[i + 1] + in[i + 2]) / 5.0f;
71 | }
72 | else {
73 | out[i] = (sh_din[idx - 2] + sh_din[idx - 1] + sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2]) / 5.0f;
74 | }
75 | }
76 |
77 | __global__ void bar_tile_3(float out[], float in[])
78 | {
79 | int idx = threadIdx.x;
80 | extern __shared__ float sh_din[];
81 | int i_in = blockIdx.x * BLOCKSIZE + idx;
82 | sh_din[idx] = in[i_in-2];
83 | __syncthreads();
84 | if (idx < blockDim.x-4)
85 | out[i_in] = (sh_din[idx] + sh_din[idx + 1] + sh_din[idx + 2] + sh_din[idx + 3] + sh_din[idx + 4]) / 5.0f;
86 | }
87 |
88 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[])
89 | {
90 | for (int i=0; i>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE);
147 | fooTimer.Stop();
148 |
149 | checkCudaErrors(cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost));
150 | printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed());
151 | compareArrays(ref_fooOut, fooOut, N);
152 |
153 | barTimer.Start();
154 | bar<<>>(d_barOut, d_barIn);
155 | //bar_tile << > >(d_barOut, d_barIn);
156 | //bar_tile_2 << > >(d_barOut, d_barIn);
157 | //bar_tile_3 << > >(d_barOut, d_barIn);
158 | barTimer.Stop();
159 |
160 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
161 | printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
162 | compareArrays(ref_barOut, barOut, N);
163 |
164 | barTimer.Start();
165 | bar_tile << > >(d_barOut, d_barIn);
166 | barTimer.Stop();
167 |
168 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
169 | printf("bar_tile<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
170 | compareArrays(ref_barOut, barOut, N);
171 |
172 | barTimer.Start();
173 | bar_tile_2 << > >(d_barOut, d_barIn);
174 | barTimer.Stop();
175 |
176 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
177 | printf("bar_tile_2<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
178 | compareArrays(ref_barOut, barOut, N);
179 |
180 | barTimer.Start();
181 | bar_tile_3 << > >(d_barOut, d_barIn);
182 | barTimer.Stop();
183 |
184 | checkCudaErrors(cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost));
185 | printf("bar_tile_3<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
186 | compareArrays(ref_barOut, barOut, N);
187 |
188 | checkCudaErrors(cudaFree(d_fooA));
189 | checkCudaErrors(cudaFree(d_fooB));
190 | checkCudaErrors(cudaFree(d_fooC));
191 | checkCudaErrors(cudaFree(d_fooD));
192 | checkCudaErrors(cudaFree(d_fooE));
193 | checkCudaErrors(cudaFree(d_barIn));
194 | checkCudaErrors(cudaFree(d_fooOut));
195 | checkCudaErrors(cudaFree(d_barOut));
196 | }
197 |
--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | // error checking utility functions
14 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
15 |
16 | template
17 | void check(T err, const char* const func, const char* const file, const int line)
18 | {
19 | if (err != cudaSuccess) {
20 | fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
21 | fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
22 | exit(1);
23 | }
24 | }
25 |
26 | void printArray(float in[], int N)
27 | {
28 | for (int i=0; i CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu})
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/HW1.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "utils.h"
5 | #include
6 | #include
7 | #include
8 |
9 | cv::Mat imageRGBA;
10 | cv::Mat imageGrey;
11 |
12 | uchar4 *d_rgbaImage__;
13 | unsigned char *d_greyImage__;
14 |
15 | size_t numRows() { return imageRGBA.rows; }
16 | size_t numCols() { return imageRGBA.cols; }
17 |
18 | //return types are void since any internal error will be handled by quitting
19 | //no point in returning error codes...
20 | //returns a pointer to an RGBA version of the input image
21 | //and a pointer to the single channel grey-scale output
22 | //on both the host and device
23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage,
24 | uchar4 **d_rgbaImage, unsigned char **d_greyImage,
25 | const std::string &filename) {
26 | //make sure the context initializes ok
27 | checkCudaErrors(cudaFree(0));
28 |
29 | cv::Mat image;
30 | image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
31 | if (image.empty()) {
32 | std::cerr << "Couldn't open file: " << filename << std::endl;
33 | exit(1);
34 | }
35 |
36 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
37 |
38 | //allocate memory for the output
39 | imageGrey.create(image.rows, image.cols, CV_8UC1);
40 |
41 | //This shouldn't ever happen given the way the images are created
42 | //at least based upon my limited understanding of OpenCV, but better to check
43 | if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
44 | std::cerr << "Images aren't continuous!! Exiting." << std::endl;
45 | exit(1);
46 | }
47 |
48 | *inputImage = (uchar4 *)imageRGBA.ptr(0);
49 | *greyImage = imageGrey.ptr(0);
50 |
51 | const size_t numPixels = numRows() * numCols();
52 | //allocate memory on the device for both input and output
53 | checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels));
54 | checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels));
55 | checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around
56 |
57 | //copy input array to the GPU
58 | checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
59 |
60 | d_rgbaImage__ = *d_rgbaImage;
61 | d_greyImage__ = *d_greyImage;
62 | }
63 |
64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) {
65 | cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
66 |
67 | //output the image
68 | cv::imwrite(output_file.c_str(), output);
69 | }
70 |
71 | void cleanup()
72 | {
73 | //cleanup
74 | cudaFree(d_rgbaImage__);
75 | cudaFree(d_greyImage__);
76 | }
77 |
78 | void generateReferenceImage(std::string input_filename, std::string output_filename)
79 | {
80 | cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE);
81 |
82 | cv::imwrite(output_filename, reference);
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | # or if using MacPorts
19 |
20 | #OPENCV_LIBPATH=/opt/local/lib
21 | #OPENCV_INCLUDEPATH=/opt/local/include
22 |
23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
24 |
25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
26 |
27 | ######################################################
28 | # On Macs the default install locations are below #
29 | # ####################################################
30 |
31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
32 | #CUDA_LIBPATH=/usr/local/cuda/lib
33 |
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 |
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 |
38 | student: main.o student_func.o compare.o reference_calc.o Makefile
39 | $(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 |
41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp
42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH)
43 |
44 | student_func.o: student_func.cu utils.h
45 | nvcc -c student_func.cu $(NVCC_OPTS)
46 |
47 | compare.o: compare.cpp compare.h
48 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 |
50 | reference_calc.o: reference_calc.cpp reference_calc.h
51 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 |
53 | clean:
54 | rm -f *.o *.png hw
55 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/cinque_terre_small.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "utils.h"
6 |
7 | void compareImages(std::string reference_filename, std::string test_filename,
8 | bool useEpsCheck, double perPixelError, double globalError)
9 | {
10 | cv::Mat reference = cv::imread(reference_filename, -1);
11 | cv::Mat test = cv::imread(test_filename, -1);
12 |
13 | cv::Mat diff = abs(reference - test);
14 |
15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 |
17 | double minVal, maxVal;
18 |
19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 |
21 | //now perform transform so that we bump values to the full range
22 |
23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 |
25 | diff = diffSingleChannel.reshape(reference.channels(), 0);
26 |
27 | cv::imwrite("HW1_differenceImage.png", diff);
28 | //OK, now we can start comparing values...
29 | unsigned char *referencePtr = reference.ptr(0);
30 | unsigned char *testPtr = test.ptr(0);
31 |
32 | if (useEpsCheck) {
33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 | }
35 | else
36 | {
37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 | }
39 |
40 | std::cout << "PASS" << std::endl;
41 | return;
42 | }
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename,
5 | bool useEpsCheck, double perPixelError, double globalError);
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW1 Solution
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include "reference_calc.h"
9 | #include "compare.h"
10 |
11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage,
12 | uchar4 * const d_rgbaImage,
13 | unsigned char* const d_greyImage,
14 | size_t numRows, size_t numCols);
15 |
16 | //include the definitions of the above functions for this homework
17 | #include "HW1.cpp"
18 |
19 | int main(int argc, char **argv) {
20 | uchar4 *h_rgbaImage, *d_rgbaImage;
21 | unsigned char *h_greyImage, *d_greyImage;
22 |
23 | std::string input_file;
24 | std::string output_file;
25 | std::string reference_file;
26 | double perPixelError = 0.0;
27 | double globalError = 0.0;
28 | bool useEpsCheck = false;
29 | switch (argc)
30 | {
31 | case 2:
32 | input_file = std::string(argv[1]);
33 | output_file = "HW1_output.png";
34 | reference_file = "HW1_reference.png";
35 | break;
36 | case 3:
37 | input_file = std::string(argv[1]);
38 | output_file = std::string(argv[2]);
39 | reference_file = "HW1_reference.png";
40 | break;
41 | case 4:
42 | input_file = std::string(argv[1]);
43 | output_file = std::string(argv[2]);
44 | reference_file = std::string(argv[3]);
45 | break;
46 | case 6:
47 | useEpsCheck=true;
48 | input_file = std::string(argv[1]);
49 | output_file = std::string(argv[2]);
50 | reference_file = std::string(argv[3]);
51 | perPixelError = atof(argv[4]);
52 | globalError = atof(argv[5]);
53 | break;
54 | default:
55 | std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
56 | exit(1);
57 | }
58 | //load the image and give us our input and output pointers
59 | preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
60 |
61 | GpuTimer timer;
62 | timer.Start();
63 | //call the students' code
64 | your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
65 | timer.Stop();
66 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
67 |
68 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
69 |
70 | if (err < 0) {
71 | //Couldn't print! Probably the student closed stdout - bad news
72 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
73 | exit(1);
74 | }
75 |
76 | size_t numPixels = numRows()*numCols();
77 | checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));
78 |
79 | //check results and output the grey image
80 | postProcess(output_file, h_greyImage);
81 |
82 | referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());
83 |
84 | postProcess(reference_file, h_greyImage);
85 |
86 | //generateReferenceImage(input_file, reference_file);
87 | compareImages(reference_file, output_file, useEpsCheck, perPixelError,
88 | globalError);
89 |
90 | cleanup();
91 |
92 | return 0;
93 | }
94 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | // for uchar4 struct
2 | #include
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 | unsigned char *const greyImage,
6 | size_t numRows,
7 | size_t numCols)
8 | {
9 | for (size_t r = 0; r < numRows; ++r) {
10 | for (size_t c = 0; c < numCols; ++c) {
11 | uchar4 rgba = rgbaImage[r * numCols + c];
12 | float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
13 | greyImage[r * numCols + c] = channelSum;
14 | }
15 | }
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 | unsigned char *const greyImage,
6 | size_t numRows,
7 | size_t numCols);
8 |
9 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/student_func.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 1/student_func.cu
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 |
15 | template
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 | if (err != cudaSuccess) {
18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 | exit(1);
21 | }
22 | }
23 |
24 | template
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 | //check that the GPU result matches the CPU result
27 | for (size_t i = 0; i < numElem; ++i) {
28 | if (ref[i] != gpu[i]) {
29 | std::cerr << "Difference at pos " << i << std::endl;
30 | //the + is magic to convert char to int without messing
31 | //with other types
32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 | "\nGPU : " << +gpu[i] << std::endl;
34 | exit(1);
35 | }
36 | }
37 | }
38 |
39 | template
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 | assert(eps1 >= 0 && eps2 >= 0);
42 | unsigned long long totalDiff = 0;
43 | unsigned numSmallDifferences = 0;
44 | for (size_t i = 0; i < numElem; ++i) {
45 | //subtract smaller from larger in case of unsigned types
46 | T smaller = std::min(ref[i], gpu[i]);
47 | T larger = std::max(ref[i], gpu[i]);
48 | T diff = larger - smaller;
49 | if (diff > 0 && diff <= eps1) {
50 | numSmallDifferences++;
51 | }
52 | else if (diff > eps1) {
53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 | "\nGPU : " << +gpu[i] << std::endl;
56 | exit(1);
57 | }
58 | totalDiff += diff * diff;
59 | }
60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 | if (percentSmallDifferences > eps2) {
62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 | exit(1);
65 | }
66 | }
67 |
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 |
74 | size_t numBadPixels = 0;
75 | for (size_t i = 0; i < numElem; ++i) {
76 | T smaller = std::min(ref[i], gpu[i]);
77 | T larger = std::max(ref[i], gpu[i]);
78 | T diff = larger - smaller;
79 | if (diff > variance)
80 | ++numBadPixels;
81 | }
82 |
83 | if (numBadPixels > tolerance) {
84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 | exit(1);
86 | }
87 | }
88 |
89 | #endif
90 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu})
15 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/HW2.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "utils.h"
5 | #include
6 | #include
7 | #include
8 |
9 | cv::Mat imageInputRGBA;
10 | cv::Mat imageOutputRGBA;
11 |
12 | uchar4 *d_inputImageRGBA__;
13 | uchar4 *d_outputImageRGBA__;
14 |
15 | float *h_filter__;
16 |
17 | size_t numRows() { return imageInputRGBA.rows; }
18 | size_t numCols() { return imageInputRGBA.cols; }
19 |
20 | //return types are void since any internal error will be handled by quitting
21 | //no point in returning error codes...
22 | //returns a pointer to an RGBA version of the input image
23 | //and a pointer to the single channel grey-scale output
24 | //on both the host and device
25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
26 | uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
27 | unsigned char **d_redBlurred,
28 | unsigned char **d_greenBlurred,
29 | unsigned char **d_blueBlurred,
30 | float **h_filter, int *filterWidth,
31 | const std::string &filename) {
32 |
33 | //make sure the context initializes ok
34 | checkCudaErrors(cudaFree(0));
35 |
36 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
37 | if (image.empty()) {
38 | std::cerr << "Couldn't open file: " << filename << std::endl;
39 | exit(1);
40 | }
41 |
42 | cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);
43 |
44 | //allocate memory for the output
45 | imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);
46 |
47 | //This shouldn't ever happen given the way the images are created
48 | //at least based upon my limited understanding of OpenCV, but better to check
49 | if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
50 | std::cerr << "Images aren't continuous!! Exiting." << std::endl;
51 | exit(1);
52 | }
53 |
54 | *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr(0);
55 | *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr(0);
56 |
57 | const size_t numPixels = numRows() * numCols();
58 | //allocate memory on the device for both input and output
59 | checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
60 | checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
61 | checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around
62 |
63 | //copy input array to the GPU
64 | checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
65 |
66 | d_inputImageRGBA__ = *d_inputImageRGBA;
67 | d_outputImageRGBA__ = *d_outputImageRGBA;
68 |
69 | //now create the filter that they will use
70 | const int blurKernelWidth = 9;
71 | const float blurKernelSigma = 2.;
72 |
73 | *filterWidth = blurKernelWidth;
74 |
75 | //create and fill the filter we will convolve with
76 | *h_filter = new float[blurKernelWidth * blurKernelWidth];
77 | h_filter__ = *h_filter;
78 |
79 | float filterSum = 0.f; //for normalization
80 |
81 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
82 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
83 | float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
84 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
85 | filterSum += filterValue;
86 | }
87 | }
88 |
89 | float normalizationFactor = 1.f / filterSum;
90 |
91 | for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
92 | for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
93 | (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
94 | }
95 | }
96 |
97 | //blurred
98 | checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels));
99 | checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels));
100 | checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels));
101 | checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels));
102 | checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
103 | checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels));
104 | }
105 |
106 | void postProcess(const std::string& output_file, uchar4* data_ptr) {
107 | cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
108 |
109 | cv::Mat imageOutputBGR;
110 | cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
111 | //output the image
112 | cv::imwrite(output_file.c_str(), imageOutputBGR);
113 | }
114 |
115 | void cleanUp(void)
116 | {
117 | cudaFree(d_inputImageRGBA__);
118 | cudaFree(d_outputImageRGBA__);
119 | delete[] h_filter__;
120 | }
121 |
122 |
123 | // An unused bit of code showing how to accomplish this assignment using OpenCV. It is much faster
124 | // than the naive implementation in reference_calc.cpp.
125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size)
126 | {
127 | cv::Mat input = cv::imread(input_file);
128 | // Create an identical image for the output as a placeholder
129 | cv::Mat reference = cv::imread(input_file);
130 | cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0);
131 | cv::imwrite(reference_file, reference);
132 | }
133 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
20 |
21 | ######################################################
22 | # On Macs the default install locations are below #
23 | # ####################################################
24 |
25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
26 | #CUDA_LIBPATH=/usr/local/cuda/lib
27 |
28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
29 |
30 | GCC_OPTS=-O3 -Wall -Wextra -m64
31 |
32 | student: main.o student_func.o compare.o reference_calc.o Makefile
33 | $(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
34 |
35 | main.o: main.cpp timer.h utils.h HW2.cpp
36 | g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH)
37 |
38 | student_func.o: student_func.cu reference_calc.cpp utils.h
39 | nvcc -c student_func.cu $(NVCC_OPTS)
40 |
41 | compare.o: compare.cpp compare.h
42 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 |
44 | reference_calc.o: reference_calc.cpp reference_calc.h
45 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
46 |
47 | clean:
48 | rm -f *.o *.png hw
49 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/cinque_terre_small.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | #include "utils.h"
6 |
7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
8 | double perPixelError, double globalError)
9 | {
10 | cv::Mat reference = cv::imread(reference_filename, -1);
11 | cv::Mat test = cv::imread(test_filename, -1);
12 |
13 | cv::Mat diff = abs(reference - test);
14 |
15 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 |
17 | double minVal, maxVal;
18 |
19 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 |
21 | //now perform transform so that we bump values to the full range
22 |
23 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 |
25 | diff = diffSingleChannel.reshape(reference.channels(), 0);
26 |
27 | cv::imwrite("HW2_differenceImage.png", diff);
28 | //OK, now we can start comparing values...
29 | unsigned char *referencePtr = reference.ptr(0);
30 | unsigned char *testPtr = test.ptr(0);
31 |
32 | if (useEpsCheck) {
33 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 | }
35 | else
36 | {
37 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 | }
39 |
40 | std::cout << "PASS" << std::endl;
41 | return;
42 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW2 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 |
9 | #include "reference_calc.h"
10 | #include "compare.h"
11 |
12 | //include the definitions of the above functions for this homework
13 | #include "HW2.cpp"
14 |
15 |
16 | /******* DEFINED IN student_func.cu *********/
17 |
18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
19 | uchar4* const d_outputImageRGBA,
20 | const size_t numRows, const size_t numCols,
21 | unsigned char *d_redBlurred,
22 | unsigned char *d_greenBlurred,
23 | unsigned char *d_blueBlurred,
24 | const int filterWidth);
25 |
26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
27 | const float* const h_filter, const size_t filterWidth);
28 |
29 |
30 | /******* Begin main *********/
31 |
32 | int main(int argc, char **argv) {
33 | uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
34 | uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
35 | unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;
36 |
37 | float *h_filter;
38 | int filterWidth;
39 |
40 | std::string input_file;
41 | std::string output_file;
42 | std::string reference_file;
43 | double perPixelError = 0.0;
44 | double globalError = 0.0;
45 | bool useEpsCheck = false;
46 | switch (argc)
47 | {
48 | case 2:
49 | input_file = std::string(argv[1]);
50 | output_file = "HW2_output.png";
51 | reference_file = "HW2_reference.png";
52 | break;
53 | case 3:
54 | input_file = std::string(argv[1]);
55 | output_file = std::string(argv[2]);
56 | reference_file = "HW2_reference.png";
57 | break;
58 | case 4:
59 | input_file = std::string(argv[1]);
60 | output_file = std::string(argv[2]);
61 | reference_file = std::string(argv[3]);
62 | break;
63 | case 6:
64 | useEpsCheck=true;
65 | input_file = std::string(argv[1]);
66 | output_file = std::string(argv[2]);
67 | reference_file = std::string(argv[3]);
68 | perPixelError = atof(argv[4]);
69 | globalError = atof(argv[5]);
70 | break;
71 | default:
72 | std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
73 | exit(1);
74 | }
75 | //load the image and give us our input and output pointers
76 | preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
77 | &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
78 | &h_filter, &filterWidth, input_file);
79 |
80 | allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
81 | GpuTimer timer;
82 | timer.Start();
83 | //call the students' code
84 | your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
85 | d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
86 | timer.Stop();
87 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
88 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
89 |
90 | if (err < 0) {
91 | //Couldn't print! Probably the student closed stdout - bad news
92 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
93 | exit(1);
94 | }
95 |
96 | //check results and output the blurred image
97 |
98 | size_t numPixels = numRows()*numCols();
99 | //copy the output back to the host
100 | checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
101 |
102 | postProcess(output_file, h_outputImageRGBA);
103 |
104 | referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
105 | numRows(), numCols(),
106 | h_filter, filterWidth);
107 |
108 | postProcess(reference_file, h_outputImageRGBA);
109 |
110 | // Cheater easy way with OpenCV
111 | //generateReferenceImage(input_file, reference_file, filterWidth);
112 |
113 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
114 |
115 | checkCudaErrors(cudaFree(d_redBlurred));
116 | checkCudaErrors(cudaFree(d_greenBlurred));
117 | checkCudaErrors(cudaFree(d_blueBlurred));
118 |
119 | cleanUp();
120 |
121 | return 0;
122 | }
123 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | // for uchar4 struct
4 | #include
5 |
6 | void channelConvolution(const unsigned char* const channel,
7 | unsigned char* const channelBlurred,
8 | const size_t numRows, const size_t numCols,
9 | const float *filter, const int filterWidth)
10 | {
11 | //Dealing with an even width filter is trickier
12 | assert(filterWidth % 2 == 1);
13 |
14 | //For every pixel in the image
15 | for (int r = 0; r < (int)numRows; ++r) {
16 | for (int c = 0; c < (int)numCols; ++c) {
17 | float result = 0.f;
18 | //For every value in the filter around the pixel (c, r)
19 | for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) {
20 | for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) {
21 | //Find the global image position for this filter position
22 | //clamp to boundary of the image
23 | int image_r = std::min(std::max(r + filter_r, 0), static_cast(numRows - 1));
24 | int image_c = std::min(std::max(c + filter_c, 0), static_cast(numCols - 1));
25 |
26 | float image_value = static_cast(channel[image_r * numCols + image_c]);
27 | float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2];
28 |
29 | result += image_value * filter_value;
30 | }
31 | }
32 |
33 | channelBlurred[r * numCols + c] = result;
34 | }
35 | }
36 | }
37 |
38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
39 | size_t numRows, size_t numCols,
40 | const float* const filter, const int filterWidth)
41 | {
42 | unsigned char *red = new unsigned char[numRows * numCols];
43 | unsigned char *blue = new unsigned char[numRows * numCols];
44 | unsigned char *green = new unsigned char[numRows * numCols];
45 |
46 | unsigned char *redBlurred = new unsigned char[numRows * numCols];
47 | unsigned char *blueBlurred = new unsigned char[numRows * numCols];
48 | unsigned char *greenBlurred = new unsigned char[numRows * numCols];
49 |
50 | //First we separate the incoming RGBA image into three separate channels
51 | //for Red, Green and Blue
52 | for (size_t i = 0; i < numRows * numCols; ++i) {
53 | uchar4 rgba = rgbaImage[i];
54 | red[i] = rgba.x;
55 | green[i] = rgba.y;
56 | blue[i] = rgba.z;
57 | }
58 |
59 | //Now we can do the convolution for each of the color channels
60 | channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth);
61 | channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth);
62 | channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth);
63 |
64 | //now recombine into the output image - Alpha is 255 for no transparency
65 | for (size_t i = 0; i < numRows * numCols; ++i) {
66 | uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255);
67 | outputImage[i] = rgba;
68 | }
69 |
70 | delete[] red;
71 | delete[] green;
72 | delete[] blue;
73 |
74 | delete[] redBlurred;
75 | delete[] greenBlurred;
76 | delete[] blueBlurred;
77 | }
78 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
5 | size_t numRows, size_t numCols,
6 | const float* const filter, const int filterWidth);
7 |
8 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/student_func.cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 2/student_func.cu
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 |
14 | template
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 | if (err != cudaSuccess) {
17 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 | exit(1);
20 | }
21 | }
22 |
23 | template
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 | //check that the GPU result matches the CPU result
26 | for (size_t i = 0; i < numElem; ++i) {
27 | if (ref[i] != gpu[i]) {
28 | std::cerr << "Difference at pos " << i << std::endl;
29 | //the + is magic to convert char to int without messing
30 | //with other types
31 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 | "\nGPU : " << +gpu[i] << std::endl;
33 | exit(1);
34 | }
35 | }
36 | }
37 |
38 | template
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 | assert(eps1 >= 0 && eps2 >= 0);
41 | unsigned long long totalDiff = 0;
42 | unsigned numSmallDifferences = 0;
43 | for (size_t i = 0; i < numElem; ++i) {
44 | //subtract smaller from larger in case of unsigned types
45 | T smaller = std::min(ref[i], gpu[i]);
46 | T larger = std::max(ref[i], gpu[i]);
47 | T diff = larger - smaller;
48 | if (diff > 0 && diff <= eps1) {
49 | numSmallDifferences++;
50 | }
51 | else if (diff > eps1) {
52 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 | "\nGPU : " << +gpu[i] << std::endl;
55 | exit(1);
56 | }
57 | totalDiff += diff * diff;
58 | }
59 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 | if (percentSmallDifferences > eps2) {
61 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 | exit(1);
64 | }
65 | }
66 |
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 |
73 | size_t numBadPixels = 0;
74 | for (size_t i = 0; i < numElem; ++i) {
75 | T smaller = std::min(ref[i], gpu[i]);
76 | T larger = std::max(ref[i], gpu[i]);
77 | T diff = larger - smaller;
78 | if (diff > variance)
79 | ++numBadPixels;
80 | }
81 |
82 | if (numBadPixels > tolerance) {
83 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 | exit(1);
85 | }
86 | }
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 | # minimum required cmake version
8 | cmake_minimum_required(VERSION 2.8)
9 | find_package(CUDA QUIET REQUIRED)
10 |
11 | SET (compare_files compare.cpp)
12 |
13 | file( GLOB hdr *.hpp *.h )
14 | file( GLOB cu *.cu)
15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
16 |
17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu})
18 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=nvcc
2 |
3 | ###################################
4 | # These are the default install #
5 | # locations on most linux distros #
6 | ###################################
7 |
8 | OPENCV_LIBPATH=/usr/lib
9 | OPENCV_INCLUDEPATH=/usr/include
10 |
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 |
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 |
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 |
20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
21 |
22 | ######################################################
23 | # On Macs the default install locations are below #
24 | # ####################################################
25 |
26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
27 | #CUDA_LIBPATH=/usr/local/cuda/lib
28 |
29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
30 |
31 | GCC_OPTS=-O3 -Wall -Wextra -m64
32 |
33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile
34 | $(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
35 |
36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h
37 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
38 |
39 | HW3.o: HW3.cu loadSaveImage.h utils.h
40 | $(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
41 |
42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
43 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
44 |
45 | compare.o: compare.cpp compare.h
46 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
47 |
48 | reference_calc.o: reference_calc.cpp reference_calc.h
49 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
50 |
51 | student_func.o: student_func.cu utils.h
52 | $(NVCC) -c student_func.cu $(NVCC_OPTS)
53 |
54 | clean:
55 | rm -f *.o hw
56 | find . -type f -name '*.exr' | grep -v memorial | xargs rm -f
57 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "utils.h"
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError)
6 | {
7 | cv::Mat reference = cv::imread(reference_filename, -1);
8 | cv::Mat test = cv::imread(test_filename, -1);
9 |
10 | cv::Mat diff = abs(reference - test);
11 |
12 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 |
14 | double minVal, maxVal;
15 |
16 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 |
18 | //now perform transform so that we bump values to the full range
19 |
20 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 |
22 | diff = diffSingleChannel.reshape(reference.channels(), 0);
23 |
24 | cv::imwrite("HW3_differenceImage.png", diff);
25 | //OK, now we can start comparing values...
26 | unsigned char *referencePtr = reference.ptr(0);
27 | unsigned char *testPtr = test.ptr(0);
28 |
29 | if (useEpsCheck) {
30 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 | }
32 | else
33 | {
34 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 | }
36 |
37 | std::cout << "PASS" << std::endl;
38 | return;
39 | }
40 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
8 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "cuda_runtime.h"
7 |
8 | //The caller becomes responsible for the returned pointer. This
9 | //is done in the interest of keeping this code as simple as possible.
10 | //In production code this is a bad idea - we should use RAII
11 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION
12 | //CODE!!!
13 | void loadImageHDR(const std::string &filename,
14 | float **imagePtr,
15 | size_t *numRows, size_t *numCols)
16 | {
17 | cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
18 |
19 | cv::Mat image;
20 |
21 | if(originImg.type() != CV_32FC3){
22 | originImg.convertTo(image,CV_32FC3);
23 | } else{
24 | image = originImg;
25 | }
26 |
27 | if (image.empty()) {
28 | std::cerr << "Couldn't open file: " << filename << std::endl;
29 | exit(1);
30 | }
31 |
32 | if (image.channels() != 3) {
33 | std::cerr << "Image must be color!" << std::endl;
34 | exit(1);
35 | }
36 |
37 | if (!image.isContinuous()) {
38 | std::cerr << "Image isn't continuous!" << std::endl;
39 | exit(1);
40 | }
41 |
42 | *imagePtr = new float[image.rows * image.cols * image.channels()];
43 |
44 | float *cvPtr = image.ptr(0);
45 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
46 | (*imagePtr)[i] = cvPtr[i];
47 |
48 | *numRows = image.rows;
49 | *numCols = image.cols;
50 | }
51 |
52 | void loadImageRGBA(const std::string &filename,
53 | uchar4 **imagePtr,
54 | size_t *numRows, size_t *numCols)
55 | {
56 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
57 | if (image.empty()) {
58 | std::cerr << "Couldn't open file: " << filename << std::endl;
59 | exit(1);
60 | }
61 |
62 | if (image.channels() != 3) {
63 | std::cerr << "Image must be color!" << std::endl;
64 | exit(1);
65 | }
66 |
67 | if (!image.isContinuous()) {
68 | std::cerr << "Image isn't continuous!" << std::endl;
69 | exit(1);
70 | }
71 |
72 | cv::Mat imageRGBA;
73 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
74 |
75 | *imagePtr = new uchar4[image.rows * image.cols];
76 |
77 | unsigned char *cvPtr = imageRGBA.ptr(0);
78 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
79 | (*imagePtr)[i].x = cvPtr[4 * i + 0];
80 | (*imagePtr)[i].y = cvPtr[4 * i + 1];
81 | (*imagePtr)[i].z = cvPtr[4 * i + 2];
82 | (*imagePtr)[i].w = cvPtr[4 * i + 3];
83 | }
84 |
85 | *numRows = image.rows;
86 | *numCols = image.cols;
87 | }
88 |
89 | void saveImageRGBA(const uchar4* const image,
90 | const size_t numRows, const size_t numCols,
91 | const std::string &output_file)
92 | {
93 | int sizes[2];
94 | sizes[0] = numRows;
95 | sizes[1] = numCols;
96 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
97 | cv::Mat imageOutputBGR;
98 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
99 | //output the image
100 | cv::imwrite(output_file.c_str(), imageOutputBGR);
101 | }
102 |
103 | //output an exr file
104 | //assumed to already be BGR
105 | void saveImageHDR(const float* const image,
106 | const size_t numRows, const size_t numCols,
107 | const std::string &output_file)
108 | {
109 | int sizes[2];
110 | sizes[0] = numRows;
111 | sizes[1] = numCols;
112 |
113 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
114 |
115 | imageHDR = imageHDR * 255;
116 |
117 | cv::imwrite(output_file.c_str(), imageHDR);
118 | }
119 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.h:
--------------------------------------------------------------------------------
1 | #ifndef LOADSAVEIMAGE_H__
2 | #define LOADSAVEIMAGE_H__
3 |
4 | #include
5 | #include //for uchar4
6 |
7 | void loadImageHDR(const std::string &filename,
8 | float **imagePtr,
9 | size_t *numRows, size_t *numCols);
10 |
11 | void loadImageRGBA(const std::string &filename,
12 | uchar4 **imagePtr,
13 | size_t *numRows, size_t *numCols);
14 |
15 | void saveImageRGBA(const uchar4* const image,
16 | const size_t numRows, const size_t numCols,
17 | const std::string &output_file);
18 |
19 | void saveImageHDR(const float* const image,
20 | const size_t numRows, const size_t numCols,
21 | const std::string &output_file);
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW3 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include
9 |
10 | #include "compare.h"
11 | #include "reference_calc.h"
12 |
13 | // Functions from HW3.cu
14 | void preProcess(float **d_luminance, unsigned int **d_cdf,
15 | size_t *numRows, size_t *numCols, unsigned int *numBins,
16 | const std::string& filename);
17 |
18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
19 | float min_logLum, float max_logLum);
20 |
21 | void cleanupGlobalMemory(void);
22 |
23 | // Function from student_func.cu
24 | void your_histogram_and_prefixsum(const float* const d_luminance,
25 | unsigned int* const d_cdf,
26 | float &min_logLum,
27 | float &max_logLum,
28 | const size_t numRows,
29 | const size_t numCols,
30 | const size_t numBins);
31 |
32 |
33 | int main(int argc, char **argv) {
34 | float *d_luminance;
35 | unsigned int *d_cdf;
36 |
37 | size_t numRows, numCols;
38 | unsigned int numBins;
39 |
40 | std::string input_file;
41 | std::string output_file;
42 | std::string reference_file;
43 | double perPixelError = 0.0;
44 | double globalError = 0.0;
45 | bool useEpsCheck = false;
46 |
47 | switch (argc)
48 | {
49 | case 2:
50 | input_file = std::string(argv[1]);
51 | output_file = "HW3_output.png";
52 | reference_file = "HW3_reference.png";
53 | break;
54 | case 3:
55 | input_file = std::string(argv[1]);
56 | output_file = std::string(argv[2]);
57 | reference_file = "HW3_reference.png";
58 | break;
59 | case 4:
60 | input_file = std::string(argv[1]);
61 | output_file = std::string(argv[2]);
62 | reference_file = std::string(argv[3]);
63 | break;
64 | case 6:
65 | useEpsCheck=true;
66 | input_file = std::string(argv[1]);
67 | output_file = std::string(argv[2]);
68 | reference_file = std::string(argv[3]);
69 | perPixelError = atof(argv[4]);
70 | globalError = atof(argv[5]);
71 | break;
72 | default:
73 | std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
74 | exit(1);
75 | }
76 | //load the image and give us our input and output pointers
77 | preProcess(&d_luminance, &d_cdf,
78 | &numRows, &numCols, &numBins, input_file);
79 |
80 | GpuTimer timer;
81 | float min_logLum, max_logLum;
82 | min_logLum = 0.f;
83 | max_logLum = 1.f;
84 | timer.Start();
85 | //call the students' code
86 | your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
87 | numRows, numCols, numBins);
88 | timer.Stop();
89 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
90 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
91 |
92 | if (err < 0) {
93 | //Couldn't print! Probably the student closed stdout - bad news
94 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
95 | exit(1);
96 | }
97 |
98 | float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
99 | unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);
100 |
101 | checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));
102 |
103 | //check results and output the tone-mapped image
104 | postProcess(output_file, numRows, numCols, min_logLum, max_logLum);
105 |
106 | for (size_t i = 1; i < numCols * numRows; ++i) {
107 | min_logLum = std::min(h_luminance[i], min_logLum);
108 | max_logLum = std::max(h_luminance[i], max_logLum);
109 | }
110 |
111 | referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);
112 |
113 | checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));
114 |
115 | //check results and output the tone-mapped image
116 | postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);
117 |
118 | cleanupGlobalMemory();
119 |
120 | compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 |
122 | return 0;
123 | }
124 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial.exr
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_large.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_large.exr
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png_large.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_png_large.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 3/memorial_raw_large.png
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 | const size_t numRows, const size_t numCols, const size_t numBins,
6 | float &logLumMin, float &logLumMax)
7 | {
8 | logLumMin = h_logLuminance[0];
9 | logLumMax = h_logLuminance[0];
10 |
11 | //Step 1
12 | //first we find the minimum and maximum across the entire image
13 | for (size_t i = 1; i < numCols * numRows; ++i) {
14 | logLumMin = std::min(h_logLuminance[i], logLumMin);
15 | logLumMax = std::max(h_logLuminance[i], logLumMax);
16 | }
17 |
18 | //Step 2
19 | float logLumRange = logLumMax - logLumMin;
20 |
21 | //Step 3
22 | //next we use the now known range to compute
23 | //a histogram of numBins bins
24 | unsigned int *histo = new unsigned int[numBins];
25 |
26 | for (size_t i = 0; i < numBins; ++i) histo[i] = 0;
27 |
28 | for (size_t i = 0; i < numCols * numRows; ++i) {
29 | unsigned int bin = std::min(static_cast(numBins - 1),
30 | static_cast((h_logLuminance[i] - logLumMin) / logLumRange * numBins));
31 | histo[bin]++;
32 | }
33 |
34 | //Step 4
35 | //finally we perform and exclusive scan (prefix sum)
36 | //on the histogram to get the cumulative distribution
37 | h_cdf[0] = 0;
38 | for (size_t i = 1; i < numBins; ++i) {
39 | h_cdf[i] = h_cdf[i - 1] + histo[i - 1];
40 | }
41 |
42 | delete[] histo;
43 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 | const size_t numRows, const size_t numCols, const size_t numBins,
6 | float &logLumMin, float &logLumMax);
7 |
8 | #endif
9 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/timer.h:
--------------------------------------------------------------------------------
1 | #ifndef GPU_TIMER_H__
2 | #define GPU_TIMER_H__
3 |
4 | #include
5 |
6 | struct GpuTimer
7 | {
8 | cudaEvent_t start;
9 | cudaEvent_t stop;
10 |
11 | GpuTimer()
12 | {
13 | cudaEventCreate(&start);
14 | cudaEventCreate(&stop);
15 | }
16 |
17 | ~GpuTimer()
18 | {
19 | cudaEventDestroy(start);
20 | cudaEventDestroy(stop);
21 | }
22 |
23 | void Start()
24 | {
25 | cudaEventRecord(start, 0);
26 | }
27 |
28 | void Stop()
29 | {
30 | cudaEventRecord(stop, 0);
31 | }
32 |
33 | float Elapsed()
34 | {
35 | float elapsed;
36 | cudaEventSynchronize(stop);
37 | cudaEventElapsedTime(&elapsed, start, stop);
38 | return elapsed;
39 | }
40 | };
41 |
42 | #endif /* GPU_TIMER_H__ */
43 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H__
2 | #define UTILS_H__
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
14 |
15 | template
16 | void check(T err, const char* const func, const char* const file, const int line) {
17 | if (err != cudaSuccess) {
18 | std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
19 | std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
20 | exit(1);
21 | }
22 | }
23 |
24 | template
25 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
26 | //check that the GPU result matches the CPU result
27 | for (size_t i = 0; i < numElem; ++i) {
28 | if (ref[i] != gpu[i]) {
29 | std::cerr << "Difference at pos " << i << std::endl;
30 | //the + is magic to convert char to int without messing
31 | //with other types
32 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
33 | "\nGPU : " << +gpu[i] << std::endl;
34 | exit(1);
35 | }
36 | }
37 | }
38 |
39 | template
40 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
41 | assert(eps1 >= 0 && eps2 >= 0);
42 | unsigned long long totalDiff = 0;
43 | unsigned numSmallDifferences = 0;
44 | for (size_t i = 0; i < numElem; ++i) {
45 | //subtract smaller from larger in case of unsigned types
46 | T smaller = std::min(ref[i], gpu[i]);
47 | T larger = std::max(ref[i], gpu[i]);
48 | T diff = larger - smaller;
49 | if (diff > 0 && diff <= eps1) {
50 | numSmallDifferences++;
51 | }
52 | else if (diff > eps1) {
53 | std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
54 | std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
55 | "\nGPU : " << +gpu[i] << std::endl;
56 | exit(1);
57 | }
58 | totalDiff += diff * diff;
59 | }
60 | double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
61 | if (percentSmallDifferences > eps2) {
62 | std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
63 | std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
64 | exit(1);
65 | }
66 | }
67 |
68 | //Uses the autodesk method of image comparison
69 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
70 | template
71 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
72 | {
73 |
74 | size_t numBadPixels = 0;
75 | for (size_t i = 0; i < numElem; ++i) {
76 | T smaller = std::min(ref[i], gpu[i]);
77 | T larger = std::max(ref[i], gpu[i]);
78 | T diff = larger - smaller;
79 | if (diff > variance)
80 | ++numBadPixels;
81 | }
82 |
83 | if (numBadPixels > tolerance) {
84 | std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
85 | exit(1);
86 | }
87 | }
88 |
89 | #endif
90 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | ############################################################################
2 | # CMakeLists.txt for OpenCV and CUDA.
3 | # 2012-02-07
4 | # Quan Tran Minh. edit by Johannes Kast, Michael Sarahan
5 | # quantm@unist.ac.kr kast.jo@googlemail.com msarahan@gmail.com
6 | ############################################################################
7 |
8 | # collect source files
9 |
10 | file( GLOB hdr *.hpp *.h )
11 | file( GLOB cu *.cu)
12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 |
14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu})
15 |
16 |
17 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/Makefile:
--------------------------------------------------------------------------------
1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
2 | #NVCC=nvcc
3 |
4 | ###################################
5 | # These are the default install #
6 | # locations on most linux distros #
7 | ###################################
8 |
9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 |
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 |
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 |
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 |
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 |
26 | ######################################################
27 | # On Macs the default install locations are below #
28 | # ####################################################
29 |
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 |
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 |
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 |
38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile
39 | $(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 |
41 | main.o: main.cpp timer.h utils.h reference_calc.h
42 | g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 |
44 | HW4.o: HW4.cu loadSaveImage.h utils.h
45 | $(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
46 |
47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
48 | g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 |
50 | compare.o: compare.cpp compare.h
51 | g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 |
53 | reference_calc.o: reference_calc.cpp reference_calc.h
54 | g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
55 |
56 | student_func.o: student_func.cu reference_calc.cpp utils.h
57 | $(NVCC) -c student_func.cu $(NVCC_OPTS)
58 |
59 | clean:
60 | rm -f *.o *.png hw
61 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include "utils.h"
3 |
4 |
5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
6 | double perPixelError, double globalError)
7 | {
8 | cv::Mat reference = cv::imread(reference_filename, -1);
9 | cv::Mat test = cv::imread(test_filename, -1);
10 |
11 | cv::Mat diff = abs(reference - test);
12 |
13 | cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
14 |
15 | double minVal, maxVal;
16 |
17 | cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
18 |
19 | //now perform transform so that we bump values to the full range
20 |
21 | diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
22 |
23 | diff = diffSingleChannel.reshape(reference.channels(), 0);
24 |
25 | cv::imwrite("HW4_differenceImage.png", diff);
26 | //OK, now we can start comparing values...
27 | unsigned char *referencePtr = reference.ptr(0);
28 | unsigned char *testPtr = test.ptr(0);
29 |
30 | if (useEpsCheck) {
31 | checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
32 | }
33 | else
34 | {
35 | checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
36 | }
37 |
38 | std::cout << "PASS" << std::endl;
39 | return;
40 | }
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW4_H__
2 | #define HW4_H__
3 |
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | double perPixelError, double globalError);
6 |
7 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "cuda_runtime.h"
6 |
7 | //The caller becomes responsible for the returned pointer. This
8 | //is done in the interest of keeping this code as simple as possible.
9 | //In production code this is a bad idea - we should use RAII
10 | //to ensure the memory is freed. DO NOT COPY THIS AND USE IN PRODUCTION
11 | //CODE!!!
12 | void loadImageHDR(const std::string &filename,
13 | float **imagePtr,
14 | size_t *numRows, size_t *numCols)
15 | {
16 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
17 | if (image.empty()) {
18 | std::cerr << "Couldn't open file: " << filename << std::endl;
19 | exit(1);
20 | }
21 |
22 | if (image.channels() != 3) {
23 | std::cerr << "Image must be color!" << std::endl;
24 | exit(1);
25 | }
26 |
27 | if (!image.isContinuous()) {
28 | std::cerr << "Image isn't continuous!" << std::endl;
29 | exit(1);
30 | }
31 |
32 | *imagePtr = new float[image.rows * image.cols * image.channels()];
33 |
34 | float *cvPtr = image.ptr(0);
35 | for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
36 | (*imagePtr)[i] = cvPtr[i];
37 |
38 | *numRows = image.rows;
39 | *numCols = image.cols;
40 | }
41 |
42 | void loadImageRGBA(const std::string &filename,
43 | uchar4 **imagePtr,
44 | size_t *numRows, size_t *numCols)
45 | {
46 | cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
47 | if (image.empty()) {
48 | std::cerr << "Couldn't open file: " << filename << std::endl;
49 | exit(1);
50 | }
51 |
52 | if (image.channels() != 3) {
53 | std::cerr << "Image must be color!" << std::endl;
54 | exit(1);
55 | }
56 |
57 | if (!image.isContinuous()) {
58 | std::cerr << "Image isn't continuous!" << std::endl;
59 | exit(1);
60 | }
61 |
62 | cv::Mat imageRGBA;
63 | cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
64 |
65 | *imagePtr = new uchar4[image.rows * image.cols];
66 |
67 | unsigned char *cvPtr = imageRGBA.ptr(0);
68 | for (size_t i = 0; i < image.rows * image.cols; ++i) {
69 | (*imagePtr)[i].x = cvPtr[4 * i + 0];
70 | (*imagePtr)[i].y = cvPtr[4 * i + 1];
71 | (*imagePtr)[i].z = cvPtr[4 * i + 2];
72 | (*imagePtr)[i].w = cvPtr[4 * i + 3];
73 | }
74 |
75 | *numRows = image.rows;
76 | *numCols = image.cols;
77 | }
78 |
79 | void saveImageRGBA(const uchar4* const image,
80 | const size_t numRows, const size_t numCols,
81 | const std::string &output_file)
82 | {
83 | int sizes[2];
84 | sizes[0] = numRows;
85 | sizes[1] = numCols;
86 | cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
87 | cv::Mat imageOutputBGR;
88 | cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
89 | //output the image
90 | cv::imwrite(output_file.c_str(), imageOutputBGR);
91 | }
92 |
93 | //output an exr file
94 | //assumed to already be BGR
95 | void saveImageHDR(const float* const image,
96 | const size_t numRows, const size_t numCols,
97 | const std::string &output_file)
98 | {
99 | int sizes[2];
100 | sizes[0] = numRows;
101 | sizes[1] = numCols;
102 |
103 | cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
104 |
105 | imageHDR = imageHDR * 255;
106 |
107 | cv::imwrite(output_file.c_str(), imageHDR);
108 | }
109 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.h:
--------------------------------------------------------------------------------
1 | #ifndef LOADSAVEIMAGE_H__
2 | #define LOADSAVEIMAGE_H__
3 |
4 | #include
5 | #include //for uchar4
6 |
7 | void loadImageHDR(const std::string &filename,
8 | float **imagePtr,
9 | size_t *numRows, size_t *numCols);
10 |
11 | void loadImageRGBA(const std::string &filename,
12 | uchar4 **imagePtr,
13 | size_t *numRows, size_t *numCols);
14 |
15 | void saveImageRGBA(const uchar4* const image,
16 | const size_t numRows, const size_t numCols,
17 | const std::string &output_file);
18 |
19 | void saveImageHDR(const float* const image,
20 | const size_t numRows, const size_t numCols,
21 | const std::string &output_file);
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/main.cpp:
--------------------------------------------------------------------------------
1 | //Udacity HW4 Driver
2 |
3 | #include
4 | #include "timer.h"
5 | #include "utils.h"
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "compare.h"
12 | #include "reference_calc.h"
13 |
14 | void preProcess(unsigned int **inputVals,
15 | unsigned int **inputPos,
16 | unsigned int **outputVals,
17 | unsigned int **outputPos,
18 | size_t &numElems,
19 | const std::string& filename,
20 | const std::string& template_file);
21 |
22 | void postProcess(const unsigned int* const outputVals,
23 | const unsigned int* const outputPos,
24 | const size_t numElems,
25 | const std::string& output_file);
26 |
27 | void your_sort(unsigned int* const inputVals,
28 | unsigned int* const inputPos,
29 | unsigned int* const outputVals,
30 | unsigned int* const outputPos,
31 | const size_t numElems);
32 |
33 | void PrintFullPath(char * partialPath)
34 | {
35 | char full[_MAX_PATH];
36 | if (_fullpath(full, partialPath, _MAX_PATH) != NULL)
37 | printf("Full path is: %s\n", full);
38 | else
39 | printf("Invalid path\n");
40 | }
41 |
42 | int main(int argc, char **argv) {
43 | unsigned int *inputVals;
44 | unsigned int *inputPos;
45 | unsigned int *outputVals;
46 | unsigned int *outputPos;
47 |
48 | size_t numElems = 4;
49 | PrintFullPath(".\\");
50 | std::string input_file;
51 | std::string template_file;
52 | std::string output_file;
53 | std::string reference_file;
54 | double perPixelError = 0.0;
55 | double globalError = 0.0;
56 | bool useEpsCheck = false;
57 |
58 | switch (argc)
59 | {
60 | case 3:
61 | input_file = std::string(argv[1]);
62 | template_file = std::string(argv[2]);
63 | output_file = "HW4_output.png";
64 | break;
65 | case 4:
66 | input_file = std::string(argv[1]);
67 | template_file = std::string(argv[2]);
68 | output_file = std::string(argv[3]);
69 | break;
70 | default:
71 | std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
72 | exit(1);
73 | }
74 | //load the image and give us our input and output pointers
75 | preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
76 |
77 | /*
78 | // Use small array to Debug
79 | checkCudaErrors(cudaMalloc(&inputVals, sizeof(unsigned int)* numElems));
80 | checkCudaErrors(cudaMalloc(&inputPos, sizeof(unsigned int)* numElems));
81 | checkCudaErrors(cudaMalloc(&outputVals, sizeof(unsigned int)* numElems));
82 | checkCudaErrors(cudaMalloc(&outputPos, sizeof(unsigned int)* numElems));
83 | unsigned int ll[4] = { 0, 5, 2, 7 };
84 | thrust::host_vector h_v(ll, ll+4);
85 | printf("%d %d %d %d\n", h_v[0], h_v[1], h_v[2], h_v[3]);
86 | thrust::device_vector d_v = h_v;
87 | cudaMemcpy(inputVals, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice);
88 | cudaMemcpy(inputPos, thrust::raw_pointer_cast(d_v.data()), sizeof(unsigned int)* numElems, cudaMemcpyDeviceToDevice);
89 | */
90 |
91 | GpuTimer timer;
92 | timer.Start();
93 |
94 | thrust::device_ptr d_inputVals(inputVals);
95 | thrust::device_ptr d_inputPos(inputPos);
96 |
97 | thrust::host_vector h_inputVals(d_inputVals,
98 | d_inputVals + numElems);
99 | thrust::host_vector h_inputPos(d_inputPos,
100 | d_inputPos + numElems);
101 |
102 | //call the students' code
103 | your_sort(inputVals, inputPos, outputVals, outputPos, numElems);
104 |
105 | timer.Stop();
106 | cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
107 | printf("\n");
108 | int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
109 |
110 | if (err < 0) {
111 | //Couldn't print! Probably the student closed stdout - bad news
112 | std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
113 | exit(1);
114 | }
115 |
116 | // TODO: something wrong with the function postProcess??
117 | //check results and output the red-eye corrected image
118 | //postProcess(outputVals, outputPos, numElems, output_file);
119 |
120 | // check code moved from HW4.cu
121 | /****************************************************************************
122 | * You can use the code below to help with debugging, but make sure to *
123 | * comment it out again before submitting your assignment for grading, *
124 | * otherwise this code will take too much time and make it seem like your *
125 | * GPU implementation isn't fast enough. *
126 | * *
127 | * This code MUST RUN BEFORE YOUR CODE in case you accidentally change *
128 | * the input values when implementing your radix sort. *
129 | * *
130 | * This code performs the reference radix sort on the host and compares your *
131 | * sorted values to the reference. *
132 | * *
133 | * Thrust containers are used for copying memory from the GPU *
134 | * ************************************************************************* */
135 | ;
136 |
137 | thrust::host_vector h_outputVals(numElems);
138 | thrust::host_vector h_outputPos(numElems);
139 |
140 | reference_calculation(&h_inputVals[0], &h_inputPos[0],
141 | &h_outputVals[0], &h_outputPos[0],
142 | numElems);
143 |
144 | //postProcess(valsPtr, posPtr, numElems, reference_file);
145 |
146 | //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
147 |
148 | thrust::device_ptr d_outputVals(outputVals);
149 | thrust::device_ptr d_outputPos(outputPos);
150 |
151 | thrust::host_vector h_yourOutputVals(d_outputVals,
152 | d_outputVals + numElems);
153 | thrust::host_vector h_yourOutputPos(d_outputPos,
154 | d_outputPos + numElems);
155 |
156 | checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
157 | checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);
158 |
159 | checkCudaErrors(cudaFree(inputVals));
160 | checkCudaErrors(cudaFree(inputPos));
161 | checkCudaErrors(cudaFree(outputVals));
162 | checkCudaErrors(cudaFree(outputPos));
163 |
164 | return 0;
165 | }
166 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect.gold
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_5.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wykvictor/cs344-cuda-udacity/d0ffd99c30f029a277f5299eedea8da887d5fad4/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | // For memset
3 | #include
4 |
5 | void reference_calculation(unsigned int* inputVals,
6 | unsigned int* inputPos,
7 | unsigned int* outputVals,
8 | unsigned int* outputPos,
9 | const size_t numElems)
10 | {
11 | const int numBits = 1;
12 | const int numBins = 1 << numBits;
13 |
14 | unsigned int *binHistogram = new unsigned int[numBins];
15 | unsigned int *binScan = new unsigned int[numBins];
16 |
17 | unsigned int *vals_src = inputVals;
18 | unsigned int *pos_src = inputPos;
19 |
20 | unsigned int *vals_dst = outputVals;
21 | unsigned int *pos_dst = outputPos;
22 |
23 | //a simple radix sort - only guaranteed to work for numBits that are multiples of 2
24 | for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) {
25 | unsigned int mask = (numBins - 1) << i;
26 |
27 | memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins
28 | memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins
29 |
30 | //perform histogram of data & mask into bins
31 | for (unsigned int j = 0; j < numElems; ++j) {
32 | unsigned int bin = (vals_src[j] & mask) >> i;
33 | binHistogram[bin]++;
34 | }
35 |
36 | //perform exclusive prefix sum (scan) on binHistogram to get starting
37 | //location for each bin
38 | for (unsigned int j = 1; j < numBins; ++j) {
39 | binScan[j] = binScan[j - 1] + binHistogram[j - 1];
40 | }
41 |
42 | //Gather everything into the correct location
43 | //need to move vals and positions
44 | for (unsigned int j = 0; j < numElems; ++j) {
45 | unsigned int bin = (vals_src[j] & mask) >> i;
46 | vals_dst[binScan[bin]] = vals_src[j];
47 | pos_dst[binScan[bin]] = pos_src[j];
48 | binScan[bin]++;
49 | }
50 |
51 | //swap the buffers (pointers only)
52 | std::swap(vals_dst, vals_src);
53 | std::swap(pos_dst, pos_src);
54 | }
55 |
56 | //we did an even number of iterations, need to copy from input buffer into output
57 | std::copy(inputVals, inputVals + numElems, outputVals);
58 | std::copy(inputPos, inputPos + numElems, outputPos);
59 |
60 | delete[] binHistogram;
61 | delete[] binScan;
62 | }
63 |
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 |
4 |
5 | //A simple un-optimized reference radix sort calculation
6 | //Only deals with power-of-2 radices
7 |
8 |
9 | void reference_calculation(unsigned int* inputVals,
10 | unsigned int* inputPos,
11 | unsigned int* outputVals,
12 | unsigned int* outputPos,
13 | const size_t numElems);
14 | #endif
--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/student_func.cu:
--------------------------------------------------------------------------------
1 | //Udacity HW 4
2 | //Radix Sorting
3 |
4 | #include "utils.h"
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | /* Red Eye Removal
12 | ===============
13 |
14 | For this assignment we are implementing red eye removal. This is
15 | accomplished by first creating a score for every pixel that tells us how
16 | likely it is to be a red eye pixel. We have already done this for you - you
17 | are receiving the scores and need to sort them in ascending order so that we
18 | know which pixels to alter to remove the red eye.
19 |
20 | Note: ascending order == smallest to largest
21 |
22 | Each score is associated with a position, when you sort the scores, you must
23 | also move the positions accordingly.
24 |
25 | Implementing Parallel Radix Sort with CUDA
26 | ==========================================
27 |
28 | The basic idea is to construct a histogram on each pass of how many of each
29 | "digit" there are. Then we scan this histogram so that we know where to put
30 | the output of each digit. For example, the first 1 must come after all the
31 | 0s so we have to know how many 0s there are to be able to start moving 1s
32 | into the correct position.
33 |
34 | 1) Histogram of the number of occurrences of each digit
35 | 2) Exclusive Prefix Sum of Histogram
36 | 3) Determine relative offset of each digit
37 | For example [0 0 1 1 0 0 1]
38 | -> [0 1 4 5 2 3 6]
39 | 4) Combine the results of steps 2 & 3 to determine the final
40 | output location for each element and move it there
41 |
42 | LSB Radix sort is an out-of-place sort and you will need to ping-pong values
43 | between the input and output buffers we have provided. Make sure the final
44 | sorted results end up in the output buffer! Hint: You may need to do a copy
45 | at the end.
46 |
47 | */
48 |
49 | //#define USE_THRUST
50 |
51 | __global__ void print_kernel(unsigned int *d_out)
52 | {
53 | printf("%d ", d_out[threadIdx.x]);
54 | }
55 |
56 |
57 | __global__ void histo_kernel(unsigned int * d_out, unsigned int* const d_in,
58 | unsigned int shift, const unsigned int numElems)
59 | {
60 | unsigned int mask = 1 << shift;
61 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
62 | if (myId >= numElems) return;
63 | int bin = (d_in[myId] & mask) >> shift;
64 | atomicAdd(&d_out[bin], 1);
65 | }
66 |
67 | // Blelloch Scan - described in lecture
68 | __global__ void sumscan_kernel(unsigned int * d_in, const size_t numBins, const unsigned int numElems)
69 | {
70 | int myId = threadIdx.x;
71 | if (myId >= numElems) return;
72 | extern __shared__ float sdata[];
73 | sdata[myId] = d_in[myId];
74 | __syncthreads(); // make sure entire block is loaded!
75 |
76 | for (int d = 1; d < numBins; d *= 2) {
77 | if (myId >= d) {
78 | sdata[myId] += sdata[myId - d];
79 | }
80 | __syncthreads();
81 | }
82 | if (myId == 0) d_in[0] = 0;
83 | else d_in[myId] = sdata[myId - 1]; //inclusive->exclusive
84 | }
85 |
86 | __global__ void makescan_kernel(unsigned int * d_in, unsigned int *d_scan,
87 | unsigned int shift, const unsigned int numElems)
88 | {
89 | unsigned int mask = 1 << shift;
90 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
91 | if (myId >= numElems) return;
92 | d_scan[myId] = ((d_in[myId] & mask) >> shift) ? 0 : 1;
93 | }
94 |
95 | __global__ void move_kernel(unsigned int* const d_inputVals,
96 | unsigned int* const d_inputPos,
97 | unsigned int* const d_outputVals,
98 | unsigned int* const d_outputPos,
99 | const unsigned int numElems,
100 | unsigned int* const d_histogram,
101 | unsigned int* const d_scaned,
102 | unsigned int shift)
103 | {
104 | unsigned int mask = 1 << shift;
105 | int myId = threadIdx.x + blockDim.x * blockIdx.x;
106 | if (myId >= numElems) return;
107 | // Important!
108 | // Algorithm described in 7.4 of http://wykvictor.github.io/2016/04/03/Cuda-2.html
109 | int des_id = 0;
110 | if ((d_inputVals[myId] & mask) >> shift) {
111 | des_id = myId + d_histogram[1] - d_scaned[myId];
112 | } else {
113 | des_id = d_scaned[myId];
114 | }
115 | d_outputVals[des_id] = d_inputVals[myId];
116 | d_outputPos[des_id] = d_inputPos[myId];
117 | }
118 |
119 | #ifdef USE_THRUST
120 | void your_sort(unsigned int* const d_inputVals,
121 | unsigned int* const d_inputPos,
122 | unsigned int* const d_outputVals,
123 | unsigned int* const d_outputPos,
124 | const size_t numElems)
125 | {
126 | // Thrust vectors wrapping raw GPU data
127 | thrust::device_ptr