├── .gitignore
├── CMakeLists.txt
├── Final
    ├── batcher
    │   ├── batcher.cu
    │   ├── compare.h
    │   └── gputimer.h
    ├── smooth
    │   ├── compare.h
    │   ├── gputimer.h
    │   └── smooth.cu
    └── warpreduce
    │   ├── part_a
    │       ├── compare.h
    │       ├── gputimer.h
    │       └── warpreduce.cu
    │   └── part_b
    │       ├── compare.h
    │       ├── gputimer.h
    │       └── warpreduce.cu
├── Lesson Code Snippets
    ├── Lesson 2 Code Snippets
    │   ├── associative.cu
    │   ├── atomics.cu
    │   ├── gputimer.h
    │   ├── hello_blockIdx.cu
    │   ├── hello_threadIdx.cu
    │   └── memory.cu
    ├── Lesson 3 Code Snippets
    │   ├── histo.cu
    │   └── reduce.cu
    ├── Lesson 5 Code Snippets
    │   ├── deviceQuery_simplified.cpp
    │   └── transpose.cu
    └── Lesson 7 Code Snippets
    │   ├── cub
    │       └── example_block_scan_cum.cu
    │   ├── thrust
    │       ├── gputimer.h
    │       └── thrust_example.cu
    │   └── tiling
    │       ├── gputimer.h
    │       ├── tiling.cu
    │       └── utils.h
├── Lesson Slides
    ├── CS344_Lesson1_Slides.pdf
    ├── CS344_Lesson2_Slides.pdf
    ├── CS344_Lesson3_Slides.pdf
    ├── CS344_Lesson4_Slides.pdf
    ├── CS344_Lesson5_Slides.pdf
    ├── CS344_Lesson6.1_Slides.pdf
    ├── CS344_Lesson6.2_Slides.pdf
    ├── CS344_Lesson7.1_Slides.pdf
    └── CS344_Lesson7.2_Slides.pdf
├── Problem Sets
    ├── Problem Set 1.zip
    ├── Problem Set 1
    │   ├── CMakeLists.txt
    │   ├── HW1.cpp
    │   ├── Makefile
    │   ├── cinque_terre.gold
    │   ├── cinque_terre_small.jpg
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 2.zip
    ├── Problem Set 2
    │   ├── CMakeLists.txt
    │   ├── HW2.cpp
    │   ├── Makefile
    │   ├── cinque_terre.gold
    │   ├── cinque_terre_small.jpg
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 3.zip
    ├── Problem Set 3
    │   ├── CMakeLists.txt
    │   ├── HW3.cu
    │   ├── Makefile
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── memorial.exr
    │   ├── memorial_large.exr
    │   ├── memorial_png.gold
    │   ├── memorial_png_large.gold
    │   ├── memorial_raw.png
    │   ├── memorial_raw_large.png
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 4.zip
    ├── Problem Set 4
    │   ├── CMakeLists.txt
    │   ├── HW4.cu
    │   ├── Makefile
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── red_eye_effect.gold
    │   ├── red_eye_effect_5.jpg
    │   ├── red_eye_effect_template_5.jpg
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 5.zip
    ├── Problem Set 5
    │   ├── CMakeLists.txt
    │   ├── Makefile
    │   ├── main.cu
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── student.cu
    │   ├── timer.h
    │   └── utils.h
    ├── Problem Set 6.zip
    └── Problem Set 6
    │   ├── CMakeLists.txt
    │   ├── HW6.cu
    │   ├── Makefile
    │   ├── blended.gold
    │   ├── compare.cpp
    │   ├── compare.h
    │   ├── destination.png
    │   ├── loadSaveImage.cpp
    │   ├── loadSaveImage.h
    │   ├── main.cpp
    │   ├── reference_calc.cpp
    │   ├── reference_calc.h
    │   ├── source.png
    │   ├── student_func.cu
    │   ├── timer.h
    │   └── utils.h
├── README.md
└── Student Contributions
    └── Notes
        ├── Unit3 Notes
            ├── NotesUnit3.pdf
            └── NotesUnit3Small.pdf
        └── Unit4 Notes
            ├── NotesUnit4.pdf
            └── NotesUnit4_Small.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | 
 4 | # Libraries
 5 | *.lib
 6 | *.a
 7 | 
 8 | # Shared objects (inc. Windows DLLs)
 9 | *.dll
10 | *.so
11 | *.so.*
12 | *.dylib
13 | 
14 | # Executables
15 | *.exe
16 | *.out
17 | *.app
18 | 
19 | # OS X stuff
20 | .DS_Store
21 | 
22 | build
23 | bin
24 | 
25 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
 9 | project(cs344)
10 | 
11 | find_package(OpenCV REQUIRED)
12 | find_package(CUDA REQUIRED)
13 | 
14 | link_libraries(${OpenCV_LIBS} )
15 | 
16 | set (EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin/")
17 | 
18 | if(CUDA_FOUND)
19 |   # compared to class settings, we let NVidia's FindCUDA CMake detect 
20 |   # whether to build x64.  We tell it to support most devices, though, 
21 |   # to make sure more people can easily run class code without knowing 
22 |   # about this compiler argument
23 |   set(CUDA_NVCC_FLAGS "
24 |   -ccbin /usr/bin/clang; 
25 |   -gencode;arch=compute_30,code=sm_30;  
26 |   -gencode;arch=compute_35,code=sm_35;
27 |   -gencode;arch=compute_35,code=compute_35;
28 |   -gencode;arch=compute_20,code=sm_20; 
29 |   -gencode;arch=compute_11,code=sm_11; 
30 |   -gencode;arch=compute_12,code=sm_12;
31 |   -gencode;arch=compute_13,code=sm_13;")
32 | 
33 |   # add -Wextra compiler flag for gcc compilations
34 |   if (UNIX)
35 |     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -Wextra")
36 |     set(CMAKE_CXX_FLAGS "-stdlib=libstdc++")
37 |   endif (UNIX)
38 | 
39 |   # add debugging to CUDA NVCC flags.  For NVidia's NSight tools.
40 |   set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
41 | 
42 |   add_subdirectory (HW1)
43 |   add_subdirectory (HW2)
44 |   add_subdirectory (HW3)
45 |   add_subdirectory (HW4)
46 |   add_subdirectory (HW5)
47 |   add_subdirectory (HW6)
48 | else(CUDA_FOUND)
49 |   message("CUDA is not installed on this system.")
50 | endif()
51 | 


--------------------------------------------------------------------------------
/Final/batcher/batcher.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // http://en.wikipedia.org/wiki/Bitonic_sort
 8 | __global__ void batcherBitonicMergesort64(float * d_out, const float * d_in)
 9 | {
10 |     // you are guaranteed this is called with <<<1, 64, 64*4>>>
11 |     extern __shared__ float sdata[];
12 |     int tid  = threadIdx.x;
13 |     sdata[tid] = d_in[tid];
14 |     __syncthreads();
15 |     
16 |     for (int stage = 0; stage <= 5; stage++)
17 |     {
18 |         for (int substage = stage; substage >= 0; substage--)
19 |         {
20 |             // TODO
21 |         }
22 |     }
23 | 
24 |     d_out[tid] = sdata[tid];
25 | }
26 | 
27 | int compareFloat (const void * a, const void * b)
28 | {
29 |   if ( *(float*)a <  *(float*)b ) return -1;
30 |   if ( *(float*)a == *(float*)b ) return 0;
31 |   if ( *(float*)a >  *(float*)b ) return 1;
32 |   return 0;                     // should never reach this
33 | }
34 | 
35 | int main(int argc, char **argv)
36 | {
37 |     const int ARRAY_SIZE = 64;
38 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
39 | 
40 |     // generate the input array on the host
41 |     float h_in[ARRAY_SIZE];
42 |     float h_sorted[ARRAY_SIZE];
43 |     float h_out[ARRAY_SIZE];
44 |     for(int i = 0; i < ARRAY_SIZE; i++) {
45 |         // generate random float in [0, 1]
46 |         h_in[i] = (float)random()/(float)RAND_MAX;
47 |         h_sorted[i] = h_in[i];
48 |     }
49 |     qsort(h_sorted, ARRAY_SIZE, sizeof(float), compareFloat);
50 | 
51 |     // declare GPU memory pointers
52 |     float * d_in, * d_out;
53 | 
54 |     // allocate GPU memory
55 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
56 |     cudaMalloc((void **) &d_out, ARRAY_BYTES);
57 | 
58 |     // transfer the input array to the GPU
59 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
60 | 
61 |     // launch the kernel
62 |     GpuTimer timer;
63 |     timer.Start();
64 |     batcherBitonicMergesort64<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(float)>>>(d_out, d_in);
65 |     timer.Stop();
66 |     
67 |     printf("Your code executed in %g ms\n", timer.Elapsed());
68 |     
69 |     // copy back the sum from GPU
70 |     cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
71 | 
72 |     compare(h_out, h_sorted, ARRAY_SIZE);
73 |   
74 |     // free GPU memory allocation
75 |     cudaFree(d_in);
76 |     cudaFree(d_out);
77 |         
78 |     return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/Final/batcher/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(float *h_out, float *h_sorted, int ARRAY_SIZE)
 2 | {	
 3 | 	int failure = 0;
 4 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 5 |         if (h_out[i] != h_sorted[i]) {
 6 |             printf("Oops! Index %i is %f, should be %f\n",
 7 |                    i, h_out[i], h_sorted[i]);
 8 |             failure = 1;
 9 |         }
10 |     }
11 | 
12 |     if (failure == 0){
13 |     	printf("Success! Your bitonic sort worked.");
14 |     }
15 | 
16 |     return failure;
17 | }


--------------------------------------------------------------------------------
/Final/batcher/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/smooth/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(float* h_in, float* h_out, float* h_out_shared, float* h_cmp, int ARRAY_SIZE){
 2 |     int failure = 0;
 3 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 4 |         if (h_out[i] != h_cmp[i]) {
 5 |             fprintf(stderr, "ERROR: h_in[%d] is %f, h_out[%d] is %f, h_cmp[%d] is %f\n",
 6 |                     i, h_in[i], i, h_out[i], i, h_cmp[i]);
 7 |             failure = 1;
 8 |         }
 9 |         if (h_out_shared[i] != h_cmp[i]) {
10 |             fprintf(stderr, "ERROR: h_in[%d] is %f, h_out_shared[%d] is %f, h_cmp[%d] is %f\n",
11 |                     i, h_in[i], i, h_out_shared[i], i, h_cmp[i]);
12 |             failure = 1;
13 |         }
14 |     }
15 | 
16 |     if (failure == 0)
17 |     {
18 |         printf("Success! Your smooth code worked!\n");
19 |     }
20 | 
21 |     return failure;
22 | }


--------------------------------------------------------------------------------
/Final/smooth/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/smooth/smooth.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Reference
 8 | __global__ void smooth(float * v_new, const float * v) {
 9 |     int myIdx = threadIdx.x * gridDim.x + blockIdx.x;
10 |     int numThreads = blockDim.x * gridDim.x;
11 |     int myLeftIdx = (myIdx == 0) ? 0 : myIdx - 1;
12 |     int myRightIdx = (myIdx == (numThreads - 1)) ? numThreads - 1 : myIdx + 1;
13 |     float myElt = v[myIdx];
14 |     float myLeftElt = v[myLeftIdx];
15 |     float myRightElt = v[myRightIdx];
16 |     v_new[myIdx] = 0.25f * myLeftElt + 0.5f * myElt + 0.25f * myRightElt;
17 | }
18 | 
19 | // Your code
20 | __global__ void smooth_shared(float * v_new, const float * v) {
21 |     extern __shared__ float s[];
22 |     // TODO: Fill in the rest of this function
23 |     return v[0];
24 | }
25 | 
26 | int main(int argc, char **argv)
27 | {
28 | 
29 |     const int ARRAY_SIZE = 4096;
30 |     const int BLOCK_SIZE = 256;
31 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
32 | 
33 |     // generate the input array on the host
34 |     float h_in[ARRAY_SIZE];
35 |     float h_cmp[ARRAY_SIZE];
36 |     float h_out[ARRAY_SIZE];
37 |     float h_out_shared[ARRAY_SIZE];
38 |     for(int i = 0; i < ARRAY_SIZE; i++) {
39 |         // generate random float in [0, 1]
40 |         h_in[i] = (float)random()/(float)RAND_MAX;
41 |     }
42 |     for(int i = 0; i < ARRAY_SIZE; i++) {
43 |         h_cmp[i] = (0.25f * h_in[(i == 0) ? 0 : i-1] +
44 |                     0.50f * h_in[i] +
45 |                     0.25f * h_in[(i == (ARRAY_SIZE - 1)) ? ARRAY_SIZE - 1 : i+1]);
46 |     }
47 | 
48 |     // declare GPU memory pointers
49 |     float * d_in, * d_out, * d_out_shared;
50 | 
51 |     // allocate GPU memory
52 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
53 |     cudaMalloc((void **) &d_out, ARRAY_BYTES);
54 |     cudaMalloc((void **) &d_out_shared, ARRAY_BYTES);
55 | 
56 |     // transfer the input array to the GPU
57 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
58 | 
59 |     // cudaEvent_t start, stop;
60 |     // cudaEventCreate(&start);
61 |     // cudaEventCreate(&stop);
62 |     // launch the kernel
63 |     smooth<<<ARRAY_SIZE / BLOCK_SIZE, BLOCK_SIZE>>>(d_out, d_in);
64 |     GpuTimer timer;
65 |     timer.Start();
66 |     smooth_shared<<<ARRAY_SIZE / BLOCK_SIZE, BLOCK_SIZE, (BLOCK_SIZE + 2) * sizeof(float)>>>(d_out_shared, d_in);
67 |     timer.Stop();
68 | 
69 |     printf("Your code executed in %g ms\n", timer.Elapsed());
70 |     // cudaEventSynchronize(stop);
71 |     // float elapsedTime;
72 |     // cudaEventElapsedTime(&elapsedTime, start, stop);    
73 | 
74 |     // copy back the result from GPU
75 |     cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
76 |     cudaMemcpy(h_out_shared, d_out_shared, ARRAY_BYTES, cudaMemcpyDeviceToHost);
77 | 
78 |     // testing for correctness
79 |     compare(h_in, h_out, h_out_shared, h_cmp, ARRAY_SIZE);
80 | 
81 |     // free GPU memory allocation
82 |     cudaFree(d_in);
83 |     cudaFree(d_out);
84 |     cudaFree(d_out_shared);
85 |         
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(unsigned int h_out_shared, int sum){
 2 |  	int failure = 0;
 3 |     if (h_out_shared != sum) {
 4 |         fprintf(stderr, "GPU shared sum %d does not match expected sum %d\n", 
 5 |                 h_out_shared, sum);
 6 |         failure = 1;
 7 |     }
 8 | 
 9 |     if (failure == 0)
10 |     {
11 |         printf("Success! Your shared warp reduce worked.\n");
12 |     }
13 |     else{
14 |     	printf("Error! Your shared reduce code's output did not match sum.\n");	
15 |     }
16 | 
17 |     return failure;
18 | }


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/warpreduce/part_a/warpreduce.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Subpart A:
 8 | // Write step 1 as a kernel that operates on threads 0--31.
 9 | // Assume that the input flags are 0 for false and 1 for true and are stored
10 | // in a local per-thread register called p (for predicate).
11 | //
12 | // You have access to 31 words of shared memory s[0:31], with s[0]
13 | // corresponding to thread 0 and s[31] corresponding to thread 31.
14 | // You may change the values of s[0:31]. Put the return sum in s[0].
15 | // Your code should execute no more than 5 warp-wide addition operations.
16 | 
17 | __device__ unsigned int shared_reduce(unsigned int p, volatile unsigned int * s) {
18 |     // Assumes values in 'p' are either 1 or 0
19 |     // Assumes s[0:31] are allocated
20 |     // Sums p across warp, returning the result. Suggest you put
21 |     // result in s[0] and return it
22 |     // You may change any value in s
23 |     // You should execute no more than 5 + operations (if you're doing
24 |     // 31, you're doing it wrong)
25 |     //
26 |     // TODO: Fill in the rest of this function
27 | 
28 |     return s[0];
29 | }
30 | 
31 | __global__ void reduce(unsigned int * d_out_shared,
32 |                        const unsigned int * d_in)
33 | {
34 |     extern __shared__ unsigned int s[];
35 |     int t = threadIdx.x;
36 |     int p = d_in[t];
37 |     unsigned int sr = shared_reduce(p, s);
38 |     if (t == 0)
39 |     {
40 |         *d_out_shared = sr;
41 |     }
42 | }
43 | 
44 | int main(int argc, char **argv)
45 | {
46 |     const int ARRAY_SIZE = 32;
47 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 | 
49 |     // generate the input array on the host
50 |     unsigned int h_in[ARRAY_SIZE];
51 |     unsigned int sum = 0;
52 |     for(int i = 0; i < ARRAY_SIZE; i++) {
53 |         // generate random float in [0, 1]
54 |         h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 |         sum += h_in[i];
56 |     }
57 | 
58 |     // declare GPU memory pointers
59 |     unsigned int * d_in, * d_out_shared;
60 | 
61 |     // allocate GPU memory
62 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 |     cudaMalloc((void **) &d_out_shared, sizeof(unsigned int));
64 | 
65 |     // transfer the input array to the GPU
66 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
67 | 
68 |     GpuTimer timer;
69 |     timer.Start();
70 |     // launch the kernel
71 |     reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 |         (d_out_shared, d_in);
73 |     timer.Stop();
74 | 
75 |     printf("Your code executed in %g ms\n", timer.Elapsed());
76 | 
77 |     unsigned int h_out_shared;
78 |     // copy back the sum from GPU
79 |     cudaMemcpy(&h_out_shared, d_out_shared, sizeof(unsigned int), 
80 |                cudaMemcpyDeviceToHost);
81 |     
82 |     compare(h_out_shared, sum);
83 | 
84 |     // free GPU memory allocation
85 |     cudaFree(d_in);
86 |     cudaFree(d_out_shared);
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/compare.h:
--------------------------------------------------------------------------------
 1 | int compare(unsigned int h_out_warp, int sum){
 2 |  	int failure = 0;
 3 |     if (h_out_warp != sum) {
 4 |         fprintf(stderr, "GPU warp sum %d does not match expected sum %d\n", 
 5 |                 h_out_warp, sum);
 6 |         failure = 1;
 7 |     }
 8 | 
 9 |     if (failure == 0)
10 |     {
11 |         printf("Success! Your warp reduce worked.\n");
12 |     }
13 |     else{
14 |     	printf("Error! Your warp reduce code's output did not match sum.\n");	
15 |     }
16 | 
17 |     return failure;
18 | }


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Final/warpreduce/part_b/warpreduce.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda_runtime.h>
 4 | #include "compare.h"
 5 | #include "gputimer.h"
 6 | 
 7 | // Subpart b:
 8 | // Compute capability 2.0+ GPUs have support for 3 per-warp instructions.
 9 | // Namely, these instructions are:
10 | //
11 | // int __popc(int x) Population Count: Returns the number of bits that are set
12 | // to 1 in the 32-bit integer x.
13 | //
14 | // int __clz(int x) Count Leading Zeros: Returns the number of consecutive zero
15 | // bits beginning at the most significant bit of the 32-bit integer x.
16 | //
17 | // int __ballot(int p) Returns a 32-bit integer in which bit k is set if and only
18 | // if the predicate p provided by the thread in lane k of the warp is non-zero.
19 | 
20 | __device__ unsigned int warp_reduce(unsigned int p, volatile unsigned int * s) {
21 |     // Assumes values in 'p' are either 1 or 0
22 |     // Should not use 's'
23 |     // Sums p across warp, returning the result.
24 |     // You can do this without using the character '+' in your code at all
25 |     //
26 |     // TODO: Fill in the rest of this function
27 |     //
28 | }
29 | 
30 | __global__ void reduce(unsigned int * d_out_warp, 
31 |                        const unsigned int * d_in)
32 | {
33 |     extern __shared__ unsigned int s[];
34 |     int t = threadIdx.x;
35 |     int p = d_in[t];
36 | 
37 |     unsigned int wr = warp_reduce(p, s);
38 |     if (t == 0)
39 |     {
40 |         *d_out_warp = wr;
41 |     }
42 | }
43 | 
44 | int main(int argc, char **argv)
45 | {
46 |     const int ARRAY_SIZE = 32;
47 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(unsigned int);
48 | 
49 |     // generate the input array on the host
50 |     unsigned int h_in[ARRAY_SIZE];
51 |     unsigned int sum = 0;
52 |     for(int i = 0; i < ARRAY_SIZE; i++) {
53 |         // generate random float in [0, 1]
54 |         h_in[i] = (float)random()/(float)RAND_MAX > 0.5f ? 1 : 0;
55 |         sum += h_in[i];
56 |     }
57 | 
58 |     // declare GPU memory pointers
59 |     unsigned int * d_in, * d_out_warp;
60 | 
61 |     // allocate GPU memory
62 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
63 |     cudaMalloc((void **) &d_out_warp, sizeof(unsigned int));
64 | 
65 |     // transfer the input array to the GPU
66 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
67 | 
68 |     GpuTimer timer;
69 |     timer.Start();
70 |     // launch the kernel
71 |     reduce<<<1, ARRAY_SIZE, ARRAY_SIZE * sizeof(unsigned int)>>>
72 |         (d_out_warp, d_in);
73 |     timer.Stop();
74 | 
75 |     printf("Your code executed in %g ms\n", timer.Elapsed());  
76 | 
77 |     unsigned int h_out_warp;
78 |     // copy back the sum from GPU
79 |     cudaMemcpy(&h_out_warp, d_out_warp, sizeof(unsigned int), 
80 |                cudaMemcpyDeviceToHost);
81 | 
82 |     // compare your result against the expected reduce sum
83 |     compare(h_out_warp, sum);
84 | 
85 |     // free GPU memory allocation
86 |     cudaFree(d_in);
87 |     cudaFree(d_out_warp);
88 |         
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/associative.cu:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main(int argc,char **argv)
4 | {   
5 |     printf("(%g + %g) + %g == %g\n%g + (%g + %g) == %g\n", 
6 |         1.f, 1e99, -1e99, (1.f + 1e99)+ -1e99, 
7 |         1.f, 1e99, -1e99, 1.f + (1e99 + -1e99));
8 |     return 0;
9 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/atomics.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "gputimer.h"
 3 | 
 4 | #define NUM_THREADS 1000000
 5 | #define ARRAY_SIZE  100
 6 | 
 7 | #define BLOCK_WIDTH 1000
 8 | 
 9 | void print_array(int *array, int size)
10 | {
11 |     printf("{ ");
12 |     for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }
13 |     printf("}\n");
14 | }
15 | 
16 | __global__ void increment_naive(int *g)
17 | {
18 | 	// which thread is this?
19 | 	int i = blockIdx.x * blockDim.x + threadIdx.x; 
20 | 
21 | 	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
22 | 	i = i % ARRAY_SIZE;  
23 | 	g[i] = g[i] + 1;
24 | }
25 | 
26 | __global__ void increment_atomic(int *g)
27 | {
28 | 	// which thread is this?
29 | 	int i = blockIdx.x * blockDim.x + threadIdx.x; 
30 | 
31 | 	// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
32 | 	i = i % ARRAY_SIZE;  
33 | 	atomicAdd(& g[i], 1);
34 | }
35 | 
36 | int main(int argc,char **argv)
37 | {   
38 |     GpuTimer timer;
39 |     printf("%d total threads in %d blocks writing into %d array elements\n",
40 |            NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);
41 | 
42 |     // declare and allocate host memory
43 |     int h_array[ARRAY_SIZE];
44 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
45 |  
46 |     // declare, allocate, and zero out GPU memory
47 |     int * d_array;
48 |     cudaMalloc((void **) &d_array, ARRAY_BYTES);
49 |     cudaMemset((void *) d_array, 0, ARRAY_BYTES); 
50 | 
51 |     // launch the kernel - comment out one of these
52 |     timer.Start();
53 |     // increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
54 |     increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);
55 |     timer.Stop();
56 |     
57 |     // copy back the array of sums from GPU and print
58 |     cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);
59 |     print_array(h_array, ARRAY_SIZE);
60 |     printf("Time elapsed = %g ms\n", timer.Elapsed());
61 |  
62 |     // free GPU memory allocation and exit
63 |     cudaFree(d_array);
64 |     return 0;
65 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_blockIdx.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define NUM_BLOCKS 16
 4 | #define BLOCK_WIDTH 1
 5 | 
 6 | __global__ void hello()
 7 | {
 8 |     printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
 9 | }
10 | 
11 | 
12 | int main(int argc,char **argv)
13 | {
14 |     // launch the kernel
15 |     hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
16 | 
17 |     // force the printf()s to flush
18 |     cudaDeviceSynchronize();
19 | 
20 |     printf("That's all!\n");
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/hello_threadIdx.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define NUM_BLOCKS 1
 4 | #define BLOCK_WIDTH 256
 5 | 
 6 | __global__ void hello()
 7 | {
 8 |     printf("Hello world! I'm thread %d\n", threadIdx.x);
 9 | }
10 | 
11 | 
12 | int main(int argc,char **argv)
13 | {
14 |     // launch the kernel
15 |     hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();
16 | 
17 |     // force the printf()s to flush
18 |     cudaDeviceSynchronize();
19 | 
20 |     printf("That's all!\n");
21 | 
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 2 Code Snippets/memory.cu:
--------------------------------------------------------------------------------
 1 | // Using different memory spaces in CUDA
 2 | #include <stdio.h>
 3 | 
 4 | /**********************
 5 |  * using local memory *
 6 |  **********************/
 7 | 
 8 | // a __device__ or __global__ function runs on the GPU
 9 | __global__ void use_local_memory_GPU(float in)
10 | {
11 |     float f;    // variable "f" is in local memory and private to each thread
12 |     f = in;     // parameter "in" is in local memory and private to each thread
13 |     // ... real code would presumably do other stuff here ... 
14 | }
15 | 
16 | /**********************
17 |  * using global memory *
18 |  **********************/
19 | 
20 | // a __global__ function runs on the GPU & can be called from host
21 | __global__ void use_global_memory_GPU(float *array)
22 | {
23 |     // "array" is a pointer into global memory on the device
24 |     array[threadIdx.x] = 2.0f * (float) threadIdx.x;
25 | }
26 | 
27 | /**********************
28 |  * using shared memory *
29 |  **********************/
30 | 
31 | // (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)
32 | __global__ void use_shared_memory_GPU(float *array)
33 | {
34 |     // local variables, private to each thread
35 |     int i, index = threadIdx.x;
36 |     float average, sum = 0.0f;
37 | 
38 |     // __shared__ variables are visible to all threads in the thread block
39 |     // and have the same lifetime as the thread block
40 |     __shared__ float sh_arr[128];
41 | 
42 |     // copy data from "array" in global memory to sh_arr in shared memory.
43 |     // here, each thread is responsible for copying a single element.
44 |     sh_arr[index] = array[index];
45 | 
46 |     __syncthreads();    // ensure all the writes to shared memory have completed
47 | 
48 |     // now, sh_arr is fully populated. Let's find the average of all previous elements
49 |     for (i=0; i<index; i++) { sum += sh_arr[i]; }
50 |     average = sum / (index + 1.0f);
51 | 
52 |     // if array[index] is greater than the average of array[0..index-1], replace with average.
53 |     // since array[] is in global memory, this change will be seen by the host (and potentially 
54 |     // other thread blocks, if any)
55 |     if (array[index] > average) { array[index] = average; }
56 | 
57 |     // the following code has NO EFFECT: it modifies shared memory, but 
58 |     // the resulting modified data is never copied back to global memory
59 |     // and vanishes when the thread block completes
60 |     sh_arr[index] = 3.14;
61 | }
62 | 
63 | int main(int argc, char **argv)
64 | {
65 |     /*
66 |      * First, call a kernel that shows using local memory 
67 |      */
68 |     use_local_memory_GPU<<<1, 128>>>(2.0f);
69 | 
70 |     /*
71 |      * Next, call a kernel that shows using global memory
72 |      */
73 |     float h_arr[128];   // convention: h_ variables live on host
74 |     float *d_arr;       // convention: d_ variables live on device (GPU global mem)
75 | 
76 |     // allocate global memory on the device, place result in "d_arr"
77 |     cudaMalloc((void **) &d_arr, sizeof(float) * 128);
78 |     // now copy data from host memory "h_arr" to device memory "d_arr"
79 |     cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
80 |     // launch the kernel (1 block of 128 threads)
81 |     use_global_memory_GPU<<<1, 128>>>(d_arr);  // modifies the contents of array at d_arr
82 |     // copy the modified array back to the host, overwriting contents of h_arr
83 |     cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost);
84 |     // ... do other stuff ...
85 | 
86 |     /*
87 |      * Next, call a kernel that shows using shared memory
88 |      */
89 | 
90 |     // as before, pass in a pointer to data in global memory
91 |     use_shared_memory_GPU<<<1, 128>>>(d_arr); 
92 |     // copy the modified array back to the host
93 |     cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyHostToDevice);
94 |     // ... do other stuff ...
95 |     return 0;
96 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/histo.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <cuda_runtime.h>
  3 | 
  4 | int log2(int i)
  5 | {
  6 |     int r = 0;
  7 |     while (i >>= 1) r++;
  8 |     return r;
  9 | }
 10 | 
 11 | int bit_reverse(int w, int bits)
 12 | {
 13 |     int r = 0;
 14 |     for (int i = 0; i < bits; i++)
 15 |     {
 16 |         int bit = (w & (1 << i)) >> i;
 17 |         r |= bit << (bits - i - 1);
 18 |     }
 19 |     return r;
 20 | }
 21 | 
 22 | __global__ void naive_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
 23 | {
 24 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 25 |     int myItem = d_in[myId];
 26 |     int myBin = myItem % BIN_COUNT;
 27 |     d_bins[myBin]++;
 28 | }
 29 | 
 30 | __global__ void simple_histo(int *d_bins, const int *d_in, const int BIN_COUNT)
 31 | {
 32 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 33 |     int myItem = d_in[myId];
 34 |     int myBin = myItem % BIN_COUNT;
 35 |     atomicAdd(&(d_bins[myBin]), 1);
 36 | }
 37 | 
 38 | 
 39 | int main(int argc, char **argv)
 40 | {
 41 |     int deviceCount;
 42 |     cudaGetDeviceCount(&deviceCount);
 43 |     if (deviceCount == 0) {
 44 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 45 |         exit(EXIT_FAILURE);
 46 |     }
 47 |     int dev = 0;
 48 |     cudaSetDevice(dev);
 49 | 
 50 |     cudaDeviceProp devProps;
 51 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
 52 |     {
 53 |         printf("Using device %d:\n", dev);
 54 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
 55 |                devProps.name, (int)devProps.totalGlobalMem, 
 56 |                (int)devProps.major, (int)devProps.minor, 
 57 |                (int)devProps.clockRate);
 58 |     }
 59 | 
 60 |     const int ARRAY_SIZE = 65536;
 61 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
 62 |     const int BIN_COUNT = 16;
 63 |     const int BIN_BYTES = BIN_COUNT * sizeof(int);
 64 | 
 65 |     // generate the input array on the host
 66 |     int h_in[ARRAY_SIZE];
 67 |     for(int i = 0; i < ARRAY_SIZE; i++) {
 68 |         h_in[i] = bit_reverse(i, log2(ARRAY_SIZE));
 69 |     }
 70 |     int h_bins[BIN_COUNT];
 71 |     for(int i = 0; i < BIN_COUNT; i++) {
 72 |         h_bins[i] = 0;
 73 |     }
 74 | 
 75 |     // declare GPU memory pointers
 76 |     int * d_in;
 77 |     int * d_bins;
 78 | 
 79 |     // allocate GPU memory
 80 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
 81 |     cudaMalloc((void **) &d_bins, BIN_BYTES);
 82 | 
 83 |     // transfer the arrays to the GPU
 84 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
 85 |     cudaMemcpy(d_bins, h_bins, BIN_BYTES, cudaMemcpyHostToDevice); 
 86 | 
 87 |     int whichKernel = 0;
 88 |     if (argc == 2) {
 89 |         whichKernel = atoi(argv[1]);
 90 |     }
 91 |         
 92 |     // launch the kernel
 93 |     switch(whichKernel) {
 94 |     case 0:
 95 |         printf("Running naive histo\n");
 96 |         naive_histo<<<ARRAY_SIZE / 64, 64>>>(d_bins, d_in, BIN_COUNT);
 97 |         break;
 98 |     case 1:
 99 |         printf("Running simple histo\n");
100 |         simple_histo<<<ARRAY_SIZE / 64, 64>>>(d_bins, d_in, BIN_COUNT);
101 |         break;
102 |     default:
103 |         fprintf(stderr, "error: ran no kernel\n");
104 |         exit(EXIT_FAILURE);
105 |     }
106 | 
107 |     // copy back the sum from GPU
108 |     cudaMemcpy(h_bins, d_bins, BIN_BYTES, cudaMemcpyDeviceToHost);
109 | 
110 |     for(int i = 0; i < BIN_COUNT; i++) {
111 |         printf("bin %d: count %d\n", i, h_bins[i]);
112 |     }
113 | 
114 |     // free GPU memory allocation
115 |     cudaFree(d_in);
116 |     cudaFree(d_bins);
117 |         
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 3 Code Snippets/reduce.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <cuda_runtime.h>
  4 | 
  5 | __global__ void global_reduce_kernel(float * d_out, float * d_in)
  6 | {
  7 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
  8 |     int tid  = threadIdx.x;
  9 | 
 10 |     // do reduction in global mem
 11 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 12 |     {
 13 |         if (tid < s)
 14 |         {
 15 |             d_in[myId] += d_in[myId + s];
 16 |         }
 17 |         __syncthreads();        // make sure all adds at one stage are done!
 18 |     }
 19 | 
 20 |     // only thread 0 writes result for this block back to global mem
 21 |     if (tid == 0)
 22 |     {
 23 |         d_out[blockIdx.x] = d_in[myId];
 24 |     }
 25 | }
 26 | 
 27 | __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
 28 | {
 29 |     // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
 30 |     extern __shared__ float sdata[];
 31 | 
 32 |     int myId = threadIdx.x + blockDim.x * blockIdx.x;
 33 |     int tid  = threadIdx.x;
 34 | 
 35 |     // load shared mem from global mem
 36 |     sdata[tid] = d_in[myId];
 37 |     __syncthreads();            // make sure entire block is loaded!
 38 | 
 39 |     // do reduction in shared mem
 40 |     for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
 41 |     {
 42 |         if (tid < s)
 43 |         {
 44 |             sdata[tid] += sdata[tid + s];
 45 |         }
 46 |         __syncthreads();        // make sure all adds at one stage are done!
 47 |     }
 48 | 
 49 |     // only thread 0 writes result for this block back to global mem
 50 |     if (tid == 0)
 51 |     {
 52 |         d_out[blockIdx.x] = sdata[0];
 53 |     }
 54 | }
 55 | 
 56 | void reduce(float * d_out, float * d_intermediate, float * d_in, 
 57 |             int size, bool usesSharedMemory)
 58 | {
 59 |     // assumes that size is not greater than maxThreadsPerBlock^2
 60 |     // and that size is a multiple of maxThreadsPerBlock
 61 |     const int maxThreadsPerBlock = 1024;
 62 |     int threads = maxThreadsPerBlock;
 63 |     int blocks = size / maxThreadsPerBlock;
 64 |     if (usesSharedMemory)
 65 |     {
 66 |         shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
 67 |             (d_intermediate, d_in);
 68 |     }
 69 |     else
 70 |     {
 71 |         global_reduce_kernel<<<blocks, threads>>>
 72 |             (d_intermediate, d_in);
 73 |     }
 74 |     // now we're down to one block left, so reduce it
 75 |     threads = blocks; // launch one thread for each block in prev step
 76 |     blocks = 1;
 77 |     if (usesSharedMemory)
 78 |     {
 79 |         shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>>
 80 |             (d_out, d_intermediate);
 81 |     }
 82 |     else
 83 |     {
 84 |         global_reduce_kernel<<<blocks, threads>>>
 85 |             (d_out, d_intermediate);
 86 |     }
 87 | }
 88 | 
 89 | int main(int argc, char **argv)
 90 | {
 91 |     int deviceCount;
 92 |     cudaGetDeviceCount(&deviceCount);
 93 |     if (deviceCount == 0) {
 94 |         fprintf(stderr, "error: no devices supporting CUDA.\n");
 95 |         exit(EXIT_FAILURE);
 96 |     }
 97 |     int dev = 0;
 98 |     cudaSetDevice(dev);
 99 | 
100 |     cudaDeviceProp devProps;
101 |     if (cudaGetDeviceProperties(&devProps, dev) == 0)
102 |     {
103 |         printf("Using device %d:\n", dev);
104 |         printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
105 |                devProps.name, (int)devProps.totalGlobalMem, 
106 |                (int)devProps.major, (int)devProps.minor, 
107 |                (int)devProps.clockRate);
108 |     }
109 | 
110 |     const int ARRAY_SIZE = 1 << 20;
111 |     const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
112 | 
113 |     // generate the input array on the host
114 |     float h_in[ARRAY_SIZE];
115 |     float sum = 0.0f;
116 |     for(int i = 0; i < ARRAY_SIZE; i++) {
117 |         // generate random float in [-1.0f, 1.0f]
118 |         h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f);
119 |         sum += h_in[i];
120 |     }
121 | 
122 |     // declare GPU memory pointers
123 |     float * d_in, * d_intermediate, * d_out;
124 | 
125 |     // allocate GPU memory
126 |     cudaMalloc((void **) &d_in, ARRAY_BYTES);
127 |     cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated
128 |     cudaMalloc((void **) &d_out, sizeof(float));
129 | 
130 |     // transfer the input array to the GPU
131 |     cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); 
132 | 
133 |     int whichKernel = 0;
134 |     if (argc == 2) {
135 |         whichKernel = atoi(argv[1]);
136 |     }
137 |         
138 |     cudaEvent_t start, stop;
139 |     cudaEventCreate(&start);
140 |     cudaEventCreate(&stop);
141 |     // launch the kernel
142 |     switch(whichKernel) {
143 |     case 0:
144 |         printf("Running global reduce\n");
145 |         cudaEventRecord(start, 0);
146 |         for (int i = 0; i < 100; i++)
147 |         {
148 |             reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false);
149 |         }
150 |         cudaEventRecord(stop, 0);
151 |         break;
152 |     case 1:
153 |         printf("Running reduce with shared mem\n");
154 |         cudaEventRecord(start, 0);
155 |         for (int i = 0; i < 100; i++)
156 |         {
157 |             reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true);
158 |         }
159 |         cudaEventRecord(stop, 0);
160 |         break;
161 |     default:
162 |         fprintf(stderr, "error: ran no kernel\n");
163 |         exit(EXIT_FAILURE);
164 |     }
165 |     cudaEventSynchronize(stop);
166 |     float elapsedTime;
167 |     cudaEventElapsedTime(&elapsedTime, start, stop);    
168 |     elapsedTime /= 100.0f;      // 100 trials
169 | 
170 |     // copy back the sum from GPU
171 |     float h_out;
172 |     cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
173 | 
174 |     printf("average time elapsed: %f\n", elapsedTime);
175 | 
176 |     // free GPU memory allocation
177 |     cudaFree(d_in);
178 |     cudaFree(d_intermediate);
179 |     cudaFree(d_out);
180 |         
181 |     return 0;
182 | }
183 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 5 Code Snippets/transpose.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include "gputimer.h"
  3 | 
  4 | const int N= 1024;		// matrix size is NxN
  5 | const int K= 32;				// tile size is KxK
  6 | 
  7 | // Utility functions: compare, print, and fill matrices
  8 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
  9 | 
 10 | template<typename T>
 11 | void check(T err, const char* const func, const char* const file, const int line)
 12 | {
 13 |   if (err != cudaSuccess) {
 14 |     fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
 15 |     fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
 16 |     exit(1);
 17 |   }
 18 | }
 19 | 
 20 | int compare_matrices(float *gpu, float *ref)
 21 | {
 22 | 	int result = 0;
 23 | 
 24 | 	for(int j=0; j < N; j++)
 25 |     	for(int i=0; i < N; i++)
 26 |     		if (ref[i + j*N] != gpu[i + j*N])
 27 |     		{
 28 |     			// printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
 29 |     			// i,j,ref[i+j*N],i,j,test[i+j*N]);
 30 |     			result = 1;
 31 |     		}
 32 |     return result;
 33 | }
 34 | 
 35 | void print_matrix(float *mat)
 36 | {
 37 | 	for(int j=0; j < N; j++) 
 38 | 	{
 39 | 		for(int i=0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
 40 | 		printf("\n");
 41 | 	}	
 42 | }
 43 | 
 44 | // fill a matrix with sequential numbers in the range 0..N-1
 45 | void fill_matrix(float *mat)
 46 | {
 47 | 	for(int j=0; j < N * N; j++)
 48 | 		mat[j] = (float) j;
 49 | }
 50 | 
 51 | 
 52 | 
 53 | void 
 54 | transpose_CPU(float in[], float out[])
 55 | {
 56 | 	for(int j=0; j < N; j++)
 57 |     	for(int i=0; i < N; i++)
 58 |       		out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
 59 | }
 60 | 
 61 | // to be launched on a single thread
 62 | __global__ void 
 63 | transpose_serial(float in[], float out[])
 64 | {
 65 | 	for(int j=0; j < N; j++)
 66 | 		for(int i=0; i < N; i++)
 67 | 			out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
 68 | }
 69 | 
 70 | // to be launched with one thread per row of output matrix
 71 | __global__ void 
 72 | transpose_parallel_per_row(float in[], float out[])
 73 | {
 74 | 	int i = threadIdx.x;
 75 | 
 76 | 	for(int j=0; j < N; j++)
 77 | 		out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
 78 | }
 79 | 
 80 | // to be launched with one thread per element, in KxK threadblocks
 81 | // thread (x,y) in grid writes element (i,j) of output matrix 
 82 | __global__ void 
 83 | transpose_parallel_per_element(float in[], float out[])
 84 | {
 85 | 	int i = blockIdx.x * K + threadIdx.x;
 86 | 	int j = blockIdx.y * K + threadIdx.y;
 87 | 
 88 | 	out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
 89 | }
 90 | 
 91 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
 92 | // thread blocks read & write tiles, in coalesced fashion
 93 | // adjacent threads read adjacent input elements, write adjacent output elmts
 94 | __global__ void 
 95 | transpose_parallel_per_element_tiled(float in[], float out[])
 96 | {
 97 | 	// (i,j) locations of the tile corners for input & output matrices:
 98 | 	int in_corner_i  = blockIdx.x * K, in_corner_j  = blockIdx.y * K;
 99 | 	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
100 | 
101 | 	int x = threadIdx.x, y = threadIdx.y;
102 | 
103 | 	__shared__ float tile[K][K];
104 | 
105 | 	// coalesced read from global mem, TRANSPOSED write into shared mem:
106 | 	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
107 | 	__syncthreads();
108 | 	// read from shared mem, coalesced write to global mem:
109 | 	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
110 | }
111 | 
112 | // to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
113 | // thread blocks read & write tiles, in coalesced fashion
114 | // adjacent threads read adjacent input elements, write adjacent output elmts
115 | __global__ void 
116 | transpose_parallel_per_element_tiled16(float in[], float out[])
117 | {
118 | 	// (i,j) locations of the tile corners for input & output matrices:
119 | 	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
120 | 	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
121 | 
122 | 	int x = threadIdx.x, y = threadIdx.y;
123 | 
124 | 	__shared__ float tile[16][16];
125 | 
126 | 	// coalesced read from global mem, TRANSPOSED write into shared mem:
127 | 	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
128 | 	__syncthreads();
129 | 	// read from shared mem, coalesced write to global mem:
130 | 	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
131 | }
132 | 
133 | // to be launched with one thread per element, in KxK threadblocks
134 | // thread blocks read & write tiles, in coalesced fashion
135 | // shared memory array padded to avoid bank conflicts
136 | __global__ void 
137 | transpose_parallel_per_element_tiled_padded(float in[], float out[])
138 | {
139 | 	// (i,j) locations of the tile corners for input & output matrices:
140 | 	int in_corner_i  = blockIdx.x * K, in_corner_j  = blockIdx.y * K;
141 | 	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
142 | 
143 | 	int x = threadIdx.x, y = threadIdx.y;
144 | 
145 | 	__shared__ float tile[K][K+1];
146 | 
147 | 	// coalesced read from global mem, TRANSPOSED write into shared mem:
148 | 	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
149 | 	__syncthreads();
150 | 	// read from shared mem, coalesced write to global mem:
151 | 	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
152 | }
153 | 
154 | // to be launched with one thread per element, in KxK threadblocks
155 | // thread blocks read & write tiles, in coalesced fashion
156 | // shared memory array padded to avoid bank conflicts
157 | __global__ void 
158 | transpose_parallel_per_element_tiled_padded16(float in[], float out[])
159 | {
160 | 	// (i,j) locations of the tile corners for input & output matrices:
161 | 	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
162 | 	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
163 | 
164 | 	int x = threadIdx.x, y = threadIdx.y;
165 | 
166 | 	__shared__ float tile[16][16+1];
167 | 
168 | 	// coalesced read from global mem, TRANSPOSED write into shared mem:
169 | 	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
170 | 	__syncthreads();
171 | 	// read from shared mem, coalesced write to global mem:
172 | 	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
173 | }
174 | 
175 | int main(int argc, char **argv)
176 | {
177 | 	int numbytes = N * N * sizeof(float);
178 | 
179 | 	float *in = (float *) malloc(numbytes);
180 | 	float *out = (float *) malloc(numbytes);
181 | 	float *gold = (float *) malloc(numbytes);
182 | 
183 | 	fill_matrix(in);
184 | 	transpose_CPU(in, gold);
185 | 
186 | 	float *d_in, *d_out;
187 | 
188 | 	cudaMalloc(&d_in, numbytes);
189 | 	cudaMalloc(&d_out, numbytes);
190 | 	cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
191 | 
192 | 	GpuTimer timer;
193 | 
194 | /*  
195 |  * Now time each kernel and verify that it produces the correct result.
196 |  *
197 |  * To be really careful about benchmarking purposes, we should run every kernel once
198 |  * to "warm" the system and avoid any compilation or code-caching effects, then run 
199 |  * every kernel 10 or 100 times and average the timings to smooth out any variance. 
200 |  * But this makes for messy code and our goal is teaching, not detailed benchmarking.
201 |  */
202 | 
203 | 	timer.Start();
204 | 	transpose_serial<<<1,1>>>(d_in, d_out);
205 | 	timer.Stop();
206 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
207 | 	printf("transpose_serial: %g ms.\nVerifying transpose...%s\n", 
208 | 	       timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
209 | 
210 | 	timer.Start();
211 | 	transpose_parallel_per_row<<<1,N>>>(d_in, d_out);
212 | 	timer.Stop();
213 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
214 | 	printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n", 
215 | 		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
216 | 
217 | 	dim3 blocks(N/K,N/K); // blocks per grid
218 | 	dim3 threads(K,K);	// threads per block
219 | 
220 | 	timer.Start();
221 | 	transpose_parallel_per_element<<<blocks,threads>>>(d_in, d_out);
222 | 	timer.Stop();
223 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
224 | 	printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
225 | 		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
226 | 
227 | 	timer.Start();
228 | 	transpose_parallel_per_element_tiled<<<blocks,threads>>>(d_in, d_out);
229 | 	timer.Stop();
230 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
231 | 	printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n", 
232 | 		   K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
233 | 	
234 | 	dim3 blocks16x16(N/16,N/16); // blocks per grid
235 | 	dim3 threads16x16(16,16);	 // threads per block
236 | 
237 | 	timer.Start();
238 | 	transpose_parallel_per_element_tiled16<<<blocks16x16,threads16x16>>>(d_in, d_out);
239 | 	timer.Stop();
240 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
241 | 	printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n", 
242 | 		   timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
243 | 	
244 | 	timer.Start();
245 |  	transpose_parallel_per_element_tiled_padded16<<<blocks16x16,threads16x16>>>(d_in, d_out);
246 | 	timer.Stop();
247 | 	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
248 | 	printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n", 
249 | 	       timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
250 | 
251 | 	cudaFree(d_in);
252 | 	cudaFree(d_out);
253 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/cub/example_block_scan_cum.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Simple demonstration of cub::BlockScan
 31 |  *
 32 |  * Example compilation string:
 33 |  *
 34 |  * nvcc example_block_scan_sum.cu -gencode=arch=compute_20,code=\"sm_20,compute_20\" -o example_block_scan_sum
 35 |  *
 36 |  ******************************************************************************/
 37 | 
 38 | // Ensure printing of CUDA runtime errors to console (define before including cub.h)
 39 | #define CUB_STDERR
 40 | 
 41 | #include <stdio.h>
 42 | #include <iostream>
 43 | 
 44 | #include <cub/cub.cuh>
 45 | 
 46 | using namespace cub;
 47 | 
 48 | //---------------------------------------------------------------------
 49 | // Globals, constants and typedefs
 50 | //---------------------------------------------------------------------
 51 | 
 52 | bool g_verbose      = false;
 53 | int g_iterations    = 100;
 54 | 
 55 | 
 56 | //---------------------------------------------------------------------
 57 | // Kernels
 58 | //---------------------------------------------------------------------
 59 | 
 60 | /**
 61 |  * Simple kernel for performing a block-wide exclusive prefix sum over integers
 62 |  */
 63 | template <
 64 |     int         BLOCK_THREADS,
 65 |     int         ITEMS_PER_THREAD>
 66 | __global__ void BlockPrefixSumKernel(
 67 |     int         *d_in,          // Tile of input
 68 |     int         *d_out,         // Tile of output
 69 |     clock_t     *d_elapsed)     // Elapsed cycle count of block scan
 70 | {
 71 |     // Parameterize BlockScan type for our thread block
 72 |     typedef BlockScan<int, BLOCK_THREADS> BlockScanT;
 73 | 
 74 |     // Shared memory
 75 |     __shared__ typename BlockScanT::SmemStorage smem_storage;
 76 | 
 77 |     // Per-thread tile data
 78 |     int data[ITEMS_PER_THREAD];
 79 |     BlockLoadVectorized(d_in, data);
 80 | 
 81 |     // Start cycle timer
 82 |     clock_t start = clock();
 83 | 
 84 |     // Compute exclusive prefix sum
 85 |     int aggregate;
 86 |     BlockScanT::ExclusiveSum(smem_storage, data, data, aggregate);
 87 | 
 88 |     // Stop cycle timer
 89 |     clock_t stop = clock();
 90 | 
 91 |     // Store output
 92 |     BlockStoreVectorized(d_out, data);
 93 | 
 94 |     // Store aggregate and elapsed clocks
 95 |     if (threadIdx.x == 0)
 96 |     {
 97 |         *d_elapsed = (start > stop) ? start - stop : stop - start;
 98 |         d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
 99 |     }
100 | }
101 | 
102 | 
103 | 
104 | //---------------------------------------------------------------------
105 | // Host utilities
106 | //---------------------------------------------------------------------
107 | 
108 | /**
109 |  * Initialize exclusive prefix sum problem (and solution).
110 |  * Returns the aggregate
111 |  */
112 | int Initialize(
113 |     int *h_in,
114 |     int *h_reference,
115 |     int num_elements)
116 | {
117 |     int inclusive = 0;
118 | 
119 |     for (int i = 0; i < num_elements; ++i)
120 |     {
121 |         h_in[i] = i % 17;
122 | 
123 |         h_reference[i] = inclusive;
124 |         inclusive += h_in[i];
125 |     }
126 | 
127 |     return inclusive;
128 | }
129 | 
130 | 
131 | /**
132 |  * Test thread block scan
133 |  */
134 | template <
135 |     int BLOCK_THREADS,
136 |     int ITEMS_PER_THREAD>
137 | void Test()
138 | {
139 |     const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
140 | 
141 |     // Allocate host arrays
142 |     int *h_in           = new int[TILE_SIZE];
143 |     int *h_reference    = new int[TILE_SIZE];
144 |     int *h_gpu          = new int[TILE_SIZE + 1];
145 | 
146 |     // Initialize problem and reference output on host
147 |     int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
148 | 
149 |     // Initialize device arrays
150 |     int *d_in           = NULL;
151 |     int *d_out          = NULL;
152 |     clock_t *d_elapsed  = NULL;
153 |     cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
154 |     cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
155 |     cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
156 | 
157 |     // Display input problem data
158 |     if (g_verbose)
159 |     {
160 |         printf("Input data: ");
161 |         for (int i = 0; i < TILE_SIZE; i++)
162 |             printf("%d, ", h_in[i]);
163 |         printf("\n\n");
164 |     }
165 | 
166 |     // Copy problem to device
167 |     cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
168 | 
169 |     printf("BlockScan %d items (%d threads, %d items per thread): ",
170 |         TILE_SIZE, BLOCK_THREADS, ITEMS_PER_THREAD);
171 | 
172 |     // Run this several times and average the performance results
173 |     clock_t elapsed_scan_clocks     = 0;
174 |     for (int i = 0; i < g_iterations; ++i)
175 |     {
176 |         // Run aggregate/prefix kernel
177 |         BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<1, BLOCK_THREADS>>>(
178 |             d_in,
179 |             d_out,
180 |             d_elapsed);
181 | 
182 |         // Copy results from device
183 |         clock_t scan_clocks;
184 |         cudaMemcpy(h_gpu, d_out, sizeof(int) * (TILE_SIZE + 1), cudaMemcpyDeviceToHost);
185 |         cudaMemcpy(&scan_clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost);
186 |         elapsed_scan_clocks += scan_clocks;
187 |     }
188 | 
189 |     // Check scanned items
190 |     bool correct = true;
191 |     for (int i = 0; i < TILE_SIZE; i++)
192 |     {
193 |         if (h_gpu[i] != h_reference[i])
194 |         {
195 |             printf("Incorrect result @ offset %d (%d != %d)\n",
196 |                 i, h_gpu[i], h_reference[i]);
197 |             correct = false;
198 |             break;
199 |         }
200 |     }
201 | 
202 |     // Check total aggregate
203 |     if (h_gpu[TILE_SIZE] != h_aggregate)
204 |     {
205 |         printf("Incorrect aggregate (%d != %d)\n", h_gpu[TILE_SIZE], h_aggregate);
206 |         correct = false;
207 |     }
208 |     if (correct) printf("Correct!\n");
209 | 
210 |     // Display results problem data
211 |     if (g_verbose)
212 |     {
213 |         printf("GPU output (reference output): ");
214 |         for (int i = 0; i < TILE_SIZE; i++)
215 |             printf("%d (%d), ", h_gpu[i], h_reference[i]);
216 |         printf("\n");
217 |         printf("GPU aggregate (reference aggregate)", h_gpu[TILE_SIZE], h_aggregate);
218 |         printf("\n\n");
219 |     }
220 | 
221 |     // Display timing results
222 |     printf("Average clocks per 32-bit int scanned: %.3f\n\n", float(elapsed_scan_clocks) / TILE_SIZE / g_iterations);
223 | 
224 |     // Cleanup
225 |     if (h_in) delete[] h_in;
226 |     if (h_reference) delete[] h_reference;
227 |     if (h_gpu) delete[] h_gpu;
228 |     if (d_in) cudaFree(d_in);
229 |     if (d_out) cudaFree(d_out);
230 |     if (d_elapsed) cudaFree(d_elapsed);
231 | }
232 | 
233 | 
234 | /**
235 |  * Main
236 |  */
237 | int main(int argc, char** argv)
238 | {
239 |     // Display GPU name
240 |     cudaDeviceProp props;
241 |     cudaGetDeviceProperties(&props, 0);
242 |     printf("Using device %s\n", props.name);
243 | 
244 | /** Add tests here **/
245 | 
246 |     // Run tests
247 |     Test<1024, 1>();
248 |     Test<512, 2>();
249 |     Test<256, 4>();
250 |     Test<128, 8>();
251 |     Test<64, 16>();
252 |     Test<32, 32>();
253 |     Test<16, 64>();
254 | 
255 | /****/
256 | 
257 |     return 0;
258 | }
259 | 
260 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/thrust/thrust_example.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/host_vector.h>
 2 | #include <thrust/device_vector.h>
 3 | #include <thrust/generate.h>
 4 | #include <thrust/sort.h>
 5 | #include <thrust/copy.h>
 6 | #include <algorithm>
 7 | #include <cstdlib>
 8 | 
 9 | #include "gputimer.h"
10 | 
11 | int main(void)
12 | {
13 |   // generate N random numbers serially
14 |   int N = 1000000;
15 |   thrust::host_vector<char> h_vec(N);
16 |   std::generate(h_vec.begin(), h_vec.end(), rand);
17 | 
18 |   // transfer data to the device
19 |   thrust::device_vector<char> d_vec = h_vec;
20 | 
21 |   // sort data on the device (846M keys per second on GeForce GTX 480)
22 |   GpuTimer timer;
23 |   timer.Start();
24 |   thrust::sort(d_vec.begin(), d_vec.end());
25 |   timer.Stop();
26 | 
27 |   // transfer data back to host
28 |   thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
29 |   
30 |   printf("Thrust sorted %d keys in %g ms\n", N, timer.Elapsed());
31 |   return 0;
32 | }


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/gputimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GPU_TIMER_H__
 2 | #define __GPU_TIMER_H__
 3 | 
 4 | struct GpuTimer
 5 | {
 6 |       cudaEvent_t start;
 7 |       cudaEvent_t stop;
 8 |  
 9 |       GpuTimer()
10 |       {
11 |             cudaEventCreate(&start);
12 |             cudaEventCreate(&stop);
13 |       }
14 |  
15 |       ~GpuTimer()
16 |       {
17 |             cudaEventDestroy(start);
18 |             cudaEventDestroy(stop);
19 |       }
20 |  
21 |       void Start()
22 |       {
23 |             cudaEventRecord(start, 0);
24 |       }
25 |  
26 |       void Stop()
27 |       {
28 |             cudaEventRecord(stop, 0);
29 |       }
30 |  
31 |       float Elapsed()
32 |       {
33 |             float elapsed;
34 |             cudaEventSynchronize(stop);
35 |             cudaEventElapsedTime(&elapsed, start, stop);
36 |             return elapsed;
37 |       }
38 | };
39 | 
40 | #endif  /* __GPU_TIMER_H__ */


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/tiling.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include "gputimer.h"
  3 | #include "utils.h"
  4 | 
  5 | const int BLOCKSIZE	= 128;
  6 | const int NUMBLOCKS = 1000;					// set this to 1 or 2 for debugging
  7 | const int N 		= BLOCKSIZE*NUMBLOCKS;
  8 | 
  9 | /* 
 10 |  * TODO: modify the foo and bar kernels to use tiling: 
 11 |  * 		 - copy the input data to shared memory
 12 |  *		 - perform the computation there
 13 |  *	     - copy the result back to global memory
 14 |  *		 - assume thread blocks of 128 threads
 15 |  *		 - handle intra-block boundaries correctly
 16 |  * You can ignore boundary conditions (we ignore the first 2 and last 2 elements)
 17 |  */
 18 | __global__ void foo(float out[], float A[], float B[], float C[], float D[], float E[]){
 19 | 
 20 | 	int i = threadIdx.x + blockIdx.x*blockDim.x; 
 21 | 	
 22 | 	out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
 23 | }
 24 | 
 25 | __global__ void bar(float out[], float in[]) 
 26 | {
 27 | 	int i = threadIdx.x + blockIdx.x*blockDim.x; 
 28 | 
 29 | 	out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
 30 | }
 31 | 
 32 | void cpuFoo(float out[], float A[], float B[], float C[], float D[], float E[])
 33 | {
 34 | 	for (int i=0; i<N; i++)
 35 | 	{
 36 | 		out[i] = (A[i] + B[i] + C[i] + D[i] + E[i]) / 5.0f;
 37 | 	}
 38 | }
 39 | 
 40 | void cpuBar(float out[], float in[])
 41 | {
 42 | 	// ignore the boundaries
 43 | 	for (int i=2; i<N-2; i++)
 44 | 	{
 45 | 		out[i] = (in[i-2] + in[i-1] + in[i] + in[i+1] + in[i+2]) / 5.0f;
 46 | 	}
 47 | }
 48 | 
 49 | int main(int argc, char **argv)
 50 | {
 51 | 	// declare and fill input arrays for foo() and bar()
 52 | 	float fooA[N], fooB[N], fooC[N], fooD[N], fooE[N], barIn[N];
 53 | 	for (int i=0; i<N; i++) 
 54 | 	{
 55 | 		fooA[i] = i; 
 56 | 		fooB[i] = i+1;
 57 | 		fooC[i] = i+2;
 58 | 		fooD[i] = i+3;
 59 | 		fooE[i] = i+4;
 60 | 		barIn[i] = 2*i; 
 61 | 	}
 62 | 	// device arrays
 63 | 	int numBytes = N * sizeof(float);
 64 | 	float *d_fooA;	 	cudaMalloc(&d_fooA, numBytes);
 65 | 	float *d_fooB; 		cudaMalloc(&d_fooB, numBytes);
 66 | 	float *d_fooC;	 	cudaMalloc(&d_fooC, numBytes);
 67 | 	float *d_fooD; 		cudaMalloc(&d_fooD, numBytes);
 68 | 	float *d_fooE; 		cudaMalloc(&d_fooE, numBytes);
 69 | 	float *d_barIn; 	cudaMalloc(&d_barIn, numBytes);
 70 | 	cudaMemcpy(d_fooA, fooA, numBytes, cudaMemcpyHostToDevice);
 71 | 	cudaMemcpy(d_fooB, fooB, numBytes, cudaMemcpyHostToDevice);
 72 | 	cudaMemcpy(d_fooC, fooC, numBytes, cudaMemcpyHostToDevice);
 73 | 	cudaMemcpy(d_fooD, fooD, numBytes, cudaMemcpyHostToDevice);
 74 | 	cudaMemcpy(d_fooE, fooE, numBytes, cudaMemcpyHostToDevice);
 75 | 	cudaMemcpy(d_barIn, barIn, numBytes, cudaMemcpyHostToDevice);	
 76 | 
 77 | 	// output arrays for host and device
 78 | 	float fooOut[N], barOut[N], *d_fooOut, *d_barOut;
 79 | 	cudaMalloc(&d_fooOut, numBytes);
 80 | 	cudaMalloc(&d_barOut, numBytes);
 81 | 
 82 | 	// declare and compute reference solutions
 83 | 	float ref_fooOut[N], ref_barOut[N]; 
 84 | 	cpuFoo(ref_fooOut, fooA, fooB, fooC, fooD, fooE);
 85 | 	cpuBar(ref_barOut, barIn);
 86 | 
 87 | 	// launch and time foo and bar
 88 | 	GpuTimer fooTimer, barTimer;
 89 | 	fooTimer.Start();
 90 | 	foo<<<N/BLOCKSIZE, BLOCKSIZE>>>(d_fooOut, d_fooA, d_fooB, d_fooC, d_fooD, d_fooE);
 91 | 	fooTimer.Stop();
 92 | 	
 93 | 	barTimer.Start();
 94 | 	bar<<<N/BLOCKSIZE, BLOCKSIZE>>>(d_barOut, d_barIn);
 95 | 	barTimer.Stop();
 96 | 
 97 | 	cudaMemcpy(fooOut, d_fooOut, numBytes, cudaMemcpyDeviceToHost);
 98 | 	cudaMemcpy(barOut, d_barOut, numBytes, cudaMemcpyDeviceToHost);
 99 | 	printf("foo<<<>>>(): %g ms elapsed. Verifying solution...", fooTimer.Elapsed());
100 | 	compareArrays(ref_fooOut, fooOut, N);
101 | 	printf("bar<<<>>>(): %g ms elapsed. Verifying solution...", barTimer.Elapsed());
102 | 	compareArrays(ref_barOut, barOut, N);
103 | }
104 | 


--------------------------------------------------------------------------------
/Lesson Code Snippets/Lesson 7 Code Snippets/tiling/utils.h:
--------------------------------------------------------------------------------
 1 | // error checking utility functions
 2 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
 3 | 
 4 | template<typename T>
 5 | void check(T err, const char* const func, const char* const file, const int line)
 6 | {
 7 |   if (err != cudaSuccess) {
 8 |     fprintf(stderr, "CUDA error at: %s : %d\n", file,line);
 9 |     fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
10 |     exit(1);
11 |   }
12 | }
13 | 
14 | void printArray(float in[], int N)
15 | {
16 | 	for (int i=0; i<N; i++) { printf("%g ", in[i]); }
17 | 	printf("\n");
18 | }
19 | 
20 | int compareArrays(float *ref, float *test, int N)
21 | {
22 | 	// ignore the boundaries
23 | 	for (int i=2; i<N-2; i++)
24 | 	{
25 | 		if (ref[i] != test[i]) 
26 | 		{
27 | 			printf("Error: solution does not match reference!\n");
28 | 			printf("first deviation at location %d\n", i);
29 | 			printf("reference array:\n"); printArray(ref, N);
30 | 			printf("solution array:\n"); printArray(test, N);
31 | 			return 1;
32 | 		}
33 | 	}
34 | 	printf("Verified!\n");
35 | 	return 0;
36 | }


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson1_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson1_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson2_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson2_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson3_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson3_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson4_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson4_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson5_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson5_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson6.1_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson6.1_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson6.2_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson6.2_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson7.1_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson7.1_Slides.pdf


--------------------------------------------------------------------------------
/Lesson Slides/CS344_Lesson7.2_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Lesson Slides/CS344_Lesson7.2_Slides.pdf


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW1_files main.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW1 ${HW1_files} ${hdr} ${cu})


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/HW1.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | #include "utils.h"
 5 | #include <cuda.h>
 6 | #include <cuda_runtime.h>
 7 | #include <string>
 8 | 
 9 | cv::Mat imageRGBA;
10 | cv::Mat imageGrey;
11 | 
12 | uchar4        *d_rgbaImage__;
13 | unsigned char *d_greyImage__;
14 | 
15 | size_t numRows() { return imageRGBA.rows; }
16 | size_t numCols() { return imageRGBA.cols; }
17 | 
18 | //return types are void since any internal error will be handled by quitting
19 | //no point in returning error codes...
20 | //returns a pointer to an RGBA version of the input image
21 | //and a pointer to the single channel grey-scale output
22 | //on both the host and device
23 | void preProcess(uchar4 **inputImage, unsigned char **greyImage,
24 |                 uchar4 **d_rgbaImage, unsigned char **d_greyImage,
25 |                 const std::string &filename) {
26 |   //make sure the context initializes ok
27 |   checkCudaErrors(cudaFree(0));
28 | 
29 |   cv::Mat image;
30 |   image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
31 |   if (image.empty()) {
32 |     std::cerr << "Couldn't open file: " << filename << std::endl;
33 |     exit(1);
34 |   }
35 | 
36 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
37 | 
38 |   //allocate memory for the output
39 |   imageGrey.create(image.rows, image.cols, CV_8UC1);
40 | 
41 |   //This shouldn't ever happen given the way the images are created
42 |   //at least based upon my limited understanding of OpenCV, but better to check
43 |   if (!imageRGBA.isContinuous() || !imageGrey.isContinuous()) {
44 |     std::cerr << "Images aren't continuous!! Exiting." << std::endl;
45 |     exit(1);
46 |   }
47 | 
48 |   *inputImage = (uchar4 *)imageRGBA.ptr<unsigned char>(0);
49 |   *greyImage  = imageGrey.ptr<unsigned char>(0);
50 | 
51 |   const size_t numPixels = numRows() * numCols();
52 |   //allocate memory on the device for both input and output
53 |   checkCudaErrors(cudaMalloc(d_rgbaImage, sizeof(uchar4) * numPixels));
54 |   checkCudaErrors(cudaMalloc(d_greyImage, sizeof(unsigned char) * numPixels));
55 |   checkCudaErrors(cudaMemset(*d_greyImage, 0, numPixels * sizeof(unsigned char))); //make sure no memory is left laying around
56 | 
57 |   //copy input array to the GPU
58 |   checkCudaErrors(cudaMemcpy(*d_rgbaImage, *inputImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
59 | 
60 |   d_rgbaImage__ = *d_rgbaImage;
61 |   d_greyImage__ = *d_greyImage;
62 | }
63 | 
64 | void postProcess(const std::string& output_file, unsigned char* data_ptr) {
65 |   cv::Mat output(numRows(), numCols(), CV_8UC1, (void*)data_ptr);
66 | 
67 |   //output the image
68 |   cv::imwrite(output_file.c_str(), output);
69 | }
70 | 
71 | void cleanup()
72 | {
73 |   //cleanup
74 |   cudaFree(d_rgbaImage__);
75 |   cudaFree(d_greyImage__);
76 | }
77 | 
78 | void generateReferenceImage(std::string input_filename, std::string output_filename)
79 | {
80 |   cv::Mat reference = cv::imread(input_filename, CV_LOAD_IMAGE_GRAYSCALE);
81 | 
82 |   cv::imwrite(output_filename, reference);
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | # or if using MacPorts
19 | 
20 | #OPENCV_LIBPATH=/opt/local/lib
21 | #OPENCV_INCLUDEPATH=/opt/local/include
22 | 
23 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
24 | 
25 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
26 | 
27 | ######################################################
28 | # On Macs the default install locations are below    #
29 | # ####################################################
30 | 
31 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
32 | #CUDA_LIBPATH=/usr/local/cuda/lib
33 | 
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 | 
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 | 
38 | student: main.o student_func.o compare.o reference_calc.o Makefile
39 | 	$(NVCC) -o HW1 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 | 
41 | main.o: main.cpp timer.h utils.h reference_calc.cpp compare.cpp HW1.cpp
42 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH) -I $(OPENCV_INCLUDEPATH)
43 | 
44 | student_func.o: student_func.cu utils.h
45 | 	nvcc -c student_func.cu $(NVCC_OPTS)
46 | 
47 | compare.o: compare.cpp compare.h
48 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 | 
50 | reference_calc.o: reference_calc.cpp reference_calc.h
51 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 | 
53 | clean:
54 | 	rm -f *.o *.png hw
55 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 1/cinque_terre_small.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | 
 5 | #include "utils.h"
 6 | 
 7 | void compareImages(std::string reference_filename, std::string test_filename, 
 8 |                    bool useEpsCheck, double perPixelError, double globalError)
 9 | {
10 |   cv::Mat reference = cv::imread(reference_filename, -1);
11 |   cv::Mat test = cv::imread(test_filename, -1);
12 | 
13 |   cv::Mat diff = abs(reference - test);
14 | 
15 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 | 
17 |   double minVal, maxVal;
18 | 
19 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 | 
21 |   //now perform transform so that we bump values to the full range
22 | 
23 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 | 
25 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
26 | 
27 |   cv::imwrite("HW1_differenceImage.png", diff);
28 |   //OK, now we can start comparing values...
29 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
30 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
31 | 
32 |   if (useEpsCheck) {
33 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 |   }
35 |   else
36 |   {
37 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 |   }
39 | 
40 |   std::cout << "PASS" << std::endl;
41 |   return;
42 | }
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, 
5 |                    bool useEpsCheck, double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/main.cpp:
--------------------------------------------------------------------------------
 1 | //Udacity HW1 Solution
 2 | 
 3 | #include <iostream>
 4 | #include "timer.h"
 5 | #include "utils.h"
 6 | #include <string>
 7 | #include <stdio.h>
 8 | #include "reference_calc.h"
 9 | #include "compare.h"
10 | 
11 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, 
12 |                             uchar4 * const d_rgbaImage,
13 |                             unsigned char* const d_greyImage, 
14 |                             size_t numRows, size_t numCols);
15 | 
16 | //include the definitions of the above functions for this homework
17 | #include "HW1.cpp"
18 | 
19 | int main(int argc, char **argv) {
20 |   uchar4        *h_rgbaImage, *d_rgbaImage;
21 |   unsigned char *h_greyImage, *d_greyImage;
22 | 
23 |   std::string input_file;
24 |   std::string output_file;
25 |   std::string reference_file;
26 |   double perPixelError = 0.0;
27 |   double globalError   = 0.0;
28 |   bool useEpsCheck = false;
29 |   switch (argc)
30 |   {
31 | 	case 2:
32 | 	  input_file = std::string(argv[1]);
33 | 	  output_file = "HW1_output.png";
34 | 	  reference_file = "HW1_reference.png";
35 | 	  break;
36 | 	case 3:
37 | 	  input_file  = std::string(argv[1]);
38 |       output_file = std::string(argv[2]);
39 | 	  reference_file = "HW1_reference.png";
40 | 	  break;
41 | 	case 4:
42 | 	  input_file  = std::string(argv[1]);
43 |       output_file = std::string(argv[2]);
44 | 	  reference_file = std::string(argv[3]);
45 | 	  break;
46 | 	case 6:
47 | 	  useEpsCheck=true;
48 | 	  input_file  = std::string(argv[1]);
49 | 	  output_file = std::string(argv[2]);
50 | 	  reference_file = std::string(argv[3]);
51 | 	  perPixelError = atof(argv[4]);
52 |       globalError   = atof(argv[5]);
53 | 	  break;
54 | 	default:
55 |       std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
56 |       exit(1);
57 |   }
58 |   //load the image and give us our input and output pointers
59 |   preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);
60 | 
61 |   GpuTimer timer;
62 |   timer.Start();
63 |   //call the students' code
64 |   your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
65 |   timer.Stop();
66 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
67 | 
68 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
69 | 
70 |   if (err < 0) {
71 |     //Couldn't print! Probably the student closed stdout - bad news
72 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
73 |     exit(1);
74 |   }
75 | 
76 |   size_t numPixels = numRows()*numCols();
77 |   checkCudaErrors(cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, cudaMemcpyDeviceToHost));
78 | 
79 |   //check results and output the grey image
80 |   postProcess(output_file, h_greyImage);
81 | 
82 |   referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());
83 | 
84 |   postProcess(reference_file, h_greyImage);
85 | 
86 |   //generateReferenceImage(input_file, reference_file);
87 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
88 |                 globalError);
89 | 
90 |   cleanup();
91 | 
92 |   return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | // for uchar4 struct
 2 | #include <cuda_runtime.h>
 3 | 
 4 | void referenceCalculation(const uchar4* const rgbaImage,
 5 |                           unsigned char *const greyImage,
 6 |                           size_t numRows,
 7 |                           size_t numCols)
 8 | {
 9 |   for (size_t r = 0; r < numRows; ++r) {
10 |     for (size_t c = 0; c < numCols; ++c) {
11 |       uchar4 rgba = rgbaImage[r * numCols + c];
12 |       float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
13 |       greyImage[r * numCols + c] = channelSum;
14 |     }
15 |   }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const uchar4* const rgbaImage,
5 |                           unsigned char *const greyImage,
6 |                           size_t numRows,
7 |                           size_t numCols);
8 | 
9 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/student_func.cu:
--------------------------------------------------------------------------------
 1 | // Homework 1
 2 | // Color to Greyscale Conversion
 3 | 
 4 | //A common way to represent color images is known as RGBA - the color
 5 | //is specified by how much Red, Grean and Blue is in it.
 6 | //The 'A' stands for Alpha and is used for transparency, it will be
 7 | //ignored in this homework.
 8 | 
 9 | //Each channel Red, Blue, Green and Alpha is represented by one byte.
10 | //Since we are using one byte for each color there are 256 different
11 | //possible values for each color.  This means we use 4 bytes per pixel.
12 | 
13 | //Greyscale images are represented by a single intensity value per pixel
14 | //which is one byte in size.
15 | 
16 | //To convert an image from color to grayscale one simple method is to
17 | //set the intensity to the average of the RGB channels.  But we will
18 | //use a more sophisticated method that takes into account how the eye 
19 | //perceives color and weights the channels unequally.
20 | 
21 | //The eye responds most strongly to green followed by red and then blue.
22 | //The NTSC (National Television System Committee) recommends the following
23 | //formula for color to greyscale conversion:
24 | 
25 | //I = .299f * R + .587f * G + .114f * B
26 | 
27 | //Notice the trailing f's on the numbers which indicate that they are 
28 | //single precision floating point constants and not double precision
29 | //constants.
30 | 
31 | //You should fill in the kernel as well as set the block and grid sizes
32 | //so that the entire image is processed.
33 | 
34 | #include "utils.h"
35 | 
36 | __global__
37 | void rgba_to_greyscale(const uchar4* const rgbaImage,
38 |                        unsigned char* const greyImage,
39 |                        int numRows, int numCols)
40 | {
41 |   //TODO
42 |   //Fill in the kernel to convert from color to greyscale
43 |   //the mapping from components of a uchar4 to RGBA is:
44 |   // .x -> R ; .y -> G ; .z -> B ; .w -> A
45 |   //
46 |   //The output (greyImage) at each pixel should be the result of
47 |   //applying the formula: output = .299f * R + .587f * G + .114f * B;
48 |   //Note: We will be ignoring the alpha channel for this conversion
49 | 
50 |   //First create a mapping from the 2D block and grid locations
51 |   //to an absolute 2D location in the image, then use that to
52 |   //calculate a 1D offset
53 | }
54 | 
55 | void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
56 |                             unsigned char* const d_greyImage, size_t numRows, size_t numCols)
57 | {
58 |   //You must fill in the correct sizes for the blockSize and gridSize
59 |   //currently only one block with one thread is being launched
60 |   const dim3 blockSize(1, 1, 1);  //TODO
61 |   const dim3 gridSize( 1, 1, 1);  //TODO
62 |   rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
63 |   
64 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 1/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW2_files main.cpp reference_calc.cpp compare.cpp)
13 |     
14 | CUDA_ADD_EXECUTABLE(HW2 ${HW2_files} ${hdr} ${cu})
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/HW2.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include "utils.h"
  5 | #include <cuda.h>
  6 | #include <cuda_runtime.h>
  7 | #include <string>
  8 | 
  9 | cv::Mat imageInputRGBA;
 10 | cv::Mat imageOutputRGBA;
 11 | 
 12 | uchar4 *d_inputImageRGBA__;
 13 | uchar4 *d_outputImageRGBA__;
 14 | 
 15 | float *h_filter__;
 16 | 
 17 | size_t numRows() { return imageInputRGBA.rows; }
 18 | size_t numCols() { return imageInputRGBA.cols; }
 19 | 
 20 | //return types are void since any internal error will be handled by quitting
 21 | //no point in returning error codes...
 22 | //returns a pointer to an RGBA version of the input image
 23 | //and a pointer to the single channel grey-scale output
 24 | //on both the host and device
 25 | void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
 26 |                 uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
 27 |                 unsigned char **d_redBlurred,
 28 |                 unsigned char **d_greenBlurred,
 29 |                 unsigned char **d_blueBlurred,
 30 |                 float **h_filter, int *filterWidth,
 31 |                 const std::string &filename) {
 32 | 
 33 |   //make sure the context initializes ok
 34 |   checkCudaErrors(cudaFree(0));
 35 | 
 36 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 37 |   if (image.empty()) {
 38 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 39 |     exit(1);
 40 |   }
 41 | 
 42 |   cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);
 43 | 
 44 |   //allocate memory for the output
 45 |   imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);
 46 | 
 47 |   //This shouldn't ever happen given the way the images are created
 48 |   //at least based upon my limited understanding of OpenCV, but better to check
 49 |   if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
 50 |     std::cerr << "Images aren't continuous!! Exiting." << std::endl;
 51 |     exit(1);
 52 |   }
 53 | 
 54 |   *h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
 55 |   *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);
 56 | 
 57 |   const size_t numPixels = numRows() * numCols();
 58 |   //allocate memory on the device for both input and output
 59 |   checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
 60 |   checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
 61 |   checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around
 62 | 
 63 |   //copy input array to the GPU
 64 |   checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
 65 | 
 66 |   d_inputImageRGBA__  = *d_inputImageRGBA;
 67 |   d_outputImageRGBA__ = *d_outputImageRGBA;
 68 | 
 69 |   //now create the filter that they will use
 70 |   const int blurKernelWidth = 9;
 71 |   const float blurKernelSigma = 2.;
 72 | 
 73 |   *filterWidth = blurKernelWidth;
 74 | 
 75 |   //create and fill the filter we will convolve with
 76 |   *h_filter = new float[blurKernelWidth * blurKernelWidth];
 77 |   h_filter__ = *h_filter;
 78 | 
 79 |   float filterSum = 0.f; //for normalization
 80 | 
 81 |   for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
 82 |     for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
 83 |       float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
 84 |       (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
 85 |       filterSum += filterValue;
 86 |     }
 87 |   }
 88 | 
 89 |   float normalizationFactor = 1.f / filterSum;
 90 | 
 91 |   for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
 92 |     for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
 93 |       (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
 94 |     }
 95 |   }
 96 | 
 97 |   //blurred
 98 |   checkCudaErrors(cudaMalloc(d_redBlurred,    sizeof(unsigned char) * numPixels));
 99 |   checkCudaErrors(cudaMalloc(d_greenBlurred,  sizeof(unsigned char) * numPixels));
100 |   checkCudaErrors(cudaMalloc(d_blueBlurred,   sizeof(unsigned char) * numPixels));
101 |   checkCudaErrors(cudaMemset(*d_redBlurred,   0, sizeof(unsigned char) * numPixels));
102 |   checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
103 |   checkCudaErrors(cudaMemset(*d_blueBlurred,  0, sizeof(unsigned char) * numPixels));
104 | }
105 | 
106 | void postProcess(const std::string& output_file, uchar4* data_ptr) {
107 |   cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
108 | 
109 |   cv::Mat imageOutputBGR;
110 |   cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
111 |   //output the image
112 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
113 | }
114 | 
115 | void cleanUp(void)
116 | {
117 |   cudaFree(d_inputImageRGBA__);
118 |   cudaFree(d_outputImageRGBA__);
119 |   delete[] h_filter__;
120 | }
121 | 
122 | 
123 | // An unused bit of code showing how to accomplish this assignment using OpenCV.  It is much faster 
124 | //    than the naive implementation in reference_calc.cpp.
125 | void generateReferenceImage(std::string input_file, std::string reference_file, int kernel_size)
126 | {
127 | 	cv::Mat input = cv::imread(input_file);
128 | 	// Create an identical image for the output as a placeholder
129 | 	cv::Mat reference = cv::imread(input_file);
130 | 	cv::GaussianBlur(input, reference, cv::Size2i(kernel_size, kernel_size),0);
131 | 	cv::imwrite(reference_file, reference);
132 | }
133 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
20 | 
21 | ######################################################
22 | # On Macs the default install locations are below    #
23 | # ####################################################
24 | 
25 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
26 | #CUDA_LIBPATH=/usr/local/cuda/lib
27 | 
28 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
29 | 
30 | GCC_OPTS=-O3 -Wall -Wextra -m64
31 | 
32 | student: main.o student_func.o compare.o reference_calc.o Makefile
33 | 	$(NVCC) -o HW2 main.o student_func.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
34 | 
35 | main.o: main.cpp timer.h utils.h HW2.cpp
36 | 	g++ -c main.cpp $(GCC_OPTS) -I $(OPENCV_INCLUDEPATH) -I $(CUDA_INCLUDEPATH)
37 | 
38 | student_func.o: student_func.cu reference_calc.cpp utils.h
39 | 	nvcc -c student_func.cu $(NVCC_OPTS)
40 | 
41 | compare.o: compare.cpp compare.h
42 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 | 
44 | reference_calc.o: reference_calc.cpp reference_calc.h
45 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
46 | 
47 | clean:
48 | 	rm -f *.o *.png hw
49 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/cinque_terre_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 2/cinque_terre_small.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/core/core.hpp>
 2 | #include <opencv2/highgui/highgui.hpp>
 3 | #include <opencv2/opencv.hpp>
 4 | 
 5 | #include "utils.h"
 6 | 
 7 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 8 | 				   double perPixelError, double globalError)
 9 | {
10 |   cv::Mat reference = cv::imread(reference_filename, -1);
11 |   cv::Mat test = cv::imread(test_filename, -1);
12 | 
13 |   cv::Mat diff = abs(reference - test);
14 | 
15 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
16 | 
17 |   double minVal, maxVal;
18 | 
19 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
20 | 
21 |   //now perform transform so that we bump values to the full range
22 | 
23 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
24 | 
25 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
26 | 
27 |   cv::imwrite("HW2_differenceImage.png", diff);
28 |   //OK, now we can start comparing values...
29 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
30 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
31 | 
32 |   if (useEpsCheck) {
33 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
34 |   }
35 |   else
36 |   {
37 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
38 |   }
39 | 
40 |   std::cout << "PASS" << std::endl;
41 |   return;
42 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef COMPARE_H__
2 | #define COMPARE_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW2 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | 
  9 | #include "reference_calc.h"
 10 | #include "compare.h"
 11 | 
 12 | //include the definitions of the above functions for this homework
 13 | #include "HW2.cpp"
 14 | 
 15 | 
 16 | /*******  DEFINED IN student_func.cu *********/
 17 | 
 18 | void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_inputImageRGBA,
 19 |                         uchar4* const d_outputImageRGBA,
 20 |                         const size_t numRows, const size_t numCols,
 21 |                         unsigned char *d_redBlurred,
 22 |                         unsigned char *d_greenBlurred,
 23 |                         unsigned char *d_blueBlurred,
 24 |                         const int filterWidth);
 25 | 
 26 | void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
 27 |                                 const float* const h_filter, const size_t filterWidth);
 28 | 
 29 | 
 30 | /*******  Begin main *********/
 31 | 
 32 | int main(int argc, char **argv) {
 33 |   uchar4 *h_inputImageRGBA,  *d_inputImageRGBA;
 34 |   uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
 35 |   unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;
 36 | 
 37 |   float *h_filter;
 38 |   int    filterWidth;
 39 | 
 40 |   std::string input_file;
 41 |   std::string output_file;
 42 |   std::string reference_file;
 43 |   double perPixelError = 0.0;
 44 |   double globalError   = 0.0;
 45 |   bool useEpsCheck = false;
 46 |   switch (argc)
 47 |   {
 48 | 	case 2:
 49 | 	  input_file = std::string(argv[1]);
 50 | 	  output_file = "HW2_output.png";
 51 | 	  reference_file = "HW2_reference.png";
 52 | 	  break;
 53 | 	case 3:
 54 | 	  input_file  = std::string(argv[1]);
 55 |       output_file = std::string(argv[2]);
 56 | 	  reference_file = "HW2_reference.png";
 57 | 	  break;
 58 | 	case 4:
 59 | 	  input_file  = std::string(argv[1]);
 60 |       output_file = std::string(argv[2]);
 61 | 	  reference_file = std::string(argv[3]);
 62 | 	  break;
 63 | 	case 6:
 64 | 	  useEpsCheck=true;
 65 | 	  input_file  = std::string(argv[1]);
 66 | 	  output_file = std::string(argv[2]);
 67 | 	  reference_file = std::string(argv[3]);
 68 | 	  perPixelError = atof(argv[4]);
 69 |       globalError   = atof(argv[5]);
 70 | 	  break;
 71 | 	default:
 72 |       std::cerr << "Usage: ./HW2 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 73 |       exit(1);
 74 |   }
 75 |   //load the image and give us our input and output pointers
 76 |   preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
 77 |              &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
 78 |              &h_filter, &filterWidth, input_file);
 79 | 
 80 |   allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
 81 |   GpuTimer timer;
 82 |   timer.Start();
 83 |   //call the students' code
 84 |   your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA, numRows(), numCols(),
 85 |                      d_redBlurred, d_greenBlurred, d_blueBlurred, filterWidth);
 86 |   timer.Stop();
 87 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 88 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 89 | 
 90 |   if (err < 0) {
 91 |     //Couldn't print! Probably the student closed stdout - bad news
 92 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 93 |     exit(1);
 94 |   }
 95 | 
 96 |   //check results and output the blurred image
 97 | 
 98 |   size_t numPixels = numRows()*numCols();
 99 |   //copy the output back to the host
100 |   checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost));
101 | 
102 |   postProcess(output_file, h_outputImageRGBA);
103 | 
104 |   referenceCalculation(h_inputImageRGBA, h_outputImageRGBA,
105 |                        numRows(), numCols(),
106 |                        h_filter, filterWidth);
107 | 
108 |   postProcess(reference_file, h_outputImageRGBA);
109 | 
110 |     //  Cheater easy way with OpenCV
111 |     //generateReferenceImage(input_file, reference_file, filterWidth);
112 | 
113 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
114 | 
115 |   checkCudaErrors(cudaFree(d_redBlurred));
116 |   checkCudaErrors(cudaFree(d_greenBlurred));
117 |   checkCudaErrors(cudaFree(d_blueBlurred));
118 | 
119 |   cleanUp();
120 | 
121 |   return 0;
122 | }
123 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | // for uchar4 struct
 4 | #include <cuda_runtime.h>
 5 | 
 6 | void channelConvolution(const unsigned char* const channel,
 7 |                         unsigned char* const channelBlurred,
 8 |                         const size_t numRows, const size_t numCols,
 9 |                         const float *filter, const int filterWidth)
10 | {
11 |   //Dealing with an even width filter is trickier
12 |   assert(filterWidth % 2 == 1);
13 | 
14 |   //For every pixel in the image
15 |   for (int r = 0; r < (int)numRows; ++r) {
16 |     for (int c = 0; c < (int)numCols; ++c) {
17 |       float result = 0.f;
18 |       //For every value in the filter around the pixel (c, r)
19 |       for (int filter_r = -filterWidth/2; filter_r <= filterWidth/2; ++filter_r) {
20 |         for (int filter_c = -filterWidth/2; filter_c <= filterWidth/2; ++filter_c) {
21 |           //Find the global image position for this filter position
22 |           //clamp to boundary of the image
23 | 		  int image_r = std::min(std::max(r + filter_r, 0), static_cast<int>(numRows - 1));
24 |           int image_c = std::min(std::max(c + filter_c, 0), static_cast<int>(numCols - 1));
25 | 
26 |           float image_value = static_cast<float>(channel[image_r * numCols + image_c]);
27 |           float filter_value = filter[(filter_r + filterWidth/2) * filterWidth + filter_c + filterWidth/2];
28 | 
29 |           result += image_value * filter_value;
30 |         }
31 |       }
32 | 
33 |       channelBlurred[r * numCols + c] = result;
34 |     }
35 |   }
36 | }
37 | 
38 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
39 |                           size_t numRows, size_t numCols,
40 |                           const float* const filter, const int filterWidth)
41 | {
42 |   unsigned char *red   = new unsigned char[numRows * numCols];
43 |   unsigned char *blue  = new unsigned char[numRows * numCols];
44 |   unsigned char *green = new unsigned char[numRows * numCols];
45 | 
46 |   unsigned char *redBlurred   = new unsigned char[numRows * numCols];
47 |   unsigned char *blueBlurred  = new unsigned char[numRows * numCols];
48 |   unsigned char *greenBlurred = new unsigned char[numRows * numCols];
49 | 
50 |   //First we separate the incoming RGBA image into three separate channels
51 |   //for Red, Green and Blue
52 |   for (size_t i = 0; i < numRows * numCols; ++i) {
53 |     uchar4 rgba = rgbaImage[i];
54 |     red[i]   = rgba.x;
55 |     green[i] = rgba.y;
56 |     blue[i]  = rgba.z;
57 |   }
58 | 
59 |   //Now we can do the convolution for each of the color channels
60 |   channelConvolution(red, redBlurred, numRows, numCols, filter, filterWidth);
61 |   channelConvolution(green, greenBlurred, numRows, numCols, filter, filterWidth);
62 |   channelConvolution(blue, blueBlurred, numRows, numCols, filter, filterWidth);
63 | 
64 |   //now recombine into the output image - Alpha is 255 for no transparency
65 |   for (size_t i = 0; i < numRows * numCols; ++i) {
66 |     uchar4 rgba = make_uchar4(redBlurred[i], greenBlurred[i], blueBlurred[i], 255);
67 |     outputImage[i] = rgba;
68 |   }
69 | 
70 |   delete[] red;
71 |   delete[] green;
72 |   delete[] blue;
73 | 
74 |   delete[] redBlurred;
75 |   delete[] greenBlurred;
76 |   delete[] blueBlurred;
77 | }
78 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const uchar4* const rgbaImage, uchar4 *const outputImage,
5 |                           size_t numRows, size_t numCols,
6 |                           const float* const filter, const int filterWidth);
7 | 
8 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 2/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <algorithm>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | # minimum required cmake version
 8 | cmake_minimum_required(VERSION 2.8)
 9 | find_package(CUDA QUIET REQUIRED)
10 | 
11 | SET (compare_files compare.cpp)
12 | 
13 | file( GLOB  hdr *.hpp *.h )
14 | file( GLOB  cu  *.cu)
15 | SET (HW3_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
16 |     
17 | CUDA_ADD_EXECUTABLE(HW3 ${HW3_files} ${hdr} ${cu})
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | ###################################
 4 | # These are the default install   #
 5 | # locations on most linux distros #
 6 | ###################################
 7 | 
 8 | OPENCV_LIBPATH=/usr/lib
 9 | OPENCV_INCLUDEPATH=/usr/include
10 | 
11 | ###################################################
12 | # On Macs the default install locations are below #
13 | ###################################################
14 | 
15 | #OPENCV_LIBPATH=/usr/local/lib
16 | #OPENCV_INCLUDEPATH=/usr/local/include
17 | 
18 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
19 | 
20 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
21 | 
22 | ######################################################
23 | # On Macs the default install locations are below    #
24 | # ####################################################
25 | 
26 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
27 | #CUDA_LIBPATH=/usr/local/cuda/lib
28 | 
29 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
30 | 
31 | GCC_OPTS=-O3 -Wall -Wextra -m64
32 | 
33 | student: main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o Makefile
34 | 	$(NVCC) -o HW3 main.o student_func.o HW3.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
35 | 
36 | main.o: main.cpp timer.h utils.h reference_calc.h compare.h
37 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
38 | 
39 | HW3.o: HW3.cu loadSaveImage.h utils.h
40 | 	$(NVCC) -c HW3.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
41 | 
42 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
43 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
44 | 
45 | compare.o: compare.cpp compare.h
46 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
47 | 
48 | reference_calc.o: reference_calc.cpp reference_calc.h
49 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
50 | 
51 | student_func.o: student_func.cu utils.h
52 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
53 | 
54 | clean:
55 | 	rm -f *.o hw
56 | 	find . -type f -name '*.exr' | grep -v memorial | xargs rm -f
57 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 5 | 				   double perPixelError, double globalError)
 6 | {
 7 |   cv::Mat reference = cv::imread(reference_filename, -1);
 8 |   cv::Mat test = cv::imread(test_filename, -1);
 9 | 
10 |   cv::Mat diff = abs(reference - test);
11 | 
12 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 | 
14 |   double minVal, maxVal;
15 | 
16 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 | 
18 |   //now perform transform so that we bump values to the full range
19 | 
20 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 | 
22 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
23 | 
24 |   cv::imwrite("HW3_differenceImage.png", diff);
25 |   //OK, now we can start comparing values...
26 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
27 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
28 | 
29 |   if (useEpsCheck) {
30 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 |   }
32 |   else
33 |   {
34 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 |   }
36 | 
37 |   std::cout << "PASS" << std::endl;
38 |   return;
39 | }
40 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include <stdio.h>
  6 | #include "cuda_runtime.h"
  7 | 
  8 | //The caller becomes responsible for the returned pointer. This
  9 | //is done in the interest of keeping this code as simple as possible.
 10 | //In production code this is a bad idea - we should use RAII
 11 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 12 | //CODE!!!
 13 | void loadImageHDR(const std::string &filename,
 14 |                   float **imagePtr,
 15 |                   size_t *numRows, size_t *numCols)
 16 | {
 17 |     cv::Mat originImg = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 18 | 
 19 |     cv::Mat image;
 20 | 
 21 |     if(originImg.type() != CV_32FC3){
 22 |       originImg.convertTo(image,CV_32FC3);
 23 |     } else{
 24 |       image = originImg;
 25 |     }
 26 | 
 27 |   if (image.empty()) {
 28 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   if (image.channels() != 3) {
 33 |     std::cerr << "Image must be color!" << std::endl;
 34 |     exit(1);
 35 |   }
 36 | 
 37 |   if (!image.isContinuous()) {
 38 |     std::cerr << "Image isn't continuous!" << std::endl;
 39 |     exit(1);
 40 |   }
 41 | 
 42 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 43 | 
 44 |   float *cvPtr = image.ptr<float>(0);
 45 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 46 |     (*imagePtr)[i] = cvPtr[i];
 47 | 
 48 |   *numRows = image.rows;
 49 |   *numCols = image.cols;
 50 | }
 51 | 
 52 | void loadImageRGBA(const std::string &filename,
 53 |                    uchar4 **imagePtr,
 54 |                    size_t *numRows, size_t *numCols)
 55 | {
 56 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 57 |   if (image.empty()) {
 58 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   if (image.channels() != 3) {
 63 |     std::cerr << "Image must be color!" << std::endl;
 64 |     exit(1);
 65 |   }
 66 | 
 67 |   if (!image.isContinuous()) {
 68 |     std::cerr << "Image isn't continuous!" << std::endl;
 69 |     exit(1);
 70 |   }
 71 | 
 72 |   cv::Mat imageRGBA;
 73 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 74 | 
 75 |   *imagePtr = new uchar4[image.rows * image.cols];
 76 | 
 77 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 78 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 79 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
 80 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
 81 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
 82 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
 83 |   }
 84 | 
 85 |   *numRows = image.rows;
 86 |   *numCols = image.cols;
 87 | }
 88 | 
 89 | void saveImageRGBA(const uchar4* const image,
 90 |                    const size_t numRows, const size_t numCols,
 91 |                    const std::string &output_file)
 92 | {
 93 |   int sizes[2];
 94 |   sizes[0] = numRows;
 95 |   sizes[1] = numCols;
 96 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
 97 |   cv::Mat imageOutputBGR;
 98 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
 99 |   //output the image
100 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
101 | }
102 | 
103 | //output an exr file
104 | //assumed to already be BGR
105 | void saveImageHDR(const float* const image,
106 |                   const size_t numRows, const size_t numCols,
107 |                   const std::string &output_file)
108 | {
109 |   int sizes[2];
110 |   sizes[0] = numRows;
111 |   sizes[1] = numCols;
112 | 
113 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
114 | 
115 |   imageHDR = imageHDR * 255;
116 | 
117 |   cv::imwrite(output_file.c_str(), imageHDR);
118 | }
119 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void saveImageRGBA(const uchar4* const image,
16 |                    const size_t numRows, const size_t numCols,
17 |                    const std::string &output_file);
18 | 
19 | void saveImageHDR(const float* const image,
20 |                   const size_t numRows, const size_t numCols,
21 |                   const std::string &output_file);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW3 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | #include <algorithm>
  9 | 
 10 | #include "compare.h"
 11 | #include "reference_calc.h"
 12 | 
 13 | // Functions from HW3.cu
 14 | void preProcess(float **d_luminance, unsigned int **d_cdf,
 15 |                 size_t *numRows, size_t *numCols, unsigned int *numBins,
 16 |                 const std::string& filename);
 17 | 
 18 | void postProcess(const std::string& output_file, size_t numRows, size_t numCols,
 19 |                  float min_logLum, float max_logLum);
 20 | 
 21 | void cleanupGlobalMemory(void);
 22 | 
 23 | // Function from student_func.cu
 24 | void your_histogram_and_prefixsum(const float* const d_luminance,
 25 |                                   unsigned int* const d_cdf,
 26 |                                   float &min_logLum,
 27 |                                   float &max_logLum,
 28 |                                   const size_t numRows,
 29 |                                   const size_t numCols,
 30 |                                   const size_t numBins);
 31 | 
 32 | 
 33 | int main(int argc, char **argv) {
 34 |   float *d_luminance;
 35 |   unsigned int *d_cdf;
 36 | 
 37 |   size_t numRows, numCols;
 38 |   unsigned int numBins;
 39 | 
 40 |   std::string input_file;
 41 |   std::string output_file;
 42 |   std::string reference_file;
 43 |   double perPixelError = 0.0;
 44 |   double globalError   = 0.0;
 45 |   bool useEpsCheck = false;
 46 | 
 47 |   switch (argc)
 48 |   {
 49 | 	case 2:
 50 | 	  input_file = std::string(argv[1]);
 51 | 	  output_file = "HW3_output.png";
 52 | 	  reference_file = "HW3_reference.png";
 53 | 	  break;
 54 | 	case 3:
 55 | 	  input_file  = std::string(argv[1]);
 56 |       output_file = std::string(argv[2]);
 57 | 	  reference_file = "HW3_reference.png";
 58 | 	  break;
 59 | 	case 4:
 60 | 	  input_file  = std::string(argv[1]);
 61 |       output_file = std::string(argv[2]);
 62 | 	  reference_file = std::string(argv[3]);
 63 | 	  break;
 64 | 	case 6:
 65 | 	  useEpsCheck=true;
 66 | 	  input_file  = std::string(argv[1]);
 67 | 	  output_file = std::string(argv[2]);
 68 | 	  reference_file = std::string(argv[3]);
 69 | 	  perPixelError = atof(argv[4]);
 70 |       globalError   = atof(argv[5]);
 71 | 	  break;
 72 | 	default:
 73 |       std::cerr << "Usage: ./HW3 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 74 |       exit(1);
 75 |   }
 76 |   //load the image and give us our input and output pointers
 77 |   preProcess(&d_luminance, &d_cdf,
 78 |              &numRows, &numCols, &numBins, input_file);
 79 | 
 80 |   GpuTimer timer;
 81 |   float min_logLum, max_logLum;
 82 |   min_logLum = 0.f;
 83 |   max_logLum = 1.f;
 84 |   timer.Start();
 85 |   //call the students' code
 86 |   your_histogram_and_prefixsum(d_luminance, d_cdf, min_logLum, max_logLum,
 87 |                                numRows, numCols, numBins);
 88 |   timer.Stop();
 89 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 90 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 91 | 
 92 |   if (err < 0) {
 93 |     //Couldn't print! Probably the student closed stdout - bad news
 94 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 95 |     exit(1);
 96 |   }
 97 | 
 98 |   float *h_luminance = (float *) malloc(sizeof(float)*numRows*numCols);
 99 |   unsigned int *h_cdf = (unsigned int *) malloc(sizeof(unsigned int)*numBins);
100 | 
101 |   checkCudaErrors(cudaMemcpy(h_luminance, d_luminance, numRows*numCols*sizeof(float), cudaMemcpyDeviceToHost));
102 | 
103 |   //check results and output the tone-mapped image
104 |   postProcess(output_file, numRows, numCols, min_logLum, max_logLum);
105 | 
106 |   for (size_t i = 1; i < numCols * numRows; ++i) {
107 | 	min_logLum = std::min(h_luminance[i], min_logLum);
108 |     max_logLum = std::max(h_luminance[i], max_logLum);
109 |   }
110 | 
111 |   referenceCalculation(h_luminance, h_cdf, numRows, numCols, numBins, min_logLum, max_logLum);
112 | 
113 |   checkCudaErrors(cudaMemcpy(d_cdf, h_cdf, sizeof(unsigned int) * numBins, cudaMemcpyHostToDevice));
114 | 
115 |   //check results and output the tone-mapped image
116 |   postProcess(reference_file, numRows, numCols, min_logLum, max_logLum);
117 | 
118 |   cleanupGlobalMemory();
119 | 
120 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 | 
122 |   return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial.exr


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_large.exr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_large.exr


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_png_large.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_png_large.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/memorial_raw_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 3/memorial_raw_large.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cassert>
 3 | 
 4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
 5 |                           const size_t numRows, const size_t numCols, const size_t numBins, 
 6 | 						  float &logLumMin, float &logLumMax)
 7 | {
 8 |   logLumMin = h_logLuminance[0];
 9 |   logLumMax = h_logLuminance[0];
10 | 
11 |   //Step 1
12 |   //first we find the minimum and maximum across the entire image
13 |   for (size_t i = 1; i < numCols * numRows; ++i) {
14 |     logLumMin = std::min(h_logLuminance[i], logLumMin);
15 |     logLumMax = std::max(h_logLuminance[i], logLumMax);
16 |   }
17 | 
18 |   //Step 2
19 |   float logLumRange = logLumMax - logLumMin;
20 | 
21 |   //Step 3
22 |   //next we use the now known range to compute
23 |   //a histogram of numBins bins
24 |   unsigned int *histo = new unsigned int[numBins];
25 | 
26 |   for (size_t i = 0; i < numBins; ++i) histo[i] = 0;
27 | 
28 |   for (size_t i = 0; i < numCols * numRows; ++i) {
29 |     unsigned int bin = std::min(static_cast<unsigned int>(numBins - 1),
30 |                            static_cast<unsigned int>((h_logLuminance[i] - logLumMin) / logLumRange * numBins));
31 |     histo[bin]++;
32 |   }
33 | 
34 |   //Step 4
35 |   //finally we perform and exclusive scan (prefix sum)
36 |   //on the histogram to get the cumulative distribution
37 |   h_cdf[0] = 0;
38 |   for (size_t i = 1; i < numBins; ++i) {
39 |     h_cdf[i] = h_cdf[i - 1] + histo[i - 1];
40 |   }
41 | 
42 |   delete[] histo;
43 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/reference_calc.h:
--------------------------------------------------------------------------------
1 | #ifndef REFERENCE_H__
2 | #define REFERENCE_H__
3 | 
4 | void referenceCalculation(const float* const h_logLuminance, unsigned int* const h_cdf,
5 |                           const size_t numRows, const size_t numCols, const size_t numBins, 
6 | 						  float &logLumMin, float &logLumMax);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/student_func.cu:
--------------------------------------------------------------------------------
  1 | /* Udacity Homework 3
  2 |    HDR Tone-mapping
  3 | 
  4 |   Background HDR
  5 |   ==============
  6 | 
  7 |   A High Dynamic Range (HDR) image contains a wider variation of intensity
  8 |   and color than is allowed by the RGB format with 1 byte per channel that we
  9 |   have used in the previous assignment.  
 10 | 
 11 |   To store this extra information we use single precision floating point for
 12 |   each channel.  This allows for an extremely wide range of intensity values.
 13 | 
 14 |   In the image for this assignment, the inside of church with light coming in
 15 |   through stained glass windows, the raw input floating point values for the
 16 |   channels range from 0 to 275.  But the mean is .41 and 98% of the values are
 17 |   less than 3!  This means that certain areas (the windows) are extremely bright
 18 |   compared to everywhere else.  If we linearly map this [0-275] range into the
 19 |   [0-255] range that we have been using then most values will be mapped to zero!
 20 |   The only thing we will be able to see are the very brightest areas - the
 21 |   windows - everything else will appear pitch black.
 22 | 
 23 |   The problem is that although we have cameras capable of recording the wide
 24 |   range of intensity that exists in the real world our monitors are not capable
 25 |   of displaying them.  Our eyes are also quite capable of observing a much wider
 26 |   range of intensities than our image formats / monitors are capable of
 27 |   displaying.
 28 | 
 29 |   Tone-mapping is a process that transforms the intensities in the image so that
 30 |   the brightest values aren't nearly so far away from the mean.  That way when
 31 |   we transform the values into [0-255] we can actually see the entire image.
 32 |   There are many ways to perform this process and it is as much an art as a
 33 |   science - there is no single "right" answer.  In this homework we will
 34 |   implement one possible technique.
 35 | 
 36 |   Background Chrominance-Luminance
 37 |   ================================
 38 | 
 39 |   The RGB space that we have been using to represent images can be thought of as
 40 |   one possible set of axes spanning a three dimensional space of color.  We
 41 |   sometimes choose other axes to represent this space because they make certain
 42 |   operations more convenient.
 43 | 
 44 |   Another possible way of representing a color image is to separate the color
 45 |   information (chromaticity) from the brightness information.  There are
 46 |   multiple different methods for doing this - a common one during the analog
 47 |   television days was known as Chrominance-Luminance or YUV.
 48 | 
 49 |   We choose to represent the image in this way so that we can remap only the
 50 |   intensity channel and then recombine the new intensity values with the color
 51 |   information to form the final image.
 52 | 
 53 |   Old TV signals used to be transmitted in this way so that black & white
 54 |   televisions could display the luminance channel while color televisions would
 55 |   display all three of the channels.
 56 |   
 57 | 
 58 |   Tone-mapping
 59 |   ============
 60 | 
 61 |   In this assignment we are going to transform the luminance channel (actually
 62 |   the log of the luminance, but this is unimportant for the parts of the
 63 |   algorithm that you will be implementing) by compressing its range to [0, 1].
 64 |   To do this we need the cumulative distribution of the luminance values.
 65 | 
 66 |   Example
 67 |   -------
 68 | 
 69 |   input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
 70 |   min / max / range: 0 / 9 / 9
 71 | 
 72 |   histo with 3 bins: [4 7 3]
 73 | 
 74 |   cdf : [4 11 14]
 75 | 
 76 | 
 77 |   Your task is to calculate this cumulative distribution by following these
 78 |   steps.
 79 | 
 80 | */
 81 | 
 82 | #include "utils.h"
 83 | 
 84 | void your_histogram_and_prefixsum(const float* const d_logLuminance,
 85 |                                   unsigned int* const d_cdf,
 86 |                                   float &min_logLum,
 87 |                                   float &max_logLum,
 88 |                                   const size_t numRows,
 89 |                                   const size_t numCols,
 90 |                                   const size_t numBins)
 91 | {
 92 |   //TODO
 93 |   /*Here are the steps you need to implement
 94 |     1) find the minimum and maximum value in the input logLuminance channel
 95 |        store in min_logLum and max_logLum
 96 |     2) subtract them to find the range
 97 |     3) generate a histogram of all the values in the logLuminance channel using
 98 |        the formula: bin = (lum[i] - lumMin) / lumRange * numBins
 99 |     4) Perform an exclusive scan (prefix sum) on the histogram to get
100 |        the cumulative distribution of luminance values (this should go in the
101 |        incoming d_cdf pointer which already has been allocated for you)       */
102 | 
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 3/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | file( GLOB  cu  *.cu)
12 | SET (HW4_files main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW4 ${HW4_files} ${hdr} ${img} ${cu})
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
 2 | #NVCC=nvcc
 3 | 
 4 | ###################################
 5 | # These are the default install   #
 6 | # locations on most linux distros #
 7 | ###################################
 8 | 
 9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 | 
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 | 
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 | 
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 | 
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 | 
26 | ######################################################
27 | # On Macs the default install locations are below    #
28 | # ####################################################
29 | 
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 | 
34 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
35 | 
36 | GCC_OPTS=-O3 -Wall -Wextra -m64
37 | 
38 | student: main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o Makefile
39 | 	$(NVCC) -o HW4 main.o student_func.o HW4.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
40 | 
41 | main.o: main.cpp timer.h utils.h reference_calc.h
42 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
43 | 
44 | HW4.o: HW4.cu loadSaveImage.h utils.h
45 | 	$(NVCC) -c HW4.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
46 | 
47 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
48 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
49 | 
50 | compare.o: compare.cpp compare.h
51 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
52 | 
53 | reference_calc.o: reference_calc.cpp reference_calc.h
54 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
55 | 
56 | student_func.o: student_func.cu reference_calc.cpp utils.h
57 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
58 | 
59 | clean:
60 | 	rm -f *.o *.png hw
61 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | 
 5 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 6 | 				   double perPixelError, double globalError)
 7 | {
 8 |   cv::Mat reference = cv::imread(reference_filename, -1);
 9 |   cv::Mat test = cv::imread(test_filename, -1);
10 | 
11 |   cv::Mat diff = abs(reference - test);
12 | 
13 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
14 | 
15 |   double minVal, maxVal;
16 | 
17 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
18 | 
19 |   //now perform transform so that we bump values to the full range
20 | 
21 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
22 | 
23 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
24 | 
25 |   cv::imwrite("HW4_differenceImage.png", diff);
26 |   //OK, now we can start comparing values...
27 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
28 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
29 | 
30 |   if (useEpsCheck) {
31 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
32 |   }
33 |   else
34 |   {
35 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
36 |   }
37 | 
38 |   std::cout << "PASS" << std::endl;
39 |   return;
40 | }


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW4_H__
2 | #define HW4_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include "cuda_runtime.h"
  6 | 
  7 | //The caller becomes responsible for the returned pointer. This
  8 | //is done in the interest of keeping this code as simple as possible.
  9 | //In production code this is a bad idea - we should use RAII
 10 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 11 | //CODE!!!
 12 | void loadImageHDR(const std::string &filename,
 13 |                   float **imagePtr,
 14 |                   size_t *numRows, size_t *numCols)
 15 | {
 16 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 17 |   if (image.empty()) {
 18 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 19 |     exit(1);
 20 |   }
 21 | 
 22 |   if (image.channels() != 3) {
 23 |     std::cerr << "Image must be color!" << std::endl;
 24 |     exit(1);
 25 |   }
 26 | 
 27 |   if (!image.isContinuous()) {
 28 |     std::cerr << "Image isn't continuous!" << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 33 | 
 34 |   float *cvPtr = image.ptr<float>(0);
 35 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 36 |     (*imagePtr)[i] = cvPtr[i];
 37 | 
 38 |   *numRows = image.rows;
 39 |   *numCols = image.cols;
 40 | }
 41 | 
 42 | void loadImageRGBA(const std::string &filename,
 43 |                    uchar4 **imagePtr,
 44 |                    size_t *numRows, size_t *numCols)
 45 | {
 46 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 47 |   if (image.empty()) {
 48 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 49 |     exit(1);
 50 |   }
 51 | 
 52 |   if (image.channels() != 3) {
 53 |     std::cerr << "Image must be color!" << std::endl;
 54 |     exit(1);
 55 |   }
 56 | 
 57 |   if (!image.isContinuous()) {
 58 |     std::cerr << "Image isn't continuous!" << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   cv::Mat imageRGBA;
 63 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 64 | 
 65 |   *imagePtr = new uchar4[image.rows * image.cols];
 66 | 
 67 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 68 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 69 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
 70 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
 71 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
 72 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
 73 |   }
 74 | 
 75 |   *numRows = image.rows;
 76 |   *numCols = image.cols;
 77 | }
 78 | 
 79 | void saveImageRGBA(const uchar4* const image,
 80 |                    const size_t numRows, const size_t numCols,
 81 |                    const std::string &output_file)
 82 | {
 83 |   int sizes[2];
 84 |   sizes[0] = numRows;
 85 |   sizes[1] = numCols;
 86 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
 87 |   cv::Mat imageOutputBGR;
 88 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
 89 |   //output the image
 90 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
 91 | }
 92 | 
 93 | //output an exr file
 94 | //assumed to already be BGR
 95 | void saveImageHDR(const float* const image,
 96 |                   const size_t numRows, const size_t numCols,
 97 |                   const std::string &output_file)
 98 | {
 99 |   int sizes[2];
100 |   sizes[0] = numRows;
101 |   sizes[1] = numCols;
102 | 
103 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
104 | 
105 |   imageHDR = imageHDR * 255;
106 | 
107 |   cv::imwrite(output_file.c_str(), imageHDR);
108 | }
109 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void saveImageRGBA(const uchar4* const image,
16 |                    const size_t numRows, const size_t numCols,
17 |                    const std::string &output_file);
18 | 
19 | void saveImageHDR(const float* const image,
20 |                   const size_t numRows, const size_t numCols,
21 |                   const std::string &output_file);
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW4 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | #include <thrust/host_vector.h>
  9 | #include <thrust/device_vector.h>
 10 | 
 11 | #include "compare.h"
 12 | #include "reference_calc.h"
 13 | 
 14 | void preProcess(unsigned int **inputVals,
 15 |                 unsigned int **inputPos,
 16 |                 unsigned int **outputVals,
 17 |                 unsigned int **outputPos,
 18 |                 size_t &numElems,
 19 |                 const std::string& filename,
 20 | 				const std::string& template_file);
 21 | 
 22 | void postProcess(const unsigned int* const outputVals,
 23 |                  const unsigned int* const outputPos,
 24 |                  const size_t numElems,
 25 |                  const std::string& output_file);
 26 | 
 27 | void your_sort(unsigned int* const inputVals,
 28 |                unsigned int* const inputPos,
 29 |                unsigned int* const outputVals,
 30 |                unsigned int* const outputPos,
 31 |                const size_t numElems);
 32 | 
 33 | int main(int argc, char **argv) {
 34 |   unsigned int *inputVals;
 35 |   unsigned int *inputPos;
 36 |   unsigned int *outputVals;
 37 |   unsigned int *outputPos;
 38 | 
 39 |   size_t numElems;
 40 | 
 41 |   std::string input_file;
 42 |   std::string template_file;
 43 |   std::string output_file;
 44 |   std::string reference_file;
 45 |   double perPixelError = 0.0;
 46 |   double globalError   = 0.0;
 47 |   bool useEpsCheck = false;
 48 | 
 49 |   switch (argc)
 50 |   {
 51 | 	case 3:
 52 | 	  input_file  = std::string(argv[1]);
 53 |       template_file = std::string(argv[2]);
 54 | 	  output_file = "HW4_output.png";
 55 | 	  break;
 56 | 	case 4:
 57 | 	  input_file  = std::string(argv[1]);
 58 |       template_file = std::string(argv[2]);
 59 | 	  output_file = std::string(argv[3]);
 60 | 	  break;
 61 | 	default:
 62 |           std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl;
 63 |           exit(1);
 64 |   }
 65 |   //load the image and give us our input and output pointers
 66 |   preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file);
 67 | 
 68 |   GpuTimer timer;
 69 |   timer.Start();
 70 | 
 71 |   //call the students' code
 72 |   your_sort(inputVals, inputPos, outputVals, outputPos, numElems);
 73 | 
 74 |   timer.Stop();
 75 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 76 |   printf("\n");
 77 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 78 | 
 79 |   if (err < 0) {
 80 |     //Couldn't print! Probably the student closed stdout - bad news
 81 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 82 |     exit(1);
 83 |   }
 84 | 
 85 |   //check results and output the red-eye corrected image
 86 |   postProcess(outputVals, outputPos, numElems, output_file);
 87 | 
 88 |   // check code moved from HW4.cu
 89 |   /****************************************************************************
 90 |   * You can use the code below to help with debugging, but make sure to       *
 91 |   * comment it out again before submitting your assignment for grading,       *
 92 |   * otherwise this code will take too much time and make it seem like your    *
 93 |   * GPU implementation isn't fast enough.                                     *
 94 |   *                                                                           *
 95 |   * This code MUST RUN BEFORE YOUR CODE in case you accidentally change       *
 96 |   * the input values when implementing your radix sort.                       *
 97 |   *                                                                           *
 98 |   * This code performs the reference radix sort on the host and compares your *
 99 |   * sorted values to the reference.                                           *
100 |   *                                                                           *
101 |   * Thrust containers are used for copying memory from the GPU                *
102 |   * ************************************************************************* */
103 |   thrust::device_ptr<unsigned int> d_inputVals(inputVals);
104 |   thrust::device_ptr<unsigned int> d_inputPos(inputPos);
105 | 
106 |   thrust::host_vector<unsigned int> h_inputVals(d_inputVals,
107 |                                                 d_inputVals+numElems);
108 |   thrust::host_vector<unsigned int> h_inputPos(d_inputPos,
109 |                                                d_inputPos + numElems);
110 | 
111 |   thrust::host_vector<unsigned int> h_outputVals(numElems);
112 |   thrust::host_vector<unsigned int> h_outputPos(numElems);
113 | 
114 |   reference_calculation(&h_inputVals[0], &h_inputPos[0],
115 | 						&h_outputVals[0], &h_outputPos[0],
116 | 						numElems);
117 | 
118 |   //postProcess(valsPtr, posPtr, numElems, reference_file);
119 | 
120 |   //compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
121 | 
122 |   thrust::device_ptr<unsigned int> d_outputVals(outputVals);
123 |   thrust::device_ptr<unsigned int> d_outputPos(outputPos);
124 | 
125 |   thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals,
126 |                                                      d_outputVals + numElems);
127 |   thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos,
128 |                                                     d_outputPos + numElems);
129 | 
130 |   checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems);
131 |   checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems);
132 | 
133 |   checkCudaErrors(cudaFree(inputVals));
134 |   checkCudaErrors(cudaFree(inputPos));
135 |   checkCudaErrors(cudaFree(outputVals));
136 |   checkCudaErrors(cudaFree(outputPos));
137 | 
138 |   return 0;
139 | }
140 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_5.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 4/red_eye_effect_template_5.jpg


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | // For memset
 3 | #include <cstring>
 4 | 
 5 | void reference_calculation(unsigned int* inputVals,
 6 |                            unsigned int* inputPos,
 7 |                            unsigned int* outputVals,
 8 |                            unsigned int* outputPos,
 9 |                            const size_t numElems)
10 | {
11 |   const int numBits = 1;
12 |   const int numBins = 1 << numBits;
13 | 
14 |   unsigned int *binHistogram = new unsigned int[numBins];
15 |   unsigned int *binScan      = new unsigned int[numBins];
16 | 
17 |   unsigned int *vals_src = inputVals;
18 |   unsigned int *pos_src  = inputPos;
19 | 
20 |   unsigned int *vals_dst = outputVals;
21 |   unsigned int *pos_dst  = outputPos;
22 | 
23 |   //a simple radix sort - only guaranteed to work for numBits that are multiples of 2
24 |   for (unsigned int i = 0; i < 8 * sizeof(unsigned int); i += numBits) {
25 |     unsigned int mask = (numBins - 1) << i;
26 | 
27 |     memset(binHistogram, 0, sizeof(unsigned int) * numBins); //zero out the bins
28 |     memset(binScan, 0, sizeof(unsigned int) * numBins); //zero out the bins
29 | 
30 |     //perform histogram of data & mask into bins
31 |     for (unsigned int j = 0; j < numElems; ++j) {
32 |       unsigned int bin = (vals_src[j] & mask) >> i;
33 |       binHistogram[bin]++;
34 |     }
35 | 
36 |     //perform exclusive prefix sum (scan) on binHistogram to get starting
37 |     //location for each bin
38 |     for (unsigned int j = 1; j < numBins; ++j) {
39 |       binScan[j] = binScan[j - 1] + binHistogram[j - 1];
40 |     }
41 | 
42 |     //Gather everything into the correct location
43 |     //need to move vals and positions
44 |     for (unsigned int j = 0; j < numElems; ++j) {
45 |       unsigned int bin = (vals_src[j] & mask) >> i;
46 |       vals_dst[binScan[bin]] = vals_src[j];
47 |       pos_dst[binScan[bin]]  = pos_src[j];
48 |       binScan[bin]++;
49 |     }
50 | 
51 |     //swap the buffers (pointers only)
52 |     std::swap(vals_dst, vals_src);
53 |     std::swap(pos_dst, pos_src);
54 |   }
55 | 
56 |   //we did an even number of iterations, need to copy from input buffer into output
57 |   std::copy(inputVals, inputVals + numElems, outputVals);
58 |   std::copy(inputPos, inputPos + numElems, outputPos);
59 | 
60 |   delete[] binHistogram;
61 |   delete[] binScan;
62 | }
63 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | 
 5 | //A simple un-optimized reference radix sort calculation
 6 | //Only deals with power-of-2 radices
 7 | 
 8 | 
 9 | void reference_calculation(unsigned int* inputVals,
10 |                            unsigned int* inputPos,
11 |                            unsigned int* outputVals,
12 |                            unsigned int* outputPos,
13 |                            const size_t numElems);
14 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/student_func.cu:
--------------------------------------------------------------------------------
 1 | //Udacity HW 4
 2 | //Radix Sorting
 3 | 
 4 | #include "utils.h"
 5 | #include <thrust/host_vector.h>
 6 | 
 7 | /* Red Eye Removal
 8 |    ===============
 9 |    
10 |    For this assignment we are implementing red eye removal.  This is
11 |    accomplished by first creating a score for every pixel that tells us how
12 |    likely it is to be a red eye pixel.  We have already done this for you - you
13 |    are receiving the scores and need to sort them in ascending order so that we
14 |    know which pixels to alter to remove the red eye.
15 | 
16 |    Note: ascending order == smallest to largest
17 | 
18 |    Each score is associated with a position, when you sort the scores, you must
19 |    also move the positions accordingly.
20 | 
21 |    Implementing Parallel Radix Sort with CUDA
22 |    ==========================================
23 | 
24 |    The basic idea is to construct a histogram on each pass of how many of each
25 |    "digit" there are.   Then we scan this histogram so that we know where to put
26 |    the output of each digit.  For example, the first 1 must come after all the
27 |    0s so we have to know how many 0s there are to be able to start moving 1s
28 |    into the correct position.
29 | 
30 |    1) Histogram of the number of occurrences of each digit
31 |    2) Exclusive Prefix Sum of Histogram
32 |    3) Determine relative offset of each digit
33 |         For example [0 0 1 1 0 0 1]
34 |                 ->  [0 1 0 1 2 3 2]
35 |    4) Combine the results of steps 2 & 3 to determine the final
36 |       output location for each element and move it there
37 | 
38 |    LSB Radix sort is an out-of-place sort and you will need to ping-pong values
39 |    between the input and output buffers we have provided.  Make sure the final
40 |    sorted results end up in the output buffer!  Hint: You may need to do a copy
41 |    at the end.
42 | 
43 |  */
44 | 
45 | 
46 | void your_sort(unsigned int* const d_inputVals,
47 |                unsigned int* const d_inputPos,
48 |                unsigned int* const d_outputVals,
49 |                unsigned int* const d_outputPos,
50 |                const size_t numElems)
51 | { 
52 |   //TODO
53 |   //PUT YOUR SORT HERE
54 | }
55 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 4/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 5.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | 
12 | SET (HW5_files main.cu student.cu reference_calc.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW5 ${HW5_files} ${hdr})
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
 3 | 
 4 | histo: main.cu reference_calc.o student.o Makefile
 5 | 	nvcc -o HW5 main.cu reference_calc.o student.o $(NVCC_OPTS)
 6 | 
 7 | student.o: student.cu
 8 | 	nvcc -c student.cu $(NVCC_OPTS)
 9 | 
10 | reference_calc.o: reference_calc.cpp reference_calc.h
11 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
12 | 
13 | clean:
14 | 	rm -f *.o hw *.bin
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/main.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <iostream>
  3 | #include <cstdio>
  4 | #include <fstream>
  5 | #include "utils.h"
  6 | #include "timer.h"
  7 | #include <cstdio>
  8 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
  9 | #include <Windows.h>
 10 | #else
 11 | #include <sys/time.h>
 12 | #endif
 13 | 
 14 | #include <thrust/random/linear_congruential_engine.h>
 15 | #include <thrust/random/normal_distribution.h>
 16 | #include <thrust/random/uniform_int_distribution.h>
 17 | 
 18 | #include "reference_calc.h"
 19 | 
 20 | void computeHistogram(const unsigned int *const d_vals,
 21 |                       unsigned int* const d_histo,
 22 |                       const unsigned int numBins,
 23 |                       const unsigned int numElems);
 24 | 
 25 | int main(void)
 26 | {
 27 |   const unsigned int numBins = 1024;
 28 |   const unsigned int numElems = 10000 * numBins;
 29 |   const float stddev = 100.f;
 30 | 
 31 |   unsigned int *vals = new unsigned int[numElems];
 32 |   unsigned int *h_vals = new unsigned int[numElems];
 33 |   unsigned int *h_studentHisto = new unsigned int[numBins];
 34 |   unsigned int *h_refHisto = new unsigned int[numBins];
 35 | 
 36 | #if defined(_WIN16) || defined(_WIN32) || defined(_WIN64)
 37 |   srand(GetTickCount());
 38 | #else
 39 |   timeval tv;
 40 |   gettimeofday(&tv, NULL);
 41 | 
 42 |   srand(tv.tv_usec);
 43 | #endif
 44 | 
 45 |   //make the mean unpredictable, but close enough to the middle
 46 |   //so that timings are unaffected
 47 |   unsigned int mean = rand() % 100 + 462;
 48 | 
 49 |   //Output mean so that grading can happen with the same inputs
 50 |   std::cout << mean << std::endl;
 51 | 
 52 |   thrust::minstd_rand rng;
 53 | 
 54 |   thrust::random::experimental::normal_distribution<float> normalDist((float)mean, stddev);
 55 | 
 56 |   // Generate the random values
 57 |   for (size_t i = 0; i < numElems; ++i) {
 58 |     vals[i] = std::min((unsigned int) std::max((int)normalDist(rng), 0), numBins - 1);
 59 |   }
 60 | 
 61 |   unsigned int *d_vals, *d_histo;
 62 | 
 63 |   GpuTimer timer;
 64 | 
 65 |   checkCudaErrors(cudaMalloc(&d_vals,    sizeof(unsigned int) * numElems));
 66 |   checkCudaErrors(cudaMalloc(&d_histo,   sizeof(unsigned int) * numBins));
 67 |   checkCudaErrors(cudaMemset(d_histo, 0, sizeof(unsigned int) * numBins));
 68 | 
 69 |   checkCudaErrors(cudaMemcpy(d_vals, vals, sizeof(unsigned int) * numElems, cudaMemcpyHostToDevice));
 70 | 
 71 |   timer.Start();
 72 |   computeHistogram(d_vals, d_histo, numBins, numElems);
 73 |   timer.Stop();
 74 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 75 | 
 76 |   if (err < 0) {
 77 |     //Couldn't print! Probably the student closed stdout - bad news
 78 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 79 |     exit(1);
 80 |   }
 81 | 
 82 |   // copy the student-computed histogram back to the host
 83 |   checkCudaErrors(cudaMemcpy(h_studentHisto, d_histo, sizeof(unsigned int) * numBins, cudaMemcpyDeviceToHost));
 84 | 
 85 |   //generate reference for the given mean
 86 |   reference_calculation(vals, h_refHisto, numBins, numElems);
 87 | 
 88 |   //Now do the comparison
 89 |   checkResultsExact(h_refHisto, h_studentHisto, numBins);
 90 | 
 91 |   delete[] h_vals;
 92 |   delete[] h_refHisto;
 93 |   delete[] h_studentHisto;
 94 | 
 95 |   cudaFree(d_vals);
 96 |   cudaFree(d_histo);
 97 | 
 98 |   return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdlib>
 2 | //Reference Histogram calculation
 3 | 
 4 | void reference_calculation(const unsigned int* const vals,
 5 |                            unsigned int* const histo,
 6 |                            const size_t numBins,
 7 |                            const size_t numElems)
 8 | 
 9 | {
10 |   //zero out bins
11 |   for (size_t i = 0; i < numBins; ++i)
12 |     histo[i] = 0;
13 | 
14 |   //go through vals and increment appropriate bin
15 |   for (size_t i = 0; i < numElems; ++i)
16 |     histo[vals[i]]++;
17 | }
18 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | //Reference Histogram calculation
 5 | 
 6 | void reference_calculation(const unsigned int* const vals,
 7 |                            unsigned int* const histo,
 8 |                            const size_t numBins,
 9 |                            const size_t numElems);
10 | 
11 | #endif


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/student.cu:
--------------------------------------------------------------------------------
 1 | /* Udacity HW5
 2 |    Histogramming for Speed
 3 | 
 4 |    The goal of this assignment is compute a histogram
 5 |    as fast as possible.  We have simplified the problem as much as
 6 |    possible to allow you to focus solely on the histogramming algorithm.
 7 | 
 8 |    The input values that you need to histogram are already the exact
 9 |    bins that need to be updated.  This is unlike in HW3 where you needed
10 |    to compute the range of the data and then do:
11 |    bin = (val - valMin) / valRange to determine the bin.
12 | 
13 |    Here the bin is just:
14 |    bin = val
15 | 
16 |    so the serial histogram calculation looks like:
17 |    for (i = 0; i < numElems; ++i)
18 |      histo[val[i]]++;
19 | 
20 |    That's it!  Your job is to make it run as fast as possible!
21 | 
22 |    The values are normally distributed - you may take
23 |    advantage of this fact in your implementation.
24 | 
25 | */
26 | 
27 | 
28 | #include "utils.h"
29 | 
30 | __global__
31 | void yourHisto(const unsigned int* const vals, //INPUT
32 |                unsigned int* const histo,      //OUPUT
33 |                int numVals)
34 | {
35 |   //TODO fill in this kernel to calculate the histogram
36 |   //as quickly as possible
37 | 
38 |   //Although we provide only one kernel skeleton,
39 |   //feel free to use more if it will help you
40 |   //write faster code
41 | }
42 | 
43 | void computeHistogram(const unsigned int* const d_vals, //INPUT
44 |                       unsigned int* const d_histo,      //OUTPUT
45 |                       const unsigned int numBins,
46 |                       const unsigned int numElems)
47 | {
48 |   //TODO Launch the yourHisto kernel
49 | 
50 |   //if you want to use/launch more than one kernel,
51 |   //feel free
52 | 
53 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
54 | }
55 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 5/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6.zip


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ############################################################################
 2 | # <summary> CMakeLists.txt for OpenCV and CUDA. </summary>
 3 | # <date>    2012-02-07          </date>
 4 | # <author>  Quan Tran Minh. edit by Johannes Kast, Michael Sarahan </author>
 5 | # <email>   quantm@unist.ac.kr  kast.jo@googlemail.com msarahan@gmail.com</email>
 6 | ############################################################################
 7 | 
 8 | # collect source files
 9 | 
10 | file( GLOB  hdr *.hpp *.h )
11 | 
12 | SET (HW6_files student_func.cu HW6.cu main.cpp loadSaveImage.cpp reference_calc.cpp compare.cpp)
13 | 
14 | CUDA_ADD_EXECUTABLE(HW6 ${HW6_files} ${hdr})
15 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/HW6.cu:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <string>
 5 | #include <iostream>
 6 | 
 7 | #include "loadSaveImage.h"
 8 | #include <stdio.h>
 9 | 
10 | 
11 | //return types are void since any internal error will be handled by quitting
12 | //no point in returning error codes...
13 | void preProcess( uchar4 **sourceImg,
14 |                  size_t &numRows,  size_t &numCols,
15 |                  uchar4 **destImg, 
16 |                  uchar4 **blendedImg, const std::string& source_filename,
17 |                  const std::string& dest_filename){
18 | 
19 |   //make sure the context initializes ok
20 |   checkCudaErrors(cudaFree(0));
21 | 
22 |   size_t numRowsSource, numColsSource, numRowsDest, numColsDest;
23 | 
24 |   loadImageRGBA(source_filename, sourceImg, &numRowsSource, &numColsSource);
25 |   loadImageRGBA(dest_filename, destImg, &numRowsDest, &numColsDest);
26 | 
27 |   assert(numRowsSource == numRowsDest);
28 |   assert(numColsSource == numColsDest);
29 | 
30 |   numRows = numRowsSource;
31 |   numCols = numColsSource;
32 | 
33 |   *blendedImg = new uchar4[numRows * numCols];
34 | 
35 | }
36 | 
37 | void postProcess(const uchar4* const blendedImg,
38 |                  const size_t numRowsDest, const size_t numColsDest,
39 |                  const std::string& output_file)
40 | {
41 |   //just need to save the image...
42 |   saveImageRGBA(blendedImg, numRowsDest, numColsDest, output_file);
43 | }
44 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=/usr/local/cuda-5.0/bin/nvcc
 2 | #NVCC=nvcc
 3 | 
 4 | ###################################
 5 | # These are the default install   #
 6 | # locations on most linux distros #
 7 | ###################################
 8 | 
 9 | OPENCV_LIBPATH=/usr/lib
10 | OPENCV_INCLUDEPATH=/usr/include
11 | 
12 | ###################################################
13 | # On Macs the default install locations are below #
14 | ###################################################
15 | 
16 | #OPENCV_LIBPATH=/usr/local/lib
17 | #OPENCV_INCLUDEPATH=/usr/local/include
18 | 
19 | OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
20 | 
21 | CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
22 | # CUDA_INCLUDEPATH=/usr/local/cuda/lib64/include
23 | # CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
24 | # CUDA_INCLUDEPATH=/Developer/NVIDIA/CUDA-5.0/include
25 | 
26 | ######################################################
27 | # On Macs the default install locations are below    #
28 | # ####################################################
29 | 
30 | #CUDA_INCLUDEPATH=/usr/local/cuda/include
31 | #CUDA_LIBPATH=/usr/local/cuda/lib
32 | CUDA_LIBPATH=/usr/local/cuda-5.0/lib64
33 | 
34 | #no warnings otherwise thrust explodes output
35 | 
36 | NVCC_OPTS=-O3 -arch=sm_20 -m64
37 | 
38 | GCC_OPTS=-O3 -m64
39 | 
40 | student: main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o Makefile
41 | 	$(NVCC) -o HW6 main.o student_func.o HW6.o loadSaveImage.o compare.o reference_calc.o -L $(OPENCV_LIBPATH) $(OPENCV_LIBS) $(NVCC_OPTS)
42 | 
43 | main.o: main.cpp timer.h utils.h
44 | 	g++ -c main.cpp $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
45 | 
46 | HW6.o: HW6.cu loadSaveImage.h utils.h
47 | 	$(NVCC) -c HW6.cu -I $(OPENCV_INCLUDEPATH) $(NVCC_OPTS)
48 | 
49 | loadSaveImage.o: loadSaveImage.cpp loadSaveImage.h
50 | 	g++ -c loadSaveImage.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
51 | 
52 | student_func.o: student_func.cu reference_calc.cpp utils.h
53 | 	$(NVCC) -c student_func.cu $(NVCC_OPTS)
54 | 
55 | compare.o: compare.cpp compare.h
56 | 	g++ -c compare.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
57 | 
58 | reference_calc.o: reference_calc.cpp reference_calc.h
59 | 	g++ -c reference_calc.cpp -I $(OPENCV_INCLUDEPATH) $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
60 | 
61 | clean:
62 | 	rm -f *.o hw
63 | 	find . -type f -name '*.png' | grep -v source.png | grep -v destination.png | xargs rm -f
64 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/blended.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/blended.gold


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.cpp:
--------------------------------------------------------------------------------
 1 | #include <opencv2/opencv.hpp>
 2 | #include "utils.h"
 3 | 
 4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
 5 | 				   double perPixelError, double globalError)
 6 | {
 7 |   cv::Mat reference = cv::imread(reference_filename, -1);
 8 |   cv::Mat test = cv::imread(test_filename, -1);
 9 | 
10 |   cv::Mat diff = abs(reference - test);
11 | 
12 |   cv::Mat diffSingleChannel = diff.reshape(1, 0); //convert to 1 channel, same # rows
13 | 
14 |   double minVal, maxVal;
15 | 
16 |   cv::minMaxLoc(diffSingleChannel, &minVal, &maxVal, NULL, NULL); //NULL because we don't care about location
17 | 
18 |   //now perform transform so that we bump values to the full range
19 | 
20 |   diffSingleChannel = (diffSingleChannel - minVal) * (255. / (maxVal - minVal));
21 | 
22 |   diff = diffSingleChannel.reshape(reference.channels(), 0);
23 | 
24 |   cv::imwrite("HW6_differenceImage.png", diff);
25 |   //OK, now we can start comparing values...
26 |   unsigned char *referencePtr = reference.ptr<unsigned char>(0);
27 |   unsigned char *testPtr = test.ptr<unsigned char>(0);
28 | 
29 |   if (useEpsCheck) {
30 |     checkResultsEps(referencePtr, testPtr, reference.rows * reference.cols * reference.channels(), perPixelError, globalError);
31 |   }
32 |   else
33 |   {
34 |     checkResultsExact(referencePtr, testPtr, reference.rows * reference.cols * reference.channels());
35 |   }
36 | 
37 |   std::cout << "PASS" << std::endl;
38 |   return;
39 | }
40 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/compare.h:
--------------------------------------------------------------------------------
1 | #ifndef HW3_H__
2 | #define HW3_H__
3 | 
4 | void compareImages(std::string reference_filename, std::string test_filename, bool useEpsCheck,
5 | 				   double perPixelError, double globalError);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/destination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/destination.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/core/core.hpp>
  2 | #include <opencv2/highgui/highgui.hpp>
  3 | #include <opencv2/opencv.hpp>
  4 | #include <vector>
  5 | #include "cuda_runtime.h"
  6 | 
  7 | //The caller becomes responsible for the returned pointer. This
  8 | //is done in the interest of keeping this code as simple as possible.
  9 | //In production code this is a bad idea - we should use RAII
 10 | //to ensure the memory is freed.  DO NOT COPY THIS AND USE IN PRODUCTION
 11 | //CODE!!!
 12 | void loadImageHDR(const std::string &filename,
 13 |                   float **imagePtr,
 14 |                   size_t *numRows, size_t *numCols)
 15 | {
 16 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR | CV_LOAD_IMAGE_ANYDEPTH);
 17 |   if (image.empty()) {
 18 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 19 |     exit(1);
 20 |   }
 21 | 
 22 |   if (image.channels() != 3) {
 23 |     std::cerr << "Image must be color!" << std::endl;
 24 |     exit(1);
 25 |   }
 26 | 
 27 |   if (!image.isContinuous()) {
 28 |     std::cerr << "Image isn't continuous!" << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   *imagePtr = new float[image.rows * image.cols * image.channels()];
 33 | 
 34 |   float *cvPtr = image.ptr<float>(0);
 35 |   for (size_t i = 0; i < image.rows * image.cols * image.channels(); ++i)
 36 |     (*imagePtr)[i] = cvPtr[i];
 37 | 
 38 |   *numRows = image.rows;
 39 |   *numCols = image.cols;
 40 | }
 41 | 
 42 | void loadImageGrey(const std::string &filename,
 43 |                    unsigned char **imagePtr,
 44 |                    size_t *numRows, size_t *numCols)
 45 | {
 46 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_GRAYSCALE);
 47 |   if (image.empty()) {
 48 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 49 |     exit(1);
 50 |   }
 51 | 
 52 |   if (image.channels() != 1) {
 53 |     std::cerr << "Image must be greyscale!" << std::endl;
 54 |     exit(1);
 55 |   }
 56 | 
 57 |   if (!image.isContinuous()) {
 58 |     std::cerr << "Image isn't continuous!" << std::endl;
 59 |     exit(1);
 60 |   }
 61 | 
 62 |   *imagePtr = new unsigned char[image.rows * image.cols];
 63 | 
 64 |   unsigned char *cvPtr = image.ptr<unsigned char>(0);
 65 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 66 |     (*imagePtr)[i] = cvPtr[i];
 67 |   }
 68 | 
 69 |   *numRows = image.rows;
 70 |   *numCols = image.cols;
 71 | }
 72 | void loadImageRGBA(const std::string &filename,
 73 |                    uchar4 **imagePtr,
 74 |                    size_t *numRows, size_t *numCols)
 75 | {
 76 |   cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
 77 |   if (image.empty()) {
 78 |     std::cerr << "Couldn't open file: " << filename << std::endl;
 79 |     exit(1);
 80 |   }
 81 | 
 82 |   if (image.channels() != 3) {
 83 |     std::cerr << "Image must be color!" << std::endl;
 84 |     exit(1);
 85 |   }
 86 | 
 87 |   if (!image.isContinuous()) {
 88 |     std::cerr << "Image isn't continuous!" << std::endl;
 89 |     exit(1);
 90 |   }
 91 | 
 92 |   cv::Mat imageRGBA;
 93 |   cv::cvtColor(image, imageRGBA, CV_BGR2RGBA);
 94 | 
 95 |   *imagePtr = new uchar4[image.rows * image.cols];
 96 | 
 97 |   unsigned char *cvPtr = imageRGBA.ptr<unsigned char>(0);
 98 |   for (size_t i = 0; i < image.rows * image.cols; ++i) {
 99 |     (*imagePtr)[i].x = cvPtr[4 * i + 0];
100 |     (*imagePtr)[i].y = cvPtr[4 * i + 1];
101 |     (*imagePtr)[i].z = cvPtr[4 * i + 2];
102 |     (*imagePtr)[i].w = cvPtr[4 * i + 3];
103 |   }
104 | 
105 |   *numRows = image.rows;
106 |   *numCols = image.cols;
107 | }
108 | 
109 | void saveImageRGBA(const uchar4* const image,
110 |                    const size_t numRows, const size_t numCols,
111 |                    const std::string &output_file)
112 | {
113 |   int sizes[2];
114 |   sizes[0] = numRows;
115 |   sizes[1] = numCols;
116 |   cv::Mat imageRGBA(2, sizes, CV_8UC4, (void *)image);
117 |   cv::Mat imageOutputBGR;
118 |   cv::cvtColor(imageRGBA, imageOutputBGR, CV_RGBA2BGR);
119 |   //output the image
120 |   cv::imwrite(output_file.c_str(), imageOutputBGR);
121 | }
122 | 
123 | //output an exr file
124 | //assumed to already be BGR
125 | void saveImageHDR(const float* const image,
126 |                   const size_t numRows, const size_t numCols,
127 |                   const std::string &output_file)
128 | {
129 |   int sizes[2];
130 |   sizes[0] = numRows;
131 |   sizes[1] = numCols;
132 | 
133 |   cv::Mat imageHDR(2, sizes, CV_32FC3, (void *)image);
134 | 
135 |   imageHDR = imageHDR * 255;
136 | 
137 |   cv::imwrite(output_file.c_str(), imageHDR);
138 | }
139 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/loadSaveImage.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOADSAVEIMAGE_H__
 2 | #define LOADSAVEIMAGE_H__
 3 | 
 4 | #include <string>
 5 | #include <cuda_runtime.h> //for uchar4
 6 | 
 7 | void loadImageHDR(const std::string &filename,
 8 |                   float **imagePtr,
 9 |                   size_t *numRows, size_t *numCols);
10 | 
11 | void loadImageRGBA(const std::string &filename,
12 |                    uchar4 **imagePtr,
13 |                    size_t *numRows, size_t *numCols);
14 | 
15 | void loadImageGrey(const std::string &filename,
16 |                    unsigned char **imagePtr,
17 |                    size_t *numRows, size_t *numCols);
18 | 
19 | void saveImageRGBA(const uchar4* const image,
20 |                    const size_t numRows, const size_t numCols,
21 |                    const std::string &output_file);
22 | 
23 | void saveImageHDR(const float* const image,
24 |                   const size_t numRows, const size_t numCols,
25 |                   const std::string &output_file);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/main.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW6 Driver
  2 | 
  3 | #include <iostream>
  4 | #include "timer.h"
  5 | #include "utils.h"
  6 | #include <string>
  7 | #include <stdio.h>
  8 | 
  9 | #include <opencv2/core/core.hpp>
 10 | #include <opencv2/highgui/highgui.hpp>
 11 | #include <opencv2/opencv.hpp>
 12 | 
 13 | #include "reference_calc.h"
 14 | #include "compare.h"
 15 | 
 16 | void preProcess( uchar4 **sourceImg, size_t &numRowsSource,  size_t &numColsSource,
 17 |                  uchar4 **destImg,
 18 |                  uchar4 **blendedImg, const std::string& source_filename,
 19 |                  const std::string& dest_filename);
 20 | 
 21 | void postProcess(const uchar4* const blendedImg,
 22 |                  const size_t numRowsDest, const size_t numColsDest,
 23 |                  const std::string& output_file);
 24 | 
 25 | void your_blend(const uchar4* const sourceImg,
 26 |                 const size_t numRowsSource, const size_t numColsSource,
 27 |                 const uchar4* const destImg,
 28 |                 uchar4* const blendedImg);
 29 | 
 30 | int main(int argc, char **argv) {
 31 |   uchar4 *h_sourceImg, *h_destImg, *h_blendedImg;
 32 |   size_t numRowsSource, numColsSource;
 33 | 
 34 |   std::string input_source_file;
 35 |   std::string input_dest_file;
 36 |   std::string output_file;
 37 | 
 38 |   std::string reference_file;
 39 |   double perPixelError = 0.0;
 40 |   double globalError   = 0.0;
 41 |   bool useEpsCheck = false;
 42 | 
 43 |   switch (argc)
 44 |   {
 45 |   	case 3:
 46 |   	  input_source_file  = std::string(argv[1]);
 47 |   	  input_dest_file = std::string(argv[2]);
 48 |       output_file = "HW6_output.png";
 49 |   	  reference_file = "HW6_reference.png";
 50 |   	  break;
 51 |   	case 4:
 52 |   	  input_source_file  = std::string(argv[1]);
 53 |   	  input_dest_file = std::string(argv[2]);
 54 |       output_file = std::string(argv[3]);
 55 |   	  reference_file = "HW6_reference.png";
 56 |   	  break;
 57 |   	case 5:
 58 |   	  input_source_file  = std::string(argv[1]);
 59 |   	  input_dest_file = std::string(argv[2]);
 60 |   	  output_file = std::string(argv[3]);
 61 |   	  reference_file = std::string(argv[4]);
 62 |   	  break;
 63 |   	case 7:
 64 |   	  useEpsCheck=true;
 65 |   	  input_source_file  = std::string(argv[1]);
 66 |   	  input_dest_file = std::string(argv[2]);
 67 |   	  output_file = std::string(argv[3]);
 68 |   	  reference_file = std::string(argv[4]);
 69 |   	  perPixelError = atof(argv[5]);
 70 |       globalError   = atof(argv[6]);
 71 |   	  break;
 72 |   	default:
 73 |         std::cerr << "Usage: ./HW6 input_source_file input_dest_filename [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
 74 |         exit(1);
 75 |     }
 76 | 
 77 |   //load the image and give us our input and output pointers
 78 |   preProcess(&h_sourceImg, numRowsSource, numColsSource,
 79 |              &h_destImg,
 80 |              &h_blendedImg, input_source_file, input_dest_file);
 81 | 
 82 |   GpuTimer timer;
 83 |   timer.Start();
 84 | 
 85 |   //call the students' code
 86 |   your_blend(h_sourceImg, numRowsSource, numColsSource,
 87 |              h_destImg,
 88 |              h_blendedImg);
 89 | 
 90 |   timer.Stop();
 91 |   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 92 |   int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
 93 |   printf("\n");
 94 |   if (err < 0) {
 95 |     //Couldn't print! Probably the student closed stdout - bad news
 96 |     std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
 97 |     exit(1);
 98 |   }
 99 | 
100 |   //check results and output the tone-mapped image
101 |   postProcess(h_blendedImg, numRowsSource, numColsSource, output_file);
102 | 
103 |   // calculate the reference image
104 |   uchar4* h_reference = new uchar4[numRowsSource*numColsSource];
105 |   reference_calc(h_sourceImg, numRowsSource, numColsSource,
106 |                    h_destImg, h_reference);
107 | 
108 |   // save the reference image
109 |   postProcess(h_reference, numRowsSource, numColsSource, reference_file);
110 | 
111 |   compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError);
112 | 
113 |   delete[] h_reference;
114 |   delete[] h_destImg;
115 |   delete[] h_sourceImg;
116 |   delete[] h_blendedImg;
117 |   return 0;
118 | }
119 | 
120 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/reference_calc.cpp:
--------------------------------------------------------------------------------
  1 | //Udacity HW 6
  2 | //Poisson Blending Reference Calculation
  3 | 
  4 | #include "utils.h"
  5 | #include <thrust/host_vector.h>
  6 | 
  7 | //Performs one iteration of the solver
  8 | void computeIteration(const unsigned char* const dstImg,
  9 |                       const unsigned char* const strictInteriorPixels,
 10 |                       const unsigned char* const borderPixels,
 11 |                       const std::vector<uint2>& interiorPixelList,
 12 |                       const size_t numColsSource,
 13 |                       const float* const f,
 14 |                       const float* const g,
 15 |                       float* const f_next)
 16 | {
 17 |   unsigned int off = interiorPixelList[0].x * numColsSource + interiorPixelList[0].y;
 18 | 
 19 |   for (size_t i = 0; i < interiorPixelList.size(); ++i) {
 20 |     float blendedSum = 0.f;
 21 |     float borderSum  = 0.f;
 22 | 
 23 |     uint2 coord = interiorPixelList[i];
 24 | 
 25 |     unsigned int offset = coord.x * numColsSource + coord.y;
 26 | 
 27 |     //process all 4 neighbor pixels
 28 |     //for each pixel if it is an interior pixel
 29 |     //then we add the previous f, otherwise if it is a
 30 |     //border pixel then we add the value of the destination
 31 |     //image at the border.  These border values are our boundary
 32 |     //conditions.
 33 |     if (strictInteriorPixels[offset - 1]) {
 34 |       blendedSum += f[offset - 1];
 35 |     }
 36 |     else {
 37 |       borderSum += dstImg[offset - 1];
 38 |     }
 39 | 
 40 |     if (strictInteriorPixels[offset + 1]) {
 41 |       blendedSum += f[offset + 1];
 42 |     }
 43 |     else {
 44 |       borderSum += dstImg[offset + 1];
 45 |     }
 46 | 
 47 |     if (strictInteriorPixels[offset - numColsSource]) {
 48 |       blendedSum += f[offset - numColsSource];
 49 |     }
 50 |     else {
 51 |       borderSum += dstImg[offset - numColsSource];
 52 |     }
 53 | 
 54 |     if (strictInteriorPixels[offset + numColsSource]) {
 55 |       blendedSum += f[offset + numColsSource];
 56 |     }
 57 |     else {
 58 |       borderSum += dstImg[offset + numColsSource];
 59 |     }
 60 | 
 61 |     float f_next_val = (blendedSum + borderSum + g[offset]) / 4.f;
 62 | 
 63 |     f_next[offset] = std::min(255.f, std::max(0.f, f_next_val)); //clip to [0, 255]
 64 |   }
 65 | 
 66 | }
 67 | 
 68 | //pre-compute the values of g, which depend only the source image
 69 | //and aren't iteration dependent.
 70 | void computeG(const unsigned char* const channel,
 71 |               float* const g,
 72 |               const size_t numColsSource,
 73 |               const std::vector<uint2>& interiorPixelList)
 74 | {
 75 |   for (size_t i = 0; i < interiorPixelList.size(); ++i) {
 76 |     uint2 coord = interiorPixelList[i];
 77 |     unsigned int offset = coord.x * numColsSource + coord.y;
 78 | 
 79 |     float sum = 4.f * channel[offset];
 80 | 
 81 |     sum -= (float)channel[offset - 1] + (float)channel[offset + 1];
 82 |     sum -= (float)channel[offset + numColsSource] + (float)channel[offset - numColsSource];
 83 | 
 84 |     g[offset] = sum;
 85 |   }
 86 | }
 87 | 
 88 | void reference_calc(const uchar4* const h_sourceImg,
 89 |                     const size_t numRowsSource, const size_t numColsSource,
 90 |                     const uchar4* const h_destImg,
 91 |                     uchar4* const h_blendedImg){
 92 | 
 93 |   //we need to create a list of border pixels and interior pixels
 94 |   //this is a conceptually simple implementation, not a particularly efficient one...
 95 | 
 96 |   //first create mask
 97 |   size_t srcSize = numRowsSource * numColsSource;
 98 |   unsigned char* mask = new unsigned char[srcSize];
 99 | 
100 |   for (int i = 0; i < srcSize; ++i) {
101 |     mask[i] = (h_sourceImg[i].x + h_sourceImg[i].y + h_sourceImg[i].z < 3 * 255) ? 1 : 0;
102 |   }
103 | 
104 |   //next compute strictly interior pixels and border pixels
105 |   unsigned char *borderPixels = new unsigned char[srcSize];
106 |   unsigned char *strictInteriorPixels = new unsigned char[srcSize];
107 | 
108 |   std::vector<uint2> interiorPixelList;
109 | 
110 |   //the source region in the homework isn't near an image boundary, so we can
111 |   //simplify the conditionals a little...
112 |   for (size_t r = 1; r < numRowsSource - 1; ++r) {
113 |     for (size_t c = 1; c < numColsSource - 1; ++c) {
114 |       if (mask[r * numColsSource + c]) {
115 |         if (mask[(r -1) * numColsSource + c] && mask[(r + 1) * numColsSource + c] &&
116 |             mask[r * numColsSource + c - 1] && mask[r * numColsSource + c + 1]) {
117 |           strictInteriorPixels[r * numColsSource + c] = 1;
118 |           borderPixels[r * numColsSource + c] = 0;
119 |           interiorPixelList.push_back(make_uint2(r, c));
120 |         }
121 |         else {
122 |           strictInteriorPixels[r * numColsSource + c] = 0;
123 |           borderPixels[r * numColsSource + c] = 1;
124 |         }
125 |       }
126 |       else {
127 |           strictInteriorPixels[r * numColsSource + c] = 0;
128 |           borderPixels[r * numColsSource + c] = 0;
129 | 
130 |       }
131 |     }
132 |   }
133 | 
134 |   //split the source and destination images into their respective
135 |   //channels
136 |   unsigned char* red_src   = new unsigned char[srcSize];
137 |   unsigned char* blue_src  = new unsigned char[srcSize];
138 |   unsigned char* green_src = new unsigned char[srcSize];
139 | 
140 |   for (int i = 0; i < srcSize; ++i) {
141 |     red_src[i]   = h_sourceImg[i].x;
142 |     blue_src[i]  = h_sourceImg[i].y;
143 |     green_src[i] = h_sourceImg[i].z;
144 |   }
145 | 
146 |   unsigned char* red_dst   = new unsigned char[srcSize];
147 |   unsigned char* blue_dst  = new unsigned char[srcSize];
148 |   unsigned char* green_dst = new unsigned char[srcSize];
149 | 
150 |   for (int i = 0; i < srcSize; ++i) {
151 |     red_dst[i]   = h_destImg[i].x;
152 |     blue_dst[i]  = h_destImg[i].y;
153 |     green_dst[i] = h_destImg[i].z;
154 |   }
155 | 
156 |   //next we'll precompute the g term - it never changes, no need to recompute every iteration
157 |   float *g_red   = new float[srcSize];
158 |   float *g_blue  = new float[srcSize];
159 |   float *g_green = new float[srcSize];
160 | 
161 |   memset(g_red,   0, srcSize * sizeof(float));
162 |   memset(g_blue,  0, srcSize * sizeof(float));
163 |   memset(g_green, 0, srcSize * sizeof(float));
164 | 
165 |   computeG(red_src,   g_red,   numColsSource, interiorPixelList);
166 |   computeG(blue_src,  g_blue,  numColsSource, interiorPixelList);
167 |   computeG(green_src, g_green, numColsSource, interiorPixelList);
168 | 
169 |   //for each color channel we'll need two buffers and we'll ping-pong between them
170 |   float *blendedValsRed_1 = new float[srcSize];
171 |   float *blendedValsRed_2 = new float[srcSize];
172 | 
173 |   float *blendedValsBlue_1 = new float[srcSize];
174 |   float *blendedValsBlue_2 = new float[srcSize];
175 | 
176 |   float *blendedValsGreen_1 = new float[srcSize];
177 |   float *blendedValsGreen_2 = new float[srcSize];
178 | 
179 |   //IC is the source image, copy over
180 |   for (size_t i = 0; i < srcSize; ++i) {
181 |     blendedValsRed_1[i] = red_src[i];
182 |     blendedValsRed_2[i] = red_src[i];
183 |     blendedValsBlue_1[i] = blue_src[i];
184 |     blendedValsBlue_2[i] = blue_src[i];
185 |     blendedValsGreen_1[i] = green_src[i];
186 |     blendedValsGreen_2[i] = green_src[i];
187 |   }
188 | 
189 |   //Perform the solve on each color channel
190 |   const size_t numIterations = 800;
191 |   for (size_t i = 0; i < numIterations; ++i) {
192 |     computeIteration(red_dst, strictInteriorPixels, borderPixels,
193 |                      interiorPixelList, numColsSource, blendedValsRed_1, g_red,
194 |                      blendedValsRed_2);
195 | 
196 |     std::swap(blendedValsRed_1, blendedValsRed_2);
197 |   }
198 | 
199 |   for (size_t i = 0; i < numIterations; ++i) {
200 |     computeIteration(blue_dst, strictInteriorPixels, borderPixels,
201 |                      interiorPixelList, numColsSource, blendedValsBlue_1, g_blue,
202 |                      blendedValsBlue_2);
203 | 
204 |     std::swap(blendedValsBlue_1, blendedValsBlue_2);
205 |   }
206 | 
207 |   for (size_t i = 0; i < numIterations; ++i) {
208 |     computeIteration(green_dst, strictInteriorPixels, borderPixels,
209 |                      interiorPixelList, numColsSource, blendedValsGreen_1, g_green,
210 |                      blendedValsGreen_2);
211 | 
212 |     std::swap(blendedValsGreen_1, blendedValsGreen_2);
213 |   }
214 |   std::swap(blendedValsRed_1,   blendedValsRed_2);   //put output into _2
215 |   std::swap(blendedValsBlue_1,  blendedValsBlue_2);  //put output into _2
216 |   std::swap(blendedValsGreen_1, blendedValsGreen_2); //put output into _2
217 | 
218 |   //copy the destination image to the output
219 |   memcpy(h_blendedImg, h_destImg, sizeof(uchar4) * srcSize);
220 | 
221 |   //copy computed values for the interior into the output
222 |   for (size_t i = 0; i < interiorPixelList.size(); ++i) {
223 |     uint2 coord = interiorPixelList[i];
224 | 
225 |     unsigned int offset = coord.x * numColsSource + coord.y;
226 | 
227 |     h_blendedImg[offset].x = blendedValsRed_2[offset];
228 |     h_blendedImg[offset].y = blendedValsBlue_2[offset];
229 |     h_blendedImg[offset].z = blendedValsGreen_2[offset];
230 |   }
231 | 
232 |   //wow, we allocated a lot of memory!
233 |   delete[] mask;
234 |   delete[] blendedValsRed_1;
235 |   delete[] blendedValsRed_2;
236 |   delete[] blendedValsBlue_1;
237 |   delete[] blendedValsBlue_2;
238 |   delete[] blendedValsGreen_1;
239 |   delete[] blendedValsGreen_2;
240 |   delete[] g_red;
241 |   delete[] g_blue;
242 |   delete[] g_green;
243 |   delete[] red_src;
244 |   delete[] red_dst;
245 |   delete[] blue_src;
246 |   delete[] blue_dst;
247 |   delete[] green_src;
248 |   delete[] green_dst;
249 |   delete[] borderPixels;
250 |   delete[] strictInteriorPixels;
251 | }
252 | 
253 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/reference_calc.h:
--------------------------------------------------------------------------------
 1 | #ifndef REFERENCE_H__
 2 | #define REFERENCE_H__
 3 | 
 4 | void reference_calc(const uchar4* const h_sourceImg,
 5 |                     const size_t numRowsSource, const size_t numColsSource,
 6 |                     const uchar4* const h_destImg,
 7 |                       uchar4* const h_blendedImg);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/source.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Problem Sets/Problem Set 6/source.png


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/student_func.cu:
--------------------------------------------------------------------------------
  1 | //Udacity HW 6
  2 | //Poisson Blending
  3 | 
  4 | /* Background
  5 |    ==========
  6 | 
  7 |    The goal for this assignment is to take one image (the source) and
  8 |    paste it into another image (the destination) attempting to match the
  9 |    two images so that the pasting is non-obvious. This is
 10 |    known as a "seamless clone".
 11 | 
 12 |    The basic ideas are as follows:
 13 | 
 14 |    1) Figure out the interior and border of the source image
 15 |    2) Use the values of the border pixels in the destination image 
 16 |       as boundary conditions for solving a Poisson equation that tells
 17 |       us how to blend the images.
 18 |    
 19 |       No pixels from the destination except pixels on the border
 20 |       are used to compute the match.
 21 | 
 22 |    Solving the Poisson Equation
 23 |    ============================
 24 | 
 25 |    There are multiple ways to solve this equation - we choose an iterative
 26 |    method - specifically the Jacobi method. Iterative methods start with
 27 |    a guess of the solution and then iterate to try and improve the guess
 28 |    until it stops changing.  If the problem was well-suited for the method
 29 |    then it will stop and where it stops will be the solution.
 30 | 
 31 |    The Jacobi method is the simplest iterative method and converges slowly - 
 32 |    that is we need a lot of iterations to get to the answer, but it is the
 33 |    easiest method to write.
 34 | 
 35 |    Jacobi Iterations
 36 |    =================
 37 | 
 38 |    Our initial guess is going to be the source image itself.  This is a pretty
 39 |    good guess for what the blended image will look like and it means that
 40 |    we won't have to do as many iterations compared to if we had started far
 41 |    from the final solution.
 42 | 
 43 |    ImageGuess_prev (Floating point)
 44 |    ImageGuess_next (Floating point)
 45 | 
 46 |    DestinationImg
 47 |    SourceImg
 48 | 
 49 |    Follow these steps to implement one iteration:
 50 | 
 51 |    1) For every pixel p in the interior, compute two sums over the four neighboring pixels:
 52 |       Sum1: If the neighbor is in the interior then += ImageGuess_prev[neighbor]
 53 |              else if the neighbor in on the border then += DestinationImg[neighbor]
 54 | 
 55 |       Sum2: += SourceImg[p] - SourceImg[neighbor]   (for all four neighbors)
 56 | 
 57 |    2) Calculate the new pixel value:
 58 |       float newVal= (Sum1 + Sum2) / 4.f  <------ Notice that the result is FLOATING POINT
 59 |       ImageGuess_next[p] = min(255, max(0, newVal)); //clamp to [0, 255]
 60 | 
 61 | 
 62 |     In this assignment we will do 800 iterations.
 63 |    */
 64 | 
 65 | 
 66 | 
 67 | #include "utils.h"
 68 | #include <thrust/host_vector.h>
 69 | 
 70 | void your_blend(const uchar4* const h_sourceImg,  //IN
 71 |                 const size_t numRowsSource, const size_t numColsSource,
 72 |                 const uchar4* const h_destImg, //IN
 73 |                 uchar4* const h_blendedImg) //OUT
 74 | {
 75 | 
 76 |   /* To Recap here are the steps you need to implement
 77 |   
 78 |      1) Compute a mask of the pixels from the source image to be copied
 79 |         The pixels that shouldn't be copied are completely white, they
 80 |         have R=255, G=255, B=255.  Any other pixels SHOULD be copied.
 81 | 
 82 |      2) Compute the interior and border regions of the mask.  An interior
 83 |         pixel has all 4 neighbors also inside the mask.  A border pixel is
 84 |         in the mask itself, but has at least one neighbor that isn't.
 85 | 
 86 |      3) Separate out the incoming image into three separate channels
 87 | 
 88 |      4) Create two float(!) buffers for each color channel that will
 89 |         act as our guesses.  Initialize them to the respective color
 90 |         channel of the source image since that will act as our intial guess.
 91 | 
 92 |      5) For each color channel perform the Jacobi iteration described 
 93 |         above 800 times.
 94 | 
 95 |      6) Create the output image by replacing all the interior pixels
 96 |         in the destination image with the result of the Jacobi iterations.
 97 |         Just cast the floating point values to unsigned chars since we have
 98 |         already made sure to clamp them to the correct range.
 99 | 
100 |       Since this is final assignment we provide little boilerplate code to
101 |       help you.  Notice that all the input/output pointers are HOST pointers.
102 | 
103 |       You will have to allocate all of your own GPU memory and perform your own
104 |       memcopies to get data in and out of the GPU memory.
105 | 
106 |       Remember to wrap all of your calls with checkCudaErrors() to catch any
107 |       thing that might go wrong.  After each kernel call do:
108 | 
109 |       cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
110 | 
111 |       to catch any errors that happened while executing the kernel.
112 |   */
113 | }
114 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_TIMER_H__
 2 | #define GPU_TIMER_H__
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | struct GpuTimer
 7 | {
 8 |   cudaEvent_t start;
 9 |   cudaEvent_t stop;
10 | 
11 |   GpuTimer()
12 |   {
13 |     cudaEventCreate(&start);
14 |     cudaEventCreate(&stop);
15 |   }
16 | 
17 |   ~GpuTimer()
18 |   {
19 |     cudaEventDestroy(start);
20 |     cudaEventDestroy(stop);
21 |   }
22 | 
23 |   void Start()
24 |   {
25 |     cudaEventRecord(start, 0);
26 |   }
27 | 
28 |   void Stop()
29 |   {
30 |     cudaEventRecord(stop, 0);
31 |   }
32 | 
33 |   float Elapsed()
34 |   {
35 |     float elapsed;
36 |     cudaEventSynchronize(stop);
37 |     cudaEventElapsedTime(&elapsed, start, stop);
38 |     return elapsed;
39 |   }
40 | };
41 | 
42 | #endif  /* GPU_TIMER_H__ */
43 | 


--------------------------------------------------------------------------------
/Problem Sets/Problem Set 6/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H__
 2 | #define UTILS_H__
 3 | 
 4 | #include <iostream>
 5 | #include <iomanip>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | #include <cuda_runtime_api.h>
 9 | #include <cassert>
10 | #include <cmath>
11 | 
12 | #define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
13 | 
14 | template<typename T>
15 | void check(T err, const char* const func, const char* const file, const int line) {
16 |   if (err != cudaSuccess) {
17 |     std::cerr << "CUDA error at: " << file << ":" << line << std::endl;
18 |     std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
19 |     exit(1);
20 |   }
21 | }
22 | 
23 | template<typename T>
24 | void checkResultsExact(const T* const ref, const T* const gpu, size_t numElem) {
25 |   //check that the GPU result matches the CPU result
26 |   for (size_t i = 0; i < numElem; ++i) {
27 |     if (ref[i] != gpu[i]) {
28 |       std::cerr << "Difference at pos " << i << std::endl;
29 |       //the + is magic to convert char to int without messing
30 |       //with other types
31 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
32 |                  "\nGPU      : " << +gpu[i] << std::endl;
33 |       exit(1);
34 |     }
35 |   }
36 | }
37 | 
38 | template<typename T>
39 | void checkResultsEps(const T* const ref, const T* const gpu, size_t numElem, double eps1, double eps2) {
40 |   assert(eps1 >= 0 && eps2 >= 0);
41 |   unsigned long long totalDiff = 0;
42 |   unsigned numSmallDifferences = 0;
43 |   for (size_t i = 0; i < numElem; ++i) {
44 |     //subtract smaller from larger in case of unsigned types
45 |     T smaller = std::min(ref[i], gpu[i]);
46 |     T larger = std::max(ref[i], gpu[i]);
47 |     T diff = larger - smaller;
48 |     if (diff > 0 && diff <= eps1) {
49 |       numSmallDifferences++;
50 |     }
51 |     else if (diff > eps1) {
52 |       std::cerr << "Difference at pos " << +i << " exceeds tolerance of " << eps1 << std::endl;
53 |       std::cerr << "Reference: " << std::setprecision(17) << +ref[i] <<
54 |         "\nGPU      : " << +gpu[i] << std::endl;
55 |       exit(1);
56 |     }
57 |     totalDiff += diff * diff;
58 |   }
59 |   double percentSmallDifferences = (double)numSmallDifferences / (double)numElem;
60 |   if (percentSmallDifferences > eps2) {
61 |     std::cerr << "Total percentage of non-zero pixel difference between the two images exceeds " << 100.0 * eps2 << "%" << std::endl;
62 |     std::cerr << "Percentage of non-zero pixel differences: " << 100.0 * percentSmallDifferences << "%" << std::endl;
63 |     exit(1);
64 |   }
65 | }
66 | 
67 | //Uses the autodesk method of image comparison
68 | //Note the the tolerance here is in PIXELS not a percentage of input pixels
69 | template<typename T>
70 | void checkResultsAutodesk(const T* const ref, const T* const gpu, size_t numElem, double variance, size_t tolerance)
71 | {
72 | 
73 |   size_t numBadPixels = 0;
74 |   for (size_t i = 0; i < numElem; ++i) {
75 |     T smaller = std::min(ref[i], gpu[i]);
76 |     T larger = std::max(ref[i], gpu[i]);
77 |     T diff = larger - smaller;
78 |     if (diff > variance)
79 |       ++numBadPixels;
80 |   }
81 | 
82 |   if (numBadPixels > tolerance) {
83 |     std::cerr << "Too many bad pixels in the image." << numBadPixels << "/" << tolerance << std::endl;
84 |     exit(1);
85 |   }
86 | }
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Archival Note
 2 | This repository is deprecated; therefore, we are going to archive it.
 3 | However, learners will be able to fork it to their personal Github account but cannot submit PRs to this repository. If you have any issues or suggestions to make, feel free to:
 4 | - Utilize the https://knowledge.udacity.com/ forum to seek help on content-specific issues.
 5 | - Submit a support ticket along with the link to your forked repository if (learners are) blocked for other reasons. Here are the links for the [retail consumers](https://udacity.zendesk.com/hc/en-us/requests/new) and [enterprise learners](https://udacityenterprise.zendesk.com/hc/en-us/requests/new?ticket_form_id=360000279131). 
 6 | 
 7 | cs344
 8 | =====
 9 | 
10 | Introduction to Parallel Programming class code
11 | 
12 | # Building on OS X
13 | 
14 | These instructions are for OS X 10.9 "Mavericks".
15 | 
16 | * Step 1. Build and install OpenCV. The best way to do this is with
17 | Homebrew. However, you must slightly alter the Homebrew OpenCV
18 | installation; you must build it with libstdc++ (instead of the default
19 | libc++) so that it will properly link against the nVidia CUDA dev kit. 
20 | [This entry in the Udacity discussion forums](http://forums.udacity.com/questions/100132476/cuda-55-opencv-247-os-x-maverick-it-doesnt-work) describes exactly how to build a compatible OpenCV.
21 | 
22 | * Step 2. You can now create 10.9-compatible makefiles, which will allow you to
23 | build and run your homework on your own machine:
24 | ```
25 | mkdir build
26 | cd build
27 | cmake ..
28 | make
29 | ```
30 | 
31 | 


--------------------------------------------------------------------------------
/Student Contributions/Notes/Unit3 Notes/NotesUnit3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit3 Notes/NotesUnit3.pdf


--------------------------------------------------------------------------------
/Student Contributions/Notes/Unit3 Notes/NotesUnit3Small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit3 Notes/NotesUnit3Small.pdf


--------------------------------------------------------------------------------
/Student Contributions/Notes/Unit4 Notes/NotesUnit4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit4 Notes/NotesUnit4.pdf


--------------------------------------------------------------------------------
/Student Contributions/Notes/Unit4 Notes/NotesUnit4_Small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/udacity/cs344/853c0827fe8d645d10d22418de94cd57444fce21/Student Contributions/Notes/Unit4 Notes/NotesUnit4_Small.pdf


--------------------------------------------------------------------------------