├── README.md
├── lab0
    ├── lab0.pdf
    ├── lab0.tex
    └── main.cu
├── lab1
    ├── lab1.cu
    ├── lab1.h
    ├── lab1.pdf
    ├── lab1.tex
    └── main.cu
├── lab2
    ├── counting.cu
    ├── counting.h
    ├── lab2.pdf
    ├── lab2.tex
    ├── main.cu
    ├── part-i-hint.tex
    └── part-ii-hint.pdf
├── lab3
    ├── lab3.cu
    ├── lab3.h
    ├── lab3.pdf
    ├── lab3.tex
    ├── lab3_test.zip
    ├── main.cu
    ├── pgm.cpp
    └── pgm.h
└── utils
    ├── SyncedMemory.h
    └── Timer.h


/README.md:
--------------------------------------------------------------------------------
 1 | # 2017, 2018
 2 | ## Lab 0
 3 | Please make sure your version is ahead of bf721d8.
 4 | 
 5 | ## Lab 1
 6 | Please make sure your version is ahead of b44998b.
 7 | 
 8 | ## Lab 2
 9 | Please make sure your version is ahead of 7d72796.
10 | 
11 | ## Lab 3
12 | Please make sure your version is ahead of 678fdca.
13 | 
14 | # 2016
15 | In principle we do not forbid you to peek the homework of 2016
16 | , but keep in mind that we may renew the homework as the course progresses.
17 | 
18 | ## Lab 1
19 | Please make sure your version is ahead of bf64ada.
20 | 
21 | ## Lab 2
22 | Please make sure your version is ahead of 6af5f1c.
23 | 
24 | ## Lab 3
25 | Please make sure your version is ahead of 827f7e7.
26 | 
27 | We also provide a testcase which is included in lab3/lab3\_test.zip.
28 | 
29 | # Git
30 | If you are not familiar with Git, [this](http://backlogtool.com/git-guide/tw/intro/intro1_1.html) is the tutorial.
31 | 
32 | # General submission guidelines
33 | 
34 | ## Repo URL change is forbidden (40% penalty)
35 | "git push --force" is useful since TA only clones your code right after the deadline.
36 | 
37 | ## Always run cuda-memcheck (50% penalty)
38 | `cuda-memcheck` is handy and useful, and you should always check your program with it.
39 | 
40 | ## Be sure to put the file at correct path (40% penalty)
41 | If we ask you to submit /lab1/hello.cu, do not submit /Lab1/hello.cu, /hello.cu, or /homework/lab1/hello.cu.
42 | 
43 | ## Others
44 | * Git conflict (20% penalty)
45 | * Add large binary files (20% penalty)
46 | * Miss the deadline (10% penalty/day)
47 | 


--------------------------------------------------------------------------------
/lab0/lab0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab0/lab0.pdf


--------------------------------------------------------------------------------
/lab0/lab0.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[12pt,a4paper]{article}
  2 | 
  3 | % \newcommand*{\TypeChinese}{} % Chinese support
  4 | \newcommand*{\AdvancedDocument}{} % include code and math
  5 | \newcommand*{\WithHeader}{}
  6 | 
  7 | % basic packages
  8 | \usepackage[margin=2cm]{geometry}
  9 | \usepackage{graphicx,subfigure,indentfirst,hyperref,colortbl,caption,cite,color,xcolor}
 10 | \hypersetup{colorlinks=true,urlcolor=blue,linkcolor=blue}
 11 | 
 12 | \ifdefined\AdvancedDocument
 13 | 	% minted is better than listing.
 14 | 	\usepackage{minted}
 15 | 	% it requires minted of a newer version.
 16 | 	\setminted{linenos=true, frame=lines, framesep=2mm}
 17 | 	\usepackage{amsmath,amssymb,bm}
 18 | \fi
 19 | 
 20 | \ifdefined\TypeChinese
 21 | 	\usepackage{xeCJK,fontspec}
 22 | 	\XeTeXlinebreaklocale "zh"
 23 | 	\XeTeXlinebreakskip = 0pt plus 1pt
 24 | 	\setmainfont{KaiGen Gothic TW}
 25 | 	\setCJKmainfont{KaiGen Gothic TW}
 26 | 	\setmonofont{Droid Sans Mono}
 27 | 	\renewcommand{\baselinestretch}{1.3}
 28 | \fi
 29 | 
 30 | \ifdefined\WithHeader
 31 | 	\usepackage{fancyhdr}
 32 | 	\fancypagestyle{plain}{
 33 | 		\fancyhf{}
 34 | 		\chead{GPU Programming 2017 Spring \textbar ~CSIE Department, National Taiwan University}
 35 | 		\cfoot{\thepage}
 36 | 		\rfoot{GPGPU Assignment \#0}
 37 | 	}
 38 | 	\pagestyle{plain}
 39 | 	\renewcommand{\headrulewidth}{1pt}
 40 | 	\renewcommand{\footrulewidth}{2pt}
 41 | \fi
 42 | 
 43 | \newcommand{\figref}[1]{Figure \ref{Fig:#1}.}
 44 | \newcommand{\tabref}[1]{Table \ref{Tab:#1}.}
 45 | % \graphicspath{{fig/}}
 46 | 
 47 | \begin{document}
 48 | \title{GPGPU Assignment \#0}
 49 | \author{TA: Yu Sheng Lin \and Instructor: Wei Chao Chen}
 50 | \maketitle
 51 | 
 52 | \section{Goals}
 53 | 
 54 | You have to
 55 | 
 56 | \begin{enumerate}
 57 | \item Get your OS/IDE/editor configured and be ready to write CUDA code (How? Google is your friend).
 58 | \item Become familiar with CUDA syntax (You should have learnt some of it during the first lecture).
 59 | \end{enumerate}
 60 | 
 61 | \section{Requirements}
 62 | 
 63 | In this assignment you have to draw something in text by CUDA.
 64 | We have provided a skeleton and some utilization functions (\verb+SyncedMem<T>+ and \verb+MemoryBuffer<T>+).
 65 | 
 66 | We allocate a buffer of size $40\times 12$ but one linebreak is required for each line
 67 | , so the actual drawing area is $39\times 12$ including the boundary, which is consists of colons.
 68 | 
 69 | Here we show a possible output.
 70 | \begin{listing}
 71 | \begin{minted}{text}
 72 | :::::::::::::::::::::::::::::::::::::::
 73 | :                                     :
 74 | :                                     :
 75 | :                                     :
 76 | :                                     :
 77 | :                 ####          <|    :
 78 | :               ######           |    :
 79 | :             ########           |    :
 80 | :           ##########           |    :
 81 | :         ############           |    :
 82 | :       ##############           #    :
 83 | :::::::::::::::::::::::::::::::::::::::
 84 | \end{minted}
 85 | \caption{The famous scene in Nintendo Super Mario.}
 86 | \end{listing}
 87 | 
 88 | \section{Submission}
 89 | \begin{itemize}
 90 | \item The submission deadline is the midnight on 3/1 Wed. (namely before 3/2).
 91 | \item You will be officially registered to this course only if you complete and submit a working solution in time.
 92 | \item We will clone your code through Git using script; you may continue to revise your code before the deadline, but we will use the last revision before the deadline.
 93 | \item Use a non-public Git repository such as Bitbucket, and make sure that your code can be cloned by these accounts: \url{https://bitbucket.org/johnjohnlys/} or \url{https://github.com/johnjohnlin}.
 94 | \item Please fill your information and git repo in \href{https://goo.gl/forms/1R7p6QRMlrnKuImu1}{this form}. TA will test your Git URL one day before the deadline, so you can modify your URL in time if it doesn't work.
 95 | \item You should complete this homework by yourself. Do not plagiarize, and do not facilitate plagiarism.  Make sure your Git repository is not accessible by your classmates.
 96 | \end{itemize}
 97 | 
 98 | Please keep the directory structure of the repo we have provided. For this assignment, we will only judge \verb+lab0/main.cu+.
 99 | 
100 | \section{Hints}
101 | 
102 | \begin{itemize}
103 | \item \verb+"SyncedMemory.h"+ is under the directory \verb+utils/+.
104 | \item C++11 is required throughout all of the assignment. AFAIK, Visual Studio 2013 and g++ 4.8 or later is recommended.
105 | \item Do not spend much time on optimization or fancy functionalities. This assignment is just for qualification and will not be used for grading.
106 | \end{itemize}
107 | 
108 | \end{document}
109 | 


--------------------------------------------------------------------------------
/lab0/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include "SyncedMemory.h"
 4 | 
 5 | #define CHECK {\
 6 | 	auto e = cudaDeviceSynchronize();\
 7 | 	if (e != cudaSuccess) {\
 8 | 		printf("At " __FILE__ ":%d, %s\n", __LINE__, cudaGetErrorString(e));\
 9 | 		abort();\
10 | 	}\
11 | }
12 | 
13 | const int W = 40;
14 | const int H = 12;
15 | 
16 | __global__ void Draw(char *frame) {
17 | 	// TODO: draw more complex things here
18 | 	// Do not just submit the original file provided by the TA!
19 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
20 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
21 | 	if (y < H and x < W) {
22 | 		char c;
23 | 		if (x == W-1) {
24 | 			c = y == H-1 ? '\0' : '\n';
25 | 		} else if (y == 0 or y == H-1 or x == 0 or x == W-2) {
26 | 			c = ':';
27 | 		} else {
28 | 			c = ' ';
29 | 		}
30 | 		frame[y*W+x] = c;
31 | 	}
32 | }
33 | 
34 | int main(int argc, char **argv)
35 | {
36 | 	MemoryBuffer<char> frame(W*H);
37 | 	auto frame_smem = frame.CreateSync(W*H);
38 | 	CHECK;
39 | 
40 | 	Draw<<<dim3((W-1)/16+1,(H-1)/12+1), dim3(16,12)>>>(frame_smem.get_gpu_wo());
41 | 	CHECK;
42 | 
43 | 	puts(frame_smem.get_cpu_ro());
44 | 	CHECK;
45 | 	return 0;
46 | }


--------------------------------------------------------------------------------
/lab1/lab1.cu:
--------------------------------------------------------------------------------
 1 | #include "lab1.h"
 2 | static const unsigned W = 640;
 3 | static const unsigned H = 480;
 4 | static const unsigned NFRAME = 240;
 5 | 
 6 | struct Lab1VideoGenerator::Impl {
 7 | 	int t = 0;
 8 | };
 9 | 
10 | Lab1VideoGenerator::Lab1VideoGenerator(): impl(new Impl) {
11 | }
12 | 
13 | Lab1VideoGenerator::~Lab1VideoGenerator() {}
14 | 
15 | void Lab1VideoGenerator::get_info(Lab1VideoInfo &info) {
16 | 	info.w = W;
17 | 	info.h = H;
18 | 	info.n_frame = NFRAME;
19 | 	// fps = 24/1 = 24
20 | 	info.fps_n = 24;
21 | 	info.fps_d = 1;
22 | };
23 | 
24 | 
25 | void Lab1VideoGenerator::Generate(uint8_t *yuv) {
26 | 	cudaMemset(yuv, (impl->t)*255/NFRAME, W*H);
27 | 	cudaMemset(yuv+W*H, 128, W*H/2);
28 | 	++(impl->t);
29 | }
30 | 


--------------------------------------------------------------------------------
/lab1/lab1.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <memory>
 4 | using std::unique_ptr;
 5 | 
 6 | struct Lab1VideoInfo {
 7 | 	unsigned w, h, n_frame;
 8 | 	unsigned fps_n, fps_d;
 9 | };
10 | 
11 | class Lab1VideoGenerator {
12 | 	struct Impl;
13 | 	unique_ptr<Impl> impl;
14 | public:
15 | 	Lab1VideoGenerator();
16 | 	~Lab1VideoGenerator();
17 | 	void get_info(Lab1VideoInfo &info);
18 | 	void Generate(uint8_t *yuv);
19 | };
20 | 


--------------------------------------------------------------------------------
/lab1/lab1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab1/lab1.pdf


--------------------------------------------------------------------------------
/lab1/lab1.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[12pt,a4paper]{article}
  2 | 
  3 | % \newcommand*{\TypeChinese}{} % Chinese support
  4 | \newcommand*{\AdvancedDocument}{} % include code and math
  5 | \newcommand*{\WithHeader}{}
  6 | 
  7 | % basic packages
  8 | \usepackage[margin=2cm,headheight=15pt]{geometry}
  9 | \usepackage{graphicx,subfigure,indentfirst,hyperref,colortbl,caption,cite,color,xcolor}
 10 | \hypersetup{colorlinks=true,urlcolor=blue,linkcolor=blue}
 11 | 
 12 | \ifdefined\AdvancedDocument
 13 | 	% minted is better than listing.
 14 | 	\usepackage{minted}
 15 | 	% it requires minted of a newer version.
 16 | 	\setminted{linenos=true, frame=lines, framesep=2mm}	
 17 | 	\usepackage{amsmath,amssymb,bm}
 18 | \fi
 19 | 
 20 | \ifdefined\TypeChinese
 21 | 	\usepackage{xeCJK,fontspec}
 22 | 	\XeTeXlinebreaklocale "zh"
 23 | 	\XeTeXlinebreakskip = 0pt plus 1pt
 24 | 	\setmainfont{KaiGen Gothic TW}
 25 | 	\setCJKmainfont{KaiGen Gothic TW}
 26 | 	\setmonofont{Droid Sans Mono}
 27 | 	\renewcommand{\baselinestretch}{1.3}
 28 | \fi
 29 | 
 30 | \ifdefined\WithHeader
 31 | 	\usepackage{fancyhdr}
 32 | 	\fancypagestyle{plain}{
 33 | 		\fancyhf{}
 34 | 		\chead{GPU Programming 2018 Spring \textbar ~CSIE Department, National Taiwan University}
 35 | 		\cfoot{\thepage}
 36 | 		\rfoot{GPGPU Assignment \#1}
 37 | 	}
 38 | 	\pagestyle{plain}
 39 | 	\renewcommand{\headrulewidth}{1pt}
 40 | 	\renewcommand{\footrulewidth}{2pt}
 41 | \fi
 42 | 
 43 | \newcommand{\figref}[1]{Figure \ref{Fig:#1}.}
 44 | \newcommand{\tabref}[1]{Table \ref{Tab:#1}.}
 45 | \newcommand{\lstref}[1]{Listing \ref{Lst:#1}.}
 46 | \usepackage{xcolor}
 47 | % \graphicspath{{fig/}}
 48 | 
 49 | \begin{document}
 50 | \title{GPGPU Assignment \#2}
 51 | \author{Author: Yu Sheng Lin \and Instructor: Wei Chao Chen}
 52 | \maketitle
 53 | 
 54 | \section{Goals}
 55 | 
 56 | Make a video with CUDA in this assignment.  Be creative -- this is an open-ended assignment.
 57 | 
 58 | \section{Requirements}
 59 | 
 60 | The requirements are very loose.  You just need to render a series of frames with CUDA, with as little input data as possible.
 61 | For example, you can try to render Perlin Noise, Simplex Noise, or Stable Fluid.
 62 | Be more artistic and try to render fireworks, ocean or even a short movie.
 63 | Google ``Demoscene'' and see some extreme examples about what people can do with very small executables.
 64 | 
 65 | \subsection{Video Format}
 66 | 
 67 | In this lab we will first call \verb+void get_info(Lab1VideoInfo &info)+
 68 | to get the height $H$, width $W$, FPS
 69 | $N/D$ \footnote{We use a ratio to represent the FPS because we are using the y4m format. If you want 24 FPS, you shell returns $N = 24, D = 1$.}
 70 | and number of frames $N_f$ of your video.
 71 | Then we will call your \verb+void Generate(uint8_t *yuv)+ $N_f$ times to get all the frames for your video.
 72 | 
 73 | Instead of storing RGB values directly,
 74 | please use the YUV color space, which is the commonly used color space for video codecs.
 75 | 
 76 | \begin{equation}
 77 | \left\lbrace
 78 | \begin{aligned}
 79 | Y &= +0.299R+0.587G+0.114B\\
 80 | U &= -0.169R-0.331G+0.500B+128\\
 81 | V &= +0.500R-0.419G-0.081B+128
 82 | \end{aligned}
 83 | \right.
 84 | \end{equation}
 85 | 
 86 | We also subsample U and V channels both horizontally and vertically,
 87 | that is, 
 88 | $\dfrac{W}{2}*\dfrac{H}{2}$\footnote{$W$ and $H$ must both be even.}.
 89 | The YUV channels are stored sequentially so the total size of a frame is $1.5WH$.
 90 | 
 91 | \subsection{Sample code}
 92 | The sample code (\lstref{sample}) generates a grayscale video
 93 | which is black initially and gradually becomes lighter.
 94 | "Grayscale" means $R=G=B$, so the YUV is:
 95 | 
 96 | \begin{equation}
 97 | \left\lbrace
 98 | \begin{aligned}
 99 | Y &= +0.299R+0.587R+0.114R = R\\
100 | U &= -0.169R-0.331R+0.500R+128 = 128\\
101 | V &= +0.500R-0.419R-0.081R+128 = 128
102 | \end{aligned}
103 | \right.
104 | \end{equation}
105 | 
106 | The Y channel is $W*H$,
107 | so we first calculate the brightness according to the frame index
108 | then fill it into the device pointer.
109 | 
110 | The U and V channels are $\dfrac{W}{2}*\dfrac{H}{2}$ each,
111 | so we fill 128 to the following memory space.
112 | Finally, we increase the frame index by one.
113 | 
114 | \begin{listing}
115 | \begin{minted}{c}
116 | void Lab2VideoGenerator::Generate(uint8_t *yuv) {
117 |    cudaMemset(yuv, (impl->t)*255/NFRAME, W*H);
118 |    cudaMemset(yuv+W*H, 128, W*H/2);
119 |    ++(impl->t);
120 | }
121 | \end{minted}
122 | \caption{Sample code explanation}\label{Lst:sample}
123 | \end{listing}
124 | 
125 | The output file is of the y4m raw video format (it can be very large!)
126 | and you can use software such as
127 | FFmpeg or Avconv to convert it to other compressed formats (\lstref{avconv}).
128 | \begin{listing}
129 | \begin{minted}{c}
130 | avconv -i output.y4m output.mkv
131 | \end{minted}
132 | \caption{Sample code explanation}\label{Lst:avconv}
133 | \end{listing}
134 | 
135 | Hints: We do not allow you to modify the header,
136 | and if you don't know how to achieve that,
137 | please read \href{https://stackoverflow.com/questions/8972588/is-the-pimpl-idiom-really-used-in-practice}{Pimpl Idiom} or \href{https://en.wikipedia.org/wiki/Opaque_pointer}{Opaque Pointer}.
138 | 
139 | \section{Submission}
140 | \begin{itemize}
141 | \item The submission deadline is 2018/4/10 23:59.
142 | \item Apart from submitting your code in time,
143 | you must also post the link on the course Facebook group
144 | {\color{red}AND} fill the Facebook link in
145 | \href{https://goo.gl/forms/iDL8aVlFyKXIRr2k1}{this form}.
146 | As there can be delays with Facebook posts, we won't be super strict about the deadline, but please do make sure to complete the spreadsheet on time.
147 | \item We won't run your code but you still have to submit it in time. TA will still clone your code and look into your code if it becomes necessary. (We expelled a few students owing to plagiarism in last year.)
148 | \item Do not add ANY video in your Git HISTORY, or you will suffer 20\% penalty. We recommend you to use the \verb+.gitignore+ file.
149 | \item Your grade for this assignment is determined by both "techniques" and "aesthetics". You may also promote your work on Facebook and we will consider giving you bonus points based on your post popularity.
150 | \item For those who don't know how to get the link for a video, just right click on the video.
151 | \item You can add music or text to the video, and we do not allow post-processing of any other form, otherwise you will suffer from a 10\% penalty.
152 | \end{itemize}
153 | \end{document}
154 | 


--------------------------------------------------------------------------------
/lab1/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdint>
 3 | #include <cstdlib>
 4 | #include "SyncedMemory.h"
 5 | #include "lab1.h"
 6 | using namespace std;
 7 | 
 8 | #define CHECK {\
 9 | 	auto e = cudaDeviceSynchronize();\
10 | 	if (e != cudaSuccess) {\
11 | 		printf("At " __FILE__ ":%d, %s\n", __LINE__, cudaGetErrorString(e));\
12 | 		abort();\
13 | 	}\
14 | }
15 | 
16 | int main(int argc, char **argv)
17 | {
18 | 	Lab1VideoGenerator g;
19 | 	Lab1VideoInfo i;
20 | 
21 | 	g.get_info(i);
22 | 	if (i.w == 0 or i.h == 0 or i.n_frame == 0 or i.fps_n == 0 or i.fps_d == 0) {
23 | 		puts("Cannot be zero");
24 | 		abort();
25 | 	} else if (i.w%2 != 0 or i.h%2 != 0) {
26 | 		puts("Only even frame size is supported");
27 | 		abort();
28 | 	}
29 | 	unsigned FRAME_SIZE = i.w*i.h*3/2;
30 | 	MemoryBuffer<uint8_t> frameb(FRAME_SIZE);
31 | 	auto frames = frameb.CreateSync(FRAME_SIZE);
32 | 	FILE *fp = fopen("result.y4m", "wb");
33 | 	fprintf(fp, "YUV4MPEG2 W%d H%d F%d:%d Ip A1:1 C420\n", i.w, i.h, i.fps_n, i.fps_d);
34 | 
35 | 	for (unsigned j = 0; j < i.n_frame; ++j) {
36 | 		fputs("FRAME\n", fp);
37 | 		g.Generate(frames.get_gpu_wo());
38 | 		fwrite(frames.get_cpu_ro(), sizeof(uint8_t), FRAME_SIZE, fp);
39 | 	}
40 | 
41 | 	fclose(fp);
42 | 	return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/lab2/counting.cu:
--------------------------------------------------------------------------------
 1 | #include "counting.h"
 2 | #include <cstdio>
 3 | #include <cassert>
 4 | #include <thrust/scan.h>
 5 | #include <thrust/transform.h>
 6 | #include <thrust/functional.h>
 7 | #include <thrust/device_ptr.h>
 8 | #include <thrust/execution_policy.h>
 9 | 
10 | __device__ __host__ int CeilDiv(int a, int b) { return (a-1)/b + 1; }
11 | __device__ __host__ int CeilAlign(int a, int b) { return CeilDiv(a, b) * b; }
12 | 
13 | void CountPosition1(const char *text, int *pos, int text_size)
14 | {
15 | }
16 | 
17 | void CountPosition2(const char *text, int *pos, int text_size)
18 | {
19 | }
20 | 


--------------------------------------------------------------------------------
/lab2/counting.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | void CountPosition1(const char *text, int *pos, int text_size);
3 | void CountPosition2(const char *text, int *pos, int text_size);


--------------------------------------------------------------------------------
/lab2/lab2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab2/lab2.pdf


--------------------------------------------------------------------------------
/lab2/lab2.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[12pt,a4paper]{article}
  2 | 
  3 | % \newcommand*{\TypeChinese}{} % Chinese support
  4 | \newcommand*{\AdvancedDocument}{} % include code and math
  5 | \newcommand*{\WithHeader}{}
  6 | 
  7 | % basic packages
  8 | \usepackage[margin=2cm,headheight=15pt]{geometry}
  9 | \usepackage{graphicx,subfigure,indentfirst,hyperref,colortbl,caption,cite,color,xcolor}
 10 | \hypersetup{colorlinks=true,urlcolor=blue,linkcolor=blue}
 11 | % for handling underscores
 12 | \usepackage{lmodern,relsize}
 13 | \usepackage[T1]{fontenc}
 14 | \renewcommand{\_}{\textscale{.5}{\textunderscore}}
 15 | 
 16 | \ifdefined\AdvancedDocument
 17 | 	% minted is better than listing.
 18 | 	\usepackage{minted}
 19 | 	% it requires minted of a newer version.
 20 | 	\setminted{linenos=true, frame=lines, framesep=2mm}	
 21 | 	\usepackage{amsmath,amssymb,bm}
 22 | \fi
 23 | 
 24 | \ifdefined\TypeChinese
 25 | 	\usepackage{xeCJK,fontspec}
 26 | 	\XeTeXlinebreaklocale "zh"
 27 | 	\XeTeXlinebreakskip = 0pt plus 1pt
 28 | 	\setmainfont{KaiGen Gothic TW}
 29 | 	\setCJKmainfont{KaiGen Gothic TW}
 30 | 	\setmonofont{Droid Sans Mono}
 31 | 	\renewcommand{\baselinestretch}{1.3}
 32 | \fi
 33 | 
 34 | \ifdefined\WithHeader
 35 | 	\usepackage{fancyhdr}
 36 | 	\fancypagestyle{plain}{
 37 | 		\fancyhf{}
 38 | 		\chead{GPU Programming 2018 Spring \textbar ~Dep. of CSIE, NTU}
 39 | 		\cfoot{\thepage}
 40 | 		\rfoot{Programming Assignment \#2}
 41 | 	}
 42 | 	\pagestyle{plain}
 43 | 	\renewcommand{\headrulewidth}{1pt}
 44 | 	\renewcommand{\footrulewidth}{2pt}
 45 | \fi
 46 | 
 47 | \newcommand{\figref}[1]{Figure \ref{Fig:#1}.}
 48 | \newcommand{\tabref}[1]{Table \ref{Tab:#1}.}
 49 | \newcommand{\lstref}[1]{Listing \ref{Lst:#1}.}
 50 | % \graphicspath{{fig/}}
 51 | 
 52 | \begin{document}
 53 | \title{Programming Assignment \#2}
 54 | \author{Author: Yu Sheng Lin \and Instructor: Wei Chao Chen}
 55 | \maketitle
 56 | 
 57 | \section{Goals}
 58 | 
 59 | You have to
 60 | 
 61 | \begin{enumerate}
 62 | \item Learn how to use the Thrust library.
 63 | \item Learn how to implement some parallel algorithms on GPU.
 64 | \end{enumerate}
 65 | 
 66 | in this assignment.
 67 | 
 68 | \section{Requirements}
 69 | 
 70 | This is a two-part fixed assignment.
 71 | You are asked to implement the same functionality in both part.
 72 | However, in part I you are only allowed to use existing libraries
 73 | while in part II the efficiency of your code is also considered during grading.
 74 | 
 75 | In this assignment you are required to count the position
 76 | of each character inside the word it belongs to.
 77 | \lstref{count} provides a sample input and output.
 78 | 
 79 | \begin{listing}[ht]
 80 | \begin{minted}{text}
 81 | gpu qq  a hello   sonoda (input)
 82 | 123012001012345000123456 (output)
 83 | \end{minted}
 84 | \caption{Example: Count the Position in Words.}\label{Lst:count}
 85 | \end{listing}
 86 | 
 87 | You may easily come up with an $O(n)$ sequential algorithm
 88 | and an $O(nk)$ parallel algorithm,
 89 | where $n$ is the length of the input and $k$ is the maximum length of a word.
 90 | 
 91 | The input is generated in a pseudo-random manner,
 92 | while we will use a different random seed during grading.
 93 | You can assume that $k=500,~n\approx 4\times10^7$ and
 94 | the input only contains characters \verb+[a-z]+
 95 | and we use linebreak \verb+'\n'+ as the spaces.
 96 | 
 97 | You have to implement a function whose signature is \lstref{count_cpp}.
 98 | All pointers are device pointers and \verb+text_size+ is the $n$.
 99 | It should also be noticed that \verb+gridDim.x+ cannot exceed $2^{17}=131072$
100 | if you don't use \verb|-arch sm_30| (or higher)
101 | \footnote{We assume that you have a GPU newer than Kepler architecture.} compile flag.
102 | 
103 | \begin{listing}[ht]
104 | \begin{minted}{cpp}
105 | void CountPosition1(const char *text, int *pos, int text_size);
106 | void CountPosition2(const char *text, int *pos, int text_size);
107 | \end{minted}
108 | \caption{The function signature of part I.}\label{Lst:count_cpp}
109 | \end{listing}
110 | 
111 | \subsection{Part I: Using the Thrust Library (40pts)}\label{thrust}
112 | 
113 | Thrust is a useful API including many common parallel computing patterns,
114 | and in this part you should only include
115 | \begin{itemize}
116 | \item Declaration of native and \verb+thrust::*+ types,
117 | \item A few \verb+struct+s with call operator and
118 | \item \verb+thrust::*+ and native CUDA functions like \verb+cudaFree+
119 |       (namely \verb+__global__+ functions are not allowed)
120 | \end{itemize}
121 | in \verb+CountPosition1+.
122 | 
123 | Here are some hints:
124 | \begin{itemize}
125 | \item You will need the document \url{https://thrust.github.io/doc/modules.html}
126 | \item I have already included some necessary headers.
127 | \item If you write more than 10 lines, then you are probably wrong.
128 | \end{itemize}
129 | 
130 | \subsection{Part II: Implement Your Own Kernel (60pts+15pts bonus)}
131 | 
132 | In part II, you have to implement the same functionality from scratch,
133 | and {\color{red}{no external API and library is allowed}}.
134 | We provide some hints about the $O(n\ln k)$ algorithm in a separate PDF
135 | while it's not the only solution,
136 | and you can decide whether to read them by yourself.
137 | 
138 | According to our experience, when $k=500$, $O(n\ln k)$ and $O(nk)$
139 | might have comparable speed. To achieve the best efficiency and
140 | outperform your classmate, we also challenge you to implement both and
141 | compare them if possible.
142 | According to the execution time of your program,
143 | at most 15pts bonus will be given.
144 | 
145 | \section{Submission}
146 | 
147 | \begin{itemize}
148 | \item The submission deadline is 2018/5/10 23:59 (Thur.).
149 | \item The efficiency of part I will NOT be considered during grading, but if you
150 | break the rules listed in part \ref{thrust}, you will get 0pt.
151 | \item The efficiency part II will also be considered during grading, and pure CPU implementations would be disqualified.
152 | \item You can only modify \verb+lab2/counting.cu+, and we will only copy this file from your repo.
153 | \item The compile flags are \verb|--std=c++11 -O2 -arch sm_50|, and the environment is ArchLinux with CUDA 9.1. Although we will test your code on a Linux machine with one GTX 970, you should not use platform dependent libraries.
154 | \item Do not hack the answer by inspecting the memory or modifying the stack. This is a course about GPGPU programming, not assembly programming.
155 | \item Please also refer to assignment \#0 for more details.
156 | \end{itemize}
157 | 
158 | \end{document}
159 | 


--------------------------------------------------------------------------------
/lab2/main.cu:
--------------------------------------------------------------------------------
  1 | #include <random>
  2 | #include <vector>
  3 | #include <tuple>
  4 | #include <cstdio>
  5 | #include <cstdlib>
  6 | #include <functional>
  7 | #include <algorithm>
  8 | #include "SyncedMemory.h"
  9 | #include "Timer.h"
 10 | #include "counting.h"
 11 | using namespace std;
 12 | 
 13 | #define CHECK {\
 14 | 	auto e = cudaDeviceSynchronize();\
 15 | 	if (e != cudaSuccess) {\
 16 | 		printf("At " __FILE__ ":%d, %s\n", __LINE__, cudaGetErrorString(e));\
 17 | 		abort();\
 18 | 	}\
 19 | }
 20 | 
 21 | template <typename Engine>
 22 | tuple<vector<char>, vector<int>> GenerateTestCase(Engine &eng, const int N) {
 23 | 	poisson_distribution<int> pd(14.0);
 24 | 	bernoulli_distribution bd(0.1);
 25 | 	uniform_int_distribution<int> id1(1, 20);
 26 | 	uniform_int_distribution<int> id2(1, 5);
 27 | 	uniform_int_distribution<int> id3('a', 'z');
 28 | 	tuple<vector<char>, vector<int>> ret;
 29 | 	auto &text = get<0>(ret);
 30 | 	auto &pos = get<1>(ret);
 31 | 	auto gen_rand_word_len = [&] () -> int {
 32 | 		return max(1, min(500, pd(eng) - 5 + (bd(eng) ? id1(eng)*20 : 0)));
 33 | 	};
 34 | 	auto gen_rand_space_len = [&] () -> int {
 35 | 		return id2(eng);
 36 | 	};
 37 | 	auto gen_rand_char = [&] () {
 38 | 		return id3(eng);
 39 | 	};
 40 | 	auto AddWord = [&] () {
 41 | 		int n = gen_rand_word_len();
 42 | 		for (int i = 0; i < n; ++i) {
 43 | 			text.push_back(gen_rand_char());
 44 | 			pos.push_back(i+1);
 45 | 		}
 46 | 	};
 47 | 	auto AddSpace = [&] () {
 48 | 		int n = gen_rand_space_len();
 49 | 		for (int i = 0; i < n; ++i) {
 50 | 			text.push_back('\n');
 51 | 			pos.push_back(0);
 52 | 		}
 53 | 	};
 54 | 
 55 | 	AddWord();
 56 | 	while (text.size() < N) {
 57 | 		AddSpace();
 58 | 		AddWord();
 59 | 	}
 60 | 	return ret;
 61 | }
 62 | 
 63 | void TestRoutine(
 64 | 	SyncedMemory<int>& yours_sync, SyncedMemory<char>& text_sync,
 65 | 	const int n, const int part, const int *golden
 66 | ) {
 67 | 	// Initialization
 68 | 	Timer timer_count_position;
 69 | 	int *yours_gpu = yours_sync.get_gpu_wo();
 70 | 	cudaMemset(yours_gpu, 0, sizeof(int)*n);
 71 | 
 72 | 	// Run
 73 | 	timer_count_position.Start();
 74 | 	if (part == 1) {
 75 | 		CountPosition1(text_sync.get_gpu_ro(), yours_gpu, n);
 76 | 	} else {
 77 | 		CountPosition2(text_sync.get_gpu_ro(), yours_gpu, n);
 78 | 	}
 79 | 	CHECK;
 80 | 	timer_count_position.Pause();
 81 | 
 82 | 	// Part I check
 83 | 	const int *yours = yours_sync.get_cpu_ro();
 84 | 	int n_match = mismatch(golden, golden+n, yours).first - golden;
 85 | 
 86 | 	printf_timer(timer_count_position);
 87 | 	if (n_match != n) {
 88 | 		printf("Part %d WA\n", part);
 89 | 	} else {
 90 | 		printf("Part %d AC\n", part);
 91 | 	}
 92 | }
 93 | #define KB <<10
 94 | #define MB <<20
 95 | int main(int argc, char **argv)
 96 | {
 97 | 	// Initialize random text
 98 | 	default_random_engine engine(12345);
 99 | 	auto text_pos_head = GenerateTestCase(engine, 40 MB);
100 | 	vector<char> &text = get<0>(text_pos_head);
101 | 	vector<int> &pos = get<1>(text_pos_head);
102 | 
103 | 	// Prepare buffers
104 | 	int n = text.size();
105 | 	char *text_gpu;
106 | 	cudaMalloc(&text_gpu, sizeof(char)*n);
107 | 	SyncedMemory<char> text_sync(text.data(), text_gpu, n);
108 | 	text_sync.get_cpu_wo(); // touch the cpu data
109 | 	MemoryBuffer<int> yours1_buf(n);
110 | 	MemoryBuffer<int> yours2_buf(n);
111 | 	auto yours1_mb = yours1_buf.CreateSync(n);
112 | 	auto yours2_mb = yours2_buf.CreateSync(n);
113 | 
114 | 	// We test 2 in first to prevent cheating
115 | 	TestRoutine(yours1_mb, text_sync, n, 2, pos.data());
116 | 	TestRoutine(yours2_mb, text_sync, n, 1, pos.data());
117 | 
118 | 	cudaFree(text_gpu);
119 | 	return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/lab2/part-i-hint.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[12pt,a4paper]{article}
  2 | 
  3 | % \newcommand*{\TypeChinese}{} % Chinese support
  4 | \newcommand*{\AdvancedDocument}{} % include code and math
  5 | \newcommand*{\WithHeader}{}
  6 | 
  7 | % basic packages
  8 | \usepackage[margin=2cm]{geometry}
  9 | \usepackage{graphicx,subfigure,indentfirst,hyperref,colortbl,caption,cite,color,xcolor,tikz,}
 10 | \hypersetup{colorlinks=true,urlcolor=blue,linkcolor=blue}
 11 | 
 12 | \ifdefined\AdvancedDocument
 13 | 	% minted is better than listing.
 14 | 	\usepackage{minted}
 15 | 	% it requires minted of a newer version.
 16 | 	\setminted{linenos=true, frame=lines, framesep=2mm}	
 17 | 	\usepackage{amsmath,amssymb,bm}
 18 | \fi
 19 | 
 20 | \ifdefined\TypeChinese
 21 | 	\usepackage{xeCJK,fontspec}
 22 | 	\XeTeXlinebreaklocale "zh"
 23 | 	\XeTeXlinebreakskip = 0pt plus 1pt
 24 | 	\setmainfont{KaiGen Gothic TW}
 25 | 	\setCJKmainfont{KaiGen Gothic TW}
 26 | 	\setmonofont{Droid Sans Mono}
 27 | 	\renewcommand{\baselinestretch}{1.3}
 28 | \fi
 29 | 
 30 | \ifdefined\WithHeader
 31 | 	\usepackage{fancyhdr}
 32 | 	\fancypagestyle{plain}{
 33 | 		\fancyhf{}
 34 | 		\chead{GPU Programming 2016 Spring \textbar ~CSIE Department, National Taiwan University}
 35 | 		\cfoot{\thepage}
 36 | 		\rfoot{GPGPU Assignment \#1 Hint}
 37 | 	}
 38 | 	\pagestyle{plain}
 39 | 	\renewcommand{\headrulewidth}{1pt}
 40 | 	\renewcommand{\footrulewidth}{2pt}
 41 | \fi
 42 | 
 43 | \newcommand{\figref}[1]{Figure \ref{Fig:#1}.}
 44 | \newcommand{\tabref}[1]{Table \ref{Tab:#1}.}
 45 | \newcommand{\lstref}[1]{Listing \ref{Lst:#1}.}
 46 | % \graphicspath{{fig/}}
 47 | 
 48 | \begin{document}
 49 | 
 50 | Have you heard the "binary indexed tree"?
 51 | 
 52 | \begin{itemize}
 53 | \item \url{https://leetcode.com/problems/range-sum-query-mutable/}
 54 | \item \url{http://www.csie.ntnu.edu.tw/~u91029/Sequence.html#8}
 55 | \end{itemize}
 56 | 
 57 | \vspace{3cm}
 58 | More hints in the next page $\downarrow$.
 59 | 
 60 | \pagebreak
 61 | 
 62 | \begin{figure}[H]
 63 | \centering
 64 | \begin{tikzpicture}[scale=1]
 65 | \draw (0,0) rectangle (8,4);
 66 | \foreach \y in {1,...,3}
 67 | 	\draw (0,\y) -- (8,\y);
 68 | \foreach \h/\n/\start in {1/4/1,2/2/2,3/1/4} {
 69 | 	\foreach \x in {1,...,4} {
 70 | 		\draw (\start*\x*2-\start,0) -- (\start*\x*2-\start,\h);	
 71 | 		\ifnum \x=\n
 72 | 			\breakforeach
 73 | 		\fi
 74 | 	}
 75 | }
 76 | \node at (0.5,0.5) {1};
 77 | \node at (1.5,0.5) {1};
 78 | \node at (2.5,0.5) {1};
 79 | \node at (3.5,0.5) {1};
 80 | \node at (4.5,0.5) {1};
 81 | \node at (5.5,0.5) {0};
 82 | \node at (6.5,0.5) {1};
 83 | \node at (7.5,0.5) {0};
 84 | 
 85 | \node at (1,1.5) {2};
 86 | \node at (3,1.5) {2};
 87 | \node at (5,1.5) {0};
 88 | \node at (7,1.5) {0};
 89 | 
 90 | \node at (2,2.5) {4};
 91 | \node at (6,2.5) {0};
 92 | 
 93 | \node at (4,3.5) {0};
 94 | \end{tikzpicture}
 95 | \caption{The binary indexed tree}
 96 | \end{figure}
 97 | 
 98 | The upper level is:
 99 | \begin{itemize}
100 | \item Addition of the lower 2 levels if they are both non-zero.
101 | \item 0, otherwise.
102 | \end{itemize}
103 | 
104 | \vspace{3cm}
105 | More hints in the next page $\downarrow$.
106 | 
107 | \pagebreak
108 | 
109 | \begin{figure}[H]
110 | \centering
111 | \begin{tikzpicture}[scale=1]
112 | \draw (0,0) rectangle (8,4);
113 | \foreach \y in {1,...,3}
114 | 	\draw (0,\y) -- (8,\y);
115 | \foreach \h/\n/\start in {1/4/1,2/2/2,3/1/4} {
116 | 	\foreach \x in {1,...,4} {
117 | 		\draw (\start*\x*2-\start,0) -- (\start*\x*2-\start,\h);
118 | 		\ifnum \x=\n
119 | 			\breakforeach
120 | 		\fi
121 | 	}
122 | }
123 | \node at (0.5,0.5) {0};
124 | \node at (1.5,0.5) {\textcolor{red}{1}};
125 | \node at (2.5,0.5) {1};
126 | \node at (3.5,0.5) {1};
127 | \node at (4.5,0.5) {\textcolor{red}{1}};
128 | \node at (5.5,0.5) {0};
129 | \node at (6.5,0.5) {1};
130 | \node at (7.5,0.5) {0};
131 | 
132 | \node at (1,1.5) {0};
133 | \node at (3,1.5) {\textcolor{red}{2}};
134 | \node at (5,1.5) {0};
135 | \node at (7,1.5) {0};
136 | 
137 | \node at (2,2.5) {0};
138 | \node at (6,2.5) {0};
139 | 
140 | \node at (4,3.5) {0};
141 | \draw[line width=2pt] (5,0) -- (5,1); \node at (5.2,1) {\textcolor{gray}{0}};
142 | \draw[line width=2pt] (4.1,0) -- (4.1,2); \node at (4.3,2) {\textcolor{gray}{1}};
143 | \draw[line width=2pt] (4,0) -- (4,3); \node at (4.2,3) {\textcolor{gray}{2}};
144 | \draw[line width=2pt,dotted] (0,0) -- (0,4); \node at (0.2,4) {\textcolor{gray}{3}};
145 | \draw[line width=2pt] (2,0) -- (2,2); \node at (2.2,2) {\textcolor{gray}{4}};
146 | \draw[line width=2pt] (1,0) -- (1,1); \node at (1.2,1) {\textcolor{gray}{5}};
147 | \end{tikzpicture}
148 | \caption{The binary indexed tree}
149 | \end{figure}
150 | 
151 | \begin{table}[H]
152 | \centering
153 | \begin{tabular}{llll}
154 | \hline
155 | \hline
156 | Check & Result & Length & Loop invariant \\
157 | \hline
158 | Try to add 1 to align to 2 & OK      & 1 & Align to 2 \\
159 | Try to add 2 to align to 4 & Already & 1 & Align to 4 \\
160 | Try to add 4 to align to 8 & Fail    & 1 & Align to 4, less than 4 more 1's\\
161 | Try to add 2               & OK      & 3 & Less than 2 more 1's\\
162 | Try to add 1               & OK      & 4 & Less than 1 more 1's (done)\\
163 | \hline
164 | \hline
165 | \end{tabular}
166 | \caption{The Algorithm}
167 | \end{table}
168 | 
169 | We traverse the tree bottom-up then top-down.
170 | This algorithm can handle at most $(2^h-1)$ 1's
171 | where $h$ is the height of the tree.
172 | So you have to choose $h = 9$ to handle $k = 500$.
173 | 
174 | Note that we also illustrate the "OK", "Already" and initial state by bold black lines while "Fail" is dotted bold lines.
175 | 
176 | \vspace{3cm}
177 | 
178 | One more example in the next page $\downarrow$.
179 | \pagebreak
180 | 
181 | \begin{figure}[H]
182 | \centering
183 | \begin{tikzpicture}[scale=1]
184 | \draw (0,0) rectangle (8,4);
185 | \foreach \y in {1,...,3}
186 | 	\draw (0,\y) -- (8,\y);
187 | \foreach \h/\n/\start in {1/4/1,2/2/2,3/1/4} {
188 | 	\foreach \x in {1,...,4} {
189 | 		\draw (\start*\x*2-\start,0) -- (\start*\x*2-\start,\h);
190 | 		\ifnum \x=\n
191 | 			\breakforeach
192 | 		\fi
193 | 	}
194 | }
195 | \node at (0.5,0.5) {1};
196 | \node at (1.5,0.5) {1};
197 | \node at (2.5,0.5) {1};
198 | \node at (3.5,0.5) {1};
199 | \node at (4.5,0.5) {0};
200 | \node at (5.5,0.5) {\textcolor{red}{1}};
201 | \node at (6.5,0.5) {\textcolor{red}{1}};
202 | \node at (7.5,0.5) {1};
203 | 
204 | \node at (1,1.5) {2};
205 | \node at (3,1.5) {2};
206 | \node at (5,1.5) {0};
207 | \node at (7,1.5) {2};
208 | 
209 | \node at (2,2.5) {2};
210 | \node at (6,2.5) {0};
211 | 
212 | \node at (4,3.5) {0};
213 | \draw[line width=2pt] (7,0) -- (7,1); \node at (7.2,1) {\textcolor{gray}{0}};
214 | \draw[line width=2pt] (6,0) -- (6,2); \node at (6.2,2) {\textcolor{gray}{1}};
215 | \draw[line width=2pt] (5,0) -- (5,1); \node at (5.2,1) {\textcolor{gray}{3}};
216 | \draw[line width=2pt, dotted] (4,0) -- (4,3); \node at (4.2,3) {\textcolor{gray}{2}};
217 | \end{tikzpicture}
218 | \caption{The binary indexed tree}
219 | \end{figure}
220 | 
221 | \begin{table}[H]
222 | \centering
223 | \begin{tabular}{llll}
224 | \hline
225 | \hline
226 | Check & Result & Length & Loop invariant \\
227 | \hline
228 | Try to add 1 to align to 2 & OK      & 1 & Align to 2 \\
229 | Try to add 2 to align to 4 & Fail    & 1 & Align to 2, less than 2 more 1's\\
230 | Try to add 1               & OK      & 2 & Less than 1 more 1's (done)\\
231 | \hline
232 | \hline
233 | \end{tabular}
234 | \caption{The Algorithm}
235 | \end{table}
236 | 
237 | In this example we "Fail" at the lower level.
238 | 
239 | \vspace{3cm}
240 | Sadly I have no more hint for you.
241 | 
242 | \end{document}
243 | 


--------------------------------------------------------------------------------
/lab2/part-ii-hint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab2/part-ii-hint.pdf


--------------------------------------------------------------------------------
/lab3/lab3.cu:
--------------------------------------------------------------------------------
 1 | #include "lab3.h"
 2 | #include <cstdio>
 3 | 
 4 | __device__ __host__ int CeilDiv(int a, int b) { return (a-1)/b + 1; }
 5 | __device__ __host__ int CeilAlign(int a, int b) { return CeilDiv(a, b) * b; }
 6 | 
 7 | __global__ void SimpleClone(
 8 | 	const float *background,
 9 | 	const float *target,
10 | 	const float *mask,
11 | 	float *output,
12 | 	const int wb, const int hb, const int wt, const int ht,
13 | 	const int oy, const int ox
14 | )
15 | {
16 | 	const int yt = blockIdx.y * blockDim.y + threadIdx.y;
17 | 	const int xt = blockIdx.x * blockDim.x + threadIdx.x;
18 | 	const int curt = wt*yt+xt;
19 | 	if (yt < ht and xt < wt and mask[curt] > 127.0f) {
20 | 		const int yb = oy+yt, xb = ox+xt;
21 | 		const int curb = wb*yb+xb;
22 | 		if (0 <= yb and yb < hb and 0 <= xb and xb < wb) {
23 | 			output[curb*3+0] = target[curt*3+0];
24 | 			output[curb*3+1] = target[curt*3+1];
25 | 			output[curb*3+2] = target[curt*3+2];
26 | 		}
27 | 	}
28 | }
29 | 
30 | void PoissonImageCloning(
31 | 	const float *background,
32 | 	const float *target,
33 | 	const float *mask,
34 | 	float *output,
35 | 	const int wb, const int hb, const int wt, const int ht,
36 | 	const int oy, const int ox
37 | )
38 | {
39 | 	cudaMemcpy(output, background, wb*hb*sizeof(float)*3, cudaMemcpyDeviceToDevice);
40 | 	SimpleClone<<<dim3(CeilDiv(wt,32), CeilDiv(ht,16)), dim3(32,16)>>>(
41 | 		background, target, mask, output,
42 | 		wb, hb, wt, ht, oy, ox
43 | 	);
44 | }
45 | 


--------------------------------------------------------------------------------
/lab3/lab3.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | void PoissonImageCloning(
 4 | 	const float *background,
 5 | 	const float *target,
 6 | 	const float *mask,
 7 | 	float *output,
 8 | 	const int wb, const int hb, const int wt, const int ht,
 9 | 	const int oy, const int ox
10 | );
11 | 


--------------------------------------------------------------------------------
/lab3/lab3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab3/lab3.pdf


--------------------------------------------------------------------------------
/lab3/lab3.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[12pt,a4paper]{article}
  2 | 
  3 | % \newcommand*{\TypeChinese}{} % Chinese support
  4 | \newcommand*{\AdvancedDocument}{} % include code and math
  5 | \newcommand*{\WithHeader}{}
  6 | 
  7 | % basic packages
  8 | \usepackage[margin=2cm]{geometry}
  9 | \usepackage{graphicx,subfigure,indentfirst,hyperref,colortbl,caption,cite,color,xcolor,tikz}
 10 | \hypersetup{colorlinks=true,urlcolor=blue,linkcolor=blue}
 11 | 
 12 | \ifdefined\AdvancedDocument
 13 | 	% minted is better than listing.
 14 | 	\usepackage{minted}
 15 | 	% it requires minted of a newer version.
 16 | 	\setminted{linenos=true, frame=lines, framesep=2mm}	
 17 | 	\usepackage{amsmath,amssymb,bm}
 18 | \fi
 19 | 
 20 | \ifdefined\TypeChinese
 21 | 	\usepackage{xeCJK,fontspec}
 22 | 	\XeTeXlinebreaklocale "zh"
 23 | 	\XeTeXlinebreakskip = 0pt plus 1pt
 24 | 	\setmainfont{KaiGen Gothic TW}
 25 | 	\setCJKmainfont{KaiGen Gothic TW}
 26 | 	\setmonofont{Droid Sans Mono}
 27 | 	\renewcommand{\baselinestretch}{1.3}
 28 | \fi
 29 | 
 30 | \ifdefined\WithHeader
 31 | 	\usepackage{fancyhdr}
 32 | 	\fancypagestyle{plain}{
 33 | 		\fancyhf{}
 34 | 		\chead{GPU Programming 2018 Spring \textbar ~CSIE Department, National Taiwan University}
 35 | 		\cfoot{\thepage}
 36 | 		\rfoot{GPGPU Assignment \#3}
 37 | 	}
 38 | 	\pagestyle{plain}
 39 | 	\renewcommand{\headrulewidth}{1pt}
 40 | 	\renewcommand{\footrulewidth}{2pt}
 41 | \fi
 42 | 
 43 | \newcommand{\figref}[1]{Figure \ref{fig:#1}}
 44 | \newcommand{\tabref}[1]{Table \ref{tab:#1}}
 45 | \newcommand{\equref}[1]{Equation \ref{equ:#1}}
 46 | \newcommand{\lstref}[1]{Listing \ref{lst:#1}}
 47 | \graphicspath{{jpg/}}
 48 | 
 49 | \begin{document}
 50 | \title{GPGPU Assignment \#3}
 51 | \author{Author: Yu-Sheng Lin \and Instructor: Wei-Chao Chen}
 52 | \maketitle
 53 | 
 54 | \section{Goals}
 55 | 
 56 | Implement Poisson Editing for image cloning on the GPUs.  Go fancy and implement faster algorithms to gain the bonus points.
 57 | 
 58 | \section{Description}
 59 | 
 60 | \subsection{Image Cloning}
 61 | 
 62 | \figref{img_clone} shows input examples for image cloning algorithms, where
 63 | (a) is the background, (b) is the target image, and (c) is a boolean mask.
 64 | You may perform naive image cloning such as \figref{img_clone_example} by 
 65 | copying and pasting the image from the target image
 66 | to the background image according to the binary mask.
 67 | This algorithm is provided in the sample codes.
 68 | 
 69 | Obviously there are rooms for improvement from this naive algorithm.
 70 | In this assignment, we ask you to implement Poisson Editing, 
 71 | a computationally intensive but effective algorithm that is designed to seamlessly blend the background
 72 | and target images by means of differential domain operations.
 73 | The algorithm is described in more details later in this document, and 
 74 | we encourage you to read the original paper \textit{Poisson Image Editing} from SIGGRAPH 2003 by P. Perez et al.
 75 | 
 76 | \subsection{Function Signature}
 77 | 
 78 | The function signature for this assignment is \lstref{sig}.
 79 | \verb+background+ and \verb+output+ are
 80 | interlaced RGB images of size $W_b\times H_b$ and
 81 | range from $0$ to $255$.
 82 | \verb+target+ is the same except that its size is $W_t\times H_t$.
 83 | \verb+mask+ is the mask that contains only one color channel and
 84 | we use \verb+0.0f/255.0f+ to represent false/true.
 85 | 
 86 | \begin{listing}[ht]
 87 | \begin{minted}{cpp}
 88 | void PoissonImageCloning(
 89 |     const float *background,
 90 |     const float *target,
 91 |     const float *mask,
 92 |     float *output,
 93 |     const int wb, const int hb, const int wt, const int ht,
 94 |     const int oy, const int ox
 95 | );
 96 | \end{minted}
 97 | \caption{Function signature}\label{lst:sig}
 98 | \end{listing}
 99 | 
100 | We will also assign the offset $O_y, O_x$,
101 | which means the offset of the target image
102 | in the background image (from the top-left corner),
103 | and we will test your program using these two commands.
104 | 
105 | \begin{listing}[ht]
106 | \begin{minted}{bash}
107 | ./a.out img_background.ppm img_target.ppm img_mask.pgm 130 600 output.ppm
108 | ./a.out img_background.ppm img_target.ppm img_mask.pgm 130 900 output.ppm
109 | \end{minted}
110 | \caption{Execute your code}\label{lst:exe}
111 | \end{listing}
112 | 
113 | We use the very popular PGM/PPM image format,
114 | which can be edited by many image processing softwares.
115 | You can generate new test cases if you wish.
116 | 
117 | \subsection{Poisson Editing}
118 | 
119 | \begin{figure}
120 | \centering
121 | \subfigure[The background image $W_b\times H_b$.]{\includegraphics[width=0.6\textwidth]{img_background.jpg}}\\
122 | \subfigure[The target image which will be pasted to the background, $W_t\times H_t$.]{\includegraphics[width=0.35\textwidth]{img_target.jpg}}
123 | \subfigure[The mask $W_t\times H_t$.]{\includegraphics[width=0.35\textwidth]{img_mask.jpg}}
124 | \caption{The input images of this assignment.}\label{fig:img_clone}
125 | \end{figure}
126 | 
127 | \begin{figure}
128 | \centering
129 | \includegraphics[width=0.6\textwidth]{output.jpg}
130 | \caption{A very naive and ineffective image cloning output.}\label{fig:img_clone_example}
131 | \end{figure}
132 | 
133 | \begin{figure}
134 | \centering
135 | \subfigure[The values to be solved.]{
136 | \begin{tikzpicture}[scale=1]
137 | \coordinate (C) at (2,2);
138 | \coordinate (N) at (2,4);
139 | \coordinate (W) at (0,2);
140 | \coordinate (S) at (2,0);
141 | \coordinate (E) at (4,2);
142 | \draw (C) -- (W);
143 | \draw (C) -- (N);
144 | \draw (C) -- (W);
145 | \draw (C) -- (S);
146 | \draw (C) -- (E);
147 | \node[circle,draw,fill=white] at (C){$C_b$};
148 | \node[circle,draw,fill=lightgray] at (N){$N_b$};
149 | \node[circle,draw,fill=lightgray] at (W){$W_b$};
150 | \node[circle,draw,fill=white] at (S){$S_b$};
151 | \node[circle,draw,fill=white] at (E){$E_b$};
152 | \end{tikzpicture}
153 | }
154 | \hspace{1.5cm}
155 | \subfigure[The corresponding target image.]{
156 | \begin{tikzpicture}[scale=1]
157 | \coordinate (C) at (2,2);
158 | \coordinate (N) at (2,4);
159 | \coordinate (W) at (0,2);
160 | \coordinate (S) at (2,0);
161 | \coordinate (E) at (4,2);
162 | \draw (C) -- (W);
163 | \draw (C) -- (N);
164 | \draw (C) -- (W);
165 | \draw (C) -- (S);
166 | \draw (C) -- (E);
167 | \node[circle,draw,fill=white] at (C){$C_t$};
168 | \node[circle,draw,fill=white] at (N){$N_t$};
169 | \node[circle,draw,fill=white] at (W){$W_t$};
170 | \node[circle,draw,fill=white] at (S){$S_t$};
171 | \node[circle,draw,fill=white] at (E){$E_t$};
172 | \end{tikzpicture}
173 | }
174 | \caption{The gray nodes are the boundary (the black pixels in the mask).}\label{fig:linear_sys}
175 | \end{figure}
176 | 
177 | If you wish to bypass the mathematical details in the original paper, 
178 | you may proceed by implementing this 4-neighbor linear system in
179 | \equref{linear_sys} (also refer to \figref{linear_sys}).
180 | 
181 | \begin{equation}
182 | 4C_b - (S_b+E_b) = 4C_t-(N_t+W_t+S_t+E_t) + (N_b+W_b).
183 | \label{equ:linear_sys}
184 | \end{equation}
185 | 
186 | With the \textbf{Jacobi Iteration}, the iteration step is in \equref{jacob},
187 | where $C_b'$ is the value of the next step and $S_b, E_b$ is the value of current step.
188 | You may also notice that $C_b'$ is independent of $C_b$
189 | but only depends on it's neighbors.
190 | 
191 | \begin{equation}
192 | C_b' =
193 | \frac{1}{4} \left[
194 | 	\underbrace{4C_t-(N_t+W_t+S_t+E_t) + (N_b+W_b)}_\text{Fixed during iterations}
195 | 	+ \underbrace{(S_b+E_b)}_\text{Current value}
196 | \right]
197 | \label{equ:jacob}
198 | \end{equation}
199 | 
200 | It is your job to generalize and figure out the equation for (1) when the locations of gray points change, and (2) when the point has less than four neighbors.
201 | 
202 | \subsection{Acceleration}
203 | 
204 | This part is counted as bonus.
205 | To qualify for the bonus, you would also need to write a short report about your speed up and implementation.
206 | You may, for example, compare convergence against the number of iterations or execution time.
207 | We describe a few possible speed-up mechanisms, and our conjectures about how much you may gain from implementing these suggestions.
208 | 
209 | First, you may observe that
210 | the time for a value to propagate from the left to the right of the image
211 | is proportional to the width of the image.
212 | Therefore it may require the square of image sizes to actually converge to a proper solution.
213 | A naive implementation would require thousands of iterations to converge,
214 | which is very impractical.
215 | \figref{convergence} shows results with TA's baseline implementation.
216 | As you can see, it takes 20000 iterations to converge.
217 | 
218 | A cheaper solution is to use a hierarchical method.
219 | Start by solving the problem at a lower resolution, upsample, and then solve it at a higher resolution.
220 | You could do this at $1/8$x, $1/4$x, $1/2$x and $1$x scales with the nearest-neighbor upsampling algorithm, for example.
221 | Note that the number of iterations after each scale promotion would be less than solving the complete problem, because your lower-resolution solutions would look reasonably similar to the higher resolution ones already.
222 | 
223 | \begin{figure}
224 | \centering
225 | \subfigure[2 iterations]{\includegraphics[width=0.45\textwidth]{2.jpg}}
226 | \subfigure[20 iterations]{\includegraphics[width=0.45\textwidth]{20.jpg}}\\
227 | \subfigure[200 iterations]{\includegraphics[width=0.45\textwidth]{200.jpg}}
228 | \subfigure[2000 iterations]{\includegraphics[width=0.45\textwidth]{2000.jpg}}\\
229 | \subfigure[6000 iterations]{\includegraphics[width=0.45\textwidth]{6000.jpg}}
230 | \subfigure[20000 iterations]{\includegraphics[width=0.45\textwidth]{20000.jpg}}
231 | \caption{The convergence with Jacobi Iteration.}\label{fig:convergence}
232 | \end{figure}
233 | 
234 | You may also try the \textbf{successive over-relaxation method (SOR)},
235 | which changes the iteration steps to \equref{sor},
236 | 
237 | \begin{equation}
238 | C_{b,SOR}' = \omega C_b + (1-\omega) C_{b,SOR}.
239 | \label{equ:sor}
240 | \end{equation}
241 | 
242 | SOR is just a interpolation/extrapolation between
243 | current values and the values of the next iteration.
244 | Sadly, since the linear system is not \textbf{diagonally dominant},
245 | the SOR Jacobi iteration diverges.
246 | (In fact, even na\"ive Jacobi iteration is not guaranteed to converge.)
247 | However, some students of 2017 accidentally found that the following
248 | alternative formula works perfectly in this assignment.
249 | 
250 | \begin{equation}
251 | C_{b,SOR}'' = \omega C_b' + (1-\omega) C_{b,SOR}.
252 | \label{equ:sor_modify}
253 | \end{equation}
254 | 
255 | The new formula extrapolates against the $C_b$ two steps before,
256 | instead of one, and this can still be implemented without an third buffer.
257 | We have confirmed that $\omega = 1.9$ works fine.
258 | 
259 | \section{Grading}
260 | \begin{enumerate}
261 | \item 100 points, when you finish the baseline implementation (Jacobi, no acceleration).
262 | \item Up to 20 bonus points for SOR Jacobi.
263 | \item Up to 40 bonus points for hierarchical Jacobi.
264 | \item Up to 50 bonus points if you can tell why Equation \ref{equ:sor_modify} works magically.
265 | \item Up to 50 extra bonus points for any other speed-up implementation.
266 | \end{enumerate}
267 | 
268 | \section{Submission}
269 | \begin{itemize}
270 | 	\item The submission deadline is 2018/05/31 23:59 (Fri.).
271 | \item Please submit the result in loseless PNG format \verb+lab3/results/***.png+.
272 | \item Submit your source code \verb+lab3/lab3.cu+. Again, you should only modify this file in the homework.
273 | \item We will test your code with the command listed in \lstref{exe}.
274 | \item If you implement hierarchical, SOR, or any other speed-up algorithm, you need to submit a report \verb+lab3/report.pdf+ to be considered for the bonus.
275 | \end{itemize}
276 | 
277 | \section{Hint}
278 | \lstref{hint} contains part of TA's baseline code. Feel free to use it.
279 | 
280 | \pagebreak
281 | 
282 | \begin{listing}
283 | \begin{minted}{cpp}
284 | void PoissonImageCloning(
285 |    const float *background,
286 |    const float *target,
287 |    const float *mask,
288 |    float *output,
289 |    const int wb, const int hb, const int wt, const int ht,
290 |    const int oy, const int ox
291 | ) {
292 |    // set up
293 |    float *fixed, *buf1, *buf2;
294 |    cudaMalloc(&fixed, 3*wt*ht*sizeof(float));
295 |    cudaMalloc(&buf1, 3*wt*ht*sizeof(float));
296 |    cudaMalloc(&buf2, 3*wt*ht*sizeof(float));
297 | 
298 |    // initialize the iteration
299 |    dim3 gdim(CeilDiv(wt,32), CeilDiv(ht,16)), bdim(32,16);
300 |    CalculateFixed<<<gdim, bdim>>>(
301 |       background, target, mask, fixed,
302 |       wb, hb, wt, ht, oy, ox
303 |    );
304 |    cudaMemcpy(buf1, target, sizeof(float)*3*wt*ht, cudaMemcpyDeviceToDevice);
305 | 
306 |    // iterate
307 |    for (int i = 0; i < 10000; ++i) {
308 |       PoissonImageCloningIteration<<<gdim, bdim>>>(
309 |          fixed, mask, buf1, buf2, wt, ht
310 |       );
311 |       PoissonImageCloningIteration<<<gdim, bdim>>>(
312 |          fixed, mask, buf2, buf1, wt, ht
313 |       );
314 |    }
315 | 
316 |    // copy the image back
317 |    cudaMemcpy(output, background, wb*hb*sizeof(float)*3, cudaMemcpyDeviceToDevice);
318 |    SimpleClone<<<gdim, bdim>>>(
319 |       background, buf1, mask, output,
320 |       wb, hb, wt, ht, oy, ox
321 |    );
322 | 
323 |    // clean up
324 |    cudaFree(fixed);
325 |    cudaFree(buf1);
326 |    cudaFree(buf2);
327 | }
328 | \end{minted}
329 | \caption{Hint}\label{lst:hint}
330 | \end{listing}
331 | 
332 | \end{document}
333 | 


--------------------------------------------------------------------------------
/lab3/lab3_test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnjohnlin/GPGPU_Programming_2016S/40c8f1ddd275cd6908271e847390f5078f6ce0c8/lab3/lab3_test.zip


--------------------------------------------------------------------------------
/lab3/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdint>
 3 | #include <cstdlib>
 4 | #include <algorithm>
 5 | #include "SyncedMemory.h"
 6 | #include "pgm.h"
 7 | #include "lab3.h"
 8 | using namespace std;
 9 | 
10 | #define CHECK {\
11 | 	auto e = cudaDeviceSynchronize();\
12 | 	if (e != cudaSuccess) {\
13 | 		printf("At " __FILE__ ":%d, %s\n", __LINE__, cudaGetErrorString(e));\
14 | 		abort();\
15 | 	}\
16 | }
17 | 
18 | int main(int argc, char **argv)
19 | {
20 | 	if (argc != 7) {
21 | 		printf("Usage: %s <background> <target> <mask> <offset x> <offset y> <output>\n", argv[0]);
22 | 		abort();
23 | 	}
24 | 	bool sucb, suct, sucm;
25 | 	int wb, hb, cb, wt, ht, ct, wm, hm, cm;
26 | 	auto imgb = ReadNetpbm(wb, hb, cb, sucb, argv[1]);
27 | 	auto imgt = ReadNetpbm(wt, ht, ct, suct, argv[2]);
28 | 	auto imgm = ReadNetpbm(wm, hm, cm, sucm, argv[3]);
29 | 	if (not (sucb and suct and sucm)) {
30 | 		puts("Something wrong with reading the input image files.");
31 | 		abort();
32 | 	}
33 | 	if (wt != wm or ht != hm) {
34 | 		puts("The mask and target image must have the same size.");
35 | 		abort();
36 | 	}
37 | 	if (cm != 1) {
38 | 		puts("The mask image must be mono-colored.");
39 | 		abort();
40 | 	}
41 | 	if (cb != 3 or ct != 3) {
42 | 		puts("The background and target image must be colored.");
43 | 		abort();
44 | 	}
45 | 	const int oy = atoi(argv[4]), ox = atoi(argv[5]);
46 | 
47 | 	const int SIZEB = wb*hb*3;
48 | 	const int SIZET = wt*ht*3;
49 | 	const int SIZEM = wm*hm;
50 | 	MemoryBuffer<float> background(SIZEB), target(SIZET), mask(SIZEM), output(SIZEB);
51 | 	auto background_s = background.CreateSync(SIZEB);
52 | 	auto target_s = target.CreateSync(SIZET);
53 | 	auto mask_s = mask.CreateSync(SIZEM);
54 | 	auto output_s = output.CreateSync(SIZEB);
55 | 
56 | 	float *background_cpu = background_s.get_cpu_wo();
57 | 	float *target_cpu = target_s.get_cpu_wo();
58 | 	float *mask_cpu = mask_s.get_cpu_wo();
59 | 	copy(imgb.get(), imgb.get()+SIZEB, background_cpu);
60 | 	copy(imgt.get(), imgt.get()+SIZET, target_cpu);
61 | 	copy(imgm.get(), imgm.get()+SIZEM, mask_cpu);
62 | 
63 | 	PoissonImageCloning(
64 | 		background_s.get_gpu_ro(),
65 | 		target_s.get_gpu_ro(),
66 | 		mask_s.get_gpu_ro(),
67 | 		output_s.get_gpu_wo(),
68 | 		wb, hb, wt, ht, oy, ox
69 | 	);
70 | 
71 | 	unique_ptr<uint8_t[]> o(new uint8_t[SIZEB]);
72 | 	const float *o_cpu = output_s.get_cpu_ro();
73 | 	transform(o_cpu, o_cpu+SIZEB, o.get(), [](float f) -> uint8_t { return max(min(int(f+0.5f), 255), 0); });
74 | 	WritePPM(o.get(), wb, hb, argv[6]);
75 | 	return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/lab3/pgm.cpp:
--------------------------------------------------------------------------------
 1 | #include "pgm.h"
 2 | 
 3 | inline int peekc(FILE *fp)
 4 | {
 5 | 	int peeked = getc(fp);
 6 | 	ungetc(peeked, fp);
 7 | 	return peeked;
 8 | }
 9 | 
10 | unique_ptr<uint8_t[]> ReadNetpbm(int &width, int &height, int &num_channel, bool &success, const char *filename)
11 | {
12 | #define FAIL_IF(cond)\
13 | 	if (cond) {\
14 | 		success = false;\
15 | 		return i;\
16 | 	}
17 | 	unique_ptr<uint8_t[]> i;
18 | 
19 | 	FILE *fp = fopen(filename, "rb");
20 | 	FAIL_IF(not fp);
21 | 
22 | 	char magic[3];
23 | 	fread(magic, 1, 3, fp);
24 | 	FAIL_IF(magic[0] != 'P' or magic[2] != '\n' or (magic[1] != '5' and magic[1] != '6'));
25 | 
26 | 	num_channel = magic[1] == '5' ? 1 : 3;
27 | 	while (peekc(fp) == '#') {
28 | 		while (getc(fp) != '\n');
29 | 	}
30 | 
31 | 	int norm;
32 | 	FAIL_IF(fscanf(fp, "%d %d\n%d\n", &width, &height, &norm) != 3);
33 | 	FAIL_IF(norm != 255);
34 | 	
35 | 	int nbytes = width*height*num_channel;
36 | 	i.reset(new uint8_t[nbytes]);
37 | 	FAIL_IF(fread(i.get(), 1, nbytes, fp) != nbytes);
38 | 
39 | #undef FAIL_IF
40 | 	success = true;
41 | 	return i;
42 | }
43 | 
44 | void WriteNetpbm(uint8_t* i, const int width, const int height, const int num_channel, const char *filename, const char *magic)
45 | {
46 | 	const int num_pixel = width*height;
47 | 	const int num_element = num_pixel*num_channel;
48 | 	FILE *fp = fopen(filename, "wb");
49 | 	fprintf(fp, "%s\n%d %d\n255\n", magic, width, height);
50 | 	fwrite(i, 1, num_element, fp);
51 | }
52 | 
53 | void WritePGM(uint8_t* i, const int width, const int height, const char *filename)
54 | {
55 | 	WriteNetpbm(i, width, height, 1, filename, "P5");
56 | }
57 | 
58 | void WritePPM(uint8_t* i, const int width, const int height, const char *filename)
59 | {
60 | 	WriteNetpbm(i, width, height, 3, filename, "P6");
61 | }
62 | 


--------------------------------------------------------------------------------
/lab3/pgm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <memory>
 3 | #include <cstdint>
 4 | using std::unique_ptr;
 5 | 
 6 | unique_ptr<uint8_t[]> ReadNetpbm(int &width, int &height, int &num_channel, bool &success, const char *filename);
 7 | void WriteNetpbm(uint8_t* i, const int width, const int height, const int num_channel, const char *filename, const char *magic);
 8 | void WritePGM(uint8_t* i, const int width, const int height, const char *filename);
 9 | void WritePPM(uint8_t* i, const int width, const int height, const char *filename);
10 | 
11 | 


--------------------------------------------------------------------------------
/utils/SyncedMemory.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstddef>
  3 | #include <cassert>
  4 | 
  5 | template <typename T>
  6 | class SyncedMemory {
  7 | 	enum MemoryState {
  8 | 		GPU_IS_NEW,
  9 | 		CPU_IS_NEW,
 10 | 		SYNCED
 11 | 	} state_ = SYNCED;
 12 | 	T *cpu_, *gpu_;
 13 | 	size_t n_;
 14 | 
 15 | 	void Sync()
 16 | 	{
 17 | 		switch (state_) {
 18 | 			case CPU_IS_NEW:
 19 | 				cudaMemcpy(gpu_, cpu_, sizeof(T)*n_, cudaMemcpyHostToDevice);
 20 | 				break;
 21 | 			case GPU_IS_NEW:
 22 | 				cudaMemcpy(cpu_, gpu_, sizeof(T)*n_, cudaMemcpyDeviceToHost);
 23 | 				break;
 24 | 		}
 25 | 		state_ = SYNCED;
 26 | 	}
 27 | public:
 28 | 	void Reset(T* cpu, T *gpu, const size_t n) { cpu_ = cpu; gpu_ = gpu; n_ = n; }
 29 | 	SyncedMemory() { Reset(nullptr, nullptr, 0); }
 30 | 	SyncedMemory(T* cpu, T *gpu, const size_t n) { Reset(cpu, gpu, n); }
 31 | 	SyncedMemory& operator=(const SyncedMemory&) = delete;
 32 | 	SyncedMemory(const SyncedMemory&) = delete;
 33 | 	SyncedMemory& operator=(SyncedMemory &&mem)
 34 | 	{
 35 | 		if (this != &mem) {
 36 | 			this->cpu_ = mem.cpu_;
 37 | 			this->gpu_ = mem.gpu_;
 38 | 			this->n_ = mem.n_;
 39 | 			mem.cpu_ = mem.gpu_ = nullptr;
 40 | 		}
 41 | 		return *this;
 42 | 	}
 43 | 	SyncedMemory(SyncedMemory &&mem)
 44 | 	{
 45 | 		this->cpu_ = mem.cpu_;
 46 | 		this->gpu_ = mem.gpu_;
 47 | 		mem.cpu_ = mem.gpu_ = nullptr;
 48 | 	}
 49 | 
 50 | 	T *get_cpu_wo() { state_ = CPU_IS_NEW; return cpu_; }
 51 | 	T *get_gpu_wo() { state_ = GPU_IS_NEW; return gpu_; }
 52 | 	const T* get_cpu_ro() { if (state_ != CPU_IS_NEW) { Sync(); } return cpu_; }
 53 | 	const T* get_gpu_ro() { if (state_ != GPU_IS_NEW) { Sync(); } return gpu_; }
 54 | 	T *get_cpu_rw() { get_cpu_ro(); return get_cpu_wo(); }
 55 | 	T *get_gpu_rw() { get_gpu_ro(); return get_gpu_wo(); }
 56 | };
 57 | 
 58 | template <typename T>
 59 | class MemoryBuffer {
 60 | 	T *cpu_ = nullptr, *gpu_ = nullptr;
 61 | 	void Alloc(const size_t n)
 62 | 	{
 63 | 		if (n != 0) {
 64 | 			cpu_ = new T[n];
 65 | 			cudaMalloc(&gpu_, sizeof(T)*n);
 66 | 		}
 67 | 	}
 68 | public:
 69 | 	void Free()
 70 | 	{
 71 | 		assert((gpu_ == nullptr) == (cpu_ == nullptr));
 72 | 		delete[] cpu_;
 73 | 		cudaFree(gpu_);
 74 | 		cpu_ = gpu_ = nullptr;
 75 | 	}
 76 | 
 77 | 	void Realloc(const size_t n)
 78 | 	{
 79 | 		Free();
 80 | 		Alloc(n);
 81 | 	}
 82 | 
 83 | 	MemoryBuffer(const size_t n = 0)
 84 | 	{
 85 | 		Alloc(n);
 86 | 	}
 87 | 
 88 | 	~MemoryBuffer()
 89 | 	{
 90 | 		Free();
 91 | 	}
 92 | 
 93 | 	MemoryBuffer& operator=(const MemoryBuffer&) = delete;
 94 | 	MemoryBuffer(const MemoryBuffer&) = delete;
 95 | 	MemoryBuffer& operator=(MemoryBuffer &&buf)
 96 | 	{
 97 | 		if (this != &buf) {
 98 | 			this->cpu_ = buf.cpu_;
 99 | 			this->gpu_ = buf.gpu_;
100 | 			buf.cpu_ = buf.gpu_ = nullptr;
101 | 		}
102 | 		return *this;
103 | 	}
104 | 	MemoryBuffer(MemoryBuffer &&buf)
105 | 	{
106 | 		this->cpu_ = buf.cpu_;
107 | 		this->gpu_ = buf.gpu_;
108 | 		buf.cpu_ = buf.gpu_ = nullptr;
109 | 	}
110 | 
111 | 	SyncedMemory<T> CreateSync(const size_t n, const size_t offset = 0) {
112 | 		return SyncedMemory<T>(cpu_+offset, gpu_+offset, n);
113 | 	}
114 | };
115 | 


--------------------------------------------------------------------------------
/utils/Timer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <chrono>
 3 | 
 4 | class Timer {
 5 | 	typedef std::chrono::time_point<std::chrono::high_resolution_clock> Clock;
 6 | 	long long count;
 7 | 	bool running;
 8 | 	Clock prev_start_;
 9 | 	Clock Now() {
10 | 		return std::chrono::high_resolution_clock::now();
11 | 	}
12 | public:
13 | 	void Start() {
14 | 		running = true;
15 | 		prev_start_ = Now();
16 | 	}
17 | 	void Pause() {
18 | 		if (running) {
19 | 			running = false;
20 | 			auto diff = Now() - prev_start_;
21 | 			count += std::chrono::duration_cast<std::chrono::microseconds>(diff).count();
22 | 		}
23 | 	}
24 | 	void Reset() {
25 | 		running = false;
26 | 		count = 0;
27 | 	}
28 | 	long long get_count() {
29 | 		return count;
30 | 	}
31 | 	Timer() {Reset();}
32 | };
33 | 
34 | #define printf_timer(timer) printf("Timer " #timer ": %lldus\n", timer.get_count());
35 | 


--------------------------------------------------------------------------------