├── image
    ├── GPU-CPU.jpg
    ├── block-index.png
    ├── block-thread.jpg
    ├── thread-index.png
    ├── cuda-c-programming.png
    └── cuda-memory-structure.jpg
├── src
    ├── chapter01
    │   ├── hello
    │   ├── hello.cu
    │   └── README.md
    ├── chapter02
    │   ├── sumArraysOnGPU
    │   ├── sumArraysOnGPU1
    │   ├── sumArraysOnHost
    │   ├── sumMatrixOnGPU-2D-grid-2D-block
    │   ├── sumArraysOnHost.c
    │   ├── sumArraysOnGPU1.cu
    │   ├── sumArraysOnGPU.cu
    │   ├── sumMatrixOnGPU-2D-grid-2D-block.cu
    │   └── README.md
    ├── chapter03
    │   └── README.md
    └── common.h
├── .gitignore
├── LICENSE
└── README.md


/image/GPU-CPU.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/GPU-CPU.jpg


--------------------------------------------------------------------------------
/src/chapter01/hello:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/src/chapter01/hello


--------------------------------------------------------------------------------
/image/block-index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/block-index.png


--------------------------------------------------------------------------------
/image/block-thread.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/block-thread.jpg


--------------------------------------------------------------------------------
/image/thread-index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/thread-index.png


--------------------------------------------------------------------------------
/image/cuda-c-programming.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/cuda-c-programming.png


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnGPU:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/src/chapter02/sumArraysOnGPU


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnGPU1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/src/chapter02/sumArraysOnGPU1


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnHost:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/src/chapter02/sumArraysOnHost


--------------------------------------------------------------------------------
/image/cuda-memory-structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/image/cuda-memory-structure.jpg


--------------------------------------------------------------------------------
/src/chapter02/sumMatrixOnGPU-2D-grid-2D-block:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HolyChen/cuda-tutorial/HEAD/src/chapter02/sumMatrixOnGPU-2D-grid-2D-block


--------------------------------------------------------------------------------
/src/chapter03/README.md:
--------------------------------------------------------------------------------
 1 | # 第 3 章 CUDA 执行模型
 2 | 
 3 | 
 4 | 目录
 5 | * [3.1. 线程束的概念](#31-线程束的概念)
 6 | 
 7 | [TOC]
 8 | 
 9 | 启动内核时，从软件的角度我们会觉得所有的线程似乎都是同时在并行运行的。但是从硬件执行的角度来看，不是所有的线程在物理上都可以同同时运行，这是因为计算资源有限，特别是寄存器和共享内存更是稀缺资源。因此，线程块里的不同线程可能会以不同的速度前进。
10 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | // #include <stdio.h>
 3 | 
 4 | #ifndef _COMMON_H
 5 | #define _COMMON_H
 6 | 
 7 | inline double cpuSecond()
 8 | {
 9 |     struct timeval tp;
10 |     struct timezone tzp;
11 |     int i = gettimeofday(&tp, &tzp);
12 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
13 | }
14 | 
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/chapter01/hello.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | // __global__ 告诉编译器这个函数将会从cpu中调用，然后在gpu上运行
 4 | __global__ void helloFromGPU (void) 
 5 | {
 6 |     printf("Hello World from GPU!\n");
 7 | }
 8 | 
 9 | int main(void)
10 | {
11 |     // hello from cpu
12 |     printf("Hello World from CPU!\n");
13 | 
14 |     helloFromGPU <<<1, 10>>>();
15 |     // <<<1, 10>> 表示从主线程到设备端代码调用10个线程
16 |     cudaDeviceReset();
17 |     return 0;
18 | }
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Build and Release Folders
 2 | bin-debug/
 3 | bin-release/
 4 | [Oo]bj/
 5 | [Bb]in/
 6 | 
 7 | # Other files and folders
 8 | .settings/
 9 | 
10 | # Executables
11 | *.swf
12 | *.air
13 | *.ipa
14 | *.apk
15 | 
16 | # Project files, i.e. `.project`, `.actionScriptProperties` and `.flexProperties`
17 | # should NOT be excluded as they contain compiler settings and other important
18 | # information for Eclipse / Flash Builder.
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 YangYun
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnHost.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <string.h>
 3 | #include <stdio.h>
 4 | #include <time.h>
 5 | 
 6 | void sumArraysOnHost(float *A, float *B, float *C, const int N){
 7 |     for (int idx=0; idx<N; idx++){
 8 |         C[idx] = A[idx] + B[idx];
 9 |     }
10 | }
11 | 
12 | 
13 | void initialData(float *ip, int size){
14 |     // generate different seed for random number
15 |     time_t t;
16 |     srand((unsigned int) time(&t));
17 | 
18 |     for (int i=0; i<size; i++){
19 |         ip[i] = (float)(rand() & 0xFF)/10.0f;
20 |     }
21 | }
22 | 
23 | void print(float *array, const int N){
24 |     for (int idx=0; idx<N; idx++){
25 |         printf(" %f", array[idx]);
26 |     }
27 |     printf("\n");
28 | }
29 | 
30 | int main(){
31 |     int nElem = 4;
32 |     size_t nBytes = nElem * sizeof(float);
33 |     float *h_A, *h_B, *h_C;
34 |     h_A = (float *)malloc(nBytes);
35 |     h_B = (float *)malloc(nBytes);
36 |     h_C = (float *)malloc(nBytes);
37 | 
38 |     initialData(h_A, nElem);
39 |     initialData(h_B, nElem);
40 |     print(h_A, nElem);
41 |     print(h_B, nElem);
42 | 
43 |     sumArraysOnHost(h_A, h_B, h_C, nElem);
44 |     print(h_C, nElem);
45 |     free(h_A);
46 |     free(h_B);
47 |     free(h_C);
48 | 
49 |     return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | |Author|YunYang1994|
 3 | |---|---
 4 | |E-mail|dreameryangyun@sjtu.edu.cn
 5 | 
 6 | [<img src="image/cuda-c-programming.png" alt="logo" height="300" align="right" />](https://book.douban.com/subject/27108836/)
 7 | 
 8 | # 《CUDA C 编程权威指南》
 9 | 
10 | ![](https://img.shields.io/badge/version-v2-green.svg)
11 | [![](https://img.shields.io/badge/language-%E7%AE%80%E4%BD%93%E4%B8%AD%E6%96%87-red.svg)](./README.md)
12 | [![](https://img.shields.io/badge/chat-%E4%BA%A4%E6%B5%81-667ed5.svg)](./assets/community.md) 
13 | 
14 | > 正在学习和使用，敬请期待。
15 | 
16 | ## 内容简介
17 | 
18 | CUDA (Compute Unified Device Architecture, 统一计算设备架构) 是NIVIDIA提出的并行计算架构， 结合了CPU和GPU的优点，主要用来处理密集型及并行计算。CPU和GPU是两个独立的处理器，通过单个计算节点的 PCI-Express总线相连。**GPU 用来提高计算密集型应用程序中并行程序段的执行速度， CPU则负责管理设备端的资源。** CUDA编程的独特优势在于开放的架构特性可以使得程序员在功能强大的硬件平台上充分挖掘其并行，既满足了计算密集型的程序的需要，又实现了程序的易读性及便捷性。
19 | ### CUDA编程平台
20 | CUDA是一种通用的并行计算平台和编程模型，是在C语言上扩展的。借助于CUDA，你可以像编写C语言程序一样实现并行算法。你可以在NIVDIA的GPU平台上用CUDA为多种系统编写应用程序，范围从嵌入式设备、平板电脑、笔记本电脑、台式机工作站到HPC集群。在CUDA编程平台中，GPU并不是一个独立运行的计算平台，而需要与CPU协同工作，可以看成是CPU的协处理器，因此当我们在说GPU并行计算时，其实是指的基于CPU+GPU的异构计算架构。在异构计算架构中，GPU与CPU通过PCIe总线连接在一起来协同工作，CPU所在位置称为为主机端（host），而GPU所在位置称为设备端（device），如下图所示。
21 | 
22 | <div align=center><img src="image/GPU-CPU.jpg" alt="logo" height="200"></div>
23 | 
24 | ### CUDA内存结构
25 | 
26 | <img src="image/cuda-memory-structure.jpg" alt="logo" height="400" align="right" />
27 | 
28 | 在CUDA的并行运行环境中，涉及到**线程块**和**线程**的概念。**每个线程有自己的私有本地内存（Local Memory），而每个线程块有包含共享内存（Shared Memory）**,可以被线程块中所有线程共享，其生命周期与线程块一致。而且，所有的线程都可以访问全局内存（Global Memory），还可以访问一些只读内存块：常量内存（Constant Memory）和纹理内存（Texture Memory）。此外，**我们可以通过寄存器（register）或常量内存来进行参数的传递**。如果使用寄存器来传参，那么每个线程用一个寄存器来传递一个参数。
29 | 
30 | 
31 | ## 目录结构
32 | - [**第 1 章 基于CUDA的异构并行计算**](./src/chapter01/README.md)
33 | - [**第 2 章 CUDA 编程模型**](./src/chapter02/README.md)
34 | - [**第 3 章 CUDA 执行模型**](./src/chapter03/README.md)
35 | 


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnGPU1.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <string.h>
 3 | #include <stdio.h>
 4 | #include <time.h>
 5 | 
 6 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
 7 | 
 8 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 9 |     if (idx < N) C[idx] = A[idx] + B[idx]; // 检查是否越界
10 |     printf("%f + %f = %f Caculated On GPU: block %d thread %d\n", 
11 |             A[idx], B[idx], C[idx], blockIdx.x, threadIdx.x);
12 | }
13 | 
14 | void initialData(float *ip, int size){
15 |     // generate different seed for random number
16 |     time_t t;
17 |     srand((unsigned int) time(&t));
18 | 
19 |     for (int i=0; i<size; i++){
20 |         ip[i] = (float)(rand() & 0xFF)/10.0f;
21 |     }
22 | }
23 | 
24 | void print(float *array, const int N){
25 |     for (int idx=0; idx<N; idx++){
26 |         printf(" %f", array[idx]);
27 |     }
28 |     printf("\n");
29 | }
30 | 
31 | int main(){
32 |     int nElem = 6;
33 |     size_t nBytes = nElem * sizeof(float);
34 |     float *h_A, *h_B;
35 | 
36 |     h_A = (float *)malloc(nBytes);
37 |     h_B = (float *)malloc(nBytes);
38 | 
39 |     initialData(h_A, nElem);
40 |     initialData(h_B, nElem);
41 |     
42 |     printf("向量 A:");
43 |     print(h_A, nElem);
44 |     printf("向量 B:");
45 |     print(h_B, nElem);
46 | 
47 |     float *d_A, *d_B, *d_C;
48 |     cudaMalloc((float**)&d_A, nBytes);
49 |     cudaMalloc((float**)&d_B, nBytes);
50 |     cudaMalloc((float**)&d_C, nBytes);
51 |     
52 |     cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
53 |     cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
54 |    
55 |     printf("向量 C 的每个元素计算过程:\n");
56 |     dim3 block(2);
57 |     dim3 thread(3);
58 |     sumArraysOnGPU <<< block, thread >>>(d_A, d_B, d_C, nElem); // 异步计算
59 | 
60 |     free(h_A);
61 |     free(h_B);
62 | 
63 |     cudaFree(d_A);
64 |     cudaFree(d_B);
65 |     cudaFree(d_C);
66 |     return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/chapter02/sumArraysOnGPU.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <string.h>
 3 | #include <stdio.h>
 4 | #include <time.h>
 5 | 
 6 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
 7 | 
 8 |     printf("Caculating On GPU\n");
 9 |     for (int idx=0; idx<N; idx++){
10 |         C[idx] = A[idx] + B[idx];
11 |     }
12 | }
13 | 
14 | 
15 | void initialData(float *ip, int size){
16 |     // generate different seed for random number
17 |     time_t t;
18 |     srand((unsigned int) time(&t));
19 | 
20 |     for (int i=0; i<size; i++){
21 |         ip[i] = (float)(rand() & 0xFF)/10.0f;
22 |     }
23 | }
24 | 
25 | 
26 | void print(float *array, const int N){
27 |     for (int idx=0; idx<N; idx++){
28 |         printf(" %f", array[idx]);
29 |     }
30 |     printf("\n");
31 | }
32 | 
33 | int main(){
34 |     int nElem = 4;
35 |     size_t nBytes = nElem * sizeof(float);
36 |     float *h_A, *h_B, *h_C;
37 | 
38 |     printf("malloc memory on Host\n");
39 |     h_A = (float *)malloc(nBytes);
40 |     h_B = (float *)malloc(nBytes);
41 |     h_C = (float *)malloc(nBytes);
42 | 
43 |     printf("initialize data on Host\n");
44 |     initialData(h_A, nElem);
45 |     initialData(h_B, nElem);
46 | 
47 |     print(h_A, nElem);
48 |     print(h_B, nElem);
49 | 
50 |     printf("malloc memory on GPU\n");
51 |     float *d_A, *d_B, *d_C;
52 |     cudaMalloc((float**)&d_A, nBytes);
53 |     cudaMalloc((float**)&d_B, nBytes);
54 |     cudaMalloc((float**)&d_C, nBytes);
55 |     
56 |     printf("copying inputs from Host to Device\n");
57 |     cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
58 |     cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
59 | 
60 |     sumArraysOnGPU <<<1, 1>>>(d_A, d_B, d_C, nElem); // 异步计算
61 |     printf("copying output from Device to Host\n");
62 |     cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);
63 |     print(h_C, nElem);
64 | 
65 |     free(h_A);
66 |     free(h_B);
67 |     free(h_C);
68 | 
69 |     cudaFree(d_A);
70 |     cudaFree(d_B);
71 |     cudaFree(d_C);
72 | 
73 |     return 0;
74 | }
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/chapter01/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 第 1 章 基于CUDA的异构并行计算
 3 | 
 4 | 从计算的角度来看，并行计算可以被定义为同时使用多个计算资源来执行并发计算， 大的问题可以被分解成很多小问题，然后在不同计算资源上分别并行处理这些小问题。
 5 | CUDA是一种通用的异构并行计算平台和编程模型，你可以利用CUDA平台像在CPU上那样使用GPU来进行计算。
 6 | 
 7 | [TOC]
 8 | 
 9 | **编译环境**：本代码将使用`nvcc`编译器来编译，你可以使用以下命令来检查CUDA是否正确安装:
10 | 
11 | ```bash
12 | $ which nvcc
13 | /usr/local/cuda-8.0/bin/nvcc  # cuda-8.0 版本
14 | ```
15 | 
16 | ###  用GPU输出 Hello World
17 | 不妨先写一个cuda C程序，命名为`helloFromGPU`，用它来输出字符串 “Hello World from GPU！” 
18 | ```cpp
19 | ========================== 代码清单 1-1 Hello World from GPU (hello.cu) ==========================
20 | // hello.cu
21 | #include <stdio.h>
22 | 
23 | __global__ void helloFromGPU (void) 
24 | {
25 |     printf("Hello World from GPU!\n");
26 | }
27 | 
28 | int main(void)
29 | {
30 |     // hello from cpu
31 |     printf("Hello World from CPU!\n");
32 | 
33 |     helloFromGPU <<<1, 10>>>();
34 |     cudaDeviceReset();
35 |     return 0;
36 | }
37 | ```
38 | 在linux终端下使用以下命令进行编译[`hello.cu`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter01/hello.cu)，然后执行程序得到
39 | ```bash
40 | $ nvcc -arch sm_20 hello.cu -o hello
41 | $ ./hello
42 | Hello World from CPU!
43 | Hello World from GPU!
44 | Hello World from GPU!
45 | Hello World from GPU!
46 | Hello World from GPU!
47 | Hello World from GPU!
48 | Hello World from GPU!
49 | Hello World from GPU!
50 | Hello World from GPU!
51 | Hello World from GPU!
52 | Hello World from GPU!
53 | ```
54 | 在上面的代码中，`cudaDeviceReset`表示重置当前线程所关联过的当前设备的所有资源；修饰符`__global__`告诉编译器这是一个内核函数，它将从CPU中调用，然后在GPU上执行，在CPU上通过下面的代码启动内核函数
55 | 
56 | ```bash
57 | helloFromGPU <<<1, 10>>>();
58 | ```
59 | 
60 | > 三重尖号意味着从主线程到端代码的调用。1和10分别表示有1个块区域和10个线程，后续会作相关介绍。
61 | 
62 | 
63 | ###  CUDA 编程结构
64 | 
65 | 一个典型的 CUDA 编程结构应该包括下面5个主要的步骤：
66 | 
67 | - **1. 分配GPU内存**
68 | 
69 | - **2. 从CPU内存中拷贝数据到GPU内存中去**
70 | 
71 | - **3. 调用CUDA 内核函数来完成程序指定的运算**
72 | 
73 | - **4. 将数据从GPU中 拷回CPU内存**
74 | 
75 | - **5. 释放GPU 内存空间**
76 | 
77 | 在上述代码中， 你只看到了第三步: 调用内核。
78 | 
79 | 下一章: [**CUDA 编程模型**](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/README.md)
80 | 
81 | 


--------------------------------------------------------------------------------
/src/chapter02/sumMatrixOnGPU-2D-grid-2D-block.cu:
--------------------------------------------------------------------------------
  1 | #include "../common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda_runtime.h>
  5 | 
  6 | void initialInt(int *ip, int size){
  7 |     for (int i=0; i<size; i++){
  8 |         ip[i] = i + rand()%10;
  9 |     }
 10 | }
 11 | 
 12 | void printMatrix(int *C, const int nx, const int ny){
 13 |     int *ic = C;
 14 |     printf("\nMatrix: (%d.%d)\n", nx, ny);
 15 |     for (int iy=0; iy<ny; iy++){
 16 |         for (int ix=0; ix<nx; ix++){
 17 |             printf("%3d", ic[ix]);
 18 |         }
 19 |         ic += nx;
 20 |         printf("\n");
 21 |     }
 22 |     printf("\n");
 23 | }
 24 | 
 25 | __global__ void sumMatrixOnGPU2D(int *MatA, int *MatB, int *MatC, int nx, int ny){
 26 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 27 |     unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
 28 |     unsigned int idx = iy * nx + ix;
 29 |     
 30 |     if(ix < nx && iy < ny)
 31 |         MatC[idx] = MatA[idx] + MatB[idx];
 32 | }
 33 | 
 34 | void sumMatrixOnHost(int *MatA, int *MatB, int *MatC, int nx, int ny){
 35 |     int *ia = MatA;
 36 |     int *ib = MatB;
 37 |     int *ic = MatC;
 38 | 
 39 |     for (int iy=0; iy<ny; iy++){
 40 |         for (int ix=0; ix<nx; ix++){
 41 |             ic[ix] = ia[ix] + ib[ix];
 42 |         }
 43 |         ia += nx; ib += nx; ic += nx;
 44 |     }
 45 | }
 46 | 
 47 | bool checkResult(int *MatC, int *h_C, int nxy){
 48 |     for (int i=0; i<nxy; i++){
 49 |         if (MatC[i] != h_C[i]){
 50 |             printf("Matc[%d]: %d != h_C[%d]: %d\n", i, MatC[i], i, h_C[i]);
 51 |             return false;
 52 |         }
 53 |     }
 54 |     return true;
 55 | }
 56 | 
 57 | 
 58 | int main(){
 59 | 
 60 |     /* int nx = 1<<13; */
 61 |     /* int ny = 1<<13; */
 62 |     int nx = 10240;
 63 |     int ny = 1024;
 64 | 
 65 |     int nxy = nx * ny;
 66 |     int nBytes = nxy * sizeof(int);
 67 | 
 68 |     int *h_A, *h_B, *h_C, *h_MatC;
 69 |     h_A = (int *)malloc(nBytes);
 70 |     h_B = (int *)malloc(nBytes);
 71 |     h_C = (int *)malloc(nBytes);
 72 |     h_MatC = (int *)malloc(nBytes);
 73 | 
 74 |     initialInt(h_A, nxy);
 75 |     /* printMatrix(h_A, nx, ny); */
 76 |     initialInt(h_B, nxy);
 77 |     /* printMatrix(h_B, nx, ny); */
 78 | 
 79 |     memset(h_C, 0, nBytes);
 80 |     memset(h_MatC, 0, nBytes);
 81 | 
 82 |     double iStart = cpuSecond();
 83 |     sumMatrixOnHost(h_A, h_B, h_C, nx, ny);
 84 |     double iElaps = cpuSecond() - iStart;
 85 |     /* printMatrix(h_C, nx, ny); */
 86 |     printf("sumMatrixOnHost elapsed %f sec\n", iElaps);
 87 |     
 88 | 
 89 |     int *d_MatA, *d_MatB, *d_MatC;
 90 |     cudaMalloc((void **)&d_MatA, nBytes);
 91 |     cudaMalloc((void **)&d_MatB, nBytes);
 92 |     cudaMalloc((void **)&d_MatC, nBytes);
 93 |     
 94 |     // transfer data from host to Device
 95 |     cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
 96 |     cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
 97 | 
 98 |     int dimx = 32;
 99 |     int dimy = 32;
100 |     dim3 block(dimx, dimy);
101 |     dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y);
102 | 
103 |     iStart = cpuSecond();
104 |     sumMatrixOnGPU2D <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny);
105 |     cudaDeviceSynchronize();
106 |     iElaps = cpuSecond() - iStart;
107 |     printf("sumMatrixOnGPU2D <<<(%d, %d), (%d, %d)>>> elapsed %f sec\n",
108 |             grid.x, grid.y, block.x, block.y, iElaps);
109 | 
110 |     cudaMemcpy(h_MatC, d_MatC, nBytes, cudaMemcpyDeviceToHost);
111 |     if (checkResult(h_MatC, h_C, nxy)){
112 |         printf("Arrays match!\n");
113 |     }
114 |     else{
115 |         printf("Arrays don't match!\n");
116 |     }
117 |     
118 |     free(h_A); free(h_B); free(h_C);
119 |     cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC);
120 | 
121 |     cudaDeviceReset();
122 |     return 0;
123 | }
124 | 
125 | 


--------------------------------------------------------------------------------
/src/chapter02/README.md:
--------------------------------------------------------------------------------
  1 | # 第 2 章 CUDA 编程模型
  2 | 
  3 | 
  4 | 目录
  5 | * [2.1. 内存管理](#21-内存管理)
  6 | * [2.2. 编写核函数](#22-编写核函数)
  7 | * [2.3. 矩阵运算](#23-矩阵运算)
  8 | 
  9 | [TOC]
 10 | 
 11 | 在CUDA编程模型中，会包含多个CPU和GPU，每个GPU和CPU的内存都由一条PCI-Express总线隔开。为了清楚地指明不同的物理内存空间， 我们需要约定
 12 | 
 13 | - **主机: CPU及内存， 其变量以h_为前缀**
 14 | 
 15 | - **设备: GPU及内存， 其变量以d_为前缀**
 16 | 
 17 | 现在，重要的是应学会如何为主机和设备分配内存以及在CPU和GPU之间拷贝共享数据。
 18 | 
 19 | ### 2.1 内存管理
 20 | **CUDA编程模型假设系统是由一个主机(CPU)和设备(GPU)组成的， 而且各自拥有独立的内存，并且核函数实在设备上运行的**.为了使你拥有充分的控制权并且使系统达到最佳性能，CUDA运行时负责分配与释放设备内存，并且在主机内存与设备内存之间传输数据。
 21 | 
 22 | 下面，我们将通过一个简单的两个数组相加的例子来学习如何在主机和设备之间进行数据传输。
 23 | ####  2.1.1 在CPU上运算
 24 | 
 25 | ```cpp
 26 | ========================== 代码清单 2-1 sumArraysOnHost.c ==========================
 27 | // sumArraysOnHost.c
 28 | #include <stdlib.h>
 29 | #include <string.h>
 30 | #include <stdio.h>
 31 | #include <time.h>
 32 | // 在主机上执行相加运算
 33 | void sumArraysOnHost(float *A, float *B, float *C, const int N){
 34 |     for (int idx=0; idx<N; idx++){
 35 |         C[idx] = A[idx] + B[idx];
 36 |     }
 37 | }
 38 | // 初始化数据值
 39 | void initialData(float *ip, int size){
 40 |     // generate different seed for random number
 41 |     time_t t;
 42 |     srand((unsigned int) time(&t));
 43 | 
 44 |     for (int i=0; i<size; i++){
 45 |         ip[i] = (float)(rand() & 0xFF)/10.0f;
 46 |     }
 47 | }
 48 | // 打印数组
 49 | void print(float *array, const int N){
 50 |     for (int idx=0; idx<N; idx++){
 51 |         printf(" %f", array[idx]);
 52 |     }
 53 |     printf("\n");
 54 | }
 55 | 
 56 | int main(){
 57 |     int nElem = 4;
 58 |     size_t nBytes = nElem * sizeof(float);
 59 |     float *h_A, *h_B, *h_C;
 60 |     h_A = (float *)malloc(nBytes);
 61 |     h_B = (float *)malloc(nBytes);
 62 |     h_C = (float *)malloc(nBytes);
 63 | 
 64 |     initialData(h_A, nElem);
 65 |     initialData(h_B, nElem);
 66 |     print(h_A, nElem);
 67 |     print(h_B, nElem);
 68 | 
 69 |     sumArraysOnHost(h_A, h_B, h_C, nElem);
 70 |     print(h_C, nElem);
 71 |     free(h_A);
 72 |     free(h_B);
 73 |     free(h_C);
 74 |     
 75 |     return 0;
 76 | }
 77 | ```
 78 | 这是一个纯C语言编写的程序，请见[`sumArraysOnHost.c`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/sumArraysOnHost.c)，可以使用像下面这样用`nvcc`进行编译
 79 | ```bash
 80 | $ nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sumArraysOnHost
 81 | $ ./sumArraysOnHost
 82 |  22.400000 24.799999 20.000000 14.200000
 83 |  22.400000 24.799999 20.000000 14.200000
 84 |  44.799999 49.599998 40.000000 28.400000
 85 | ```
 86 | 在上述命令行中，需要注意:
 87 | 
 88 | > `-Xcompiler`是指用于指定命令行选项是指向C编译器
 89 | 
 90 | > `-std=c99` 指的是将按照C99标准进行编译
 91 | 
 92 | ####  2.1.2 在GPU上运算
 93 | 现在， 你可以在GPU上修改代码来进行数组加法运算， 用`cudaMalloc`在GPU上申请内存。
 94 | ```bashrc
 95 | float *d_A, *d_B, *d_C;             //先定义数值指针，用来存放地址
 96 | cudaMalloc((float**)&d_A, nBytes);  //申请n个字节的内存后，返回地址
 97 | cudaMalloc((float**)&d_B, nBytes);
 98 | cudaMalloc((float**)&d_C, nBytes);
 99 | ```
100 | 然后使用`cudaMemcpy`函数把数据从**主机内存拷贝到GPU的全局内存中**，参数`cudaMemcpyHostToDevice`指定了数据的拷贝方向。
101 | ```bashrc
102 | cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
103 | cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
104 | ```
105 | 当数据被转移到**GPU的全局内存后**，主机段调用核函数在GPU上进行数组求和运算。**一旦内核被调用，控制权立刻被传回主机，这样的话，内核与
106 | 主机的是异步进行。**
107 | ```bashrc
108 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){...}
109 | ```
110 | 当内核在GPU上完成了对所有数组元素的处理后，其结果将通过```cudaMemcpy```函数复制回到CPU内存中去。
111 | ```bashrc
112 | cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost); # cudaMemcpyDeviceToHost, GPU-->CPU
113 | ```
114 | 最后，一定别忘了调用`cudaFree`函数来释放GPU的内存。
115 | ```bashrc
116 | cudaFree(d_A);
117 | cudaFree(d_B);
118 | cudaFree(d_C);
119 | ```
120 | 关于在GPU上进行数组相加运算，详细代码请见[`sumArraysOnGPU.cu`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/sumArraysOnGPU.cu)，现在使用以下命令来编译和执行
121 | ```bashrc
122 | $ nvcc -arch=sm_20 sumArraysOnGPU.cu -o sumArraysOnGPU
123 | $ ./sumArraysOnGPU
124 | malloc memory on Host
125 | initialize data on Host
126 |  8.900000 23.000000 14.100000 12.200000
127 |  8.900000 23.000000 14.100000 12.200000
128 | malloc memory on GPU
129 | copying inputs from Host to Device
130 | copying output from Device to Host
131 | Caculating On GPU
132 |  17.799999 46.000000 28.200001 24.400000
133 | ```
134 | 
135 | ### 2.2 编写核函数
136 | 
137 | 不妨先来介绍下核函数、块和线程的概念。核函数是在设备端执行的代码，它描述的是在GPU上运行计算的任务，它的调用形式具体表现为
138 | ```bashrc
139 | kernel_name <<<block, thread>>>(argument list);
140 | ```
141 | `argument list`是形参，`block`是指网格维度，表示启动块的数目；`thread`表示的是块的维度，也就是每个块中线程的数目。每个线程的坐标表里以`blockIdx`和`threadIdx`来表示,因此我们可以得到总线程数量为`block*thread`。例如，在下图中有4096个线程块，因此网格维度gridDim=4096；每个块中有256个线程，因此块维度blockDim=256，因此一共有4096*256个线程。
142 | <div align=center><img src="https://github.com/YunYang1994/cuda-tutorial/blob/master/image/block-thread.jpg" alt="logo" height="200"></div>
143 | 
144 | 当核函数被调用时，许多不同的CUDA线程并行执行同一个计算任务，以下用`__global`声明定义核函数:
145 | ```bashrc
146 | __global__ void kernel_name(argument list); // 核函数必须要有一个void返回类型
147 | ```
148 | 下表总结了 CUDA C 程序中函数类型的限定符。函数限定符将指定一个函数在主机上执行还是在设备上执行，以及可被主机调用还是被设备调用。
149 | 
150 | | 限定符  | 执行 | 调用 | 备注 |
151 | | ---------- | -----------| ---------- | -----------|
152 | | `__global__`   | 在设备端执行   | 可从主机、设备端调用 | 必须有一个void返回类型 |
153 | | `__device__`   | 在设备端执行   | 仅能从设备端调用 |
154 | | `__host__`  | 在主机端执行 | 仅能从主机端上调用 | 可以省略不写 |
155 | 
156 | 考虑一个简单的例子，将两个大小为6为向量**A**和**B**相加为例。由于每个元素相加过程不存在相关性，现在使考虑使用两个块，每个块包含3个线程来计算该过程。因此来说，**每个线程的计算就是每个元素的相加过程**。在代码[`sumArraysOnGPU.cu`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/sumArraysOnGPU.cu)的基础上，我们需要
157 | 
158 | #### 1. 定义块和线程
159 | ```cpp
160 | dim3 block(2);
161 | dim3 thread(3);
162 | ```
163 | #### 2. 定义核函数
164 | 在这里，每个线程都将调用同一个核函数。因此可以考虑基于给定块索引和线程索引来计算全局数据访问的唯一索引:
165 | ```cpp
166 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
167 | 
168 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
169 |     if (idx < N) C[idx] = A[idx] + B[idx]; // 检查是否越界
170 |     printf("%f + %f = %f Caculated On GPU: block %d thread %d\n", 
171 |              A[idx], B[idx], C[idx], blockIdx.x, threadIdx.x);
172 | }
173 | ```
174 | #### 3. 执行和编译
175 | 完整代码见[`sumArraysOnGPU1.cu`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/sumArraysOnGPU1.cu)，最终通过以下命令编译执行，得到
176 | ```bashrc
177 | $  nvcc -arch=sm_20 sumArraysOnGPU1.cu -o sumArraysOnGPU1
178 | $ ./sumArraysOnGPU1
179 | 向量 A: 20.400000 25.299999 1.000000 12.300000 17.700001 18.299999
180 | 向量 B: 20.400000 25.299999 1.000000 12.300000 17.700001 18.299999
181 | 向量 C 的每个元素计算过程:
182 | 20.400000 + 20.400000 = 40.799999 Caculated On GPU: block 0 thread 0
183 | 25.299999 + 25.299999 = 50.599998 Caculated On GPU: block 0 thread 1
184 | 1.000000 + 1.000000 = 2.000000 Caculated On GPU: block 0 thread 2
185 | 12.300000 + 12.300000 = 24.600000 Caculated On GPU: block 1 thread 0
186 | 17.700001 + 17.700001 = 35.400002 Caculated On GPU: block 1 thread 1
187 | 18.299999 + 18.299999 = 36.599998 Caculated On GPU: block 1 thread 2
188 | ```
189 | ### 2.3 矩阵运算
190 | 
191 | ####  2.3.1 矩阵索引
192 | 在一个二维矩阵加法的核函数中，一个线程通常被分配一个数据元素来处理。首先要完成的任务是如何使用块和线程索引从全局内存中访问指定的数据。一般来说，其步骤如下
193 | 
194 | - **第一步，可以用以下公式把线程和块索引映射到矩阵坐标上，称为坐标索引:**
195 | ```bashrc
196 | ix = threadIdx.x + blockIdx.x * blockDim.x
197 | iy = threadIdx.y + blockIdx.y * blockDim.y
198 | 
199 | 坐标索引: (ix, iy)
200 | ```
201 | <div align=center><img src="https://github.com/YunYang1994/cuda-tutorial/blob/master/image/thread-index.png" alt="logo" height="250"></div>
202 | 
203 | 
204 | - **第二步，可以用以下公式把矩阵坐标映射到全局内存的索引/存储单元上，称为全局索引:**
205 | ```bashrc
206 | idx = iy*nx + ix // nx 表示在x维度上元素个数, 对于(4,4)矩阵而言, nx=4
207 | 
208 | 全局索引: idx
209 | ```
210 | 
211 | 例如,对于维度为(4,4)的矩阵而言, 
212 | ```bashrc
213 |                                 0,  1,  2,  3,
214 |                                 4,  5,  6,  7,
215 |                                 8,  9,  10, 11,
216 |                                 12, 13, 14, 15,
217 |                                 
218 | 那么，对于元素'9' --> 坐标索引 (1,2)， 全局内存索引为 idx = 2*4 + 1 = 9
219 | ```
220 | 再比如，对于一个(6,8)维度的矩阵而言，假如划分为6个块，每个块有8个线程，那么就如下图所示
221 | 
222 | <div align=center><img src="https://github.com/YunYang1994/cuda-tutorial/blob/master/image/block-index.png" alt="logo" height="250"></div>
223 | 
224 | 从上图中也可以看出: `block`也有两个维度，分别是`blockIdx.x`和`blockIdx.y`，即`block`的索引坐标表示为(`blockIdx.x`,`blockIdx.y`)。类似地，在每个`block`中，线程`thread`也有两个维度。
225 | 
226 | ####  2.3.2 求和运算
227 | 在本小节中，我们将对二维矩阵的求和运算作并行处理。由于矩阵是二维的，不妨考虑将使用一个二维网格和二维块来编写一个矩阵加法的核函数。
228 | 
229 | ```cpp
230 | __global__ void sumMatrixOnGPU2D(int *MatA, int *MatB, int *MatC, int nx, int ny){
231 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
232 |     unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
233 |     unsigned int idx = iy * nx + ix;    // 矩阵的存储都是通过一维数组的形式存储，因此我们需要计算全局索引位置
234 |     
235 |     if(ix < nx && iy < ny)
236 |         MatC[idx] = MatA[idx] + MatB[idx];
237 | }
238 | ```
239 | 
240 | 然后我们通过以下命令编译和执行文件[`sumMatrixOnGPU-2D-grid-2D-block.cu`](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter02/sumMatrixOnGPU-2D-grid-2D-block.cu)
241 | ```bashrc
242 | $ nvcc -arch=sm_20 sumMatrixOnGPU-2D-grid-2D-block.cu -o sumMatrixOnGPU-2D-grid-2D-block
243 | $ ./sumMatrixOnGPU-2D-grid-2D-block
244 | 
245 | Matrix: (4.4)
246 |   3  7  9  8
247 |   7 10 12  9
248 |  17 10 12 18
249 |  12 22 17 21
250 | 
251 | 
252 | Matrix: (4.4)
253 |   0  7  4  9
254 |   5 13 13 16
255 |  10  9 12 14
256 |  19 18 23 17
257 | 
258 | sumMatrixOnHost elapsed 0.000001 sec
259 | sumMatrixOnGPU2D <<<(1, 1), (32, 32)>>> elapsed 0.000021 sec
260 | 
261 | Matrix: (4.4)
262 |   3 14 13 17
263 |  12 23 25 25
264 |  27 19 24 32
265 |  31 40 40 38
266 | 
267 | Arrays match!
268 | ```
269 | 当我们把矩阵的维度渐渐增大时，GPU和CPU的运行时间差异就很明显了
270 | 
271 | | 矩阵维度 | 4 x 4 | 16 x 16 | 256 x 256 | 512 x 512 | 1024 x 1024 | 2048 x 2048 | 8192 x 8192 |
272 | | ---------- | -----------| ---------- | -----------| ---------- | -----------| ---------- | -----------|
273 | | `CPU`   | 0.000001 s  | 0.000004 s | 0.000206 s | 0.000797 s | 0.003192 s | 0.012736 s | 0.051285 s |
274 | | `GPU`   | 0.000021 s  | 0.000021 s | 0.000113 s | 0.000449 s | 0.000923 s | 0.001989 s | 0.006241 s |
275 | 
276 | 上一章: [**基于CUDA的异构并行计算**](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter01/README.md)<br>
277 | 下一章: [**CUDA执行模型**](https://github.com/YunYang1994/cuda-tutorial/blob/master/src/chapter03/README.md)
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------