├── .gitignore
├── CUDA编程指南5.0水印版.pdf
├── README.md
├── image.jpg
├── tutorials_eight.cu
├── tutorials_eleven.cu
├── tutorials_first.cu
├── tutorials_four.cu
├── tutorials_nine.cu
├── tutorials_seven.cu
├── tutorials_six.cu
├── tutorials_ten.cu
├── tutorials_three.cu
└── tutorials_yolo_part_postprocess.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | # Repo-specific GitIgnore ----------------------------------------------------------------------------------------------
2 | 
3 | cuda_tutorials/x64/
4 | x64/
5 | cuda_tutorials.vc*
6 | tutorials_yolo_postprocess.cu


--------------------------------------------------------------------------------
/CUDA编程指南5.0水印版.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjunjun966/cuda-tutorial-master/f7a6f766bb2dce2c9fa806aca316bda362699cc7/CUDA编程指南5.0水印版.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # cuda教程目录
 3 | 
 4 | #### 第一章 	指针篇
 5 | #### 第二章 	CUDA原理篇
 6 | #### 第三章 	CUDA编译器环境配置篇
 7 | #### 第四章 	kernel函数基础篇
 8 | #### 第五章 	kernel索引(index)篇
 9 | #### 第六章 	kenel矩阵计算实战篇
10 | #### 第七章 	kenel实战强化篇
11 | #### 第八章 	CUDA内存应用与性能优化篇
12 | #### 第九章 	CUDA原子(atomic)实战篇
13 | #### 第十章 	CUDA流(stream)实战篇
14 | #### 第十一章 	CUDA的NMS算子实战篇
15 | #### 第十二章 	YOLO的部署实战篇
16 | #### 第十三章 	基于CUDA的YOLO部署实战篇
17 | # cuda教程背景
18 |    随着人工智能的发展与人才的内卷，很多企业已将深度学习算法的C++部署能力作为基本技能之一。面对诸多arm相关且资源有限的设备，往往想更好的提速，满足更高时效性，必将更多类似矩阵相关运算交给CUDA处理。同时，面对市场诸多教程与诸多博客岑子不起的教程或高昂教程费用，使读者(特别是小白)容易迷糊，无法快速入手CUDA编程，实现工程化。
19 | 	因此，我将结合我的工程实战经验，我将在本专栏实现CUDA系列教程，帮助读者(或小白)实现CUDA工程化，掌握CUDA编程能力。学习我的教程专栏，你将绝对能实现CUDA工程化，完全从环境安装到CUDA核函数编程，从核函数到使用相关内存优化，从内存优化到深度学习算子开发(如：nms)，从算子优化到模型(以yolo系列为基准)部署。最重要的是，我的教程将简单明了直切主题，CUDA理论与实战实例应用，并附相关代码，可直接上手实战。我的想法是掌握必要CUDA相关理论，去除非必须繁杂理论，实现CUDA算法应用开发，待进一步提高，将进一步理解更高深理论。
20 | # cuda教程内容
21 | 
22 | 第一章到第三章探索指针在cuda函数中的作用与cuda相关原理及环境配置；
23 | 
24 | 第四章初步探索cuda相关函数编写(__global__、__device__、__host__等)，实现简单入门；
25 | 
26 | 第五章探索不同grid与block配置，如何计算kernel函数的index，以便后续通过index实现各种运算；
27 | 
28 | 第六、七章由浅入深探索核函数矩阵计算，深入探索grid、block与thread索引对kernel函数编写作用与影响，并实战多个应用列子(如：kernel函数实现图像颜色空间转换)；
29 | 
30 | 第八章探索cuda内存纹理内存、常量内存、全局内存等分配机制与内存实战应用(附代码)，通过不同内存的使用来优化cuda计算性能；
31 | 
32 | 第九章探索cuda原子(atomic)相关操作，并实战应用(如:获得某些自加索引等)；
33 | 
34 | 第十章探索cuda流stream相关应用，并给出相关实战列子(如:多流操作等)；
35 | 
36 | 第十一到十三章探索基于tensorrt部署yolo算法，我们首先将给出通用tensorrt的yolo算法部署，该部署的前后处理基于C++语言的host端实现，然后给出基于cuda的前后处理的算子核函数编写，最后数据无需在gpu与host间复制操作，实现gpu处理，提升算法性能。
37 | 
38 | ```目前，以上为我们的cuda教学全部内容，若后续读者有想了解知识，可留言，我们将根据实际情况，更新相关教学内容。```
39 | 
40 | 
41 | ### 我已收到网友相关需求，我基本已共享很多代码，可自习琢磨研究，具体代码解读或系列理论内容在我链接中。基于创造不易，内容查看或小部分代码学习需少量补偿。望理解，也帮忙顺手点赞点赞
42 | ## CSDN教程链接地址：[http://t.csdn.cn/J4KZj](http://t.csdnimg.cn/CD5IG)
43 | ## YOLO的部署实战篇开源代码地址：https://github.com/tangjunjun966/yolov5-tensorrt-onnx-master/tree/master
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tangjunjun966/cuda-tutorial-master/f7a6f766bb2dce2c9fa806aca316bda362699cc7/image.jpg


--------------------------------------------------------------------------------
/tutorials_eight.cu:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | 
  3 | /*
  4 |     
  5 |     我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  6 | 
  7 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  8 | 
  9 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
 10 | 
 11 |     可以保证，认真学完教程，cuda编程毫无压力。
 12 | 
 13 |     详情请链接:http://t.csdn.cn/NaCZ5
 14 | 
 15 |     @Description : 内存应用篇
 16 |     @Author      : tangjun
 17 |     @Date        :
 18 | */
 19 | 
 20 | 
 21 | #include <iostream>
 22 | #include <time.h>
 23 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 24 | #include "opencv2/opencv.hpp"
 25 | #include "device_launch_parameters.h"
 26 | #include <cuda_runtime_api.h>
 27 | #include <device_functions.h>
 28 | #include <texture_fetch_functions.h>
 29 | using namespace cv;
 30 | using namespace std;
 31 | 
 32 | 
 33 | 
 34 | #define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | int inquire_GPU_info() {
 47 |     int deviceCount;
 48 |     cudaGetDeviceCount(&deviceCount);
 49 | 
 50 |     int dev;
 51 |     for (dev = 0; dev < deviceCount; dev++)
 52 |     {
 53 |         int driver_version(0), runtime_version(0);
 54 |         cudaDeviceProp deviceProp;
 55 |         cudaGetDeviceProperties(&deviceProp, dev);
 56 |         if (dev == 0)
 57 |             if (deviceProp.minor = 9999 && deviceProp.major == 9999)
 58 |                 printf("\n");
 59 |         printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
 60 |         cudaDriverGetVersion(&driver_version);
 61 |         printf("CUDA驱动版本:                                         %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
 62 |         cudaRuntimeGetVersion(&runtime_version);
 63 |         printf("CUDA运行时版本:                                       %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10);
 64 |         printf("设备计算能力:                                         %d.%d\n", deviceProp.major, deviceProp.minor);
 65 |         printf("设备全局内存总量 Global Memory:                       %u M\n", deviceProp.totalGlobalMem / (1024 * 1024));
 66 |         printf("Number of SMs:                                        %d\n", deviceProp.multiProcessorCount);
 67 |         printf("常量内存 Constant Memory:                             %u K\n", deviceProp.totalConstMem / 1024);
 68 |         printf("每个block的共享内存 Shared Memory:                    %u K\n", deviceProp.sharedMemPerBlock / 1024);
 69 |         printf("每个block的寄存器 registers :                         %d\n", deviceProp.regsPerBlock);
 70 |         printf("线程束Warp size:                                      %d\n", deviceProp.warpSize);
 71 |         printf("每个SM的最大线程数 threads per SM:                    %d\n", deviceProp.maxThreadsPerMultiProcessor);
 72 |         printf("每个block的最大线程数 threads per block:              %d\n", deviceProp.maxThreadsPerBlock);
 73 |         printf("每个block的最大维度 each dimension of a block:        %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
 74 |         printf("每个grid的最大维度 dimension of a grid:               %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
 75 |         printf("Maximum memory pitch:                                 %u bytes\n", deviceProp.memPitch);
 76 |         printf("Texture alignmemt:                                    %u bytes\n", deviceProp.texturePitchAlignment);
 77 |         printf("Clock rate:                                           %.2f GHz\n", deviceProp.clockRate * 1e-6f);
 78 |         printf("Memory Clock rate:                                    %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
 79 |         printf("Memory Bus Width:                                     %d-bit\n", deviceProp.memoryBusWidth);
 80 |     }
 81 | 
 82 |     return 0;
 83 | }
 84 | 
 85 | 
 86 | bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line)
 87 | {
 88 |     if (code != cudaSuccess)
 89 |     {
 90 |         const char* err_name = cudaGetErrorName(code);
 91 |         const char* err_message = cudaGetErrorString(code);
 92 |         printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);
 93 |         return false;
 94 |     }
 95 |     return true;
 96 | }
 97 | 
 98 | __global__  void show_value(float* v) {
 99 |     int idx = threadIdx.x;
100 |     if (idx < 10) {
101 |         printf("value_%d: \t%.2f\n", idx, v[idx]);  //只读取前10个数
102 |     }
103 | }
104 | 
105 | int memory_appy1()
106 | {
107 | 
108 |     int device_id = 0;
109 |     checkRuntime(cudaSetDevice(device_id));
110 |     std::cout << "设置gpu id为：\t" << device_id << std::endl;
111 | 
112 | 
113 |     std::cout << "设置全局内存" << std::endl;
114 |     float* memory_device = nullptr; // Global Memory
115 |     cudaMalloc((void**)&memory_device, 100 * sizeof(float)); // pointer to device
116 | 
117 | 
118 |     std::cout << "设置new(malloc)可分页内存" << std::endl;
119 |     float* memory_host = new float[100]; // Pageable Memory
120 |     for (int i = 0; i < 100; i++) { memory_host[i] = i * 100; }
121 |     checkRuntime(cudaMemcpy(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice)); // 返回的地址是开辟的device地址，存放在memory_device
122 |     show_value << <dim3(1), dim3(100) >> > (memory_device);
123 | 
124 | 
125 | 
126 | 
127 |     std::cout << "设置页锁定内存" << std::endl;
128 |     float* memory_page_locked = nullptr; // Pinned Memory
129 |     checkRuntime(cudaMallocHost((void**)&memory_page_locked, 100 * sizeof(float))); // 返回的地址是被开辟的pin memory的地址，存放在memory_page_locked
130 |     checkRuntime(cudaMemcpy(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost)); // 将其返回host内存
131 | 
132 | 
133 | 
134 | 
135 |     //printf("%f\n", memory_page_locked[2]);
136 |     checkRuntime(cudaFreeHost(memory_page_locked));
137 |     delete[] memory_host;
138 |     checkRuntime(cudaFree(memory_device));
139 | 
140 |     return 0;
141 | }
142 | 
143 | 
144 | 
145 | __device__ float dev_array[10];
146 | __global__ void my_device_function(float* ptr) {
147 |     // Use the device variable
148 |     int idx = threadIdx.x;
149 | 
150 |     // Do something with the value
151 |     ptr[idx] = dev_array[idx] + 0.9f;
152 | 
153 | }
154 | 
155 | void memory_appy2() {
156 |     // Allocate memory on the device
157 |     float* dev_ptr;
158 |     cudaMalloc((void**)&dev_ptr, 10 * sizeof(float));
159 | 
160 |     // Copy data from host to device
161 |     float host_array[10] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f };
162 |     cudaMemcpyToSymbol(dev_array, host_array, 10 * sizeof(float)); //赋值
163 | 
164 |     // Call a device function that uses the device variable
165 |     my_device_function << <1, 10 >> > (dev_ptr);
166 | 
167 |     float* host_ptr = nullptr;
168 |     cudaMallocHost((void**)&host_ptr, sizeof(int) * 10);
169 |     cudaMemcpy(host_ptr, dev_ptr, sizeof(float) * 10, cudaMemcpyDeviceToHost);
170 | 
171 |     for (int i = 0; i < 10; i++) { std::cout << host_ptr[i] << endl; }
172 | 
173 |     // Free memory on the device
174 |     cudaFree(dev_ptr);
175 | }
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | __global__ void static_sharedMemKernel(int* input, int* output)
184 | {
185 |     // 定义共享内存
186 |     __shared__ int sharedMem[1024];
187 |     int tid = threadIdx.x;
188 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
189 |     // 将数据从全局内存拷贝到共享内存
190 |     sharedMem[tid] = input[i];
191 |     // 等待所有线程都将数据拷贝到共享内存中
192 |     __syncthreads();
193 |     // 对共享内存中的数据进行处理
194 |     sharedMem[tid] *= 2;
195 |     sharedMem[tid] = sharedMem[tid] + 30;
196 |     // 等待所有线程都完成数据处理
197 |     __syncthreads();
198 |     // 将结果从共享内存拷贝回全局内存
199 |     output[i] = sharedMem[tid];
200 | }
201 | 
202 | 
203 | __global__ void dynamic_sharedMemKernel(int* input, int* output)
204 | {
205 |     // 定义共享内存
206 |     extern __shared__ int sharedMem[];   //使用extern非常重要
207 |     int tid = threadIdx.x;
208 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
209 |     // 将数据从全局内存拷贝到共享内存
210 |     sharedMem[tid] = input[i];
211 |     // 等待所有线程都将数据拷贝到共享内存中
212 |     __syncthreads();
213 |     // 对共享内存中的数据进行处理
214 |     sharedMem[tid] *= 2;
215 |     sharedMem[tid] = sharedMem[tid] + 30;
216 |     // 等待所有线程都完成数据处理
217 |     __syncthreads();
218 |     // 将结果从共享内存拷贝回全局内存
219 |     output[i] = sharedMem[tid];
220 | }
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | int memory_appy3()
229 | {
230 |     const int n = 9900;
231 |     const int thrid = 1024;
232 |     int input[n];
233 |     int output[n];
234 |     int* d_input, * d_output;
235 |     for (int i = 0; i < n; i++) { input[i] = i; }
236 |     // 分配设备内存
237 |     cudaMalloc((void**)&d_input, n * sizeof(int));
238 |     cudaMalloc((void**)&d_output, n * sizeof(int));
239 |     // 将输入数据拷贝到设备内存中
240 |     cudaMemcpy(d_input, input, n * sizeof(int), cudaMemcpyHostToDevice);
241 | 
242 |     // 调用核函数
243 |     unsigned int grid = (n + thrid - 1) / thrid;  //列写给x
244 |     dim3 gridperblock(grid);
245 | 
246 |     auto start = std::chrono::system_clock::now();  //时间函数
247 |     static_sharedMemKernel << <gridperblock, thrid >> > (d_input, d_output);
248 |     auto end = std::chrono::system_clock::now();
249 |     float time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
250 |     cout << "\n计算时间:" << time << endl;
251 | 
252 |     // 将结果拷贝回主机内存中
253 |     cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
254 |     std::cout << "输出静态共享内存结果output前5位数:" << endl;
255 |     for (int i = 0; i < 5; i++) { cout << output[i] << endl; }
256 | 
257 |     dynamic_sharedMemKernel << <gridperblock, thrid, 1024 * sizeof(int) >> > (d_input, d_output);
258 |     cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
259 |     std::cout << "输出动态共享内存结果output前5位数:" << endl;
260 |     for (int i = 0; i < 5; i++) { cout << output[i] << endl; }
261 | 
262 |     // 释放设备内存
263 |     cudaFree(d_input);
264 |     cudaFree(d_output);
265 |     return 0;
266 | }
267 | 
268 | 
269 | 
270 | 
271 | //声明纹理，用来绑定纹理，其实也就是个纹理标识  
272 | texture<unsigned int, 1, cudaReadModeElementType> texone;
273 | 
274 | 
275 | //核心代码，在gpu端执行的kernel，  
276 | __global__ void Textureone(unsigned int* listTarget, int size)
277 | {
278 |     unsigned int texvalue = 0;
279 |     int index = blockIdx.x * blockDim.x + threadIdx.x; //通过线程ID得到数组下标 
280 |     if (index < size)
281 |         texvalue = tex1Dfetch(texone, index) * 100; //通过索引获得纹理值再乘100 
282 |     listTarget[index] = texvalue;
283 | }
284 | 
285 | 
286 | void memory_appy4_one()
287 | {
288 |     const int _length = 100;
289 |     unsigned int* listSource = new unsigned int[_length];
290 |     unsigned int* listTarget = new unsigned int[_length];
291 | 
292 |     //赋值  
293 |     for (int i = 0; i < _length; i++) { listSource[i] = i; }
294 | 
295 |     unsigned int* dev_Source;
296 |     unsigned int* dev_Target;
297 | 
298 |     //在设备上申请显存空间  
299 |     cudaMalloc((void**)&dev_Source, _length * sizeof(unsigned int));
300 |     cudaMalloc((void**)&dev_Target, _length * sizeof(unsigned int));
301 |     //将host端的数据拷贝到device端  
302 |     cudaMemcpy(dev_Source, listSource, _length * sizeof(unsigned int), cudaMemcpyHostToDevice);
303 | 
304 | 
305 |     //绑定纹理，绑定的纹理标识对应的数据   
306 |     cudaBindTexture(0, texone, dev_Source);
307 | 
308 |     //调用kernel  
309 |     Textureone << < ceil(_length / 10), 10 >> > (dev_Target, _length);
310 | 
311 |     //将结果拷贝到host端 ☆host就是CPU  
312 |     cudaMemcpy(listTarget, dev_Target, _length * sizeof(unsigned int), cudaMemcpyDeviceToHost);
313 | 
314 |     //取消绑定  
315 |     cudaUnbindTexture(texone);
316 | 
317 |     //释放内存空间  
318 |     cudaFree(dev_Source);
319 |     cudaFree(dev_Target);
320 | 
321 | 
322 |     cout << "原始数据： " << endl;
323 |     for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; }
324 | 
325 |     cout << endl << endl << "运算结果： " << endl;
326 |     for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; }
327 |     getchar();
328 | }
329 | 
330 | 
331 | 
332 | 
333 | 
334 | 
335 | texture<uchar4, 2, cudaReadModeElementType> textwo;
336 | 
337 | 
338 | __global__ void my_kernel(uchar3* output, int width, int height)
339 | {
340 |     uchar3 img_v;
341 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
342 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
343 | 
344 |     //float u = (float)x / (float)width;
345 |     //float v = (float)y / (float)height;
346 | 
347 |     uchar4 value = tex2D(textwo, x, y);
348 | 
349 |     uchar4 swapped_value = make_uchar4(value.z, value.y, value.x, value.w);
350 |     img_v.x = value.x;
351 |     img_v.y = value.y;
352 |     img_v.z = value.z;
353 | 
354 |     //printf("\n%.2f", (float)value.y);
355 |     output[x + y * width] = img_v;
356 | 
357 | }
358 | 
359 | 
360 | void show(Mat img, string name = "image") {
361 |     cv::imshow(name, img);
362 |     cv::waitKey(1000);
363 |     cv::destroyAllWindows();
364 | 
365 | }
366 | 
367 | int memory_appy4_two()
368 | {
369 |     // 读取图像数据
370 |     cv::Mat image = cv::imread("image.jpg", cv::IMREAD_COLOR);
371 |     show(image, "img_ori");
372 | 
373 | 
374 | 
375 |     // 申请二维纹理内存
376 |     cudaArray* cuArray;
377 |     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar4>();
378 |     cudaMallocArray(&cuArray, &channelDesc, image.cols, image.rows);
379 |     cudaMemcpy2DToArray(cuArray, 0, 0, image.data, image.step, image.cols * sizeof(uchar4), image.rows, cudaMemcpyHostToDevice);
380 | 
381 |     // 绑定纹理对象和二维纹理内存
382 |     cudaBindTextureToArray(textwo, cuArray);
383 | 
384 |     // 调用核函数
385 |     dim3 block(16, 16);
386 |     dim3 grid((image.cols + block.x - 1) / block.x, (image.rows + block.y - 1) / block.y);
387 | 
388 |     cv::Mat outimg = Mat(Size(image.cols, image.rows), CV_8UC1);
389 |     uchar3* output;
390 |     cudaMallocHost(&output, image.cols * image.rows * sizeof(uchar3));
391 | 
392 |     //cudaMallocHost(&outimg.data, image.cols * image.rows * sizeof(uchar3));
393 |     my_kernel << <grid, block >> > (output, image.cols, image.rows);
394 | 
395 |     //cudaMemcpy(outimg.data, output, image.cols * image.rows * sizeof(uchar4), cudaMemcpyDeviceToHost);
396 | 
397 |     // 解绑纹理对象和二维纹理内存
398 |     cudaUnbindTexture(textwo);
399 | 
400 | 
401 | 
402 | 
403 |     //show(outimg,"outimg");
404 | 
405 |     // 释放内存
406 |     cudaFree(output);
407 |     cudaFreeArray(cuArray);
408 |     cout << "ok" << endl;
409 |     return 0;
410 | }
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | texture<float, 2, cudaReadModeElementType> texRef;
422 | 
423 | __global__ void kernel(float* output, int width, int height)
424 | {
425 |     int x = blockIdx.x * blockDim.x + threadIdx.x;
426 |     int y = blockIdx.y * blockDim.y + threadIdx.y;
427 | 
428 |     if (x < width && y < height) { output[y * width + x] = tex2D(texRef, x, y); }
429 | }
430 | 
431 | int memory_appy5()
432 | {
433 |     int width = 512;
434 |     int height = 512;
435 |     int size = width * height * sizeof(float);
436 | 
437 |     float* input = (float*)malloc(size);
438 |     float* output = (float*)malloc(size);
439 | 
440 |     // 初始化输入数据
441 |     for (int i = 0; i < width * height; i++) { input[i] = (float)i; }
442 | 
443 |     // 定义CUDA数组
444 |     cudaArray* cuArray;
445 |     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
446 |     cudaMallocArray(&cuArray, &channelDesc, width, height);
447 | 
448 |     // 将输入数据拷贝到CUDA数组中
449 |     cudaMemcpyToArray(cuArray, 0, 0, input, size, cudaMemcpyHostToDevice);
450 | 
451 |     // 设置纹理内存参数
452 |     texRef.addressMode[0] = cudaAddressModeWrap;
453 |     texRef.addressMode[1] = cudaAddressModeWrap;
454 |     texRef.filterMode = cudaFilterModeLinear;
455 |     texRef.normalized = false;
456 | 
457 |     // 绑定纹理内存到CUDA数组
458 |     cudaBindTextureToArray(texRef, cuArray);
459 | 
460 |     // 定义CUDA核函数的线程块和线程格
461 |     dim3 block(16, 16);
462 |     dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y);
463 | 
464 |     // 调用CUDA核函数
465 |     kernel << <grid, block >> > (output, width, height);
466 | 
467 |     // 将输出数据从设备拷贝到主机
468 |     cudaMemcpy(output, output, size, cudaMemcpyDeviceToHost);
469 | 
470 |     // 输出结果
471 |     for (int i = 0; i < width * height; i++)
472 |     {
473 |         printf("%f ", output[i]);
474 |     }
475 | 
476 |     // 解绑纹理内存
477 |     cudaUnbindTexture(texRef);
478 | 
479 |     // 释放CUDA数组和内存
480 |     cudaFreeArray(cuArray);
481 |     free(input);
482 |     free(output);
483 | 
484 |     return 0;
485 | }
486 | 
487 | 
488 | 
489 | 
490 | 
491 | 
492 | 
493 | 
494 | 
495 | 
496 | 
497 | void main_eight() {
498 |     inquire_GPU_info();
499 |     memory_appy1();
500 |     memory_appy2();
501 |     memory_appy3();
502 |     memory_appy4_one();
503 |     memory_appy4_two();
504 | 
505 | 
506 | 
507 | }
508 | 
509 | 
510 | 
511 | 
512 | 
513 | 
514 | 
515 | 
516 | 


--------------------------------------------------------------------------------
/tutorials_eleven.cu:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | 
  3 | 
  4 | /*!
  5 | 
  6 | 
  7 |     我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  8 | 
  9 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
 10 | 
 11 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
 12 | 
 13 |     可以保证，认真学完教程，cuda编程毫无压力。
 14 | 
 15 |     详情请链接:http://t.csdn.cn/NaCZ5
 16 | 
 17 | 
 18 |     @Description : cuda nms计算方法
 19 |     @Author      : tangjun
 20 |     @Date        : 2023-08-07
 21 | */
 22 | 
 23 | 
 24 | 
 25 | 
 26 | #include <iostream>
 27 | #include <time.h>
 28 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 29 | #include "opencv2/opencv.hpp"
 30 | #include "device_launch_parameters.h"
 31 | #include <cuda_runtime_api.h>
 32 | #include <device_functions.h>
 33 | #include <texture_fetch_functions.h>
 34 | using namespace cv;
 35 | using namespace std;
 36 | 
 37 | 
 38 | 
 39 | 
 40 | #include <iostream>
 41 | #include <vector>
 42 | #include <algorithm>
 43 | #include <cmath>
 44 | #include <cuda_runtime.h>
 45 | 
 46 | using namespace std;
 47 | 
 48 | // 定义矩形框的结构体
 49 | struct nms_box {
 50 |     float x1, y1, x2, y2;
 51 |     float score;
 52 |     int cls_id;
 53 | };
 54 | 
 55 | // 定义CUDA核函数，用于计算两个矩形框之间的IOU值
 56 | __device__ float iou(nms_box a, nms_box b)
 57 | {
 58 |     float x1 = fmaxf(a.x1, b.x1);
 59 |     float y1 = fmaxf(a.y1, b.y1);
 60 |     float x2 = fminf(a.x2, b.x2);
 61 |     float y2 = fminf(a.y2, b.y2);
 62 |     float intersection = fmaxf(0.0f, x2 - x1) * fmaxf(0.0f, y2 - y1);
 63 |     float area_a = (a.x2 - a.x1) * (a.y2 - a.y1);
 64 |     float area_b = (b.x2 - b.x1) * (b.y2 - b.y1);
 65 |     float union_ = area_a + area_b - intersection;
 66 |     return intersection / union_;
 67 | }
 68 | 
 69 | // 定义CUDA核函数，用于执行NMS算法
 70 | __global__ void nms_kernel(nms_box* boxes, int* indices, int* num_indices, float nms_thr)
 71 | {
 72 |     /*
 73 |     boxes:输入nms信息，为结构体
 74 |     indices:输入为列表序列，记录所有box，如[0,1,2,3,4,5,...]，后续将不需要会变成-1。
 75 |     num_indices:记录有多少个box数量
 76 |     float nms_thr:nms的阈值，实际为iou阈值
 77 |     */
 78 | 
 79 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 80 |     if (i >= *num_indices) { return; }
 81 | 
 82 |     int index = indices[i];
 83 | 
 84 |     if (index == -1) { return; }
 85 | 
 86 |     nms_box box = boxes[index];
 87 | 
 88 | 
 89 |     for (int j = i + 1; j < *num_indices; j++) {
 90 |         int other_index = indices[j];
 91 |         if (other_index == -1) { continue; }
 92 | 
 93 |         nms_box other_box = boxes[other_index];
 94 |         float iou_value = iou(box, other_box);
 95 |         printf("iou value:%f\n", iou_value);
 96 |         if (iou_value > nms_thr) { indices[j] = -1; }
 97 | 
 98 |     }
 99 | }
100 | 
101 | vector<nms_box> nms(vector<nms_box> boxes, float threshold)
102 | {
103 |     int num_boxes = boxes.size();
104 | 
105 |     // 将矩形框转换为CUDA中的Box结构体
106 |     nms_box* d_boxes = nullptr;
107 |     cudaMalloc(&d_boxes, num_boxes * sizeof(nms_box));
108 |     cudaMemcpy(d_boxes, boxes.data(), num_boxes * sizeof(nms_box), cudaMemcpyHostToDevice);
109 | 
110 | 
111 | 
112 | 
113 |     // 创建一个索引数组，用于标记哪些矩形框应该被保留
114 |     int* d_indices;
115 |     cudaMallocHost(&d_indices, num_boxes * sizeof(int));
116 |     for (int i = 0; i < num_boxes; i++) { d_indices[i] = i; }
117 | 
118 | 
119 | 
120 |     // 在CUDA设备上执行NMS算法
121 |     int num_indices = num_boxes;
122 |     int* d_num_indices = nullptr;
123 |     cudaMalloc(&d_num_indices, sizeof(int));
124 |     cudaMemcpy(d_num_indices, &num_indices, sizeof(int), cudaMemcpyHostToDevice);
125 | 
126 | 
127 | 
128 | 
129 | 
130 |     int blockSize = 256;
131 |     int numBlocks = (num_boxes + blockSize - 1) / blockSize;
132 |     nms_kernel << <numBlocks, blockSize >> > (d_boxes, d_indices, d_num_indices, threshold);
133 |     //
134 | 
135 | 
136 | 
137 | 
138 |     // 将保留的矩形框复制回主机端
139 |     cudaMemcpy(&num_indices, d_num_indices, sizeof(int), cudaMemcpyDeviceToHost);
140 | 
141 | 
142 | 
143 |     int* h_indices = new int[num_indices];
144 | 
145 |     cudaMemcpy(h_indices, d_indices, num_indices * sizeof(int), cudaMemcpyDeviceToHost);
146 | 
147 |     std::cout << "打印需要保存box的索引值:" << endl;
148 |     for (int i = 0; i < num_indices; i++) {
149 |         std::cout << "keep indices:" << h_indices[i] << endl;
150 |     }
151 | 
152 | 
153 |     vector<nms_box> kept_boxes(num_indices);
154 |     for (int i = 0; i < num_indices; i++) {
155 |         if (h_indices[i] > -1) {
156 |             kept_boxes[i] = boxes[h_indices[i]];
157 |         }
158 |     }
159 | 
160 | 
161 |     // 释放内存
162 |     cudaFree(d_boxes);
163 |     cudaFree(d_indices);
164 |     cudaFree(d_num_indices);
165 |     delete[] h_indices;
166 | 
167 |     return kept_boxes;
168 | }
169 | 
170 | int main_eleven()
171 | {
172 |     // 创建一组矩形框
173 |     vector<nms_box> boxes = {
174 |         {367.0, 38.0, 677.0, 318.0, 0.9,1},
175 |         {502.0, 38.0, 731.0, 318.0, 0.8,2},
176 |         {303.0, 378.0, 831.0, 1071.0, 0.8,2},
177 |         {193.0, 435.0, 831.0, 931.0, 0.7,3},
178 |         {1039.0, 147.0, 1471.0, 557.0, 0.6,4},
179 |         {1339, 1.0,1571.0, 209.0, 0.5,5}
180 |     };
181 | 
182 | 
183 | 
184 | 
185 | 
186 |     // 执行NMS算法
187 |     vector<nms_box> kept_boxes = nms(boxes, 0.2);
188 | 
189 | 
190 | 
191 |     // 输出结果
192 |     cv::Mat image = cv::imread("image.jpg");
193 | 
194 |     for (nms_box box : kept_boxes) {
195 |         cout << box.x1 << ", " << box.y1 << ", " << box.x2 << ", " << box.y2 << ", " << box.score << endl;
196 | 
197 |         cv::Point p1(box.x1, box.y1);
198 |         cv::Point p2(box.x2, box.y2);
199 |         cv::rectangle(image, p1, p2, cv::Scalar(0, 255, 0), 4, 1, 0);//矩形的两个顶点，两个顶点都包括在矩形内部
200 |     }
201 | 
202 |     cv::resize(image, image, cv::Size(600, 400), 0, 0, cv::INTER_NEAREST);
203 | 
204 |     cv::imshow("www", image);
205 |     cv::waitKey(100000);
206 |     cv::destroyAllWindows();
207 | 
208 |     return 0;
209 | }
210 | 
211 | 
212 | 
213 | 
214 | /*
215 | 
216 | 在这个示例中，我们定义了一个名为 iou 的函数，用于计算两个矩形框之间的 IOU（交并比）。然后，我们定义了一个名为 nms_kernel 的核函数，用于执行 NMS 算法。在 nms_kernel 中，我们首先获取当前线程的索引 tid，并获取该线程对应的矩形框 box。然后，我们遍历所有矩形框，并计算当前矩形框与其他矩形框之间的 IOU 值。如果 IOU 值大于阈值 iou_threshold，则将该矩形框标记为不保留。最后，我们将结果存储在 indices 中。
217 | 
218 | 在 nms 函数中，我们计算适当的块和网格大小，并调用 nms_kernel 核函数。
219 | 
220 | 请注意，这个示例只是一个简单的实现，并且可能不适用于所有情况。在实际应用中，您可能需要根据具体情况进行修改和优化。
221 | */
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 


--------------------------------------------------------------------------------
/tutorials_first.cu:
--------------------------------------------------------------------------------
  1 | ﻿/*!
  2 | 
  3 |    我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  4 | 
  5 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  6 | 
  7 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  8 | 
  9 |     可以保证，认真学完教程，cuda编程毫无压力。
 10 | 
 11 |     详情请链接:http://t.csdn.cn/NaCZ5
 12 | 
 13 | 
 14 | 
 15 | 
 16 |     @Description : 指针回顾
 17 |     @Author      : tangjun
 18 |     @Date        : 
 19 | */
 20 | 
 21 | 
 22 | #include <iostream>
 23 | #include <time.h>
 24 | 
 25 | using namespace std;
 26 | 
 27 | 
 28 | 
 29 | 
 30 | /*************************************第一节-指针篇**********************************************/
 31 | void Print_Pointers(int* ptr, int N) {
 32 |     for (int i = 0; i < N; i++)
 33 |     {
 34 |         std::cout << "order:\t" << i << "\tptr_value:\t" << *ptr << "\tphysical address:" << ptr << std::endl;
 35 |         ptr++;
 36 |     }
 37 | }
 38 | 
 39 | void pointer_1() {
 40 |     /*  探索指针赋值方法 */
 41 |     const int N = 6;
 42 |     int arr[N];
 43 |     for (int i = 0; i < N; i++) arr[i] = i + 1; //数组赋值
 44 |     //指针第一种赋值方法
 45 |     int* ptr = nullptr;
 46 |     ptr = arr;
 47 |     //指针第二种赋值方法
 48 |     int* ptr2 = arr;
 49 | 
 50 |     std::cout << "output ptr1 " << std::endl;
 51 |     Print_Pointers(ptr, N);
 52 |     std::cout << "\n\noutput ptr2 " << std::endl;
 53 |     Print_Pointers(ptr2, N);
 54 | 
 55 |     //单独变量赋值
 56 |     int a = 20;
 57 |     int* p = &a;
 58 |     std::cout << "\n\noutput p value: \t" << *p << "\tphysical address:\t" << p << std::endl;
 59 | 
 60 | }
 61 | 
 62 | void pointer_2() {
 63 |     const int N = 6;
 64 |     int arr[N];
 65 |     for (int i = 0; i < N; i++) arr[i] = i + 1; //数组赋值
 66 |     int* ptr = arr; //构建指针
 67 |     for (int i = 0; i < 5; i++)
 68 |     {
 69 |         std::cout << "ptr_value_" << i << ":\t" << *ptr << std::endl;;
 70 |         ptr++;
 71 |     }
 72 | }
 73 | 
 74 | 
 75 | void pointer_3() {
 76 |     int num = 4;
 77 |     int* p = &num;
 78 |     cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl;
 79 | 
 80 |     *p = *p + 20; //通过指针更改地址的值
 81 |     cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl;
 82 |     num = 30; //更改变量值
 83 |     cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl;
 84 | 
 85 | 
 86 | }
 87 | 
 88 | void pointer_4() {
 89 |     int num = 4;
 90 |     int* p1 = &num;
 91 |     //指针的指针第一种赋值方法
 92 |     int** p2 = &p1;
 93 |     //指针的指针第二种赋值方法
 94 |     int** p3;
 95 |     p3 = &p1;
 96 | 
 97 |     cout << "num value:\t" << num << "\t num address:\t" << &num << endl;
 98 |     cout << "p1 value:\t" << *p1 << "\t p1 address:\t" << p1 << endl;
 99 |     cout << "p2 value:\t" << *p2 << "\t p2 address:\t" << p2 << endl;
100 |     cout << "p3 value:\t" << *p3 << "\t p3 address:\t" << p3 << endl;
101 | 
102 |     cout << "p2 value:\t" << **p2 << "\t p2 address:\t" << *p2 << endl;
103 | }
104 | 
105 | void main_first() {
106 |     pointer_1();
107 |     pointer_2();
108 |     pointer_3();
109 |     pointer_4();
110 | 
111 | 
112 | }
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/tutorials_four.cu:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | 
  3 | /*!
  4 |     我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  5 | 
  6 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  7 | 
  8 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  9 | 
 10 |     可以保证，认真学完教程，cuda编程毫无压力。
 11 | 
 12 |     详情请链接:http://t.csdn.cn/NaCZ5
 13 | 
 14 | 
 15 |  
 16 |     @Description : CUDA函数基础篇
 17 |     @Author      : tangjun
 18 |     @Date        :
 19 | */
 20 | 
 21 | 
 22 | 
 23 | #include <iostream>
 24 | #include <time.h>
 25 | #include "device_launch_parameters.h"
 26 | #include <cuda_runtime_api.h>
 27 | #include <device_functions.h>
 28 | 
 29 | using namespace std;
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | /*************************************第四节-CUDA函数基础篇**********************************************/
 38 | 
 39 | float sigmoid_host(float x) {
 40 |     float y = 1 / (1 + exp(-x));
 41 |     return y;
 42 | }
 43 | 
 44 | __device__  float sigmoid(float x) {
 45 |     float y = 1 / (1 + exp(-x));
 46 |     //float y = sigmoid_host(x);
 47 |     return y;
 48 | }
 49 | 
 50 | __global__ void test_kernel(float* a, float* c) {
 51 | 
 52 |     int idx = threadIdx.x;
 53 |     c[idx] = sigmoid(a[idx]); //正确方式
 54 |     //c[idx] = sigmoid_host(a[idx]);//绝对错误，无法调用，即：global函数无法调用host函数，只能调用devices函数
 55 | 
 56 | 
 57 | }
 58 | 
 59 | void Print_dim(float* ptr, int N) {
 60 |     for (int i = 0; i < N; i++)
 61 |     {
 62 |         std::cout << "value:\t" << ptr[i] << std::endl;
 63 | 
 64 |     }
 65 | }
 66 | 
 67 | void init_variables_float(float* a, int m, int n) {
 68 | 
 69 |     //初始化变量
 70 |     std::cout << "value of a:" << endl;
 71 |     for (int i = 0; i < m; i++) {
 72 |         for (int j = 0; j < n; j++) {
 73 |             a[i * n + j] = rand() / 4089;
 74 |             std::cout << "\t" << a[i * n + j];
 75 | 
 76 |         }
 77 |         std::cout << "\n";
 78 |     }
 79 | 
 80 | 
 81 | }
 82 | 
 83 | void global2device() {
 84 |     const int m = 4;
 85 |     const int n = 2;
 86 |     //分配host内存
 87 |     float* a, * c;
 88 |     cudaMallocHost((void**)&a, sizeof(float) * m * n);
 89 |     cudaMallocHost((void**)&c, sizeof(float) * m * n);
 90 |     //变量初始化
 91 |     init_variables_float(a, m, n);
 92 |     // 分配gpu内存并将host值复制到gpu变量中
 93 |     float* g_a;
 94 |     cudaMalloc((void**)&g_a, sizeof(float) * m * n);
 95 |     cudaMemcpy(g_a, a, sizeof(float) * m * n, cudaMemcpyHostToDevice);
 96 |     float* g_c;
 97 |     cudaMalloc((void**)&g_c, sizeof(float) * m * n);
 98 |     test_kernel << <dim3(1), dim3(m * n), 0, nullptr >> > (g_a, g_c);
 99 |     cudaMemcpy(c, g_c, sizeof(float) * m * n, cudaMemcpyDeviceToHost);
100 |     Print_dim(c, m * n);
101 | 
102 | }
103 | 
104 | __device__ __host__  float sigmoid_device_host(float x) {
105 |     float y = 1 / (1 + exp(-x));
106 |     return y;
107 | }
108 | 
109 | void host2device() {
110 |     float y = sigmoid_device_host(1.25);
111 |     std::cout << y << endl;
112 |     std::cout << "success：host calling  device+host  " << endl;
113 |     //以下执行失败   
114 |     try {
115 |         float y = sigmoid_host(1.25);
116 |         throw std::runtime_error("error: fail");
117 |     }
118 |     catch (std::runtime_error err) {
119 |         std::cout << "fail：host calling device" << endl;
120 | 
121 |     }
122 | 
123 | }
124 | 
125 | 
126 | void main_four() {
127 | 
128 |     global2device();//host<--global<--device
129 |     host2device();
130 | 
131 | 
132 | 
133 | }
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/tutorials_nine.cu:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | 
  3 | 
  4 | /*!
  5 |     我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  6 | 
  7 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  8 | 
  9 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
 10 | 
 11 |     可以保证，认真学完教程，cuda编程毫无压力。
 12 | 
 13 |     详情请链接:http://t.csdn.cn/NaCZ5
 14 | 
 15 | 
 16 |     @Description : 原子操作篇
 17 |     @Author      : tangjun
 18 |     @Date        : 2023-8-21
 19 | */
 20 | 
 21 | 
 22 | #include <iostream>
 23 | #include <time.h>
 24 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 25 | #include "opencv2/opencv.hpp"
 26 | #include "device_launch_parameters.h"
 27 | #include <cuda_runtime_api.h>
 28 | #include <device_functions.h>
 29 | #include <texture_fetch_functions.h>
 30 | using namespace cv;
 31 | using namespace std;
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | extern "C" __global__ void kernel_func_error(int* counter, int* data_0)
 43 | {
 44 |     // 计算线程号
 45 |     unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
 46 |     unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \
 47 |         threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 48 | 
 49 |     // 统计结果
 50 |     int value = data_0[thread_index];
 51 |     //printf("%d\n", value);
 52 |     counter[value] ++;
 53 | }
 54 | 
 55 | extern "C" __global__ void kernel_func_correct(int* counter, int* data_0)
 56 | {
 57 |     // 计算线程号
 58 |     unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
 59 |     unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \
 60 |         threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
 61 | 
 62 |     // 统计结果
 63 |     int value = data_0[thread_index];
 64 |     atomicAdd(&counter[value], 1);
 65 | }
 66 | 
 67 | 
 68 | 
 69 | 
 70 | int atomic_apply1() {
 71 | 
 72 |     const int N = 32;
 73 |     int* gpu_buffer;
 74 |     int* host_data = new int[N];
 75 | 
 76 |     for (int i = 0; i < N; i++) { 
 77 |         if (i%2==0) {
 78 |             host_data[i] = 1;
 79 |         }
 80 |         else {
 81 |             host_data[i] = 0;
 82 |         }
 83 |     
 84 |     }
 85 | 
 86 |     std::cout << "打印输入数据" << endl;
 87 |     for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; }
 88 | 
 89 | 
 90 |     cudaMalloc((void**)&gpu_buffer, N * sizeof(int));
 91 | 
 92 |     
 93 |     cudaMemcpy(gpu_buffer, host_data, N * sizeof(int), cudaMemcpyHostToDevice);
 94 |     
 95 |     int* count = nullptr;
 96 |     cudaMalloc((void**)&count, 2 * sizeof(int));
 97 |     int* host_count = nullptr;
 98 |     cudaMallocHost((void**)&host_count,2*sizeof(int));
 99 |     host_count[0] = 0;
100 |     host_count[1] = 0;
101 |     cudaMemcpy(count, host_count, 2 * sizeof(int), cudaMemcpyHostToDevice);
102 | 
103 | 
104 |     //kernel_func_error << <N, 1 >> > (count, gpu_buffer);
105 |     //kernel_func_correct << <N, 1 >> > (count, gpu_buffer);
106 | 
107 | 
108 |     auto T0 = std::chrono::system_clock::now();  //时间函数
109 |     int num = 10000;
110 |     int num_k = 60;
111 |  
112 |     for (int k = 0; k < num_k; k++) {
113 |         for (int j = 0; j < num; j++) {
114 |             //cudaMemcpy(count, host_count, 2 * sizeof(int), cudaMemcpyHostToDevice);
115 |             kernel_func_error << <N, 1 >> > (count, gpu_buffer);
116 |             //kernel_func_correct << <N, 1 >> > (count, gpu_buffer);
117 | 
118 |         }
119 |     
120 |     }
121 |     
122 |     auto T1 = std::chrono::system_clock::now();
123 |     float time_kernel = std::chrono::duration_cast<std::chrono::milliseconds>(T1 - T0).count();
124 | 
125 |     std::cout << "\n\n推理时间:\t " <<  time_kernel/ num_k << "ms\n\n" << endl;
126 | 
127 |     cudaMemcpy(host_count, count, 2 * sizeof(int), cudaMemcpyDeviceToHost);
128 |     
129 |     std::cout << "\n打印输出结果\n\n" <<"说明：\t若为偶数，0与1数量相等，否则相差1个数\n\n" << "0的计数量：" << host_count[0]<<"\n1的计数量："<< host_count[1]<< endl;
130 |     
131 | 
132 |     
133 | 
134 | 
135 |     return 0;
136 | }
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | //十进制转二进制
145 | 
146 | 
147 | void printbinary(const unsigned int val)
148 | {
149 |     for (int i = 16; i >= 0; i--)
150 |     {
151 |         if (val & (1 << i))
152 |             cout << "1";
153 |         else
154 |             cout << "0";
155 |     }
156 | }
157 | 
158 | int atomic_apply2()
159 | {
160 |     int a = 6;
161 |     int b = 4;
162 |     std::cout << "\n打印变量a的二进制\n" << a << ":\t";
163 |     printbinary(a);
164 |     std::cout << "\n打印变量b的二进制\n" <<b << ":\t";
165 |     printbinary(b);
166 | 
167 |     std::cout << "\n打印与、或、异或结果：" << endl;
168 | 
169 |     printf("\n与结果：\t%d",a&b);
170 |     printf("\n或结果：\t%d", a | b);
171 | 
172 |     std::cout << "\n打印与的二进制\n";
173 |     printbinary(a & b);
174 |     std::cout << "\n打印或的二进制\n" ;
175 |     printbinary(a|b);
176 | 
177 | 
178 |     return 0;
179 | }
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | __global__ void kernel(int* data, int* gpu_output) {
199 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
200 |     // 对共享内存中的数据执行原子加操作
201 |     int index = atomicAdd(&data[tid], 1);
202 |     //printf("%d\n", index);
203 | 
204 |     if (index > 50) {
205 |         return;
206 |     }
207 |     gpu_output[index] = data[index] * 100;
208 | 
209 | 
210 | }
211 | 
212 | 
213 | 
214 | 
215 | int atomic_apply3() {
216 | 
217 |     const int N = 50;
218 |     int* gpu_buffer[2];
219 | 
220 |     int* host_data = new int[N];
221 | 
222 | 
223 |     for (int i = 0; i < N; i++) { host_data[i] = i + 6; }
224 |     std::cout << "\n打印初始化" << endl;
225 |     for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; }
226 | 
227 | 
228 | 
229 |     cudaMalloc((void**)&gpu_buffer[0], N * sizeof(int));
230 | 
231 |     cudaMalloc((void**)&gpu_buffer[1], N * sizeof(int));
232 | 
233 | 
234 |     cudaMemcpy(gpu_buffer[0], host_data, N * sizeof(int), cudaMemcpyHostToDevice);
235 |     kernel << <1, N >> > (gpu_buffer[0], gpu_buffer[1]);
236 | 
237 | 
238 |     int* cpu_output = new int[N];
239 | 
240 |     cudaMemcpy(cpu_output, gpu_buffer[1], N * sizeof(int), cudaMemcpyDeviceToHost);
241 | 
242 |     std::cout << "\n打印输出结果" << endl;
243 |     for (int i = 0; i < N; i++) { std::cout << cpu_output[i] << "\t"; }
244 | 
245 | 
246 | 
247 |     return 0;
248 | }
249 | 
250 | 
251 | 
252 | 
253 | 
254 | __global__ void kernel2(int* data, int* gpu_output, int N) {
255 | 
256 |     int count = data[0];
257 |     //printf("count:%d\n", count);
258 | 
259 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
260 | 
261 |     if ((tid + 1) % 2 != 0) { return; } 
262 | 
263 | 
264 |     int index = atomicAdd(gpu_output, 1); 
265 | 
266 |     //printf("index:%d\n ", (index));
267 | 
268 |     if (index >= N / 2) return;
269 | 
270 |     gpu_output[index] = data[tid];
271 | 
272 | 
273 | }
274 | 
275 | 
276 | int atomic_apply4() {
277 |   
278 | 
279 |     const int N = 50; //使用偶数验证
280 |     int* gpu_buffer[2];
281 | 
282 |     int* host_data = new int[N];
283 | 
284 | 
285 |     for (int i = 0; i < N; i++) { host_data[i] = i + 1; }
286 |     std::cout << "\n打印初始化" << endl;
287 |     for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; }
288 | 
289 | 
290 | 
291 |     cudaMalloc((void**)&gpu_buffer[0], N * sizeof(int));
292 | 
293 |     cudaMalloc((void**)&gpu_buffer[1], (N / 2) * sizeof(int));
294 | 
295 | 
296 |     cudaMemcpy(gpu_buffer[0], host_data, N * sizeof(int), cudaMemcpyHostToDevice);
297 |     cudaStream_t stream;
298 |     cudaStreamCreate(&stream); //stream初始化
299 | 
300 |     //cudaMemsetAsync(gpu_buffer[1], 0, sizeof(int)  * N/2,stream);
301 |     kernel2 << <1, N, 0, stream >> > (gpu_buffer[0], gpu_buffer[1], N);
302 | 
303 | 
304 |     int* cpu_output = new int[N / 2];
305 | 
306 |     cudaMemcpy(cpu_output, gpu_buffer[1], N / 2 * sizeof(int), cudaMemcpyDeviceToHost);
307 | 
308 |     std::cout << "\n打印输出结果" << endl;
309 |     for (int i = 0; i < N / 2; i++) { std::cout << cpu_output[i] << "\t"; }
310 | 
311 | 
312 | 
313 |     return 0;
314 | }
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | //使用原子操作，计数
323 | 
324 | __global__ void countValues(float* list, int* count, int n)
325 | {
326 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
327 |     if (i < n) {
328 |         atomicAdd(count, 1);
329 | 
330 |     }
331 | }
332 | 
333 | int atomic_apply5()
334 | {
335 |     // 假设你已经有一个包含n个数值的列表list
336 | 
337 |     const int n = 32;
338 | 
339 |     float* d_list = nullptr;
340 | 
341 | 
342 | 
343 |     int count = 0;
344 |     cudaMallocHost((void**)&d_list, n * sizeof(float));
345 |     for (int i = 0; i < n; i++) { d_list[i] = i + 1; }
346 |     std::cout << "d_list:" << endl;
347 |     for (int i = 0; i < n; i++) { std::cout << d_list[i] << "\t"; }
348 | 
349 | 
350 |     int* d_count = nullptr;
351 |     cudaMalloc((void**)&d_count, sizeof(int));
352 | 
353 |     cudaMemcpy(d_count, &count, sizeof(int), cudaMemcpyHostToDevice);
354 | 
355 |     // 定义块和线程的数量
356 |     int blockSize = 256;
357 |     int numBlocks = (n + blockSize - 1) / blockSize;
358 | 
359 |     // 调用核函数
360 |     countValues << <numBlocks, blockSize >> > (d_list, d_count, n);
361 | 
362 |     // 将计数器的值从设备端复制回主机端
363 |     cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
364 | 
365 |     // 输出结果
366 |     printf("Number of non-zero values: %d\n", count);
367 | 
368 |     // 释放内存
369 |     cudaFree(d_list);
370 |     cudaFree(d_count);
371 | 
372 |     return 0;
373 | }
374 | 
375 | 
376 | 
377 | 
378 | __global__ void kernel(int* data) {
379 |     int tid = threadIdx.x;
380 |     atomicAnd(&data[tid], 0x0F);
381 | }
382 | 
383 | int atomic_apply6() {
384 |     int data[16] = { 0 };
385 |     int* d_data;
386 |     cudaMalloc(&d_data, sizeof(int) * 16);
387 |     cudaMemcpy(d_data, data, sizeof(int) * 16, cudaMemcpyHostToDevice);
388 | 
389 |     kernel << <1, 16 >> > (d_data);
390 | 
391 |     cudaMemcpy(data, d_data, sizeof(int) * 16, cudaMemcpyDeviceToHost);
392 |     cudaFree(d_data);
393 | 
394 |     for (int i = 0; i < 16; ++i) {
395 |         printf("%d ", data[i]);
396 |     }
397 |     printf("\n");
398 | 
399 |     return 0;
400 | }
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | int  main_nine() {
410 | 
411 | 
412 |     //atomic_apply1();
413 |     atomic_apply2();
414 |     //atomic_apply3();
415 |     //atomic_apply4();
416 |     //atomic_apply5();
417 |     //atomic_apply6();
418 |     return 0;
419 | }
420 | 
421 | 
422 | 
423 | 
424 | 
425 | 
426 | 
427 | 
428 | 


--------------------------------------------------------------------------------
/tutorials_seven.cu:
--------------------------------------------------------------------------------
  1 | ﻿/*!
  2 |     
  3 |    我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  4 | 
  5 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  6 | 
  7 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  8 | 
  9 |     可以保证，认真学完教程，cuda编程毫无压力。
 10 | 
 11 |     详情请链接:http://t.csdn.cn/NaCZ5
 12 | 
 13 | 
 14 |     @Description : CUDA kenel计算应用示例篇
 15 |     @Author      : tangjun
 16 |     @Date        :
 17 | */
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | #include <iostream>
 24 | #include <time.h>
 25 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 26 | #include "opencv2/opencv.hpp"
 27 | #include "device_launch_parameters.h"
 28 | #include <cuda_runtime_api.h>
 29 | #include <device_functions.h>
 30 | using namespace cv;
 31 | using namespace std;
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | __global__ void hello_from_gpu()
 40 | {
 41 |     const int blockid = blockIdx.x;
 42 |     const int  threadid = threadIdx.x;
 43 |     printf("block index %d and thread idex %d!\n", blockid, threadid);
 44 | }
 45 | int kernel_apply1(void)
 46 | {
 47 |     hello_from_gpu << <6, 5 >> > ();
 48 |     cudaDeviceSynchronize();
 49 |     return 0;
 50 | }
 51 | 
 52 | 
 53 | 
 54 | __global__ void VecAdd1(int* A, int* B, int* C)
 55 | {
 56 |     int i = threadIdx.x;
 57 |     C[i] = A[i] + B[i];
 58 | }
 59 | 
 60 | int kernel_apply2()
 61 | {
 62 |     int m = 8;
 63 |     int* a, * b, * c;
 64 |     //分配host内存
 65 |     cudaMallocHost((void**)&a, sizeof(int) * m);
 66 |     cudaMallocHost((void**)&b, sizeof(int) * m);
 67 |     cudaMallocHost((void**)&c, sizeof(int) * m);
 68 | 
 69 |     std::cout << "value of a:" << endl;
 70 |     for (int i = 0; i < m; i++) {
 71 |         a[i] = rand() % 256;
 72 |         std::cout << a[i] << "\t";
 73 |     }
 74 |     std::cout << "\nvalue of b:" << endl;
 75 |     for (int i = 0; i < m; i++) {
 76 |         b[i] = rand() % 260;
 77 |         std::cout << b[i] << "\t";
 78 |     }
 79 | 
 80 |     int* g_a, * g_b, * g_c;
 81 |     //分配gpu内存
 82 |     cudaMalloc((void**)&g_a, sizeof(int) * m);
 83 |     cudaMalloc((void**)&g_b, sizeof(int) * m);
 84 |     cudaMalloc((void**)&g_c, sizeof(int) * m);
 85 |     // 赋值
 86 |     cudaMemcpy(g_a, a, sizeof(int) * m, cudaMemcpyHostToDevice);
 87 |     cudaMemcpy(g_b, b, sizeof(int) * m, cudaMemcpyHostToDevice);
 88 | 
 89 |     dim3 dimGrid(1);
 90 |     dim3 dimBlock(m);
 91 | 
 92 |     //应用grid只有x方向一个block，block只有x方向m个third
 93 |     VecAdd1 << <dimGrid, dimBlock >> > (g_a, g_b, g_c);
 94 |     //VecAdd1 << <dim3(1), dim3(m) >> > (g_a, g_b, g_c);
 95 |     //VecAdd1 << <1, m >> > (g_a, g_b, g_c);
 96 | 
 97 |     //将g_c赋值给c
 98 |     cudaMemcpy(c, g_c, sizeof(int) * m, cudaMemcpyDeviceToHost);
 99 |     //打印
100 |     std::cout << "\nvalue of c:" << endl;
101 |     for (int i = 0; i < m; i++) {
102 |         std::cout << c[i] << "\t";
103 |     }
104 | 
105 | 
106 | 
107 | 
108 | 
109 |     //释放内存
110 |     cudaFree(g_a);
111 |     cudaFree(g_b);
112 |     cudaFree(g_c);
113 |     cudaFreeHost(a);
114 |     cudaFreeHost(b);
115 |     cudaFreeHost(c);
116 | 
117 |     return 0;
118 | }
119 | 
120 | 
121 | 
122 | 
123 | __global__ void MatAdd2(int A[8], int B[8], int C[8])
124 | {
125 |     int i = threadIdx.x;
126 |     C[i] = A[i] + B[i];
127 |     printf("\ni=%i", i);
128 | 
129 |     //std::cout <<"核函数："  << std::endl;
130 | }
131 | 
132 | 
133 | int kernel_apply3()
134 | {
135 |     const int m = 8;
136 |     int a[m], b[m], c[m];
137 |     //int *a, *b, *c;
138 |     //int* a, * b, c[m];
139 | 
140 |     //分配host内存
141 |     cudaMallocHost((void**)&a, sizeof(int) * m * m);
142 |     cudaMallocHost((void**)&b, sizeof(int) * m);
143 |     cudaMallocHost((void**)&c, sizeof(int) * m);
144 | 
145 |     std::cout << "value of a:" << endl;
146 |     for (int i = 0; i < m; i++) {
147 |         a[i] = rand() % 69;
148 |         std::cout << a[i] << "\t";
149 |     }
150 | 
151 |     std::cout << "value of b:" << endl;
152 |     for (int j = 0; j < m; j++) {
153 |         b[j] = rand() % 25;
154 |         std::cout << b[j] << "\t";
155 |     }
156 | 
157 |     int* g_a, * g_b, * g_c;
158 | 
159 |     //分配gpu内存
160 |     cudaMalloc((void**)&g_a, sizeof(int) * m * m);
161 |     cudaMalloc((void**)&g_b, sizeof(int) * m);
162 |     cudaMalloc((void**)&g_c, sizeof(int) * m);
163 |     // 赋值
164 |     cudaMemcpy(g_a, a, sizeof(int) * m, cudaMemcpyHostToDevice);
165 |     cudaMemcpy(g_b, b, sizeof(int) * m, cudaMemcpyHostToDevice);
166 | 
167 | 
168 |     MatAdd2 << <1, m >> > (g_a, g_b, g_c);
169 |     //cudaDeviceSynchronize();
170 | 
171 |     cudaMemcpy(c, g_c, sizeof(int) * m, cudaMemcpyDeviceToHost);
172 | 
173 |     std::cout << "value of c:" << endl;
174 |     for (int j = 0; j < m; j++) {
175 |         std::cout << c[j] << "\t";
176 |     }
177 |     return 0;
178 | }
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | //用于CV读取图片BGR通道将其改为RGB方法
191 | __global__ void rgb2grayincuda(uchar3* const d_in, unsigned char* const d_out,
192 |     uint imgheight, uint imgwidth)
193 | {
194 |     const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;  //w
195 |     const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;  //h
196 | 
197 |     if (idx < imgwidth && idy < imgheight)  //有的线程会跑到图像外面去，不执行即可
198 |     {
199 |         uchar3 rgb = d_in[idy * imgwidth + idx];
200 |         d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
201 | 
202 |     }
203 | }
204 | 
205 | 
206 | void show_img(Mat img) {
207 |     cv::imshow("Image", img);
208 |     cv::waitKey(1000);
209 |     cv::destroyAllWindows();
210 | 
211 | }
212 | 
213 | 
214 | void kernel_apply4() {
215 |     Mat srcImage = imread("image.jpg");
216 |     show_img(srcImage);
217 | 
218 |     const uint imgheight = srcImage.rows;
219 |     const uint imgwidth = srcImage.cols;
220 | 
221 |     Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));
222 | 
223 | 
224 | 
225 |     uchar3* d_in;   //向量类型，3个uchar
226 |     unsigned char* d_out;
227 | 
228 |     cudaMalloc((void**)&d_in, imgheight * imgwidth * sizeof(uchar3));
229 |     cudaMalloc((void**)&d_out, imgheight * imgwidth * sizeof(unsigned char));
230 | 
231 | 
232 |     cudaMemcpy(d_in, srcImage.data, imgheight * imgwidth * sizeof(uchar3), cudaMemcpyHostToDevice);
233 | 
234 |     //说明：(imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x表示x方向
235 |     dim3 threadsPerBlock(32, 32);
236 |     dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);
237 | 
238 | 
239 |     //启动内核
240 |     rgb2grayincuda << <blocksPerGrid, threadsPerBlock >> > (d_in, d_out, imgheight, imgwidth);
241 | 
242 |     //执行内核是一个异步操作，因此需要同步以测量准确时间
243 |     cudaDeviceSynchronize();
244 | 
245 | 
246 | 
247 | 
248 |     //拷贝回来数据
249 |     cudaMemcpy(grayImage.data, d_out, imgheight * imgwidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);
250 | 
251 |     //释放显存
252 |     cudaFree(d_in);
253 |     cudaFree(d_out);
254 | 
255 |     imshow("grayImage", grayImage);
256 |     cv::waitKey(1000);
257 |     cv::destroyAllWindows();
258 | 
259 | 
260 | }
261 | 
262 | 
263 | 
264 | 
265 | typedef struct {
266 |     int width;
267 |     int height;
268 |     float* elements;
269 | 
270 | }Matrix;
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | void main_seven()
281 | {
282 | 
283 |     //kernel_apply1();
284 |     //kernel_apply2();
285 |     //kernel_apply2();
286 |     kernel_apply4();
287 | 
288 | 
289 | 
290 | }
291 | 
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 


--------------------------------------------------------------------------------
/tutorials_six.cu:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | /*!
  3 |     
  4 |    我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  5 | 
  6 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  7 | 
  8 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  9 | 
 10 |     可以保证，认真学完教程，cuda编程毫无压力。
 11 | 
 12 |     详情请链接:http://t.csdn.cn/NaCZ5
 13 | 
 14 | 
 15 |     
 16 |     @Description : CUDA矩阵的加减乘除
 17 |     @Author      : tangjun
 18 |     @Date        :
 19 | */
 20 | 
 21 | 
 22 | 
 23 | #include <iostream>
 24 | #include <time.h>
 25 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 26 | #include "opencv2/opencv.hpp"
 27 | #include "device_launch_parameters.h"
 28 | #include <cuda_runtime_api.h>
 29 | #include <device_functions.h>
 30 | using namespace cv;
 31 | using namespace std;
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | void Print_2dim(int* ptr, int m, int n) {
 38 |     std::cout << "result:\n";
 39 |     for (int i = 0; i < m; i++) {
 40 |         for (int j = 0; j < n; j++) {
 41 |             std::cout << "\t" << ptr[i * n + j];
 42 |         }
 43 |         std::cout << "\n";
 44 |     }
 45 | }
 46 | 
 47 | __global__ void gpu_matrix_plus_thread(int* a, int* b, int* c)
 48 | {
 49 |     //方法一：通过id方式计算
 50 |     //grid为2维度，block为2维度,使用公式id=blocksize * blockid + threadid
 51 |     int blocksize = blockDim.x * blockDim.y;
 52 |     int blockid = gridDim.x * blockIdx.y + blockIdx.x;
 53 |     int threadid = blockDim.x * threadIdx.y + threadIdx.x;
 54 |     int id = blocksize * blockid + threadid;
 55 | 
 56 |     c[id] = a[id] + b[id];
 57 | 
 58 | }
 59 | 
 60 | 
 61 | __global__ void gpu_matrix_plus1(int* a, int* b, int* c, int m, int n)
 62 | {   //方法二：通过row与col的方式计算-->通过变换列给出id
 63 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 64 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 65 |     c[row * n + col] = a[row * n + col] + b[row * n + col];
 66 | }
 67 | 
 68 | 
 69 | __global__ void gpu_matrix_plus2(int* a, int* b, int* c, int m, int n)
 70 | {   //方法三：通过row与col的方式计算-->通过变换行给出id
 71 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 72 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 73 |     c[row + col * m] = a[row + col * m] + b[row + col * m];
 74 | }
 75 | 
 76 | 
 77 | void init_variables(int* a, int* b, int m, int n) {
 78 | 
 79 |     //初始化变量
 80 |     std::cout << "value of a:" << endl;
 81 |     for (int i = 0; i < m; i++) {
 82 |         for (int j = 0; j < n; j++) {
 83 |             a[i * n + j] = rand() % 256;
 84 |             std::cout << "\t" << a[i * n + j];
 85 | 
 86 |         }
 87 |         std::cout << "\n";
 88 |     }
 89 |     std::cout << "value of b:" << endl;
 90 |     for (int i = 0; i < m; i++) {
 91 |         for (int j = 0; j < n; j++) {
 92 |             b[i * n + j] = rand() % 256;
 93 |             std::cout << "\t" << b[i * n + j];
 94 |         }
 95 |         std::cout << "\n";
 96 |     }
 97 |     std::cout << "value of a+b:" << endl;
 98 |     for (int i = 0; i < m; i++) {
 99 |         for (int j = 0; j < n; j++) {
100 | 
101 |             std::cout << "\t" << a[i * n + j] + b[i * n + j];
102 |         }
103 |         std::cout << "\n";
104 |     }
105 | 
106 | 
107 | }
108 | 
109 | int kernel_plus()
110 | {
111 |     /*
112 |     matrix a[m,n], matrix b[m,n]
113 |     a[m,n]+b[m,n]=[m,n]
114 |     */
115 | 
116 |     const int  BLOCK_SIZE = 2;
117 |     int m = 8;    //行
118 |     int n = 10;   //列
119 |     int* a, * b;
120 |     //分配host内存
121 |     cudaMallocHost((void**)&a, sizeof(int) * m * n);
122 |     cudaMallocHost((void**)&b, sizeof(int) * m * n);
123 | 
124 |     init_variables(a, b, m, n);//随机初始化变量
125 | 
126 |     int* g_a, * g_b;
127 |     //分配gpu内存
128 |     cudaMalloc((void**)&g_a, sizeof(int) * m * n);
129 |     cudaMalloc((void**)&g_b, sizeof(int) * m * n);
130 |     cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
131 |     cudaMemcpy(g_b, b, sizeof(int) * m * n, cudaMemcpyHostToDevice);
132 | 
133 |     unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;  //行写给y
134 |     unsigned int grid_cols = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;  //列写给x
135 |     dim3 dimGrid(grid_cols, grid_rows);
136 |     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
137 | 
138 | 
139 |     std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl;
140 |     std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl;
141 | 
142 | 
143 | 
144 | 
145 |     int* c1, * g_c;
146 |     cudaMalloc((void**)&g_c, sizeof(int) * m * n);
147 |     cudaMallocHost((void**)&c1, sizeof(int) * m * n);
148 |     gpu_matrix_plus_thread << <dimGrid, dimBlock >> > (g_a, g_b, g_c);
149 |     cudaMemcpy(c1, g_c, sizeof(int) * m * n, cudaMemcpyDeviceToHost);
150 |     Print_2dim(c1, m, n);
151 | 
152 | 
153 |     int* c2, * g_c2;
154 |     cudaMallocHost((void**)&c2, sizeof(int) * m * n);
155 |     cudaMalloc((void**)&g_c2, sizeof(int) * m * n);
156 |     gpu_matrix_plus1 << <dimGrid, dimBlock >> > (g_a, g_b, g_c2, m, n);
157 |     cudaMemcpy(c2, g_c2, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端
158 |     Print_2dim(c2, m, n);
159 | 
160 |     int* c3, * g_c3;
161 |     cudaMallocHost((void**)&c3, sizeof(int) * m * n);
162 |     cudaMalloc((void**)&g_c3, sizeof(int) * m * n);
163 |     gpu_matrix_plus2 << <dimGrid, dimBlock >> > (g_a, g_b, g_c3, m, n);
164 |     cudaMemcpy(c3, g_c3, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端
165 |     Print_2dim(c3, m, n);
166 | 
167 |     //释放内存
168 |     cudaFree(g_a);
169 |     cudaFree(g_b);
170 |     cudaFree(g_c);
171 |     cudaFreeHost(a);
172 |     cudaFreeHost(b);
173 |     cudaFreeHost(c1);
174 | 
175 |     return 0;
176 | }
177 | 
178 | 
179 | __global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k)
180 | {
181 |     int row = blockIdx.y * blockDim.y + threadIdx.y; // 行线程 y
182 |     int col = blockIdx.x * blockDim.x + threadIdx.x; // 列线程 x
183 |     int sum = 0;
184 |     if (col < k && row < m) {
185 |         for (int i = 0; i < n; i++) {
186 |             sum += a[row * n + i] * b[i * k + col];  //看出row与col不动的方式计算
187 |         }
188 |         c[row * k + col] = sum;
189 |     }
190 | }
191 | 
192 | __global__ void gpu_matrix_multiply_thread(int* a, int* b, int* c, int m, int n, int k)
193 | {
194 |     // [m*n]* [n*k]<---m n
195 |     //方法一：通过id方式计算
196 |     //grid为2维度，block为2维度,使用公式id=blocksize * blockid + threadid
197 |     int blocksize = blockDim.x * blockDim.y;
198 |     int blockid = gridDim.x * blockIdx.y + blockIdx.x;
199 |     int threadid = blockDim.x * threadIdx.y + threadIdx.x;
200 |     int id = blocksize * blockid + threadid;
201 | 
202 |     int row = id / k;
203 |     int col = id % k;
204 |     int sum = 0;
205 | 
206 |     for (int i = 0; i < n; i++) {
207 |         sum += a[row * n + i] * b[i * k + col];
208 | 
209 |     }
210 |     c[row * k + col] = sum;
211 | 
212 | 
213 | }
214 | 
215 | int kernel_multiply()
216 | {
217 |     /*
218 |     matrix a[m,n], matrix b[n,k]
219 |     a[m,n]*b[n,k]=[m,k]
220 |     */
221 | 
222 |     const int  BLOCK_SIZE = 2;
223 |     int m = 8;    //行
224 |     int n = 4;   //中间变量
225 |     int k = 10;    //列
226 |     int* a, * b;
227 | 
228 |     // 初始化 a与b
229 |     cudaMallocHost((void**)&a, sizeof(int) * m * n);
230 |     cudaMallocHost((void**)&b, sizeof(int) * n * k);
231 | 
232 |     std::cout << "value of a:" << endl;
233 |     for (int i = 0; i < m; i++) {
234 |         for (int j = 0; j < n; j++) {
235 |             a[i * n + j] = rand() % 6;
236 |             std::cout << "\t" << a[i * n + j];
237 |         }
238 |         std::cout << "\n";
239 |     }
240 | 
241 |     std::cout << "value of b:" << endl;
242 |     for (int i = 0; i < n; i++) {
243 |         for (int j = 0; j < k; j++) {
244 |             b[i * k + j] = rand() % 10;
245 |             std::cout << "\t" << b[i * k + j];
246 |         }
247 |         std::cout << "\n";
248 |     }
249 | 
250 |     //a*b相乘
251 |     std::cout << "value of a*b:" << endl;
252 |     for (int i = 0; i < m; i++) {
253 |         for (int j = 0; j < k; j++) {
254 |             int tmp = 0;
255 |             for (int h = 0; h < n; h++) {
256 |                 tmp += a[i * n + h] * b[h * k + j];
257 |             }
258 |             //c[i * k + j] = tmp;
259 |             std::cout << "\t" << tmp;
260 |         }
261 |         std::cout << "\n";
262 |     }
263 | 
264 | 
265 | 
266 | 
267 |     int* g_a, * g_b;
268 | 
269 |     cudaMalloc((void**)&g_a, sizeof(int) * m * n);
270 |     cudaMalloc((void**)&g_b, sizeof(int) * n * k);
271 |     cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice);
272 |     cudaMemcpy(g_b, b, sizeof(int) * k * n, cudaMemcpyHostToDevice);
273 |     unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;  //行写给y
274 |     unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;  //列写给x
275 |     dim3 dimGrid(grid_cols, grid_rows);
276 |     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
277 |     std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl;
278 |     std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl;
279 | 
280 |     // 使用row与col计算
281 |     int* c1, * g_c;
282 |     cudaMalloc((void**)&g_c, sizeof(int) * m * k);
283 |     cudaMallocHost((void**)&c1, sizeof(int) * m * k);
284 |     gpu_matrix_mult << <dimGrid, dimBlock >> > (g_a, g_b, g_c, m, n, k);
285 |     cudaMemcpy(c1, g_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
286 |     Print_2dim(c1, m, k);
287 |     cudaFree(g_c);
288 |     cudaFreeHost(c1);
289 | 
290 |     //使用id计算
291 |     int* c2, * g_c2;
292 |     cudaMalloc((void**)&g_c2, sizeof(int) * m * k);
293 |     cudaMallocHost((void**)&c2, sizeof(int) * m * k);
294 |     gpu_matrix_multiply_thread << <dimGrid, dimBlock >> > (g_a, g_b, g_c2, m, n, k);
295 |     cudaMemcpy(c2, g_c2, sizeof(int) * m * k, cudaMemcpyDeviceToHost);
296 |     Print_2dim(c2, m, k);
297 |     cudaFree(g_c2);
298 |     cudaFreeHost(c2);
299 | 
300 |     //释放内存
301 |     cudaFree(g_a);
302 |     cudaFree(g_b);
303 |     cudaFreeHost(a);
304 |     cudaFreeHost(b);
305 | 
306 | 
307 |     return 0;
308 | }
309 | 
310 | void main_six() {
311 |     kernel_plus();
312 |     kernel_multiply();
313 | 
314 | }
315 | 
316 | 
317 | 
318 | 
319 | 


--------------------------------------------------------------------------------
/tutorials_ten.cu:
--------------------------------------------------------------------------------
  1 | ﻿/*!
  2 | 
  3 |    我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  4 | 
  5 | 	只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  6 | 
  7 | 	因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  8 | 
  9 | 	可以保证，认真学完教程，cuda编程毫无压力。
 10 | 
 11 | 	详情请链接:http://t.csdn.cn/NaCZ5
 12 | 
 13 | 
 14 | 
 15 | 
 16 | 	@Description : stream
 17 | 	@Author      : tangjun
 18 | 	@Date        : 2023-08-23
 19 | */
 20 | 
 21 | 
 22 | 
 23 | 
 24 | #include "cuda_runtime.h"  
 25 | #include <iostream>
 26 | #include <stdio.h>  
 27 | #include <math.h>  
 28 | using namespace std;
 29 | 
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | 
 36 | __global__ void kernel_one(int* a, int* b, int* c)
 37 | {
 38 | 	int threadID = blockIdx.x * blockDim.x + threadIdx.x;
 39 | 	//printf("threadID:%d\n", threadID);
 40 | 
 41 | 	c[threadID] = a[threadID] + b[threadID];
 42 | 	
 43 | }
 44 | 
 45 | int stream_apply1()
 46 | {
 47 | 	int N = 32;
 48 | 	const int FULL_DATA_SIZE = N * 2;
 49 | 	//获取设备属性
 50 | 	cudaDeviceProp prop;
 51 | 	int deviceID;
 52 | 	cudaGetDevice(&deviceID);
 53 | 	cudaGetDeviceProperties(&prop, deviceID);
 54 | 	//检查设备是否支持重叠功能
 55 | 	if (!prop.deviceOverlap)
 56 | 	{
 57 | 		printf("No device will handle overlaps. so no speed up from stream.\n");
 58 | 		return 0;
 59 | 	}
 60 | 
 61 | 	//启动计时器
 62 | 	cudaEvent_t start, stop;
 63 | 	float elapsedTime;
 64 | 	cudaEventCreate(&start);
 65 | 	cudaEventCreate(&stop);
 66 | 	cudaEventRecord(start, 0);
 67 | 
 68 | 	//创建一个CUDA流
 69 | 	cudaStream_t stream;
 70 | 	cudaStreamCreate(&stream);
 71 | 
 72 | 	int* host_a, * host_b, * host_c;
 73 | 	int* dev_a, * dev_b, * dev_c;
 74 | 
 75 | 	//在GPU上分配内存
 76 | 	cudaMalloc((void**)&dev_a, N * sizeof(int));
 77 | 	cudaMalloc((void**)&dev_b, N * sizeof(int));
 78 | 	cudaMalloc((void**)&dev_c, N * sizeof(int));
 79 | 
 80 | 	//在CPU上分配页锁定内存
 81 | 	cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
 82 | 	cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
 83 | 	cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
 84 | 
 85 | 	//主机上的内存赋值
 86 | 	for (int i = 0; i < FULL_DATA_SIZE; i++)
 87 | 	{
 88 | 		host_a[i] = i;
 89 | 		host_b[i] = 10000 * i;
 90 | 	}
 91 | 
 92 | 	for (int i = 0; i < FULL_DATA_SIZE; i += N)
 93 | 	{
 94 | 		cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);
 95 | 		cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream);
 96 | 
 97 | 
 98 | 		kernel_one << <FULL_DATA_SIZE / 32, 32, 0, stream >> > (dev_a, dev_b, dev_c);
 99 | 
100 | 		cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream);
101 | 	}
102 | 
103 | 	// wait until gpu execution finish  
104 | 	cudaStreamSynchronize(stream);
105 | 
106 | 	cudaEventRecord(stop, 0);
107 | 	cudaEventSynchronize(stop);
108 | 	cudaEventElapsedTime(&elapsedTime, start, stop);
109 | 
110 | 	std::cout << "消耗时间： " << elapsedTime << std::endl;
111 | 
112 | 
113 | 
114 | 	cout << "输入数据host_a" << endl;
115 | 	for (int i = 0; i < FULL_DATA_SIZE; i++) { std::cout << host_a[i] << "\t"; }
116 | 	cout << "\n输入数据host_b" << endl;
117 | 	for (int i = 0; i < FULL_DATA_SIZE; i++) { std::cout << host_b[i] << "\t"; }
118 | 
119 | 	cout << "\n输出结果host_c" << endl;
120 | 	for (int i = 0; i < FULL_DATA_SIZE; i++)	{std::cout << host_c[i] << "\t"; }
121 | 
122 | 	getchar();
123 | 
124 | 	// free stream and mem  
125 | 	cudaFreeHost(host_a);
126 | 	cudaFreeHost(host_b);
127 | 	cudaFreeHost(host_c);
128 | 
129 | 	cudaFree(dev_a);
130 | 	cudaFree(dev_b);
131 | 	cudaFree(dev_c);
132 | 
133 | 	cudaStreamDestroy(stream);
134 | 	return 0;
135 | }
136 | 
137 | int stream_apply2()
138 | {
139 | 	const int NS = 4;
140 | 	const int ND = 32;
141 | 
142 | 	//创建CUDA流与初始化
143 | 	cudaStream_t streams[NS];
144 | 	for (int i = 0; i < NS; i++) { cudaStreamCreate(&streams[i]); }
145 | 	
146 | 
147 | 	int* host_a, * host_b, * host_c;
148 | 	int* dev_a, * dev_b, * dev_c;
149 | 
150 | 	//在GPU上分配内存
151 | 	//cudaMalloc((void**)&dev_a, ND * sizeof(int));
152 | 	//cudaMalloc((void**)&dev_b, ND * sizeof(int));
153 | 	//cudaMalloc((void**)&dev_c, ND * sizeof(int));
154 | 
155 | 
156 | 	cudaMalloc((void**)&dev_a, ND * NS * sizeof(int));
157 | 	cudaMalloc((void**)&dev_b, ND * NS * sizeof(int));
158 | 	cudaMalloc((void**)&dev_c, ND * NS * sizeof(int));
159 | 
160 | 	//在CPU上分配页锁定内存
161 | 	cudaHostAlloc((void**)&host_a, ND*NS * sizeof(int), cudaHostAllocDefault);
162 | 	cudaHostAlloc((void**)&host_b, ND*NS * sizeof(int), cudaHostAllocDefault);
163 | 	cudaHostAlloc((void**)&host_c, ND*NS * sizeof(int), cudaHostAllocDefault);
164 | 
165 | 	//主机上的内存赋值
166 | 	for (int i = 0; i < ND * NS; i++)	{
167 | 		host_a[i] = i;
168 | 		host_b[i] = 10000 * i;	}
169 | 
170 | 	for (int i = 0; i < NS; i++)	{
171 | 		cudaMemcpyAsync(dev_a + i * ND, host_a + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]);
172 | 		cudaMemcpyAsync(dev_b + i * ND, host_b + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]);
173 | 		kernel_one << <ND / 32, 32, 0, streams[i] >> > (dev_a + i * ND, dev_b + i * ND, dev_c + i * ND);
174 | 		cudaMemcpyAsync(host_c + i * ND, dev_c + i * ND, ND * sizeof(int), cudaMemcpyDeviceToHost, streams[i]);
175 | 
176 | 	}
177 | 
178 | 	// wait until gpu execution finish  
179 | 	cudaDeviceSynchronize();
180 | 
181 | 	cout << "输入数据host_a" << endl;
182 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_a[i] << "\t"; }
183 | 	cout << "\n输入数据host_b" << endl;
184 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_b[i] << "\t"; }
185 | 	cout << "\n输出结果host_c" << endl;
186 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_c[i] << "\t"; }
187 | 	// free stream and mem  
188 | 	cudaFreeHost(host_a);
189 | 	cudaFreeHost(host_b);
190 | 	cudaFreeHost(host_c);
191 | 
192 | 	cudaFree(dev_a);
193 | 	cudaFree(dev_b);
194 | 	cudaFree(dev_c);
195 | 
196 | 	for (int i = 0; i < NS; i++) { cudaStreamDestroy(streams[i]); }
197 | 	return 0;
198 | }
199 | 
200 | 
201 | 
202 | 
203 | int stream_apply3()
204 | {
205 | 	const int NS = 4; //流个数
206 | 	const int ND = 32; //每个流分配负责多少个数据
207 | 
208 | 	cudaStream_t streams[NS]; //创建多个cuda流
209 | 	for (int i = 0; i < NS; i++) { cudaStreamCreate(&streams[i]); } //每个流初始化
210 | 
211 | 	int* host_a, * host_b, * host_c; //host端变量
212 | 	int* dev_a, * dev_b, * dev_c; //gpu端变量
213 | 
214 | 	//在GPU上分配内存
215 | 	cudaMalloc((void**)&dev_a, ND * sizeof(int));
216 | 	cudaMalloc((void**)&dev_b, ND * sizeof(int));
217 | 	cudaMalloc((void**)&dev_c, ND * sizeof(int));
218 | 
219 | 	
220 | 	//在CPU上分配页锁定内存，必须使用cudaHostAlloc方法
221 | 	cudaHostAlloc((void**)&host_a, ND * NS * sizeof(int), cudaHostAllocDefault);
222 | 	cudaHostAlloc((void**)&host_b, ND * NS * sizeof(int), cudaHostAllocDefault);
223 | 	cudaHostAlloc((void**)&host_c, ND * NS * sizeof(int), cudaHostAllocDefault);
224 | 
225 | 	//主机上的内存赋值
226 | 	for (int i = 0; i < ND * NS; i++) {
227 | 		host_a[i] = i;
228 | 		host_b[i] = 10000 * i;
229 | 	}
230 | 	//循环流，为每个流分配数据赋值与kernel操作过程
231 | 	for (int i = 0; i < NS; i++) {
232 | 		cudaMemcpyAsync(dev_a , host_a + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]);
233 | 		cudaMemcpyAsync(dev_b , host_b + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]);
234 | 		kernel_one << <ND / 32, 32, 0, streams[i] >> > (dev_a , dev_b , dev_c );
235 | 		cudaMemcpyAsync(host_c + i * ND, dev_c , ND * sizeof(int), cudaMemcpyDeviceToHost, streams[i]);
236 | 
237 | 	}
238 | 
239 | 	// wait until gpu execution finish  
240 | 	cudaDeviceSynchronize();//等待所有异步执行完，cpu才操作
241 | 	//打印输出结果
242 | 	cout << "输入数据host_a" << endl;
243 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_a[i] << "\t"; }
244 | 	cout << "\n输入数据host_b" << endl;
245 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_b[i] << "\t"; }
246 | 	cout << "\n输出结果host_c" << endl;
247 | 	for (int i = 0; i < ND * NS; i++) { std::cout << host_c[i] << "\t"; }
248 | 	// free stream and mem  
249 | 	cudaFreeHost(host_a);
250 | 	cudaFreeHost(host_b);
251 | 	cudaFreeHost(host_c);
252 | 
253 | 	cudaFree(dev_a);
254 | 	cudaFree(dev_b);
255 | 	cudaFree(dev_c);
256 | 
257 | 	for (int i = 0; i < NS; i++) { cudaStreamDestroy(streams[i]); }
258 | 	return 0;
259 | }
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 
268 | 
269 | 
270 | 
271 | 
272 | 
273 | int main_ten() {
274 | 
275 | 
276 | 	//stream_apply1();
277 | 	stream_apply2();
278 | 	stream_apply3();
279 | 
280 | 	return 0;
281 | 
282 | }
283 | 
284 | 


--------------------------------------------------------------------------------
/tutorials_three.cu:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | /*!
 3 |     我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
 4 | 
 5 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
 6 | 
 7 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
 8 | 
 9 |     可以保证，认真学完教程，cuda编程毫无压力。
10 | 
11 |     详情请链接:http://t.csdn.cn/NaCZ5
12 | 
13 | 
14 |     
15 |     
16 |     @Description : CUDA编译器环境配置篇
17 |     @Author      : tangjun
18 |     @Date        : 
19 | */
20 | 
21 | 
22 | #include "cuda_runtime.h"
23 | #include "device_launch_parameters.h"
24 | #include <stdio.h>
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | int check_cuda_main() {
34 |     int deviceCount;
35 |     cudaGetDeviceCount(&deviceCount);
36 | 
37 |     int dev;
38 |     for (dev = 0; dev < deviceCount; dev++)
39 |     {
40 |         int driver_version(0), runtime_version(0);
41 |         cudaDeviceProp deviceProp;
42 |         cudaGetDeviceProperties(&deviceProp, dev);
43 |         if (dev == 0)
44 |             if (deviceProp.minor = 9999 && deviceProp.major == 9999)
45 |                 printf("\n");
46 |         printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name);
47 |         cudaDriverGetVersion(&driver_version);
48 |         printf("CUDA驱动版本:                  %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
49 |         cudaRuntimeGetVersion(&runtime_version);
50 |         printf("CUDA运行时版本:                 %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10);
51 |         printf("设备计算能力:                  %d.%d\n", deviceProp.major, deviceProp.minor);
52 |         printf("Total amount of Global Memory:         %u bytes\n", deviceProp.totalGlobalMem);
53 |         printf("Number of SMs:                 %d\n", deviceProp.multiProcessorCount);
54 |         printf("Total amount of Constant Memory:        %u bytes\n", deviceProp.totalConstMem);
55 |         printf("Total amount of Shared Memory per block:    %u bytes\n", deviceProp.sharedMemPerBlock);
56 |         printf("Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
57 |         printf("Warp size:                   %d\n", deviceProp.warpSize);
58 |         printf("Maximum number of threads per SM:        %d\n", deviceProp.maxThreadsPerMultiProcessor);
59 |         printf("Maximum number of threads per block:      %d\n", deviceProp.maxThreadsPerBlock);
60 |         printf("Maximum size of each dimension of a block:   %d x %d x %d\n", deviceProp.maxThreadsDim[0],
61 |             deviceProp.maxThreadsDim[1],
62 |             deviceProp.maxThreadsDim[2]);
63 |         printf("Maximum size of each dimension of a grid:    %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
64 |         printf("Maximum memory pitch:              %u bytes\n", deviceProp.memPitch);
65 |         printf("Texture alignmemt:               %u bytes\n", deviceProp.texturePitchAlignment);
66 |         printf("Clock rate:                   %.2f GHz\n", deviceProp.clockRate * 1e-6f);
67 |         printf("Memory Clock rate:               %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f);
68 |         printf("Memory Bus Width:                %d-bit\n", deviceProp.memoryBusWidth);
69 |     }
70 | 
71 |     return 0;
72 | }
73 | 
74 | 
75 | 
76 | void main_three() {
77 | 
78 |     check_cuda_main();
79 | 
80 | 
81 | }


--------------------------------------------------------------------------------
/tutorials_yolo_part_postprocess.cu:
--------------------------------------------------------------------------------
  1 | ﻿/*!
  2 |     
  3 |    我的教程优势:一系列且强逻辑顺序教程，附有源码，实战性很强。
  4 | 
  5 |     只有核心cuda处理代码，隐藏在教程中，我将不开源，毕竟已开源很多cuda教程代码，也为本次教程付出很多汗水。
  6 | 
  7 |     因此，核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿，望理解。
  8 | 
  9 |     可以保证，认真学完教程，cuda编程毫无压力。
 10 | 
 11 |     详情请链接:http://t.csdn.cn/NaCZ5
 12 | 
 13 | 
 14 |     
 15 |     
 16 |     @Description : 模拟yolo输出整个cuda的后处理-->最终为box、conf、cls_id，本部分只给出了框架。
 17 |     @Author      : tangjun
 18 |     @Date        : 2023-08-10
 19 | */
 20 | 
 21 | 
 22 | 
 23 | #include <iostream>
 24 | #include <time.h>
 25 | #include "opencv2/highgui.hpp"  //实际上在/usr/include下
 26 | #include "opencv2/opencv.hpp"
 27 | #include "device_launch_parameters.h"
 28 | #include <cuda_runtime_api.h>
 29 | #include <device_functions.h>
 30 | #include <texture_fetch_functions.h>
 31 | #include<math.h>
 32 | using namespace cv;
 33 | using namespace std;
 34 | 
 35 | 
 36 | #include <iostream>
 37 | #include <vector>
 38 | #include <algorithm>
 39 | #include <cmath>
 40 | #include <cuda_runtime.h>
 41 | 
 42 | 
 43 | 
 44 | struct nms_box {
 45 |     float x1, y1, x2, y2;
 46 |     float score;
 47 |     int cls_id;
 48 | };
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | void imitate_yolo_part_postprocess() {
 65 |     /*
 66 | 
 67 |     该代码模拟yolo后处理方法
 68 | 
 69 |     */
 70 | 
 71 |     float conf_thr = 0.3;
 72 |     float nms_thr = 0.1;  //iou>nms_thr则排除
 73 |     const int max_object = 6;
 74 | 
 75 | 
 76 |     int cls_num = 3;
 77 | 
 78 |     int anchor_output_num = 21; //模型输出目标数量，类似25200
 79 |     int N_obj = anchor_output_num * (cls_num + 5);//模型输出x,y,w,h+conf+cls_num
 80 | 
 81 | 
 82 | 
 83 | 
 84 |     /***********************#########################模仿构建yolo输出结果########################***********************/
 85 |     float* input_data = nullptr;
 86 |     cudaMallocHost((void**)&input_data, sizeof(float) * N_obj);
 87 | 
 88 |     //赋值
 89 |     for (int i = 0; i < N_obj; i++) {
 90 |         //input_data[i] = (float)(i+1) ;
 91 |         float value = rand() / float(RAND_MAX);
 92 |         input_data[i] = round(value * 10000) / 10000;
 93 |     }
 94 |     //更新赋值
 95 |     for (int i = 0; i < 6; i++) {
 96 |         int idx = i * (cls_num + 5);
 97 |         if (idx == 0) {
 98 |             input_data[idx] = 367.0;
 99 |             input_data[idx + 1] = 38.0;
100 |             input_data[idx + 2] = 677.0;
101 |             input_data[idx + 3] = 318.0;
102 |             input_data[idx + 4] = 1.0;
103 |         }
104 |         else if (idx == 1 * (cls_num + 5)) {
105 |             input_data[idx] = 502.0;
106 |             input_data[idx + 1] = 38.0;
107 |             input_data[idx + 2] = 731.0;
108 |             input_data[idx + 3] = 318.0;
109 |             input_data[idx + 4] = 1.0;
110 | 
111 |         }
112 |         else if (idx == 2 * (cls_num + 5)) {
113 |             input_data[idx] = 303.0;
114 |             input_data[idx + 1] = 377.0;
115 |             input_data[idx + 2] = 831.0;
116 |             input_data[idx + 3] = 1071.0;
117 |             input_data[idx + 4] = 1.0;
118 | 
119 |         }
120 |         else if (idx == 3 * (cls_num + 5)) {
121 |             input_data[idx] = 193.0;
122 |             input_data[idx + 1] = 435.0;
123 |             input_data[idx + 2] = 831.0;
124 |             input_data[idx + 3] = 931.0;
125 |             input_data[idx + 4] = 1.0;
126 | 
127 |         }
128 |         else if (idx == 4 * (cls_num + 5)) {
129 |             input_data[idx] = 1039.0;
130 |             input_data[idx + 1] = 147.0;
131 |             input_data[idx + 2] = 1471.0;
132 |             input_data[idx + 3] = 557.0;
133 |             input_data[idx + 4] = 1.0;
134 | 
135 |         }
136 |         else if (idx == 5 * (cls_num + 5)) {
137 |             input_data[idx] = 1339.0;
138 |             input_data[idx + 1] = 1.0;
139 |             input_data[idx + 2] = 1571.0;
140 |             input_data[idx + 3] = 209.0;
141 |             input_data[idx + 4] = 1.0;
142 | 
143 |         }
144 | 
145 | 
146 | 
147 |     }
148 | 
149 |     //打印显示
150 |     std::cout << "原始数据赋值+打印:" << endl;
151 |     for (int i = 0; i < N_obj; i++) {
152 | 
153 |         if (i % (cls_num + 5) == 0) { std::cout << endl; }
154 |         std::cout << input_data[i] << "\t\t";
155 | 
156 |     }
157 | 
158 |     float* gpu_input = nullptr;
159 |     cudaMalloc((void**)&gpu_input, sizeof(float) * N_obj);
160 |     cudaMemcpy(gpu_input, input_data, sizeof(float) * N_obj, cudaMemcpyHostToDevice);
161 | 
162 |     /***********************#########################模仿构建yolo输出结果########################***********************/
163 | 
164 | 
165 | 
166 | 
167 | 
168 |      /***********************#########################cuda相关变量和内存分配-初始化########################***********************/
169 | 
170 |     float* gpu_output = nullptr;
171 |     cudaMalloc((void**)&gpu_output, sizeof(float) * max_object * 6);// 保存处理后的yolo输出结果，格式为[max_boject,   [x1,y1,x2,y2,conf,cls_id]]
172 | 
173 |     nms_box* d_boxes = nullptr;
174 |     cudaMalloc(&d_boxes, anchor_output_num * sizeof(nms_box)); // gpu设备保存，gpu_output数据纯粹格式转换为nms_box结构体的格式
175 |     nms_box* h_boxes = nullptr;
176 |     cudaMallocHost(&h_boxes, anchor_output_num * sizeof(nms_box)); //同理，host端保存，
177 | 
178 | 
179 |     int* h_nms_indices_init;
180 |     cudaMallocHost(&h_nms_indices_init, max_object * sizeof(int)); //nms处理的索引赋值初始化变量，恒定不变
181 |     for (int i = 0; i < max_object; i++) { h_nms_indices_init[i] = i; } //赋值操作
182 | 
183 |     int* d_nms_indices;
184 |     cudaMalloc(&d_nms_indices, max_object * sizeof(int)); //gpu设备处理后索引值，-1为排除目标，>-1为保存目标，需h_nms_indices_init为为其赋值
185 |     int* h_nms_indices;
186 |     cudaMallocHost(&h_nms_indices, max_object * sizeof(int)); //host端保存d_nms_indices赋值，以此决定保留nms目标
187 | 
188 | 
189 |     int h_count = 0; //host端记录gpu_output保存有效obj数量，值来源d_conut
190 |     int* d_count = nullptr;
191 |     cudaMalloc((void**)&d_count, sizeof(int)); //gpu端通过原子操作，记录gpu_output保存有效obj数量
192 | 
193 | 
194 |     /***********************#########################cuda相关变量和内存分配-初始化########################***********************/
195 | 
196 | 
197 | 
198 |     const int block = 32;
199 | 
200 | 
201 | 
202 |     /*************************************************开始cuda计算***********************************************/
203 | 
204 |     cudaStream_t stream;
205 |     cudaStreamCreate(&stream);
206 | 
207 | 
208 | 
209 |     h_count = 0;
210 |     cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);  //初始化记录有效变量d_count与h_count
211 |     int grid = (anchor_output_num + block - 1) / block;
212 |     //调用yolo输出结果处理的核函数
213 |     //decode_yolo_kernel << < grid, block, 0, stream >> > (gpu_input, gpu_output, max_object, cls_num, conf_thr, d_count);
214 |     
215 | 
216 |     cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
217 |     if (h_count > max_object) { h_count = max_object; };
218 | 
219 | 
220 | 
221 | 
222 |     /****************************************打印模型输出输出数据结果--》通过置信度已过滤不满足要求和给出类别**********************************/
223 | 
224 |     float* host_decode = nullptr; // 保存gpu处理的变量 
225 |     cudaMallocHost((void**)&host_decode, sizeof(float) * max_object * 6);
226 |     cudaMemcpy(host_decode, gpu_output, sizeof(float) * max_object * 6, cudaMemcpyDeviceToHost);
227 |     std::cout << "\n\n打印输出结果-gpu_output\n" << endl;
228 |     if (h_count == 0) { std::cout << "\n无检测结果" << endl; }
229 |     for (int i = 0; i < h_count; i++) {
230 |         int idx = i * 6;
231 |         std::cout << "x1:" << host_decode[idx] << "\ty1:" << host_decode[idx + 1] << "\tx2:" << host_decode[idx + 2]
232 |             << "\ty2:" << host_decode[idx + 3] << "\tconf:" << host_decode[idx + 4] << "\tclass_id:" << host_decode[idx + 5] << endl;
233 | 
234 |     }
235 |     /******************************************************************************************************************************/
236 | 
237 | 
238 | 
239 | 
240 |     //这里是我对decode_yolo_kernel核函数处理后的数据进一步转换格式，在gpu中完成
241 |     int grid_max = (max_object + block - 1) / block;
242 |     //data_format_convert << < grid_max, block, 0, stream >> > (d_boxes, gpu_output, h_count); // gpu_output格式为[x1,y1,conf,cls_id]
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 |     /****************************************将数据转换为带有nms_box格式数据******************************************************/
250 |     nms_box* h_boxes_format = nullptr;
251 |     cudaMallocHost(&h_boxes_format, anchor_output_num * sizeof(nms_box));
252 |     cudaMemcpy(h_boxes_format, d_boxes, anchor_output_num * sizeof(nms_box), cudaMemcpyDeviceToHost);
253 |     std::cout << "\n\n打印格式转换输出-h_boxes_format\n" << endl;
254 |     if (h_count == 0) { std::cout << "\n无检测结果" << endl; }
255 |     for (int i = 0; i < h_count; i++) {
256 |         nms_box bb = h_boxes_format[i];
257 |         std::cout << "x1:" << bb.x1 << "\ty1:" << bb.y1 << "\tx2:" << bb.x2 << "\ty2:" << bb.y2 << "\tconf:" << bb.score << "\tclass_id:" << bb.cls_id << endl;
258 |     }
259 |     /******************************************************************************************************************************/
260 | 
261 | 
262 | 
263 | 
264 | 
265 |     cudaMemcpy(d_nms_indices, h_nms_indices_init, max_object * sizeof(int), cudaMemcpyHostToDevice); //初始化nms处理的索引-->很重要
266 | 
267 | 
268 | 
269 | 
270 |     /****************************************查看d_nms_indices数据******************************************************/
271 |     int* d_nms_indices_visual = nullptr;
272 |     cudaMallocHost(&d_nms_indices_visual, max_object * sizeof(int));
273 |     cudaMemcpy(d_nms_indices_visual, d_nms_indices, max_object * sizeof(int), cudaMemcpyDeviceToHost);
274 |     std::cout << "\n\nd_nms_indices:\n" << endl;
275 |     for (int i = 0; i < max_object; i++) { std::cout << "\t" << d_nms_indices_visual[i] << endl; }
276 | 
277 |     /******************************************************************************************************************************/
278 | 
279 | 
280 | 
281 | 
282 |     //这一步是将转换后的数据做nms处理，也是在gpu上进行cuda处理
283 |     //nms_yolo_kernel << <grid_max, block >> > (d_boxes, d_nms_indices, h_count, nms_thr);
284 | 
285 | 
286 | 
287 |     /*******将yolo的gpu上结果转host端，然后保存结果处理-->最终结果保存在keep_boxes中**********/
288 |     cudaMemcpy(h_boxes, d_boxes, anchor_output_num * sizeof(nms_box), cudaMemcpyDeviceToHost);
289 |     cudaMemcpy(h_nms_indices, d_nms_indices, max_object * sizeof(int), cudaMemcpyDeviceToHost);  //保存处理后的indice
290 | 
291 |     vector<nms_box> keep_boxes(h_count);
292 |     for (int i = 0; i < h_count; i++) {
293 |         if (h_nms_indices[i] > -1) {
294 |             keep_boxes[i] = h_boxes[i];
295 |         }
296 |     }
297 | 
298 | 
299 | 
300 |     /****************************************查看nms处理后的-d_nms_indices******************************************************/
301 |     std::cout << "nms处理后，保留box索引，-1表示排除obj，>-1表示保存obj" << endl;
302 |     for (int i = 0; i < max_object; i++) { std::cout << h_nms_indices[i] << "\t"; }
303 |     /**********************************************************************************************/
304 | 
305 | 
306 | 
307 |     /****************************************随便一张图为背景-显示结果于图上******************************************************/
308 |     cv::Mat image = cv::imread("image.jpg");
309 | 
310 |     for (nms_box box : keep_boxes) {
311 | 
312 |         cv::Point p1(box.x1, box.y1);
313 |         cv::Point p2(box.x2, box.y2);
314 |         cv::rectangle(image, p1, p2, cv::Scalar(0, 255, 0), 4, 1, 0);//矩形的两个顶点，两个顶点都包括在矩形内部
315 |     }
316 | 
317 | 
318 |     cv::resize(image, image, cv::Size(600, 400), 0, 0, cv::INTER_NEAREST);
319 |     cv::imshow("www", image);
320 |     cv::waitKey(100000);
321 |     cv::destroyAllWindows();
322 |     /**********************************************************************************************/
323 | 
324 | 
325 | 
326 | }
327 | 
328 | 
329 | 
330 | 
331 | 
332 | 
333 | 
334 | void main() {
335 | 
336 | 
337 |   
338 |     imitate_yolo_part_postprocess();
339 | 
340 | 
341 | 
342 | }
343 | 
344 | 


--------------------------------------------------------------------------------