├── .gitignore ├── CUDA编程指南5.0水印版.pdf ├── README.md ├── image.jpg ├── tutorials_eight.cu ├── tutorials_eleven.cu ├── tutorials_first.cu ├── tutorials_four.cu ├── tutorials_nine.cu ├── tutorials_seven.cu ├── tutorials_six.cu ├── tutorials_ten.cu ├── tutorials_three.cu └── tutorials_yolo_part_postprocess.cu /.gitignore: -------------------------------------------------------------------------------- 1 | # Repo-specific GitIgnore ---------------------------------------------------------------------------------------------- 2 | 3 | cuda_tutorials/x64/ 4 | x64/ 5 | cuda_tutorials.vc* 6 | tutorials_yolo_postprocess.cu -------------------------------------------------------------------------------- /CUDA编程指南5.0水印版.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tangjunjun966/cuda-tutorial-master/f7a6f766bb2dce2c9fa806aca316bda362699cc7/CUDA编程指南5.0水印版.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # cuda教程目录 3 | 4 | #### 第一章 指针篇 5 | #### 第二章 CUDA原理篇 6 | #### 第三章 CUDA编译器环境配置篇 7 | #### 第四章 kernel函数基础篇 8 | #### 第五章 kernel索引(index)篇 9 | #### 第六章 kenel矩阵计算实战篇 10 | #### 第七章 kenel实战强化篇 11 | #### 第八章 CUDA内存应用与性能优化篇 12 | #### 第九章 CUDA原子(atomic)实战篇 13 | #### 第十章 CUDA流(stream)实战篇 14 | #### 第十一章 CUDA的NMS算子实战篇 15 | #### 第十二章 YOLO的部署实战篇 16 | #### 第十三章 基于CUDA的YOLO部署实战篇 17 | # cuda教程背景 18 | 随着人工智能的发展与人才的内卷,很多企业已将深度学习算法的C++部署能力作为基本技能之一。面对诸多arm相关且资源有限的设备,往往想更好的提速,满足更高时效性,必将更多类似矩阵相关运算交给CUDA处理。同时,面对市场诸多教程与诸多博客岑子不起的教程或高昂教程费用,使读者(特别是小白)容易迷糊,无法快速入手CUDA编程,实现工程化。 19 | 因此,我将结合我的工程实战经验,我将在本专栏实现CUDA系列教程,帮助读者(或小白)实现CUDA工程化,掌握CUDA编程能力。学习我的教程专栏,你将绝对能实现CUDA工程化,完全从环境安装到CUDA核函数编程,从核函数到使用相关内存优化,从内存优化到深度学习算子开发(如:nms),从算子优化到模型(以yolo系列为基准)部署。最重要的是,我的教程将简单明了直切主题,CUDA理论与实战实例应用,并附相关代码,可直接上手实战。我的想法是掌握必要CUDA相关理论,去除非必须繁杂理论,实现CUDA算法应用开发,待进一步提高,将进一步理解更高深理论。 20 | # cuda教程内容 21 | 22 | 第一章到第三章探索指针在cuda函数中的作用与cuda相关原理及环境配置; 23 | 24 | 第四章初步探索cuda相关函数编写(__global__、__device__、__host__等),实现简单入门; 25 | 26 | 第五章探索不同grid与block配置,如何计算kernel函数的index,以便后续通过index实现各种运算; 27 | 28 | 第六、七章由浅入深探索核函数矩阵计算,深入探索grid、block与thread索引对kernel函数编写作用与影响,并实战多个应用列子(如:kernel函数实现图像颜色空间转换); 29 | 30 | 第八章探索cuda内存纹理内存、常量内存、全局内存等分配机制与内存实战应用(附代码),通过不同内存的使用来优化cuda计算性能; 31 | 32 | 第九章探索cuda原子(atomic)相关操作,并实战应用(如:获得某些自加索引等); 33 | 34 | 第十章探索cuda流stream相关应用,并给出相关实战列子(如:多流操作等); 35 | 36 | 第十一到十三章探索基于tensorrt部署yolo算法,我们首先将给出通用tensorrt的yolo算法部署,该部署的前后处理基于C++语言的host端实现,然后给出基于cuda的前后处理的算子核函数编写,最后数据无需在gpu与host间复制操作,实现gpu处理,提升算法性能。 37 | 38 | ```目前,以上为我们的cuda教学全部内容,若后续读者有想了解知识,可留言,我们将根据实际情况,更新相关教学内容。``` 39 | 40 | 41 | ### 我已收到网友相关需求,我基本已共享很多代码,可自习琢磨研究,具体代码解读或系列理论内容在我链接中。基于创造不易,内容查看或小部分代码学习需少量补偿。望理解,也帮忙顺手点赞点赞 42 | ## CSDN教程链接地址:[http://t.csdn.cn/J4KZj](http://t.csdnimg.cn/CD5IG) 43 | ## YOLO的部署实战篇开源代码地址:https://github.com/tangjunjun966/yolov5-tensorrt-onnx-master/tree/master 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tangjunjun966/cuda-tutorial-master/f7a6f766bb2dce2c9fa806aca316bda362699cc7/image.jpg -------------------------------------------------------------------------------- /tutorials_eight.cu: -------------------------------------------------------------------------------- 1 |  2 | 3 | /* 4 | 5 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 6 | 7 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 8 | 9 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 10 | 11 | 可以保证,认真学完教程,cuda编程毫无压力。 12 | 13 | 详情请链接:http://t.csdn.cn/NaCZ5 14 | 15 | @Description : 内存应用篇 16 | @Author : tangjun 17 | @Date : 18 | */ 19 | 20 | 21 | #include 22 | #include 23 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 24 | #include "opencv2/opencv.hpp" 25 | #include "device_launch_parameters.h" 26 | #include 27 | #include 28 | #include 29 | using namespace cv; 30 | using namespace std; 31 | 32 | 33 | 34 | #define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__) 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | int inquire_GPU_info() { 47 | int deviceCount; 48 | cudaGetDeviceCount(&deviceCount); 49 | 50 | int dev; 51 | for (dev = 0; dev < deviceCount; dev++) 52 | { 53 | int driver_version(0), runtime_version(0); 54 | cudaDeviceProp deviceProp; 55 | cudaGetDeviceProperties(&deviceProp, dev); 56 | if (dev == 0) 57 | if (deviceProp.minor = 9999 && deviceProp.major == 9999) 58 | printf("\n"); 59 | printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name); 60 | cudaDriverGetVersion(&driver_version); 61 | printf("CUDA驱动版本: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10); 62 | cudaRuntimeGetVersion(&runtime_version); 63 | printf("CUDA运行时版本: %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10); 64 | printf("设备计算能力: %d.%d\n", deviceProp.major, deviceProp.minor); 65 | printf("设备全局内存总量 Global Memory: %u M\n", deviceProp.totalGlobalMem / (1024 * 1024)); 66 | printf("Number of SMs: %d\n", deviceProp.multiProcessorCount); 67 | printf("常量内存 Constant Memory: %u K\n", deviceProp.totalConstMem / 1024); 68 | printf("每个block的共享内存 Shared Memory: %u K\n", deviceProp.sharedMemPerBlock / 1024); 69 | printf("每个block的寄存器 registers : %d\n", deviceProp.regsPerBlock); 70 | printf("线程束Warp size: %d\n", deviceProp.warpSize); 71 | printf("每个SM的最大线程数 threads per SM: %d\n", deviceProp.maxThreadsPerMultiProcessor); 72 | printf("每个block的最大线程数 threads per block: %d\n", deviceProp.maxThreadsPerBlock); 73 | printf("每个block的最大维度 each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); 74 | printf("每个grid的最大维度 dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); 75 | printf("Maximum memory pitch: %u bytes\n", deviceProp.memPitch); 76 | printf("Texture alignmemt: %u bytes\n", deviceProp.texturePitchAlignment); 77 | printf("Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); 78 | printf("Memory Clock rate: %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f); 79 | printf("Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); 80 | } 81 | 82 | return 0; 83 | } 84 | 85 | 86 | bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line) 87 | { 88 | if (code != cudaSuccess) 89 | { 90 | const char* err_name = cudaGetErrorName(code); 91 | const char* err_message = cudaGetErrorString(code); 92 | printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message); 93 | return false; 94 | } 95 | return true; 96 | } 97 | 98 | __global__ void show_value(float* v) { 99 | int idx = threadIdx.x; 100 | if (idx < 10) { 101 | printf("value_%d: \t%.2f\n", idx, v[idx]); //只读取前10个数 102 | } 103 | } 104 | 105 | int memory_appy1() 106 | { 107 | 108 | int device_id = 0; 109 | checkRuntime(cudaSetDevice(device_id)); 110 | std::cout << "设置gpu id为:\t" << device_id << std::endl; 111 | 112 | 113 | std::cout << "设置全局内存" << std::endl; 114 | float* memory_device = nullptr; // Global Memory 115 | cudaMalloc((void**)&memory_device, 100 * sizeof(float)); // pointer to device 116 | 117 | 118 | std::cout << "设置new(malloc)可分页内存" << std::endl; 119 | float* memory_host = new float[100]; // Pageable Memory 120 | for (int i = 0; i < 100; i++) { memory_host[i] = i * 100; } 121 | checkRuntime(cudaMemcpy(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice)); // 返回的地址是开辟的device地址,存放在memory_device 122 | show_value << > > (memory_device); 123 | 124 | 125 | 126 | 127 | std::cout << "设置页锁定内存" << std::endl; 128 | float* memory_page_locked = nullptr; // Pinned Memory 129 | checkRuntime(cudaMallocHost((void**)&memory_page_locked, 100 * sizeof(float))); // 返回的地址是被开辟的pin memory的地址,存放在memory_page_locked 130 | checkRuntime(cudaMemcpy(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost)); // 将其返回host内存 131 | 132 | 133 | 134 | 135 | //printf("%f\n", memory_page_locked[2]); 136 | checkRuntime(cudaFreeHost(memory_page_locked)); 137 | delete[] memory_host; 138 | checkRuntime(cudaFree(memory_device)); 139 | 140 | return 0; 141 | } 142 | 143 | 144 | 145 | __device__ float dev_array[10]; 146 | __global__ void my_device_function(float* ptr) { 147 | // Use the device variable 148 | int idx = threadIdx.x; 149 | 150 | // Do something with the value 151 | ptr[idx] = dev_array[idx] + 0.9f; 152 | 153 | } 154 | 155 | void memory_appy2() { 156 | // Allocate memory on the device 157 | float* dev_ptr; 158 | cudaMalloc((void**)&dev_ptr, 10 * sizeof(float)); 159 | 160 | // Copy data from host to device 161 | float host_array[10] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f }; 162 | cudaMemcpyToSymbol(dev_array, host_array, 10 * sizeof(float)); //赋值 163 | 164 | // Call a device function that uses the device variable 165 | my_device_function << <1, 10 >> > (dev_ptr); 166 | 167 | float* host_ptr = nullptr; 168 | cudaMallocHost((void**)&host_ptr, sizeof(int) * 10); 169 | cudaMemcpy(host_ptr, dev_ptr, sizeof(float) * 10, cudaMemcpyDeviceToHost); 170 | 171 | for (int i = 0; i < 10; i++) { std::cout << host_ptr[i] << endl; } 172 | 173 | // Free memory on the device 174 | cudaFree(dev_ptr); 175 | } 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | __global__ void static_sharedMemKernel(int* input, int* output) 184 | { 185 | // 定义共享内存 186 | __shared__ int sharedMem[1024]; 187 | int tid = threadIdx.x; 188 | int i = blockIdx.x * blockDim.x + threadIdx.x; 189 | // 将数据从全局内存拷贝到共享内存 190 | sharedMem[tid] = input[i]; 191 | // 等待所有线程都将数据拷贝到共享内存中 192 | __syncthreads(); 193 | // 对共享内存中的数据进行处理 194 | sharedMem[tid] *= 2; 195 | sharedMem[tid] = sharedMem[tid] + 30; 196 | // 等待所有线程都完成数据处理 197 | __syncthreads(); 198 | // 将结果从共享内存拷贝回全局内存 199 | output[i] = sharedMem[tid]; 200 | } 201 | 202 | 203 | __global__ void dynamic_sharedMemKernel(int* input, int* output) 204 | { 205 | // 定义共享内存 206 | extern __shared__ int sharedMem[]; //使用extern非常重要 207 | int tid = threadIdx.x; 208 | int i = blockIdx.x * blockDim.x + threadIdx.x; 209 | // 将数据从全局内存拷贝到共享内存 210 | sharedMem[tid] = input[i]; 211 | // 等待所有线程都将数据拷贝到共享内存中 212 | __syncthreads(); 213 | // 对共享内存中的数据进行处理 214 | sharedMem[tid] *= 2; 215 | sharedMem[tid] = sharedMem[tid] + 30; 216 | // 等待所有线程都完成数据处理 217 | __syncthreads(); 218 | // 将结果从共享内存拷贝回全局内存 219 | output[i] = sharedMem[tid]; 220 | } 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | int memory_appy3() 229 | { 230 | const int n = 9900; 231 | const int thrid = 1024; 232 | int input[n]; 233 | int output[n]; 234 | int* d_input, * d_output; 235 | for (int i = 0; i < n; i++) { input[i] = i; } 236 | // 分配设备内存 237 | cudaMalloc((void**)&d_input, n * sizeof(int)); 238 | cudaMalloc((void**)&d_output, n * sizeof(int)); 239 | // 将输入数据拷贝到设备内存中 240 | cudaMemcpy(d_input, input, n * sizeof(int), cudaMemcpyHostToDevice); 241 | 242 | // 调用核函数 243 | unsigned int grid = (n + thrid - 1) / thrid; //列写给x 244 | dim3 gridperblock(grid); 245 | 246 | auto start = std::chrono::system_clock::now(); //时间函数 247 | static_sharedMemKernel << > > (d_input, d_output); 248 | auto end = std::chrono::system_clock::now(); 249 | float time = std::chrono::duration_cast(end - start).count(); 250 | cout << "\n计算时间:" << time << endl; 251 | 252 | // 将结果拷贝回主机内存中 253 | cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); 254 | std::cout << "输出静态共享内存结果output前5位数:" << endl; 255 | for (int i = 0; i < 5; i++) { cout << output[i] << endl; } 256 | 257 | dynamic_sharedMemKernel << > > (d_input, d_output); 258 | cudaMemcpy(output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); 259 | std::cout << "输出动态共享内存结果output前5位数:" << endl; 260 | for (int i = 0; i < 5; i++) { cout << output[i] << endl; } 261 | 262 | // 释放设备内存 263 | cudaFree(d_input); 264 | cudaFree(d_output); 265 | return 0; 266 | } 267 | 268 | 269 | 270 | 271 | //声明纹理,用来绑定纹理,其实也就是个纹理标识 272 | texture texone; 273 | 274 | 275 | //核心代码,在gpu端执行的kernel, 276 | __global__ void Textureone(unsigned int* listTarget, int size) 277 | { 278 | unsigned int texvalue = 0; 279 | int index = blockIdx.x * blockDim.x + threadIdx.x; //通过线程ID得到数组下标 280 | if (index < size) 281 | texvalue = tex1Dfetch(texone, index) * 100; //通过索引获得纹理值再乘100 282 | listTarget[index] = texvalue; 283 | } 284 | 285 | 286 | void memory_appy4_one() 287 | { 288 | const int _length = 100; 289 | unsigned int* listSource = new unsigned int[_length]; 290 | unsigned int* listTarget = new unsigned int[_length]; 291 | 292 | //赋值 293 | for (int i = 0; i < _length; i++) { listSource[i] = i; } 294 | 295 | unsigned int* dev_Source; 296 | unsigned int* dev_Target; 297 | 298 | //在设备上申请显存空间 299 | cudaMalloc((void**)&dev_Source, _length * sizeof(unsigned int)); 300 | cudaMalloc((void**)&dev_Target, _length * sizeof(unsigned int)); 301 | //将host端的数据拷贝到device端 302 | cudaMemcpy(dev_Source, listSource, _length * sizeof(unsigned int), cudaMemcpyHostToDevice); 303 | 304 | 305 | //绑定纹理,绑定的纹理标识对应的数据 306 | cudaBindTexture(0, texone, dev_Source); 307 | 308 | //调用kernel 309 | Textureone << < ceil(_length / 10), 10 >> > (dev_Target, _length); 310 | 311 | //将结果拷贝到host端 ☆host就是CPU 312 | cudaMemcpy(listTarget, dev_Target, _length * sizeof(unsigned int), cudaMemcpyDeviceToHost); 313 | 314 | //取消绑定 315 | cudaUnbindTexture(texone); 316 | 317 | //释放内存空间 318 | cudaFree(dev_Source); 319 | cudaFree(dev_Target); 320 | 321 | 322 | cout << "原始数据: " << endl; 323 | for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } 324 | 325 | cout << endl << endl << "运算结果: " << endl; 326 | for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } 327 | getchar(); 328 | } 329 | 330 | 331 | 332 | 333 | 334 | 335 | texture textwo; 336 | 337 | 338 | __global__ void my_kernel(uchar3* output, int width, int height) 339 | { 340 | uchar3 img_v; 341 | int x = blockIdx.x * blockDim.x + threadIdx.x; 342 | int y = blockIdx.y * blockDim.y + threadIdx.y; 343 | 344 | //float u = (float)x / (float)width; 345 | //float v = (float)y / (float)height; 346 | 347 | uchar4 value = tex2D(textwo, x, y); 348 | 349 | uchar4 swapped_value = make_uchar4(value.z, value.y, value.x, value.w); 350 | img_v.x = value.x; 351 | img_v.y = value.y; 352 | img_v.z = value.z; 353 | 354 | //printf("\n%.2f", (float)value.y); 355 | output[x + y * width] = img_v; 356 | 357 | } 358 | 359 | 360 | void show(Mat img, string name = "image") { 361 | cv::imshow(name, img); 362 | cv::waitKey(1000); 363 | cv::destroyAllWindows(); 364 | 365 | } 366 | 367 | int memory_appy4_two() 368 | { 369 | // 读取图像数据 370 | cv::Mat image = cv::imread("image.jpg", cv::IMREAD_COLOR); 371 | show(image, "img_ori"); 372 | 373 | 374 | 375 | // 申请二维纹理内存 376 | cudaArray* cuArray; 377 | cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); 378 | cudaMallocArray(&cuArray, &channelDesc, image.cols, image.rows); 379 | cudaMemcpy2DToArray(cuArray, 0, 0, image.data, image.step, image.cols * sizeof(uchar4), image.rows, cudaMemcpyHostToDevice); 380 | 381 | // 绑定纹理对象和二维纹理内存 382 | cudaBindTextureToArray(textwo, cuArray); 383 | 384 | // 调用核函数 385 | dim3 block(16, 16); 386 | dim3 grid((image.cols + block.x - 1) / block.x, (image.rows + block.y - 1) / block.y); 387 | 388 | cv::Mat outimg = Mat(Size(image.cols, image.rows), CV_8UC1); 389 | uchar3* output; 390 | cudaMallocHost(&output, image.cols * image.rows * sizeof(uchar3)); 391 | 392 | //cudaMallocHost(&outimg.data, image.cols * image.rows * sizeof(uchar3)); 393 | my_kernel << > > (output, image.cols, image.rows); 394 | 395 | //cudaMemcpy(outimg.data, output, image.cols * image.rows * sizeof(uchar4), cudaMemcpyDeviceToHost); 396 | 397 | // 解绑纹理对象和二维纹理内存 398 | cudaUnbindTexture(textwo); 399 | 400 | 401 | 402 | 403 | //show(outimg,"outimg"); 404 | 405 | // 释放内存 406 | cudaFree(output); 407 | cudaFreeArray(cuArray); 408 | cout << "ok" << endl; 409 | return 0; 410 | } 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | texture texRef; 422 | 423 | __global__ void kernel(float* output, int width, int height) 424 | { 425 | int x = blockIdx.x * blockDim.x + threadIdx.x; 426 | int y = blockIdx.y * blockDim.y + threadIdx.y; 427 | 428 | if (x < width && y < height) { output[y * width + x] = tex2D(texRef, x, y); } 429 | } 430 | 431 | int memory_appy5() 432 | { 433 | int width = 512; 434 | int height = 512; 435 | int size = width * height * sizeof(float); 436 | 437 | float* input = (float*)malloc(size); 438 | float* output = (float*)malloc(size); 439 | 440 | // 初始化输入数据 441 | for (int i = 0; i < width * height; i++) { input[i] = (float)i; } 442 | 443 | // 定义CUDA数组 444 | cudaArray* cuArray; 445 | cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); 446 | cudaMallocArray(&cuArray, &channelDesc, width, height); 447 | 448 | // 将输入数据拷贝到CUDA数组中 449 | cudaMemcpyToArray(cuArray, 0, 0, input, size, cudaMemcpyHostToDevice); 450 | 451 | // 设置纹理内存参数 452 | texRef.addressMode[0] = cudaAddressModeWrap; 453 | texRef.addressMode[1] = cudaAddressModeWrap; 454 | texRef.filterMode = cudaFilterModeLinear; 455 | texRef.normalized = false; 456 | 457 | // 绑定纹理内存到CUDA数组 458 | cudaBindTextureToArray(texRef, cuArray); 459 | 460 | // 定义CUDA核函数的线程块和线程格 461 | dim3 block(16, 16); 462 | dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); 463 | 464 | // 调用CUDA核函数 465 | kernel << > > (output, width, height); 466 | 467 | // 将输出数据从设备拷贝到主机 468 | cudaMemcpy(output, output, size, cudaMemcpyDeviceToHost); 469 | 470 | // 输出结果 471 | for (int i = 0; i < width * height; i++) 472 | { 473 | printf("%f ", output[i]); 474 | } 475 | 476 | // 解绑纹理内存 477 | cudaUnbindTexture(texRef); 478 | 479 | // 释放CUDA数组和内存 480 | cudaFreeArray(cuArray); 481 | free(input); 482 | free(output); 483 | 484 | return 0; 485 | } 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | void main_eight() { 498 | inquire_GPU_info(); 499 | memory_appy1(); 500 | memory_appy2(); 501 | memory_appy3(); 502 | memory_appy4_one(); 503 | memory_appy4_two(); 504 | 505 | 506 | 507 | } 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | -------------------------------------------------------------------------------- /tutorials_eleven.cu: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | /*! 5 | 6 | 7 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 8 | 9 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 10 | 11 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 12 | 13 | 可以保证,认真学完教程,cuda编程毫无压力。 14 | 15 | 详情请链接:http://t.csdn.cn/NaCZ5 16 | 17 | 18 | @Description : cuda nms计算方法 19 | @Author : tangjun 20 | @Date : 2023-08-07 21 | */ 22 | 23 | 24 | 25 | 26 | #include 27 | #include 28 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 29 | #include "opencv2/opencv.hpp" 30 | #include "device_launch_parameters.h" 31 | #include 32 | #include 33 | #include 34 | using namespace cv; 35 | using namespace std; 36 | 37 | 38 | 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | using namespace std; 47 | 48 | // 定义矩形框的结构体 49 | struct nms_box { 50 | float x1, y1, x2, y2; 51 | float score; 52 | int cls_id; 53 | }; 54 | 55 | // 定义CUDA核函数,用于计算两个矩形框之间的IOU值 56 | __device__ float iou(nms_box a, nms_box b) 57 | { 58 | float x1 = fmaxf(a.x1, b.x1); 59 | float y1 = fmaxf(a.y1, b.y1); 60 | float x2 = fminf(a.x2, b.x2); 61 | float y2 = fminf(a.y2, b.y2); 62 | float intersection = fmaxf(0.0f, x2 - x1) * fmaxf(0.0f, y2 - y1); 63 | float area_a = (a.x2 - a.x1) * (a.y2 - a.y1); 64 | float area_b = (b.x2 - b.x1) * (b.y2 - b.y1); 65 | float union_ = area_a + area_b - intersection; 66 | return intersection / union_; 67 | } 68 | 69 | // 定义CUDA核函数,用于执行NMS算法 70 | __global__ void nms_kernel(nms_box* boxes, int* indices, int* num_indices, float nms_thr) 71 | { 72 | /* 73 | boxes:输入nms信息,为结构体 74 | indices:输入为列表序列,记录所有box,如[0,1,2,3,4,5,...],后续将不需要会变成-1。 75 | num_indices:记录有多少个box数量 76 | float nms_thr:nms的阈值,实际为iou阈值 77 | */ 78 | 79 | int i = blockIdx.x * blockDim.x + threadIdx.x; 80 | if (i >= *num_indices) { return; } 81 | 82 | int index = indices[i]; 83 | 84 | if (index == -1) { return; } 85 | 86 | nms_box box = boxes[index]; 87 | 88 | 89 | for (int j = i + 1; j < *num_indices; j++) { 90 | int other_index = indices[j]; 91 | if (other_index == -1) { continue; } 92 | 93 | nms_box other_box = boxes[other_index]; 94 | float iou_value = iou(box, other_box); 95 | printf("iou value:%f\n", iou_value); 96 | if (iou_value > nms_thr) { indices[j] = -1; } 97 | 98 | } 99 | } 100 | 101 | vector nms(vector boxes, float threshold) 102 | { 103 | int num_boxes = boxes.size(); 104 | 105 | // 将矩形框转换为CUDA中的Box结构体 106 | nms_box* d_boxes = nullptr; 107 | cudaMalloc(&d_boxes, num_boxes * sizeof(nms_box)); 108 | cudaMemcpy(d_boxes, boxes.data(), num_boxes * sizeof(nms_box), cudaMemcpyHostToDevice); 109 | 110 | 111 | 112 | 113 | // 创建一个索引数组,用于标记哪些矩形框应该被保留 114 | int* d_indices; 115 | cudaMallocHost(&d_indices, num_boxes * sizeof(int)); 116 | for (int i = 0; i < num_boxes; i++) { d_indices[i] = i; } 117 | 118 | 119 | 120 | // 在CUDA设备上执行NMS算法 121 | int num_indices = num_boxes; 122 | int* d_num_indices = nullptr; 123 | cudaMalloc(&d_num_indices, sizeof(int)); 124 | cudaMemcpy(d_num_indices, &num_indices, sizeof(int), cudaMemcpyHostToDevice); 125 | 126 | 127 | 128 | 129 | 130 | int blockSize = 256; 131 | int numBlocks = (num_boxes + blockSize - 1) / blockSize; 132 | nms_kernel << > > (d_boxes, d_indices, d_num_indices, threshold); 133 | // 134 | 135 | 136 | 137 | 138 | // 将保留的矩形框复制回主机端 139 | cudaMemcpy(&num_indices, d_num_indices, sizeof(int), cudaMemcpyDeviceToHost); 140 | 141 | 142 | 143 | int* h_indices = new int[num_indices]; 144 | 145 | cudaMemcpy(h_indices, d_indices, num_indices * sizeof(int), cudaMemcpyDeviceToHost); 146 | 147 | std::cout << "打印需要保存box的索引值:" << endl; 148 | for (int i = 0; i < num_indices; i++) { 149 | std::cout << "keep indices:" << h_indices[i] << endl; 150 | } 151 | 152 | 153 | vector kept_boxes(num_indices); 154 | for (int i = 0; i < num_indices; i++) { 155 | if (h_indices[i] > -1) { 156 | kept_boxes[i] = boxes[h_indices[i]]; 157 | } 158 | } 159 | 160 | 161 | // 释放内存 162 | cudaFree(d_boxes); 163 | cudaFree(d_indices); 164 | cudaFree(d_num_indices); 165 | delete[] h_indices; 166 | 167 | return kept_boxes; 168 | } 169 | 170 | int main_eleven() 171 | { 172 | // 创建一组矩形框 173 | vector boxes = { 174 | {367.0, 38.0, 677.0, 318.0, 0.9,1}, 175 | {502.0, 38.0, 731.0, 318.0, 0.8,2}, 176 | {303.0, 378.0, 831.0, 1071.0, 0.8,2}, 177 | {193.0, 435.0, 831.0, 931.0, 0.7,3}, 178 | {1039.0, 147.0, 1471.0, 557.0, 0.6,4}, 179 | {1339, 1.0,1571.0, 209.0, 0.5,5} 180 | }; 181 | 182 | 183 | 184 | 185 | 186 | // 执行NMS算法 187 | vector kept_boxes = nms(boxes, 0.2); 188 | 189 | 190 | 191 | // 输出结果 192 | cv::Mat image = cv::imread("image.jpg"); 193 | 194 | for (nms_box box : kept_boxes) { 195 | cout << box.x1 << ", " << box.y1 << ", " << box.x2 << ", " << box.y2 << ", " << box.score << endl; 196 | 197 | cv::Point p1(box.x1, box.y1); 198 | cv::Point p2(box.x2, box.y2); 199 | cv::rectangle(image, p1, p2, cv::Scalar(0, 255, 0), 4, 1, 0);//矩形的两个顶点,两个顶点都包括在矩形内部 200 | } 201 | 202 | cv::resize(image, image, cv::Size(600, 400), 0, 0, cv::INTER_NEAREST); 203 | 204 | cv::imshow("www", image); 205 | cv::waitKey(100000); 206 | cv::destroyAllWindows(); 207 | 208 | return 0; 209 | } 210 | 211 | 212 | 213 | 214 | /* 215 | 216 | 在这个示例中,我们定义了一个名为 iou 的函数,用于计算两个矩形框之间的 IOU(交并比)。然后,我们定义了一个名为 nms_kernel 的核函数,用于执行 NMS 算法。在 nms_kernel 中,我们首先获取当前线程的索引 tid,并获取该线程对应的矩形框 box。然后,我们遍历所有矩形框,并计算当前矩形框与其他矩形框之间的 IOU 值。如果 IOU 值大于阈值 iou_threshold,则将该矩形框标记为不保留。最后,我们将结果存储在 indices 中。 217 | 218 | 在 nms 函数中,我们计算适当的块和网格大小,并调用 nms_kernel 核函数。 219 | 220 | 请注意,这个示例只是一个简单的实现,并且可能不适用于所有情况。在实际应用中,您可能需要根据具体情况进行修改和优化。 221 | */ 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | -------------------------------------------------------------------------------- /tutorials_first.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 4 | 5 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 6 | 7 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 8 | 9 | 可以保证,认真学完教程,cuda编程毫无压力。 10 | 11 | 详情请链接:http://t.csdn.cn/NaCZ5 12 | 13 | 14 | 15 | 16 | @Description : 指针回顾 17 | @Author : tangjun 18 | @Date : 19 | */ 20 | 21 | 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | 27 | 28 | 29 | 30 | /*************************************第一节-指针篇**********************************************/ 31 | void Print_Pointers(int* ptr, int N) { 32 | for (int i = 0; i < N; i++) 33 | { 34 | std::cout << "order:\t" << i << "\tptr_value:\t" << *ptr << "\tphysical address:" << ptr << std::endl; 35 | ptr++; 36 | } 37 | } 38 | 39 | void pointer_1() { 40 | /* 探索指针赋值方法 */ 41 | const int N = 6; 42 | int arr[N]; 43 | for (int i = 0; i < N; i++) arr[i] = i + 1; //数组赋值 44 | //指针第一种赋值方法 45 | int* ptr = nullptr; 46 | ptr = arr; 47 | //指针第二种赋值方法 48 | int* ptr2 = arr; 49 | 50 | std::cout << "output ptr1 " << std::endl; 51 | Print_Pointers(ptr, N); 52 | std::cout << "\n\noutput ptr2 " << std::endl; 53 | Print_Pointers(ptr2, N); 54 | 55 | //单独变量赋值 56 | int a = 20; 57 | int* p = &a; 58 | std::cout << "\n\noutput p value: \t" << *p << "\tphysical address:\t" << p << std::endl; 59 | 60 | } 61 | 62 | void pointer_2() { 63 | const int N = 6; 64 | int arr[N]; 65 | for (int i = 0; i < N; i++) arr[i] = i + 1; //数组赋值 66 | int* ptr = arr; //构建指针 67 | for (int i = 0; i < 5; i++) 68 | { 69 | std::cout << "ptr_value_" << i << ":\t" << *ptr << std::endl;; 70 | ptr++; 71 | } 72 | } 73 | 74 | 75 | void pointer_3() { 76 | int num = 4; 77 | int* p = # 78 | cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl; 79 | 80 | *p = *p + 20; //通过指针更改地址的值 81 | cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl; 82 | num = 30; //更改变量值 83 | cout << "*p:\t" << *p << "\t p address:\t" << p << "\tnum value:\t" << num << "\tnum address:\t" << num << endl; 84 | 85 | 86 | } 87 | 88 | void pointer_4() { 89 | int num = 4; 90 | int* p1 = # 91 | //指针的指针第一种赋值方法 92 | int** p2 = &p1; 93 | //指针的指针第二种赋值方法 94 | int** p3; 95 | p3 = &p1; 96 | 97 | cout << "num value:\t" << num << "\t num address:\t" << &num << endl; 98 | cout << "p1 value:\t" << *p1 << "\t p1 address:\t" << p1 << endl; 99 | cout << "p2 value:\t" << *p2 << "\t p2 address:\t" << p2 << endl; 100 | cout << "p3 value:\t" << *p3 << "\t p3 address:\t" << p3 << endl; 101 | 102 | cout << "p2 value:\t" << **p2 << "\t p2 address:\t" << *p2 << endl; 103 | } 104 | 105 | void main_first() { 106 | pointer_1(); 107 | pointer_2(); 108 | pointer_3(); 109 | pointer_4(); 110 | 111 | 112 | } 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /tutorials_four.cu: -------------------------------------------------------------------------------- 1 |  2 | 3 | /*! 4 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 5 | 6 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 7 | 8 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 9 | 10 | 可以保证,认真学完教程,cuda编程毫无压力。 11 | 12 | 详情请链接:http://t.csdn.cn/NaCZ5 13 | 14 | 15 | 16 | @Description : CUDA函数基础篇 17 | @Author : tangjun 18 | @Date : 19 | */ 20 | 21 | 22 | 23 | #include 24 | #include 25 | #include "device_launch_parameters.h" 26 | #include 27 | #include 28 | 29 | using namespace std; 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | /*************************************第四节-CUDA函数基础篇**********************************************/ 38 | 39 | float sigmoid_host(float x) { 40 | float y = 1 / (1 + exp(-x)); 41 | return y; 42 | } 43 | 44 | __device__ float sigmoid(float x) { 45 | float y = 1 / (1 + exp(-x)); 46 | //float y = sigmoid_host(x); 47 | return y; 48 | } 49 | 50 | __global__ void test_kernel(float* a, float* c) { 51 | 52 | int idx = threadIdx.x; 53 | c[idx] = sigmoid(a[idx]); //正确方式 54 | //c[idx] = sigmoid_host(a[idx]);//绝对错误,无法调用,即:global函数无法调用host函数,只能调用devices函数 55 | 56 | 57 | } 58 | 59 | void Print_dim(float* ptr, int N) { 60 | for (int i = 0; i < N; i++) 61 | { 62 | std::cout << "value:\t" << ptr[i] << std::endl; 63 | 64 | } 65 | } 66 | 67 | void init_variables_float(float* a, int m, int n) { 68 | 69 | //初始化变量 70 | std::cout << "value of a:" << endl; 71 | for (int i = 0; i < m; i++) { 72 | for (int j = 0; j < n; j++) { 73 | a[i * n + j] = rand() / 4089; 74 | std::cout << "\t" << a[i * n + j]; 75 | 76 | } 77 | std::cout << "\n"; 78 | } 79 | 80 | 81 | } 82 | 83 | void global2device() { 84 | const int m = 4; 85 | const int n = 2; 86 | //分配host内存 87 | float* a, * c; 88 | cudaMallocHost((void**)&a, sizeof(float) * m * n); 89 | cudaMallocHost((void**)&c, sizeof(float) * m * n); 90 | //变量初始化 91 | init_variables_float(a, m, n); 92 | // 分配gpu内存并将host值复制到gpu变量中 93 | float* g_a; 94 | cudaMalloc((void**)&g_a, sizeof(float) * m * n); 95 | cudaMemcpy(g_a, a, sizeof(float) * m * n, cudaMemcpyHostToDevice); 96 | float* g_c; 97 | cudaMalloc((void**)&g_c, sizeof(float) * m * n); 98 | test_kernel << > > (g_a, g_c); 99 | cudaMemcpy(c, g_c, sizeof(float) * m * n, cudaMemcpyDeviceToHost); 100 | Print_dim(c, m * n); 101 | 102 | } 103 | 104 | __device__ __host__ float sigmoid_device_host(float x) { 105 | float y = 1 / (1 + exp(-x)); 106 | return y; 107 | } 108 | 109 | void host2device() { 110 | float y = sigmoid_device_host(1.25); 111 | std::cout << y << endl; 112 | std::cout << "success:host calling device+host " << endl; 113 | //以下执行失败 114 | try { 115 | float y = sigmoid_host(1.25); 116 | throw std::runtime_error("error: fail"); 117 | } 118 | catch (std::runtime_error err) { 119 | std::cout << "fail:host calling device" << endl; 120 | 121 | } 122 | 123 | } 124 | 125 | 126 | void main_four() { 127 | 128 | global2device();//host<--global<--device 129 | host2device(); 130 | 131 | 132 | 133 | } 134 | 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /tutorials_nine.cu: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | /*! 5 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 6 | 7 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 8 | 9 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 10 | 11 | 可以保证,认真学完教程,cuda编程毫无压力。 12 | 13 | 详情请链接:http://t.csdn.cn/NaCZ5 14 | 15 | 16 | @Description : 原子操作篇 17 | @Author : tangjun 18 | @Date : 2023-8-21 19 | */ 20 | 21 | 22 | #include 23 | #include 24 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 25 | #include "opencv2/opencv.hpp" 26 | #include "device_launch_parameters.h" 27 | #include 28 | #include 29 | #include 30 | using namespace cv; 31 | using namespace std; 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | extern "C" __global__ void kernel_func_error(int* counter, int* data_0) 43 | { 44 | // 计算线程号 45 | unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; 46 | unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \ 47 | threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; 48 | 49 | // 统计结果 50 | int value = data_0[thread_index]; 51 | //printf("%d\n", value); 52 | counter[value] ++; 53 | } 54 | 55 | extern "C" __global__ void kernel_func_correct(int* counter, int* data_0) 56 | { 57 | // 计算线程号 58 | unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; 59 | unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \ 60 | threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y; 61 | 62 | // 统计结果 63 | int value = data_0[thread_index]; 64 | atomicAdd(&counter[value], 1); 65 | } 66 | 67 | 68 | 69 | 70 | int atomic_apply1() { 71 | 72 | const int N = 32; 73 | int* gpu_buffer; 74 | int* host_data = new int[N]; 75 | 76 | for (int i = 0; i < N; i++) { 77 | if (i%2==0) { 78 | host_data[i] = 1; 79 | } 80 | else { 81 | host_data[i] = 0; 82 | } 83 | 84 | } 85 | 86 | std::cout << "打印输入数据" << endl; 87 | for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; } 88 | 89 | 90 | cudaMalloc((void**)&gpu_buffer, N * sizeof(int)); 91 | 92 | 93 | cudaMemcpy(gpu_buffer, host_data, N * sizeof(int), cudaMemcpyHostToDevice); 94 | 95 | int* count = nullptr; 96 | cudaMalloc((void**)&count, 2 * sizeof(int)); 97 | int* host_count = nullptr; 98 | cudaMallocHost((void**)&host_count,2*sizeof(int)); 99 | host_count[0] = 0; 100 | host_count[1] = 0; 101 | cudaMemcpy(count, host_count, 2 * sizeof(int), cudaMemcpyHostToDevice); 102 | 103 | 104 | //kernel_func_error << > > (count, gpu_buffer); 105 | //kernel_func_correct << > > (count, gpu_buffer); 106 | 107 | 108 | auto T0 = std::chrono::system_clock::now(); //时间函数 109 | int num = 10000; 110 | int num_k = 60; 111 | 112 | for (int k = 0; k < num_k; k++) { 113 | for (int j = 0; j < num; j++) { 114 | //cudaMemcpy(count, host_count, 2 * sizeof(int), cudaMemcpyHostToDevice); 115 | kernel_func_error << > > (count, gpu_buffer); 116 | //kernel_func_correct << > > (count, gpu_buffer); 117 | 118 | } 119 | 120 | } 121 | 122 | auto T1 = std::chrono::system_clock::now(); 123 | float time_kernel = std::chrono::duration_cast(T1 - T0).count(); 124 | 125 | std::cout << "\n\n推理时间:\t " << time_kernel/ num_k << "ms\n\n" << endl; 126 | 127 | cudaMemcpy(host_count, count, 2 * sizeof(int), cudaMemcpyDeviceToHost); 128 | 129 | std::cout << "\n打印输出结果\n\n" <<"说明:\t若为偶数,0与1数量相等,否则相差1个数\n\n" << "0的计数量:" << host_count[0]<<"\n1的计数量:"<< host_count[1]<< endl; 130 | 131 | 132 | 133 | 134 | 135 | return 0; 136 | } 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | //十进制转二进制 145 | 146 | 147 | void printbinary(const unsigned int val) 148 | { 149 | for (int i = 16; i >= 0; i--) 150 | { 151 | if (val & (1 << i)) 152 | cout << "1"; 153 | else 154 | cout << "0"; 155 | } 156 | } 157 | 158 | int atomic_apply2() 159 | { 160 | int a = 6; 161 | int b = 4; 162 | std::cout << "\n打印变量a的二进制\n" << a << ":\t"; 163 | printbinary(a); 164 | std::cout << "\n打印变量b的二进制\n" < 50) { 205 | return; 206 | } 207 | gpu_output[index] = data[index] * 100; 208 | 209 | 210 | } 211 | 212 | 213 | 214 | 215 | int atomic_apply3() { 216 | 217 | const int N = 50; 218 | int* gpu_buffer[2]; 219 | 220 | int* host_data = new int[N]; 221 | 222 | 223 | for (int i = 0; i < N; i++) { host_data[i] = i + 6; } 224 | std::cout << "\n打印初始化" << endl; 225 | for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; } 226 | 227 | 228 | 229 | cudaMalloc((void**)&gpu_buffer[0], N * sizeof(int)); 230 | 231 | cudaMalloc((void**)&gpu_buffer[1], N * sizeof(int)); 232 | 233 | 234 | cudaMemcpy(gpu_buffer[0], host_data, N * sizeof(int), cudaMemcpyHostToDevice); 235 | kernel << <1, N >> > (gpu_buffer[0], gpu_buffer[1]); 236 | 237 | 238 | int* cpu_output = new int[N]; 239 | 240 | cudaMemcpy(cpu_output, gpu_buffer[1], N * sizeof(int), cudaMemcpyDeviceToHost); 241 | 242 | std::cout << "\n打印输出结果" << endl; 243 | for (int i = 0; i < N; i++) { std::cout << cpu_output[i] << "\t"; } 244 | 245 | 246 | 247 | return 0; 248 | } 249 | 250 | 251 | 252 | 253 | 254 | __global__ void kernel2(int* data, int* gpu_output, int N) { 255 | 256 | int count = data[0]; 257 | //printf("count:%d\n", count); 258 | 259 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 260 | 261 | if ((tid + 1) % 2 != 0) { return; } 262 | 263 | 264 | int index = atomicAdd(gpu_output, 1); 265 | 266 | //printf("index:%d\n ", (index)); 267 | 268 | if (index >= N / 2) return; 269 | 270 | gpu_output[index] = data[tid]; 271 | 272 | 273 | } 274 | 275 | 276 | int atomic_apply4() { 277 | 278 | 279 | const int N = 50; //使用偶数验证 280 | int* gpu_buffer[2]; 281 | 282 | int* host_data = new int[N]; 283 | 284 | 285 | for (int i = 0; i < N; i++) { host_data[i] = i + 1; } 286 | std::cout << "\n打印初始化" << endl; 287 | for (int i = 0; i < N; i++) { std::cout << host_data[i] << "\t"; } 288 | 289 | 290 | 291 | cudaMalloc((void**)&gpu_buffer[0], N * sizeof(int)); 292 | 293 | cudaMalloc((void**)&gpu_buffer[1], (N / 2) * sizeof(int)); 294 | 295 | 296 | cudaMemcpy(gpu_buffer[0], host_data, N * sizeof(int), cudaMemcpyHostToDevice); 297 | cudaStream_t stream; 298 | cudaStreamCreate(&stream); //stream初始化 299 | 300 | //cudaMemsetAsync(gpu_buffer[1], 0, sizeof(int) * N/2,stream); 301 | kernel2 << <1, N, 0, stream >> > (gpu_buffer[0], gpu_buffer[1], N); 302 | 303 | 304 | int* cpu_output = new int[N / 2]; 305 | 306 | cudaMemcpy(cpu_output, gpu_buffer[1], N / 2 * sizeof(int), cudaMemcpyDeviceToHost); 307 | 308 | std::cout << "\n打印输出结果" << endl; 309 | for (int i = 0; i < N / 2; i++) { std::cout << cpu_output[i] << "\t"; } 310 | 311 | 312 | 313 | return 0; 314 | } 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | //使用原子操作,计数 323 | 324 | __global__ void countValues(float* list, int* count, int n) 325 | { 326 | int i = blockIdx.x * blockDim.x + threadIdx.x; 327 | if (i < n) { 328 | atomicAdd(count, 1); 329 | 330 | } 331 | } 332 | 333 | int atomic_apply5() 334 | { 335 | // 假设你已经有一个包含n个数值的列表list 336 | 337 | const int n = 32; 338 | 339 | float* d_list = nullptr; 340 | 341 | 342 | 343 | int count = 0; 344 | cudaMallocHost((void**)&d_list, n * sizeof(float)); 345 | for (int i = 0; i < n; i++) { d_list[i] = i + 1; } 346 | std::cout << "d_list:" << endl; 347 | for (int i = 0; i < n; i++) { std::cout << d_list[i] << "\t"; } 348 | 349 | 350 | int* d_count = nullptr; 351 | cudaMalloc((void**)&d_count, sizeof(int)); 352 | 353 | cudaMemcpy(d_count, &count, sizeof(int), cudaMemcpyHostToDevice); 354 | 355 | // 定义块和线程的数量 356 | int blockSize = 256; 357 | int numBlocks = (n + blockSize - 1) / blockSize; 358 | 359 | // 调用核函数 360 | countValues << > > (d_list, d_count, n); 361 | 362 | // 将计数器的值从设备端复制回主机端 363 | cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost); 364 | 365 | // 输出结果 366 | printf("Number of non-zero values: %d\n", count); 367 | 368 | // 释放内存 369 | cudaFree(d_list); 370 | cudaFree(d_count); 371 | 372 | return 0; 373 | } 374 | 375 | 376 | 377 | 378 | __global__ void kernel(int* data) { 379 | int tid = threadIdx.x; 380 | atomicAnd(&data[tid], 0x0F); 381 | } 382 | 383 | int atomic_apply6() { 384 | int data[16] = { 0 }; 385 | int* d_data; 386 | cudaMalloc(&d_data, sizeof(int) * 16); 387 | cudaMemcpy(d_data, data, sizeof(int) * 16, cudaMemcpyHostToDevice); 388 | 389 | kernel << <1, 16 >> > (d_data); 390 | 391 | cudaMemcpy(data, d_data, sizeof(int) * 16, cudaMemcpyDeviceToHost); 392 | cudaFree(d_data); 393 | 394 | for (int i = 0; i < 16; ++i) { 395 | printf("%d ", data[i]); 396 | } 397 | printf("\n"); 398 | 399 | return 0; 400 | } 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | int main_nine() { 410 | 411 | 412 | //atomic_apply1(); 413 | atomic_apply2(); 414 | //atomic_apply3(); 415 | //atomic_apply4(); 416 | //atomic_apply5(); 417 | //atomic_apply6(); 418 | return 0; 419 | } 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | -------------------------------------------------------------------------------- /tutorials_seven.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 4 | 5 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 6 | 7 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 8 | 9 | 可以保证,认真学完教程,cuda编程毫无压力。 10 | 11 | 详情请链接:http://t.csdn.cn/NaCZ5 12 | 13 | 14 | @Description : CUDA kenel计算应用示例篇 15 | @Author : tangjun 16 | @Date : 17 | */ 18 | 19 | 20 | 21 | 22 | 23 | #include 24 | #include 25 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 26 | #include "opencv2/opencv.hpp" 27 | #include "device_launch_parameters.h" 28 | #include 29 | #include 30 | using namespace cv; 31 | using namespace std; 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | __global__ void hello_from_gpu() 40 | { 41 | const int blockid = blockIdx.x; 42 | const int threadid = threadIdx.x; 43 | printf("block index %d and thread idex %d!\n", blockid, threadid); 44 | } 45 | int kernel_apply1(void) 46 | { 47 | hello_from_gpu << <6, 5 >> > (); 48 | cudaDeviceSynchronize(); 49 | return 0; 50 | } 51 | 52 | 53 | 54 | __global__ void VecAdd1(int* A, int* B, int* C) 55 | { 56 | int i = threadIdx.x; 57 | C[i] = A[i] + B[i]; 58 | } 59 | 60 | int kernel_apply2() 61 | { 62 | int m = 8; 63 | int* a, * b, * c; 64 | //分配host内存 65 | cudaMallocHost((void**)&a, sizeof(int) * m); 66 | cudaMallocHost((void**)&b, sizeof(int) * m); 67 | cudaMallocHost((void**)&c, sizeof(int) * m); 68 | 69 | std::cout << "value of a:" << endl; 70 | for (int i = 0; i < m; i++) { 71 | a[i] = rand() % 256; 72 | std::cout << a[i] << "\t"; 73 | } 74 | std::cout << "\nvalue of b:" << endl; 75 | for (int i = 0; i < m; i++) { 76 | b[i] = rand() % 260; 77 | std::cout << b[i] << "\t"; 78 | } 79 | 80 | int* g_a, * g_b, * g_c; 81 | //分配gpu内存 82 | cudaMalloc((void**)&g_a, sizeof(int) * m); 83 | cudaMalloc((void**)&g_b, sizeof(int) * m); 84 | cudaMalloc((void**)&g_c, sizeof(int) * m); 85 | // 赋值 86 | cudaMemcpy(g_a, a, sizeof(int) * m, cudaMemcpyHostToDevice); 87 | cudaMemcpy(g_b, b, sizeof(int) * m, cudaMemcpyHostToDevice); 88 | 89 | dim3 dimGrid(1); 90 | dim3 dimBlock(m); 91 | 92 | //应用grid只有x方向一个block,block只有x方向m个third 93 | VecAdd1 << > > (g_a, g_b, g_c); 94 | //VecAdd1 << > > (g_a, g_b, g_c); 95 | //VecAdd1 << <1, m >> > (g_a, g_b, g_c); 96 | 97 | //将g_c赋值给c 98 | cudaMemcpy(c, g_c, sizeof(int) * m, cudaMemcpyDeviceToHost); 99 | //打印 100 | std::cout << "\nvalue of c:" << endl; 101 | for (int i = 0; i < m; i++) { 102 | std::cout << c[i] << "\t"; 103 | } 104 | 105 | 106 | 107 | 108 | 109 | //释放内存 110 | cudaFree(g_a); 111 | cudaFree(g_b); 112 | cudaFree(g_c); 113 | cudaFreeHost(a); 114 | cudaFreeHost(b); 115 | cudaFreeHost(c); 116 | 117 | return 0; 118 | } 119 | 120 | 121 | 122 | 123 | __global__ void MatAdd2(int A[8], int B[8], int C[8]) 124 | { 125 | int i = threadIdx.x; 126 | C[i] = A[i] + B[i]; 127 | printf("\ni=%i", i); 128 | 129 | //std::cout <<"核函数:" << std::endl; 130 | } 131 | 132 | 133 | int kernel_apply3() 134 | { 135 | const int m = 8; 136 | int a[m], b[m], c[m]; 137 | //int *a, *b, *c; 138 | //int* a, * b, c[m]; 139 | 140 | //分配host内存 141 | cudaMallocHost((void**)&a, sizeof(int) * m * m); 142 | cudaMallocHost((void**)&b, sizeof(int) * m); 143 | cudaMallocHost((void**)&c, sizeof(int) * m); 144 | 145 | std::cout << "value of a:" << endl; 146 | for (int i = 0; i < m; i++) { 147 | a[i] = rand() % 69; 148 | std::cout << a[i] << "\t"; 149 | } 150 | 151 | std::cout << "value of b:" << endl; 152 | for (int j = 0; j < m; j++) { 153 | b[j] = rand() % 25; 154 | std::cout << b[j] << "\t"; 155 | } 156 | 157 | int* g_a, * g_b, * g_c; 158 | 159 | //分配gpu内存 160 | cudaMalloc((void**)&g_a, sizeof(int) * m * m); 161 | cudaMalloc((void**)&g_b, sizeof(int) * m); 162 | cudaMalloc((void**)&g_c, sizeof(int) * m); 163 | // 赋值 164 | cudaMemcpy(g_a, a, sizeof(int) * m, cudaMemcpyHostToDevice); 165 | cudaMemcpy(g_b, b, sizeof(int) * m, cudaMemcpyHostToDevice); 166 | 167 | 168 | MatAdd2 << <1, m >> > (g_a, g_b, g_c); 169 | //cudaDeviceSynchronize(); 170 | 171 | cudaMemcpy(c, g_c, sizeof(int) * m, cudaMemcpyDeviceToHost); 172 | 173 | std::cout << "value of c:" << endl; 174 | for (int j = 0; j < m; j++) { 175 | std::cout << c[j] << "\t"; 176 | } 177 | return 0; 178 | } 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | //用于CV读取图片BGR通道将其改为RGB方法 191 | __global__ void rgb2grayincuda(uchar3* const d_in, unsigned char* const d_out, 192 | uint imgheight, uint imgwidth) 193 | { 194 | const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; //w 195 | const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; //h 196 | 197 | if (idx < imgwidth && idy < imgheight) //有的线程会跑到图像外面去,不执行即可 198 | { 199 | uchar3 rgb = d_in[idy * imgwidth + idx]; 200 | d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z; 201 | 202 | } 203 | } 204 | 205 | 206 | void show_img(Mat img) { 207 | cv::imshow("Image", img); 208 | cv::waitKey(1000); 209 | cv::destroyAllWindows(); 210 | 211 | } 212 | 213 | 214 | void kernel_apply4() { 215 | Mat srcImage = imread("image.jpg"); 216 | show_img(srcImage); 217 | 218 | const uint imgheight = srcImage.rows; 219 | const uint imgwidth = srcImage.cols; 220 | 221 | Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0)); 222 | 223 | 224 | 225 | uchar3* d_in; //向量类型,3个uchar 226 | unsigned char* d_out; 227 | 228 | cudaMalloc((void**)&d_in, imgheight * imgwidth * sizeof(uchar3)); 229 | cudaMalloc((void**)&d_out, imgheight * imgwidth * sizeof(unsigned char)); 230 | 231 | 232 | cudaMemcpy(d_in, srcImage.data, imgheight * imgwidth * sizeof(uchar3), cudaMemcpyHostToDevice); 233 | 234 | //说明:(imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x表示x方向 235 | dim3 threadsPerBlock(32, 32); 236 | dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x, (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y); 237 | 238 | 239 | //启动内核 240 | rgb2grayincuda << > > (d_in, d_out, imgheight, imgwidth); 241 | 242 | //执行内核是一个异步操作,因此需要同步以测量准确时间 243 | cudaDeviceSynchronize(); 244 | 245 | 246 | 247 | 248 | //拷贝回来数据 249 | cudaMemcpy(grayImage.data, d_out, imgheight * imgwidth * sizeof(unsigned char), cudaMemcpyDeviceToHost); 250 | 251 | //释放显存 252 | cudaFree(d_in); 253 | cudaFree(d_out); 254 | 255 | imshow("grayImage", grayImage); 256 | cv::waitKey(1000); 257 | cv::destroyAllWindows(); 258 | 259 | 260 | } 261 | 262 | 263 | 264 | 265 | typedef struct { 266 | int width; 267 | int height; 268 | float* elements; 269 | 270 | }Matrix; 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | void main_seven() 281 | { 282 | 283 | //kernel_apply1(); 284 | //kernel_apply2(); 285 | //kernel_apply2(); 286 | kernel_apply4(); 287 | 288 | 289 | 290 | } 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | -------------------------------------------------------------------------------- /tutorials_six.cu: -------------------------------------------------------------------------------- 1 |  2 | /*! 3 | 4 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 5 | 6 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 7 | 8 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 9 | 10 | 可以保证,认真学完教程,cuda编程毫无压力。 11 | 12 | 详情请链接:http://t.csdn.cn/NaCZ5 13 | 14 | 15 | 16 | @Description : CUDA矩阵的加减乘除 17 | @Author : tangjun 18 | @Date : 19 | */ 20 | 21 | 22 | 23 | #include 24 | #include 25 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 26 | #include "opencv2/opencv.hpp" 27 | #include "device_launch_parameters.h" 28 | #include 29 | #include 30 | using namespace cv; 31 | using namespace std; 32 | 33 | 34 | 35 | 36 | 37 | void Print_2dim(int* ptr, int m, int n) { 38 | std::cout << "result:\n"; 39 | for (int i = 0; i < m; i++) { 40 | for (int j = 0; j < n; j++) { 41 | std::cout << "\t" << ptr[i * n + j]; 42 | } 43 | std::cout << "\n"; 44 | } 45 | } 46 | 47 | __global__ void gpu_matrix_plus_thread(int* a, int* b, int* c) 48 | { 49 | //方法一:通过id方式计算 50 | //grid为2维度,block为2维度,使用公式id=blocksize * blockid + threadid 51 | int blocksize = blockDim.x * blockDim.y; 52 | int blockid = gridDim.x * blockIdx.y + blockIdx.x; 53 | int threadid = blockDim.x * threadIdx.y + threadIdx.x; 54 | int id = blocksize * blockid + threadid; 55 | 56 | c[id] = a[id] + b[id]; 57 | 58 | } 59 | 60 | 61 | __global__ void gpu_matrix_plus1(int* a, int* b, int* c, int m, int n) 62 | { //方法二:通过row与col的方式计算-->通过变换列给出id 63 | int row = blockIdx.y * blockDim.y + threadIdx.y; 64 | int col = blockIdx.x * blockDim.x + threadIdx.x; 65 | c[row * n + col] = a[row * n + col] + b[row * n + col]; 66 | } 67 | 68 | 69 | __global__ void gpu_matrix_plus2(int* a, int* b, int* c, int m, int n) 70 | { //方法三:通过row与col的方式计算-->通过变换行给出id 71 | int row = blockIdx.y * blockDim.y + threadIdx.y; 72 | int col = blockIdx.x * blockDim.x + threadIdx.x; 73 | c[row + col * m] = a[row + col * m] + b[row + col * m]; 74 | } 75 | 76 | 77 | void init_variables(int* a, int* b, int m, int n) { 78 | 79 | //初始化变量 80 | std::cout << "value of a:" << endl; 81 | for (int i = 0; i < m; i++) { 82 | for (int j = 0; j < n; j++) { 83 | a[i * n + j] = rand() % 256; 84 | std::cout << "\t" << a[i * n + j]; 85 | 86 | } 87 | std::cout << "\n"; 88 | } 89 | std::cout << "value of b:" << endl; 90 | for (int i = 0; i < m; i++) { 91 | for (int j = 0; j < n; j++) { 92 | b[i * n + j] = rand() % 256; 93 | std::cout << "\t" << b[i * n + j]; 94 | } 95 | std::cout << "\n"; 96 | } 97 | std::cout << "value of a+b:" << endl; 98 | for (int i = 0; i < m; i++) { 99 | for (int j = 0; j < n; j++) { 100 | 101 | std::cout << "\t" << a[i * n + j] + b[i * n + j]; 102 | } 103 | std::cout << "\n"; 104 | } 105 | 106 | 107 | } 108 | 109 | int kernel_plus() 110 | { 111 | /* 112 | matrix a[m,n], matrix b[m,n] 113 | a[m,n]+b[m,n]=[m,n] 114 | */ 115 | 116 | const int BLOCK_SIZE = 2; 117 | int m = 8; //行 118 | int n = 10; //列 119 | int* a, * b; 120 | //分配host内存 121 | cudaMallocHost((void**)&a, sizeof(int) * m * n); 122 | cudaMallocHost((void**)&b, sizeof(int) * m * n); 123 | 124 | init_variables(a, b, m, n);//随机初始化变量 125 | 126 | int* g_a, * g_b; 127 | //分配gpu内存 128 | cudaMalloc((void**)&g_a, sizeof(int) * m * n); 129 | cudaMalloc((void**)&g_b, sizeof(int) * m * n); 130 | cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice); 131 | cudaMemcpy(g_b, b, sizeof(int) * m * n, cudaMemcpyHostToDevice); 132 | 133 | unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; //行写给y 134 | unsigned int grid_cols = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; //列写给x 135 | dim3 dimGrid(grid_cols, grid_rows); 136 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 137 | 138 | 139 | std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl; 140 | std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl; 141 | 142 | 143 | 144 | 145 | int* c1, * g_c; 146 | cudaMalloc((void**)&g_c, sizeof(int) * m * n); 147 | cudaMallocHost((void**)&c1, sizeof(int) * m * n); 148 | gpu_matrix_plus_thread << > > (g_a, g_b, g_c); 149 | cudaMemcpy(c1, g_c, sizeof(int) * m * n, cudaMemcpyDeviceToHost); 150 | Print_2dim(c1, m, n); 151 | 152 | 153 | int* c2, * g_c2; 154 | cudaMallocHost((void**)&c2, sizeof(int) * m * n); 155 | cudaMalloc((void**)&g_c2, sizeof(int) * m * n); 156 | gpu_matrix_plus1 << > > (g_a, g_b, g_c2, m, n); 157 | cudaMemcpy(c2, g_c2, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端 158 | Print_2dim(c2, m, n); 159 | 160 | int* c3, * g_c3; 161 | cudaMallocHost((void**)&c3, sizeof(int) * m * n); 162 | cudaMalloc((void**)&g_c3, sizeof(int) * m * n); 163 | gpu_matrix_plus2 << > > (g_a, g_b, g_c3, m, n); 164 | cudaMemcpy(c3, g_c3, sizeof(int) * m * n, cudaMemcpyDeviceToHost); //将device端转host端 165 | Print_2dim(c3, m, n); 166 | 167 | //释放内存 168 | cudaFree(g_a); 169 | cudaFree(g_b); 170 | cudaFree(g_c); 171 | cudaFreeHost(a); 172 | cudaFreeHost(b); 173 | cudaFreeHost(c1); 174 | 175 | return 0; 176 | } 177 | 178 | 179 | __global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k) 180 | { 181 | int row = blockIdx.y * blockDim.y + threadIdx.y; // 行线程 y 182 | int col = blockIdx.x * blockDim.x + threadIdx.x; // 列线程 x 183 | int sum = 0; 184 | if (col < k && row < m) { 185 | for (int i = 0; i < n; i++) { 186 | sum += a[row * n + i] * b[i * k + col]; //看出row与col不动的方式计算 187 | } 188 | c[row * k + col] = sum; 189 | } 190 | } 191 | 192 | __global__ void gpu_matrix_multiply_thread(int* a, int* b, int* c, int m, int n, int k) 193 | { 194 | // [m*n]* [n*k]<---m n 195 | //方法一:通过id方式计算 196 | //grid为2维度,block为2维度,使用公式id=blocksize * blockid + threadid 197 | int blocksize = blockDim.x * blockDim.y; 198 | int blockid = gridDim.x * blockIdx.y + blockIdx.x; 199 | int threadid = blockDim.x * threadIdx.y + threadIdx.x; 200 | int id = blocksize * blockid + threadid; 201 | 202 | int row = id / k; 203 | int col = id % k; 204 | int sum = 0; 205 | 206 | for (int i = 0; i < n; i++) { 207 | sum += a[row * n + i] * b[i * k + col]; 208 | 209 | } 210 | c[row * k + col] = sum; 211 | 212 | 213 | } 214 | 215 | int kernel_multiply() 216 | { 217 | /* 218 | matrix a[m,n], matrix b[n,k] 219 | a[m,n]*b[n,k]=[m,k] 220 | */ 221 | 222 | const int BLOCK_SIZE = 2; 223 | int m = 8; //行 224 | int n = 4; //中间变量 225 | int k = 10; //列 226 | int* a, * b; 227 | 228 | // 初始化 a与b 229 | cudaMallocHost((void**)&a, sizeof(int) * m * n); 230 | cudaMallocHost((void**)&b, sizeof(int) * n * k); 231 | 232 | std::cout << "value of a:" << endl; 233 | for (int i = 0; i < m; i++) { 234 | for (int j = 0; j < n; j++) { 235 | a[i * n + j] = rand() % 6; 236 | std::cout << "\t" << a[i * n + j]; 237 | } 238 | std::cout << "\n"; 239 | } 240 | 241 | std::cout << "value of b:" << endl; 242 | for (int i = 0; i < n; i++) { 243 | for (int j = 0; j < k; j++) { 244 | b[i * k + j] = rand() % 10; 245 | std::cout << "\t" << b[i * k + j]; 246 | } 247 | std::cout << "\n"; 248 | } 249 | 250 | //a*b相乘 251 | std::cout << "value of a*b:" << endl; 252 | for (int i = 0; i < m; i++) { 253 | for (int j = 0; j < k; j++) { 254 | int tmp = 0; 255 | for (int h = 0; h < n; h++) { 256 | tmp += a[i * n + h] * b[h * k + j]; 257 | } 258 | //c[i * k + j] = tmp; 259 | std::cout << "\t" << tmp; 260 | } 261 | std::cout << "\n"; 262 | } 263 | 264 | 265 | 266 | 267 | int* g_a, * g_b; 268 | 269 | cudaMalloc((void**)&g_a, sizeof(int) * m * n); 270 | cudaMalloc((void**)&g_b, sizeof(int) * n * k); 271 | cudaMemcpy(g_a, a, sizeof(int) * m * n, cudaMemcpyHostToDevice); 272 | cudaMemcpy(g_b, b, sizeof(int) * k * n, cudaMemcpyHostToDevice); 273 | unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; //行写给y 274 | unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; //列写给x 275 | dim3 dimGrid(grid_cols, grid_rows); 276 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 277 | std::cout << "gridDIM.x:" << grid_cols << "\tgridDIM.y:" << grid_rows << endl; 278 | std::cout << "blockDIM.x:" << BLOCK_SIZE << "\tblockDIM.y:" << BLOCK_SIZE << endl; 279 | 280 | // 使用row与col计算 281 | int* c1, * g_c; 282 | cudaMalloc((void**)&g_c, sizeof(int) * m * k); 283 | cudaMallocHost((void**)&c1, sizeof(int) * m * k); 284 | gpu_matrix_mult << > > (g_a, g_b, g_c, m, n, k); 285 | cudaMemcpy(c1, g_c, sizeof(int) * m * k, cudaMemcpyDeviceToHost); 286 | Print_2dim(c1, m, k); 287 | cudaFree(g_c); 288 | cudaFreeHost(c1); 289 | 290 | //使用id计算 291 | int* c2, * g_c2; 292 | cudaMalloc((void**)&g_c2, sizeof(int) * m * k); 293 | cudaMallocHost((void**)&c2, sizeof(int) * m * k); 294 | gpu_matrix_multiply_thread << > > (g_a, g_b, g_c2, m, n, k); 295 | cudaMemcpy(c2, g_c2, sizeof(int) * m * k, cudaMemcpyDeviceToHost); 296 | Print_2dim(c2, m, k); 297 | cudaFree(g_c2); 298 | cudaFreeHost(c2); 299 | 300 | //释放内存 301 | cudaFree(g_a); 302 | cudaFree(g_b); 303 | cudaFreeHost(a); 304 | cudaFreeHost(b); 305 | 306 | 307 | return 0; 308 | } 309 | 310 | void main_six() { 311 | kernel_plus(); 312 | kernel_multiply(); 313 | 314 | } 315 | 316 | 317 | 318 | 319 | -------------------------------------------------------------------------------- /tutorials_ten.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 4 | 5 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 6 | 7 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 8 | 9 | 可以保证,认真学完教程,cuda编程毫无压力。 10 | 11 | 详情请链接:http://t.csdn.cn/NaCZ5 12 | 13 | 14 | 15 | 16 | @Description : stream 17 | @Author : tangjun 18 | @Date : 2023-08-23 19 | */ 20 | 21 | 22 | 23 | 24 | #include "cuda_runtime.h" 25 | #include 26 | #include 27 | #include 28 | using namespace std; 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | __global__ void kernel_one(int* a, int* b, int* c) 37 | { 38 | int threadID = blockIdx.x * blockDim.x + threadIdx.x; 39 | //printf("threadID:%d\n", threadID); 40 | 41 | c[threadID] = a[threadID] + b[threadID]; 42 | 43 | } 44 | 45 | int stream_apply1() 46 | { 47 | int N = 32; 48 | const int FULL_DATA_SIZE = N * 2; 49 | //获取设备属性 50 | cudaDeviceProp prop; 51 | int deviceID; 52 | cudaGetDevice(&deviceID); 53 | cudaGetDeviceProperties(&prop, deviceID); 54 | //检查设备是否支持重叠功能 55 | if (!prop.deviceOverlap) 56 | { 57 | printf("No device will handle overlaps. so no speed up from stream.\n"); 58 | return 0; 59 | } 60 | 61 | //启动计时器 62 | cudaEvent_t start, stop; 63 | float elapsedTime; 64 | cudaEventCreate(&start); 65 | cudaEventCreate(&stop); 66 | cudaEventRecord(start, 0); 67 | 68 | //创建一个CUDA流 69 | cudaStream_t stream; 70 | cudaStreamCreate(&stream); 71 | 72 | int* host_a, * host_b, * host_c; 73 | int* dev_a, * dev_b, * dev_c; 74 | 75 | //在GPU上分配内存 76 | cudaMalloc((void**)&dev_a, N * sizeof(int)); 77 | cudaMalloc((void**)&dev_b, N * sizeof(int)); 78 | cudaMalloc((void**)&dev_c, N * sizeof(int)); 79 | 80 | //在CPU上分配页锁定内存 81 | cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); 82 | cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); 83 | cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); 84 | 85 | //主机上的内存赋值 86 | for (int i = 0; i < FULL_DATA_SIZE; i++) 87 | { 88 | host_a[i] = i; 89 | host_b[i] = 10000 * i; 90 | } 91 | 92 | for (int i = 0; i < FULL_DATA_SIZE; i += N) 93 | { 94 | cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream); 95 | cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream); 96 | 97 | 98 | kernel_one << > > (dev_a, dev_b, dev_c); 99 | 100 | cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream); 101 | } 102 | 103 | // wait until gpu execution finish 104 | cudaStreamSynchronize(stream); 105 | 106 | cudaEventRecord(stop, 0); 107 | cudaEventSynchronize(stop); 108 | cudaEventElapsedTime(&elapsedTime, start, stop); 109 | 110 | std::cout << "消耗时间: " << elapsedTime << std::endl; 111 | 112 | 113 | 114 | cout << "输入数据host_a" << endl; 115 | for (int i = 0; i < FULL_DATA_SIZE; i++) { std::cout << host_a[i] << "\t"; } 116 | cout << "\n输入数据host_b" << endl; 117 | for (int i = 0; i < FULL_DATA_SIZE; i++) { std::cout << host_b[i] << "\t"; } 118 | 119 | cout << "\n输出结果host_c" << endl; 120 | for (int i = 0; i < FULL_DATA_SIZE; i++) {std::cout << host_c[i] << "\t"; } 121 | 122 | getchar(); 123 | 124 | // free stream and mem 125 | cudaFreeHost(host_a); 126 | cudaFreeHost(host_b); 127 | cudaFreeHost(host_c); 128 | 129 | cudaFree(dev_a); 130 | cudaFree(dev_b); 131 | cudaFree(dev_c); 132 | 133 | cudaStreamDestroy(stream); 134 | return 0; 135 | } 136 | 137 | int stream_apply2() 138 | { 139 | const int NS = 4; 140 | const int ND = 32; 141 | 142 | //创建CUDA流与初始化 143 | cudaStream_t streams[NS]; 144 | for (int i = 0; i < NS; i++) { cudaStreamCreate(&streams[i]); } 145 | 146 | 147 | int* host_a, * host_b, * host_c; 148 | int* dev_a, * dev_b, * dev_c; 149 | 150 | //在GPU上分配内存 151 | //cudaMalloc((void**)&dev_a, ND * sizeof(int)); 152 | //cudaMalloc((void**)&dev_b, ND * sizeof(int)); 153 | //cudaMalloc((void**)&dev_c, ND * sizeof(int)); 154 | 155 | 156 | cudaMalloc((void**)&dev_a, ND * NS * sizeof(int)); 157 | cudaMalloc((void**)&dev_b, ND * NS * sizeof(int)); 158 | cudaMalloc((void**)&dev_c, ND * NS * sizeof(int)); 159 | 160 | //在CPU上分配页锁定内存 161 | cudaHostAlloc((void**)&host_a, ND*NS * sizeof(int), cudaHostAllocDefault); 162 | cudaHostAlloc((void**)&host_b, ND*NS * sizeof(int), cudaHostAllocDefault); 163 | cudaHostAlloc((void**)&host_c, ND*NS * sizeof(int), cudaHostAllocDefault); 164 | 165 | //主机上的内存赋值 166 | for (int i = 0; i < ND * NS; i++) { 167 | host_a[i] = i; 168 | host_b[i] = 10000 * i; } 169 | 170 | for (int i = 0; i < NS; i++) { 171 | cudaMemcpyAsync(dev_a + i * ND, host_a + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]); 172 | cudaMemcpyAsync(dev_b + i * ND, host_b + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]); 173 | kernel_one << > > (dev_a + i * ND, dev_b + i * ND, dev_c + i * ND); 174 | cudaMemcpyAsync(host_c + i * ND, dev_c + i * ND, ND * sizeof(int), cudaMemcpyDeviceToHost, streams[i]); 175 | 176 | } 177 | 178 | // wait until gpu execution finish 179 | cudaDeviceSynchronize(); 180 | 181 | cout << "输入数据host_a" << endl; 182 | for (int i = 0; i < ND * NS; i++) { std::cout << host_a[i] << "\t"; } 183 | cout << "\n输入数据host_b" << endl; 184 | for (int i = 0; i < ND * NS; i++) { std::cout << host_b[i] << "\t"; } 185 | cout << "\n输出结果host_c" << endl; 186 | for (int i = 0; i < ND * NS; i++) { std::cout << host_c[i] << "\t"; } 187 | // free stream and mem 188 | cudaFreeHost(host_a); 189 | cudaFreeHost(host_b); 190 | cudaFreeHost(host_c); 191 | 192 | cudaFree(dev_a); 193 | cudaFree(dev_b); 194 | cudaFree(dev_c); 195 | 196 | for (int i = 0; i < NS; i++) { cudaStreamDestroy(streams[i]); } 197 | return 0; 198 | } 199 | 200 | 201 | 202 | 203 | int stream_apply3() 204 | { 205 | const int NS = 4; //流个数 206 | const int ND = 32; //每个流分配负责多少个数据 207 | 208 | cudaStream_t streams[NS]; //创建多个cuda流 209 | for (int i = 0; i < NS; i++) { cudaStreamCreate(&streams[i]); } //每个流初始化 210 | 211 | int* host_a, * host_b, * host_c; //host端变量 212 | int* dev_a, * dev_b, * dev_c; //gpu端变量 213 | 214 | //在GPU上分配内存 215 | cudaMalloc((void**)&dev_a, ND * sizeof(int)); 216 | cudaMalloc((void**)&dev_b, ND * sizeof(int)); 217 | cudaMalloc((void**)&dev_c, ND * sizeof(int)); 218 | 219 | 220 | //在CPU上分配页锁定内存,必须使用cudaHostAlloc方法 221 | cudaHostAlloc((void**)&host_a, ND * NS * sizeof(int), cudaHostAllocDefault); 222 | cudaHostAlloc((void**)&host_b, ND * NS * sizeof(int), cudaHostAllocDefault); 223 | cudaHostAlloc((void**)&host_c, ND * NS * sizeof(int), cudaHostAllocDefault); 224 | 225 | //主机上的内存赋值 226 | for (int i = 0; i < ND * NS; i++) { 227 | host_a[i] = i; 228 | host_b[i] = 10000 * i; 229 | } 230 | //循环流,为每个流分配数据赋值与kernel操作过程 231 | for (int i = 0; i < NS; i++) { 232 | cudaMemcpyAsync(dev_a , host_a + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]); 233 | cudaMemcpyAsync(dev_b , host_b + i * ND, ND * sizeof(int), cudaMemcpyHostToDevice, streams[i]); 234 | kernel_one << > > (dev_a , dev_b , dev_c ); 235 | cudaMemcpyAsync(host_c + i * ND, dev_c , ND * sizeof(int), cudaMemcpyDeviceToHost, streams[i]); 236 | 237 | } 238 | 239 | // wait until gpu execution finish 240 | cudaDeviceSynchronize();//等待所有异步执行完,cpu才操作 241 | //打印输出结果 242 | cout << "输入数据host_a" << endl; 243 | for (int i = 0; i < ND * NS; i++) { std::cout << host_a[i] << "\t"; } 244 | cout << "\n输入数据host_b" << endl; 245 | for (int i = 0; i < ND * NS; i++) { std::cout << host_b[i] << "\t"; } 246 | cout << "\n输出结果host_c" << endl; 247 | for (int i = 0; i < ND * NS; i++) { std::cout << host_c[i] << "\t"; } 248 | // free stream and mem 249 | cudaFreeHost(host_a); 250 | cudaFreeHost(host_b); 251 | cudaFreeHost(host_c); 252 | 253 | cudaFree(dev_a); 254 | cudaFree(dev_b); 255 | cudaFree(dev_c); 256 | 257 | for (int i = 0; i < NS; i++) { cudaStreamDestroy(streams[i]); } 258 | return 0; 259 | } 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | int main_ten() { 274 | 275 | 276 | //stream_apply1(); 277 | stream_apply2(); 278 | stream_apply3(); 279 | 280 | return 0; 281 | 282 | } 283 | 284 | -------------------------------------------------------------------------------- /tutorials_three.cu: -------------------------------------------------------------------------------- 1 |  2 | /*! 3 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 4 | 5 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 6 | 7 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 8 | 9 | 可以保证,认真学完教程,cuda编程毫无压力。 10 | 11 | 详情请链接:http://t.csdn.cn/NaCZ5 12 | 13 | 14 | 15 | 16 | @Description : CUDA编译器环境配置篇 17 | @Author : tangjun 18 | @Date : 19 | */ 20 | 21 | 22 | #include "cuda_runtime.h" 23 | #include "device_launch_parameters.h" 24 | #include 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | int check_cuda_main() { 34 | int deviceCount; 35 | cudaGetDeviceCount(&deviceCount); 36 | 37 | int dev; 38 | for (dev = 0; dev < deviceCount; dev++) 39 | { 40 | int driver_version(0), runtime_version(0); 41 | cudaDeviceProp deviceProp; 42 | cudaGetDeviceProperties(&deviceProp, dev); 43 | if (dev == 0) 44 | if (deviceProp.minor = 9999 && deviceProp.major == 9999) 45 | printf("\n"); 46 | printf("\nDevice%d:\"%s\"\n", dev, deviceProp.name); 47 | cudaDriverGetVersion(&driver_version); 48 | printf("CUDA驱动版本: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10); 49 | cudaRuntimeGetVersion(&runtime_version); 50 | printf("CUDA运行时版本: %d.%d\n", runtime_version / 1000, (runtime_version % 1000) / 10); 51 | printf("设备计算能力: %d.%d\n", deviceProp.major, deviceProp.minor); 52 | printf("Total amount of Global Memory: %u bytes\n", deviceProp.totalGlobalMem); 53 | printf("Number of SMs: %d\n", deviceProp.multiProcessorCount); 54 | printf("Total amount of Constant Memory: %u bytes\n", deviceProp.totalConstMem); 55 | printf("Total amount of Shared Memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); 56 | printf("Total number of registers available per block: %d\n", deviceProp.regsPerBlock); 57 | printf("Warp size: %d\n", deviceProp.warpSize); 58 | printf("Maximum number of threads per SM: %d\n", deviceProp.maxThreadsPerMultiProcessor); 59 | printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); 60 | printf("Maximum size of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], 61 | deviceProp.maxThreadsDim[1], 62 | deviceProp.maxThreadsDim[2]); 63 | printf("Maximum size of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); 64 | printf("Maximum memory pitch: %u bytes\n", deviceProp.memPitch); 65 | printf("Texture alignmemt: %u bytes\n", deviceProp.texturePitchAlignment); 66 | printf("Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); 67 | printf("Memory Clock rate: %.0f MHz\n", deviceProp.memoryClockRate * 1e-3f); 68 | printf("Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); 69 | } 70 | 71 | return 0; 72 | } 73 | 74 | 75 | 76 | void main_three() { 77 | 78 | check_cuda_main(); 79 | 80 | 81 | } -------------------------------------------------------------------------------- /tutorials_yolo_part_postprocess.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | 我的教程优势:一系列且强逻辑顺序教程,附有源码,实战性很强。 4 | 5 | 只有核心cuda处理代码,隐藏在教程中,我将不开源,毕竟已开源很多cuda教程代码,也为本次教程付出很多汗水。 6 | 7 | 因此,核心代码于yolo部署cuda代码和整个文字解释教程需要有一定补偿,望理解。 8 | 9 | 可以保证,认真学完教程,cuda编程毫无压力。 10 | 11 | 详情请链接:http://t.csdn.cn/NaCZ5 12 | 13 | 14 | 15 | 16 | @Description : 模拟yolo输出整个cuda的后处理-->最终为box、conf、cls_id,本部分只给出了框架。 17 | @Author : tangjun 18 | @Date : 2023-08-10 19 | */ 20 | 21 | 22 | 23 | #include 24 | #include 25 | #include "opencv2/highgui.hpp" //实际上在/usr/include下 26 | #include "opencv2/opencv.hpp" 27 | #include "device_launch_parameters.h" 28 | #include 29 | #include 30 | #include 31 | #include 32 | using namespace cv; 33 | using namespace std; 34 | 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | 43 | 44 | struct nms_box { 45 | float x1, y1, x2, y2; 46 | float score; 47 | int cls_id; 48 | }; 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | void imitate_yolo_part_postprocess() { 65 | /* 66 | 67 | 该代码模拟yolo后处理方法 68 | 69 | */ 70 | 71 | float conf_thr = 0.3; 72 | float nms_thr = 0.1; //iou>nms_thr则排除 73 | const int max_object = 6; 74 | 75 | 76 | int cls_num = 3; 77 | 78 | int anchor_output_num = 21; //模型输出目标数量,类似25200 79 | int N_obj = anchor_output_num * (cls_num + 5);//模型输出x,y,w,h+conf+cls_num 80 | 81 | 82 | 83 | 84 | /***********************#########################模仿构建yolo输出结果########################***********************/ 85 | float* input_data = nullptr; 86 | cudaMallocHost((void**)&input_data, sizeof(float) * N_obj); 87 | 88 | //赋值 89 | for (int i = 0; i < N_obj; i++) { 90 | //input_data[i] = (float)(i+1) ; 91 | float value = rand() / float(RAND_MAX); 92 | input_data[i] = round(value * 10000) / 10000; 93 | } 94 | //更新赋值 95 | for (int i = 0; i < 6; i++) { 96 | int idx = i * (cls_num + 5); 97 | if (idx == 0) { 98 | input_data[idx] = 367.0; 99 | input_data[idx + 1] = 38.0; 100 | input_data[idx + 2] = 677.0; 101 | input_data[idx + 3] = 318.0; 102 | input_data[idx + 4] = 1.0; 103 | } 104 | else if (idx == 1 * (cls_num + 5)) { 105 | input_data[idx] = 502.0; 106 | input_data[idx + 1] = 38.0; 107 | input_data[idx + 2] = 731.0; 108 | input_data[idx + 3] = 318.0; 109 | input_data[idx + 4] = 1.0; 110 | 111 | } 112 | else if (idx == 2 * (cls_num + 5)) { 113 | input_data[idx] = 303.0; 114 | input_data[idx + 1] = 377.0; 115 | input_data[idx + 2] = 831.0; 116 | input_data[idx + 3] = 1071.0; 117 | input_data[idx + 4] = 1.0; 118 | 119 | } 120 | else if (idx == 3 * (cls_num + 5)) { 121 | input_data[idx] = 193.0; 122 | input_data[idx + 1] = 435.0; 123 | input_data[idx + 2] = 831.0; 124 | input_data[idx + 3] = 931.0; 125 | input_data[idx + 4] = 1.0; 126 | 127 | } 128 | else if (idx == 4 * (cls_num + 5)) { 129 | input_data[idx] = 1039.0; 130 | input_data[idx + 1] = 147.0; 131 | input_data[idx + 2] = 1471.0; 132 | input_data[idx + 3] = 557.0; 133 | input_data[idx + 4] = 1.0; 134 | 135 | } 136 | else if (idx == 5 * (cls_num + 5)) { 137 | input_data[idx] = 1339.0; 138 | input_data[idx + 1] = 1.0; 139 | input_data[idx + 2] = 1571.0; 140 | input_data[idx + 3] = 209.0; 141 | input_data[idx + 4] = 1.0; 142 | 143 | } 144 | 145 | 146 | 147 | } 148 | 149 | //打印显示 150 | std::cout << "原始数据赋值+打印:" << endl; 151 | for (int i = 0; i < N_obj; i++) { 152 | 153 | if (i % (cls_num + 5) == 0) { std::cout << endl; } 154 | std::cout << input_data[i] << "\t\t"; 155 | 156 | } 157 | 158 | float* gpu_input = nullptr; 159 | cudaMalloc((void**)&gpu_input, sizeof(float) * N_obj); 160 | cudaMemcpy(gpu_input, input_data, sizeof(float) * N_obj, cudaMemcpyHostToDevice); 161 | 162 | /***********************#########################模仿构建yolo输出结果########################***********************/ 163 | 164 | 165 | 166 | 167 | 168 | /***********************#########################cuda相关变量和内存分配-初始化########################***********************/ 169 | 170 | float* gpu_output = nullptr; 171 | cudaMalloc((void**)&gpu_output, sizeof(float) * max_object * 6);// 保存处理后的yolo输出结果,格式为[max_boject, [x1,y1,x2,y2,conf,cls_id]] 172 | 173 | nms_box* d_boxes = nullptr; 174 | cudaMalloc(&d_boxes, anchor_output_num * sizeof(nms_box)); // gpu设备保存,gpu_output数据纯粹格式转换为nms_box结构体的格式 175 | nms_box* h_boxes = nullptr; 176 | cudaMallocHost(&h_boxes, anchor_output_num * sizeof(nms_box)); //同理,host端保存, 177 | 178 | 179 | int* h_nms_indices_init; 180 | cudaMallocHost(&h_nms_indices_init, max_object * sizeof(int)); //nms处理的索引赋值初始化变量,恒定不变 181 | for (int i = 0; i < max_object; i++) { h_nms_indices_init[i] = i; } //赋值操作 182 | 183 | int* d_nms_indices; 184 | cudaMalloc(&d_nms_indices, max_object * sizeof(int)); //gpu设备处理后索引值,-1为排除目标,>-1为保存目标,需h_nms_indices_init为为其赋值 185 | int* h_nms_indices; 186 | cudaMallocHost(&h_nms_indices, max_object * sizeof(int)); //host端保存d_nms_indices赋值,以此决定保留nms目标 187 | 188 | 189 | int h_count = 0; //host端记录gpu_output保存有效obj数量,值来源d_conut 190 | int* d_count = nullptr; 191 | cudaMalloc((void**)&d_count, sizeof(int)); //gpu端通过原子操作,记录gpu_output保存有效obj数量 192 | 193 | 194 | /***********************#########################cuda相关变量和内存分配-初始化########################***********************/ 195 | 196 | 197 | 198 | const int block = 32; 199 | 200 | 201 | 202 | /*************************************************开始cuda计算***********************************************/ 203 | 204 | cudaStream_t stream; 205 | cudaStreamCreate(&stream); 206 | 207 | 208 | 209 | h_count = 0; 210 | cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice); //初始化记录有效变量d_count与h_count 211 | int grid = (anchor_output_num + block - 1) / block; 212 | //调用yolo输出结果处理的核函数 213 | //decode_yolo_kernel << < grid, block, 0, stream >> > (gpu_input, gpu_output, max_object, cls_num, conf_thr, d_count); 214 | 215 | 216 | cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); 217 | if (h_count > max_object) { h_count = max_object; }; 218 | 219 | 220 | 221 | 222 | /****************************************打印模型输出输出数据结果--》通过置信度已过滤不满足要求和给出类别**********************************/ 223 | 224 | float* host_decode = nullptr; // 保存gpu处理的变量 225 | cudaMallocHost((void**)&host_decode, sizeof(float) * max_object * 6); 226 | cudaMemcpy(host_decode, gpu_output, sizeof(float) * max_object * 6, cudaMemcpyDeviceToHost); 227 | std::cout << "\n\n打印输出结果-gpu_output\n" << endl; 228 | if (h_count == 0) { std::cout << "\n无检测结果" << endl; } 229 | for (int i = 0; i < h_count; i++) { 230 | int idx = i * 6; 231 | std::cout << "x1:" << host_decode[idx] << "\ty1:" << host_decode[idx + 1] << "\tx2:" << host_decode[idx + 2] 232 | << "\ty2:" << host_decode[idx + 3] << "\tconf:" << host_decode[idx + 4] << "\tclass_id:" << host_decode[idx + 5] << endl; 233 | 234 | } 235 | /******************************************************************************************************************************/ 236 | 237 | 238 | 239 | 240 | //这里是我对decode_yolo_kernel核函数处理后的数据进一步转换格式,在gpu中完成 241 | int grid_max = (max_object + block - 1) / block; 242 | //data_format_convert << < grid_max, block, 0, stream >> > (d_boxes, gpu_output, h_count); // gpu_output格式为[x1,y1,conf,cls_id] 243 | 244 | 245 | 246 | 247 | 248 | 249 | /****************************************将数据转换为带有nms_box格式数据******************************************************/ 250 | nms_box* h_boxes_format = nullptr; 251 | cudaMallocHost(&h_boxes_format, anchor_output_num * sizeof(nms_box)); 252 | cudaMemcpy(h_boxes_format, d_boxes, anchor_output_num * sizeof(nms_box), cudaMemcpyDeviceToHost); 253 | std::cout << "\n\n打印格式转换输出-h_boxes_format\n" << endl; 254 | if (h_count == 0) { std::cout << "\n无检测结果" << endl; } 255 | for (int i = 0; i < h_count; i++) { 256 | nms_box bb = h_boxes_format[i]; 257 | std::cout << "x1:" << bb.x1 << "\ty1:" << bb.y1 << "\tx2:" << bb.x2 << "\ty2:" << bb.y2 << "\tconf:" << bb.score << "\tclass_id:" << bb.cls_id << endl; 258 | } 259 | /******************************************************************************************************************************/ 260 | 261 | 262 | 263 | 264 | 265 | cudaMemcpy(d_nms_indices, h_nms_indices_init, max_object * sizeof(int), cudaMemcpyHostToDevice); //初始化nms处理的索引-->很重要 266 | 267 | 268 | 269 | 270 | /****************************************查看d_nms_indices数据******************************************************/ 271 | int* d_nms_indices_visual = nullptr; 272 | cudaMallocHost(&d_nms_indices_visual, max_object * sizeof(int)); 273 | cudaMemcpy(d_nms_indices_visual, d_nms_indices, max_object * sizeof(int), cudaMemcpyDeviceToHost); 274 | std::cout << "\n\nd_nms_indices:\n" << endl; 275 | for (int i = 0; i < max_object; i++) { std::cout << "\t" << d_nms_indices_visual[i] << endl; } 276 | 277 | /******************************************************************************************************************************/ 278 | 279 | 280 | 281 | 282 | //这一步是将转换后的数据做nms处理,也是在gpu上进行cuda处理 283 | //nms_yolo_kernel << > > (d_boxes, d_nms_indices, h_count, nms_thr); 284 | 285 | 286 | 287 | /*******将yolo的gpu上结果转host端,然后保存结果处理-->最终结果保存在keep_boxes中**********/ 288 | cudaMemcpy(h_boxes, d_boxes, anchor_output_num * sizeof(nms_box), cudaMemcpyDeviceToHost); 289 | cudaMemcpy(h_nms_indices, d_nms_indices, max_object * sizeof(int), cudaMemcpyDeviceToHost); //保存处理后的indice 290 | 291 | vector keep_boxes(h_count); 292 | for (int i = 0; i < h_count; i++) { 293 | if (h_nms_indices[i] > -1) { 294 | keep_boxes[i] = h_boxes[i]; 295 | } 296 | } 297 | 298 | 299 | 300 | /****************************************查看nms处理后的-d_nms_indices******************************************************/ 301 | std::cout << "nms处理后,保留box索引,-1表示排除obj,>-1表示保存obj" << endl; 302 | for (int i = 0; i < max_object; i++) { std::cout << h_nms_indices[i] << "\t"; } 303 | /**********************************************************************************************/ 304 | 305 | 306 | 307 | /****************************************随便一张图为背景-显示结果于图上******************************************************/ 308 | cv::Mat image = cv::imread("image.jpg"); 309 | 310 | for (nms_box box : keep_boxes) { 311 | 312 | cv::Point p1(box.x1, box.y1); 313 | cv::Point p2(box.x2, box.y2); 314 | cv::rectangle(image, p1, p2, cv::Scalar(0, 255, 0), 4, 1, 0);//矩形的两个顶点,两个顶点都包括在矩形内部 315 | } 316 | 317 | 318 | cv::resize(image, image, cv::Size(600, 400), 0, 0, cv::INTER_NEAREST); 319 | cv::imshow("www", image); 320 | cv::waitKey(100000); 321 | cv::destroyAllWindows(); 322 | /**********************************************************************************************/ 323 | 324 | 325 | 326 | } 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | void main() { 335 | 336 | 337 | 338 | imitate_yolo_part_postprocess(); 339 | 340 | 341 | 342 | } 343 | 344 | --------------------------------------------------------------------------------