├── README.md ├── cuda ├── Transformer Encoder.cu ├── cu_managed_Matrixmultiplication.cu ├── cu_vectorAdd.cu └── cuda.sh ├── dpcpp ├── dpcpp_sobel.cpp ├── dpcpp_templatematching.cpp └── dpcpp_vectorAdd.cpp ├── hip └── hip_vectorAdd.cpp ├── opencl ├── OpenCL_Mixer.cpp ├── OpenCL_vectorAdd.c ├── a.cl └── opencl.sh ├── openmp ├── OpenMP-Matrix_Vector_Multiplication.c ├── OpenMP-matrix_multiplication.cpp ├── OpenMP-simple_instances.c └── openmp.sh └── pthread ├── PThread-matrix_multiplication.c ├── PThread-simple_instances.c ├── PThread-synchronization.c └── pthread.sh /README.md: -------------------------------------------------------------------------------- 1 | # Parallel Computing Lab 2 | 3 | 4 | This is a compilation of experiments on multi-thread computing, parallel computing and a small project on parallel programming language implementations, including Pthread, OpenMP, CUDA, HIP, OpenCL and DPC++. 5 | 6 | 7 | 8 | ## OpenMP 9 | 10 | OpenMP-simple_instances.c is a simple OpenMP example 11 | 12 | OpenMP-Matrix_Vector_Multiplication.c is an OpenMP example of vector and matrix multiplication. 13 | 14 | OpenMP-matrix_multiplication.c is an OpenMP example of matrix multiplication. 15 | 16 | For specific instructions, see 17 | https://blog.csdn.net/qq_46009046/article/details/133587081 18 | 19 | ## PThread 20 | 21 | PThread-simple_instances.c is a simple PThread example 22 | 23 | PThread-synchronization.c is a synchronized PThread example 24 | 25 | PThread-matrix_multiplication.c is a matrix multiplication PThread example 26 | 27 | For specific instructions, see 28 | https://blog.csdn.net/qq_46009046/article/details/133587081 29 | 30 | ## CUDA 31 | 32 | cu_vectorAdd.cu is a simple vector addition CUDA example 33 | 34 | managed_cu_Matrixmultiplication.cu is a matrix multiplication implemented using the CDUA unified shared memory 35 | 36 | Transformer_Encoder.cu is a CUDA-based implementation of the Transformer Encoder Example 37 | 38 | For specific instructions, see 39 | https://blog.csdn.net/qq_46009046/article/details/133753993 40 | https://blog.csdn.net/qq_46009046/article/details/133797554 41 | https://blog.csdn.net/qq_46009046/article/details/134020656 42 | 43 | ## OpenCL 44 | 45 | OpenCL_vectorAdd.c is a simple vector addition OpenCL example 46 | 47 | OpenCL_Mixer.c OpenCL_Mixer.c is a matrix multiplication implementation using OpenCL. 48 | 49 | a.cl a.cl is a kernel function for OpenCL matrix multiplication 50 | 51 | For specific instructions, see 52 | https://blog.csdn.net/qq_46009046/article/details/133777178 53 | 54 | ## HIP 55 | 56 | hip_vectorAdd.cpp is a simple vector addition HIP example 57 | 58 | For specific instructions, see 59 | https://blog.csdn.net/qq_46009046/article/details/133583217 60 | 61 | ## DPC++ 62 | 63 | dpcpp_vectorAdd.cpp is a simple vector addition DPC++ example 64 | 65 | dpcpp_templatematching.cpp is a template matching DPC++ example 66 | 67 | dpcpp_sobel.cpp is a sobel filter DPC++ example. 68 | 69 | For specific instructions, see 70 | https://blog.csdn.net/qq_46009046/article/details/123306679 71 | -------------------------------------------------------------------------------- /cuda/Transformer Encoder.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include "cuda_runtime.h" 4 | #include "device_launch_parameters.h" 5 | #include 6 | // cuda API error checking 7 | #define CUDA_CHECK(err) \ 8 | do { \ 9 | cudaError_t err_ = (err); \ 10 | if (err_ != cudaSuccess) { \ 11 | std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ 12 | } \ 13 | } while (0) 14 | 15 | // cublas API error checking 16 | #define CUBLAS_CHECK(err) \ 17 | do { \ 18 | cublasStatus_t err_ = (err); \ 19 | if (err_ != CUBLAS_STATUS_SUCCESS) { \ 20 | std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ 21 | } \ 22 | } while (0) 23 | 24 | 25 | #define batch_count 8 26 | #define M 197 27 | #define N 768 28 | 29 | using data_type = float; 30 | //定义输入和用于residual结构的张量 31 | std::vector> tensor(batch_count, std::vector(M* N)); 32 | std::vector> tensor_copy(batch_count, std::vector(M* N)); 33 | //定义三个liner层的参数 34 | std::vector> w1(batch_count, std::vector(768 * 2304)); 35 | std::vector> b1(batch_count, std::vector(197 * 2304)); 36 | std::vector> w2(batch_count, std::vector(768 * 3072)); 37 | std::vector> b2(batch_count, std::vector(197 * 3072)); 38 | std::vector> w3(batch_count, std::vector(3072 * 768)); 39 | std::vector> b3(batch_count, std::vector(197 * 768)); 40 | //定义QKV及其中间结果 41 | std::vector> Q(batch_count * 8, std::vector(197 * 96)); 42 | std::vector> K(batch_count * 8, std::vector(197 * 96)); 43 | std::vector> V(batch_count * 8, std::vector(197 * 96)); 44 | std::vector> QK(batch_count * 8, std::vector(197 * 197)); 45 | 46 | //**********************************initial******************************************* 47 | //初始化tensor数据 48 | void tensor_initial(int input_dim, int output_dim) 49 | { 50 | const int m = input_dim; 51 | const int n = output_dim; 52 | for (int b = 0; b < batch_count; b++) 53 | for (int i = 0; i < m; i++) { 54 | for (int j = 0; j < n; j++) { 55 | tensor[b][j * m + i] = (float)(rand() % 101) / 101; 56 | } 57 | } 58 | } 59 | //初始化liner层w和b数据 60 | void liner_initial() { 61 | //第一个liner层(B*197*768————B*197*2304) 62 | const int m1 = 197; 63 | const int k1 = 768; 64 | const int n1 = 2304; 65 | for (int b = 0; b < batch_count; b++) 66 | for (int i = 0; i < k1; i++) { 67 | for (int j = 0; j < n1; j++) { 68 | w1[b][j * k1 + i] = (float)(rand() % 101) / 101; 69 | } 70 | } 71 | for (int b = 0; b < batch_count; b++) 72 | for (int i = 0; i < m1; i++) { 73 | for (int j = 0; j < n1; j++) { 74 | b1[b][j * m1 + i] = (float)(rand() % 101) / 101; 75 | } 76 | } 77 | //第二个liner层(B*197*768————B*197*3072) 78 | const int m2 = 197; 79 | const int k2 = 768; 80 | const int n2 = 3072; 81 | for (int b = 0; b < batch_count; b++) 82 | for (int i = 0; i < k2; i++) { 83 | for (int j = 0; j < n2; j++) { 84 | w2[b][j * k2 + i] = (float)(rand() % 101) / 101; 85 | } 86 | } 87 | for (int b = 0; b < batch_count; b++) 88 | for (int i = 0; i < m2; i++) { 89 | for (int j = 0; j < n2; j++) { 90 | b2[b][j * m2 + i] = (float)(rand() % 101) / 101; 91 | } 92 | } 93 | //第三个liner层(B*197*3072————B*197*768) 94 | const int m3 = 197; 95 | const int k3 = 3072; 96 | const int n3 = 768; 97 | for (int b = 0; b < batch_count; b++) 98 | for (int i = 0; i < k3; i++) { 99 | for (int j = 0; j < n3; j++) { 100 | w3[b][j * k3 + i] = (float)(rand() % 101) / 101; 101 | } 102 | } 103 | for (int b = 0; b < batch_count; b++) 104 | for (int i = 0; i < m3; i++) { 105 | for (int j = 0; j < n3; j++) { 106 | b3[b][j * m3 + i] = (float)(rand() % 101) / 101; 107 | } 108 | } 109 | } 110 | //初始化liner的w和b 111 | void qkv_initial(int input_dim, int output_dim) { 112 | const int m1 = input_dim; 113 | const int n1 = output_dim; 114 | const int m_batch_count = batch_count * 8; 115 | for (int b = 0; b < m_batch_count; b++) 116 | for (int i = 0; i < m1; i++) { 117 | for (int j = 0; j < n1; j++) { 118 | Q[b][j * m1 + i] = (float)(rand() % 101) / 101; 119 | K[b][j * m1 + i] = (float)(rand() % 101) / 101; 120 | V[b][j * m1 + i] = (float)(rand() % 101) / 101; 121 | } 122 | } 123 | } 124 | //******************************MultiHeadAttention************************************ 125 | //计算b1=tensor*w1+b1,得到的b1(B×197×2304)为结果 126 | void liner_1(int input_dim, int output_dim) { 127 | const int m = 197; 128 | const int n = output_dim; 129 | const int k = input_dim; 130 | const int lda = m; 131 | const int ldb = k; 132 | const int ldc = m; 133 | 134 | const data_type alpha = 1.0; 135 | const data_type beta = 1.0; 136 | 137 | cublasHandle_t cublasH = NULL; 138 | cudaStream_t stream = NULL; 139 | 140 | data_type** d_A_array = nullptr; 141 | data_type** d_B_array = nullptr; 142 | data_type** d_C_array = nullptr; 143 | 144 | std::vector d_A(batch_count, nullptr); 145 | std::vector d_B(batch_count, nullptr); 146 | std::vector d_C(batch_count, nullptr); 147 | 148 | cublasOperation_t transa = CUBLAS_OP_N; 149 | cublasOperation_t transb = CUBLAS_OP_N; 150 | 151 | /* step 1: create cublas handle, bind a stream */ 152 | CUBLAS_CHECK(cublasCreate(&cublasH)); 153 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 154 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 155 | 156 | /* step 2: copy data to device */ 157 | for (int i = 0; i < batch_count; i++) { 158 | CUDA_CHECK( 159 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * tensor[i].size())); 160 | CUDA_CHECK( 161 | cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * w1[i].size())); 162 | CUDA_CHECK( 163 | cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * b1[i].size())); 164 | } 165 | 166 | CUDA_CHECK( 167 | cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type*) * batch_count)); 168 | CUDA_CHECK( 169 | cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type*) * batch_count)); 170 | CUDA_CHECK( 171 | cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type*) * batch_count)); 172 | 173 | for (int i = 0; i < batch_count; i++) { 174 | CUDA_CHECK(cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), 175 | cudaMemcpyHostToDevice, stream)); 176 | CUDA_CHECK(cudaMemcpyAsync(d_B[i], w1[i].data(), sizeof(data_type) * w1[i].size(), 177 | cudaMemcpyHostToDevice, stream)); 178 | CUDA_CHECK(cudaMemcpyAsync(d_C[i], b1[i].data(), sizeof(data_type) * b1[i].size(), 179 | cudaMemcpyHostToDevice, stream)); 180 | } 181 | 182 | CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count, 183 | cudaMemcpyHostToDevice, stream)); 184 | CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count, 185 | cudaMemcpyHostToDevice, stream)); 186 | CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count, 187 | cudaMemcpyHostToDevice, stream)); 188 | 189 | /* step 3: compute */ 190 | cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, 191 | d_B_array, ldb, &beta, d_C_array, ldc, batch_count); 192 | 193 | /* step 4: copy data to host */ 194 | for (int i = 0; i < batch_count; i++) { 195 | CUDA_CHECK(cudaMemcpyAsync(b1[i].data(), d_C[i], sizeof(data_type) * b1[i].size(), 196 | cudaMemcpyDeviceToHost, stream)); 197 | } 198 | /* free resources */ 199 | CUDA_CHECK(cudaFree(d_A_array)); 200 | CUDA_CHECK(cudaFree(d_B_array)); 201 | CUDA_CHECK(cudaFree(d_C_array)); 202 | for (int i = 0; i < batch_count; i++) { 203 | CUDA_CHECK(cudaFree(d_A[i])); 204 | CUDA_CHECK(cudaFree(d_B[i])); 205 | CUDA_CHECK(cudaFree(d_C[i])); 206 | } 207 | 208 | CUBLAS_CHECK(cublasDestroy(cublasH)); 209 | 210 | CUDA_CHECK(cudaStreamDestroy(stream)); 211 | 212 | //CUDA_CHECK(cudaDeviceReset()); 213 | 214 | } 215 | //将b1(B×197×2304)划分得到qkv(B×8×197×96) 216 | void Permute_1() { 217 | 218 | } 219 | //qk=(q@transpose.k)×1/sqrt(Dk) 220 | void multiplication_1() { 221 | cublasHandle_t cublasH = NULL; 222 | cudaStream_t stream = NULL; 223 | 224 | const int m = 197; 225 | const int n = 197; 226 | const int k = 96; 227 | const int lda = m; 228 | const int ldb = n; //k转置了 229 | const int ldc = m; 230 | const int m_batch_count = batch_count*8; 231 | 232 | const data_type alpha = 1 / pow(24,-0.5); //放缩系数 233 | const data_type beta = 0; 234 | 235 | data_type** d_A_array = nullptr; 236 | data_type** d_B_array = nullptr; 237 | data_type** d_C_array = nullptr; 238 | 239 | std::vector d_A(m_batch_count, nullptr); 240 | std::vector d_B(m_batch_count, nullptr); 241 | std::vector d_C(m_batch_count, nullptr); 242 | 243 | cublasOperation_t transa = CUBLAS_OP_N; 244 | cublasOperation_t transb = CUBLAS_OP_T; //k转置 245 | 246 | /* step 1: create cublas handle, bind a stream */ 247 | CUBLAS_CHECK(cublasCreate(&cublasH)); 248 | 249 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 250 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 251 | 252 | /* step 2: copy data to device */ 253 | for (int i = 0; i < m_batch_count; i++) { 254 | CUDA_CHECK( 255 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * Q[i].size())); 256 | CUDA_CHECK( 257 | cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * K[i].size())); 258 | CUDA_CHECK( 259 | cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * QK[i].size())); 260 | } 261 | 262 | CUDA_CHECK( 263 | cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type*) * batch_count)); 264 | CUDA_CHECK( 265 | cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type*) * batch_count)); 266 | CUDA_CHECK( 267 | cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type*) * batch_count)); 268 | 269 | for (int i = 0; i < m_batch_count; i++) { 270 | CUDA_CHECK(cudaMemcpyAsync(d_A[i], Q[i].data(), sizeof(data_type) * Q[i].size(), 271 | cudaMemcpyHostToDevice, stream)); 272 | CUDA_CHECK(cudaMemcpyAsync(d_B[i], K[i].data(), sizeof(data_type) * K[i].size(), 273 | cudaMemcpyHostToDevice, stream)); 274 | CUDA_CHECK(cudaMemcpyAsync(d_C[i], QK[i].data(), sizeof(data_type) * QK[i].size(), 275 | cudaMemcpyHostToDevice, stream)); 276 | } 277 | 278 | CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count, 279 | cudaMemcpyHostToDevice, stream)); 280 | CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count, 281 | cudaMemcpyHostToDevice, stream)); 282 | CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count, 283 | cudaMemcpyHostToDevice, stream)); 284 | /* step 3: compute */ 285 | cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, 286 | d_B_array, ldb, &beta, d_C_array, ldc, batch_count); 287 | 288 | /* step 4: copy data to host */ 289 | for (int i = 0; i < m_batch_count; i++) { 290 | CUDA_CHECK(cudaMemcpyAsync(QK[i].data(), d_C[i], sizeof(data_type) * QK[i].size(), 291 | cudaMemcpyDeviceToHost, stream)); 292 | } 293 | 294 | CUDA_CHECK(cudaStreamSynchronize(stream)); 295 | /* free resources */ 296 | CUDA_CHECK(cudaFree(d_A_array)); 297 | CUDA_CHECK(cudaFree(d_B_array)); 298 | CUDA_CHECK(cudaFree(d_C_array)); 299 | for (int i = 0; i < m_batch_count; i++) { 300 | CUDA_CHECK(cudaFree(d_A[i])); 301 | CUDA_CHECK(cudaFree(d_B[i])); 302 | CUDA_CHECK(cudaFree(d_C[i])); 303 | } 304 | 305 | CUBLAS_CHECK(cublasDestroy(cublasH)); 306 | 307 | CUDA_CHECK(cudaStreamDestroy(stream)); 308 | 309 | //CUDA_CHECK(cudaDeviceReset()); 310 | } 311 | //对qk进行softmax操作 312 | void softmax() { 313 | const int m = 197; 314 | const int n = 197; 315 | const int count = batch_count * 8 * 197; 316 | float s[count] = {}; 317 | //归约 318 | int t = 0; 319 | for (int b = 0; b < batch_count*8; b++) 320 | for (int i = 0; i < m; i++) { 321 | for (int j = 0; j < n; j++) { 322 | s[t]= s[t]+ exp(QK[b][j * m + i]); 323 | } 324 | t++; 325 | } 326 | //求softmax 327 | t = 0; 328 | for (int b = 0; b < batch_count * 8; b++) 329 | for (int i = 0; i < m; i++) { 330 | for (int j = 0; j < n; j++) { 331 | QK[b][j * m + i] = exp(QK[b][j * m + i]) / s[t]; 332 | } 333 | t++; 334 | } 335 | } 336 | //计算qk@v,将结果存到q中 337 | void multiplication_2() { 338 | cublasHandle_t cublasH = NULL; 339 | cudaStream_t stream = NULL; 340 | 341 | const int m = 197; 342 | const int n = 96; 343 | const int k = 197; 344 | const int lda = m; 345 | const int ldb = k; 346 | const int ldc = m; 347 | const int m_batch_count = batch_count * 8; 348 | 349 | const data_type alpha = 1.0; 350 | const data_type beta = 0; 351 | 352 | data_type** d_A_array = nullptr; 353 | data_type** d_B_array = nullptr; 354 | data_type** d_C_array = nullptr; 355 | 356 | std::vector d_A(m_batch_count, nullptr); 357 | std::vector d_B(m_batch_count, nullptr); 358 | std::vector d_C(m_batch_count, nullptr); 359 | 360 | cublasOperation_t transa = CUBLAS_OP_N; 361 | cublasOperation_t transb = CUBLAS_OP_N; 362 | 363 | /* step 1: create cublas handle, bind a stream */ 364 | CUBLAS_CHECK(cublasCreate(&cublasH)); 365 | 366 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 367 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 368 | 369 | /* step 2: copy data to device */ 370 | for (int i = 0; i < m_batch_count; i++) { 371 | CUDA_CHECK( 372 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * QK[i].size())); 373 | CUDA_CHECK( 374 | cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * V[i].size())); 375 | CUDA_CHECK( 376 | cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * Q[i].size())); 377 | } 378 | 379 | CUDA_CHECK( 380 | cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type*) * batch_count)); 381 | CUDA_CHECK( 382 | cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type*) * batch_count)); 383 | CUDA_CHECK( 384 | cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type*) * batch_count)); 385 | 386 | for (int i = 0; i < m_batch_count; i++) { 387 | CUDA_CHECK(cudaMemcpyAsync(d_A[i], QK[i].data(), sizeof(data_type) * QK[i].size(), 388 | cudaMemcpyHostToDevice, stream)); 389 | CUDA_CHECK(cudaMemcpyAsync(d_B[i], V[i].data(), sizeof(data_type) * V[i].size(), 390 | cudaMemcpyHostToDevice, stream)); 391 | CUDA_CHECK(cudaMemcpyAsync(d_C[i], Q[i].data(), sizeof(data_type) * Q[i].size(), 392 | cudaMemcpyHostToDevice, stream)); 393 | } 394 | 395 | CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count, 396 | cudaMemcpyHostToDevice, stream)); 397 | CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count, 398 | cudaMemcpyHostToDevice, stream)); 399 | CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count, 400 | cudaMemcpyHostToDevice, stream)); 401 | /* step 3: compute */ 402 | cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, 403 | d_B_array, ldb, &beta, d_C_array, ldc, batch_count); 404 | 405 | /* step 4: copy data to host */ 406 | for (int i = 0; i < m_batch_count; i++) { 407 | CUDA_CHECK(cudaMemcpyAsync(Q[i].data(), d_C[i], sizeof(data_type) * Q[i].size(), 408 | cudaMemcpyDeviceToHost, stream)); 409 | } 410 | 411 | CUDA_CHECK(cudaStreamSynchronize(stream)); 412 | /* free resources */ 413 | CUDA_CHECK(cudaFree(d_A_array)); 414 | CUDA_CHECK(cudaFree(d_B_array)); 415 | CUDA_CHECK(cudaFree(d_C_array)); 416 | for (int i = 0; i < m_batch_count; i++) { 417 | CUDA_CHECK(cudaFree(d_A[i])); 418 | CUDA_CHECK(cudaFree(d_B[i])); 419 | CUDA_CHECK(cudaFree(d_C[i])); 420 | } 421 | 422 | CUBLAS_CHECK(cublasDestroy(cublasH)); 423 | 424 | CUDA_CHECK(cudaStreamDestroy(stream)); 425 | 426 | //CUDA_CHECK(cudaDeviceReset()); 427 | } 428 | //将q(B×8×197×96)转化为B×197×768赋值给tensor 429 | void Permute_2() { 430 | 431 | } 432 | //*************************************MLP****************************************** 433 | //计算b2=tensor*w2+b2,得到的b2(B×197×3072)为结果 434 | void liner_2(int input_dim, int output_dim) { 435 | const int m = 197; 436 | const int n = output_dim; 437 | const int k = input_dim; 438 | const int lda = m; 439 | const int ldb = k; 440 | const int ldc = m; 441 | 442 | const data_type alpha = 1.0; 443 | const data_type beta = 1.0; 444 | 445 | cublasHandle_t cublasH = NULL; 446 | cudaStream_t stream = NULL; 447 | 448 | data_type** d_A_array = nullptr; 449 | data_type** d_B_array = nullptr; 450 | data_type** d_C_array = nullptr; 451 | 452 | std::vector d_A(batch_count, nullptr); 453 | std::vector d_B(batch_count, nullptr); 454 | std::vector d_C(batch_count, nullptr); 455 | 456 | cublasOperation_t transa = CUBLAS_OP_N; 457 | cublasOperation_t transb = CUBLAS_OP_N; 458 | 459 | /* step 1: create cublas handle, bind a stream */ 460 | CUBLAS_CHECK(cublasCreate(&cublasH)); 461 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 462 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 463 | 464 | /* step 2: copy data to device */ 465 | for (int i = 0; i < batch_count; i++) { 466 | CUDA_CHECK( 467 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * tensor[i].size())); 468 | CUDA_CHECK( 469 | cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * w2[i].size())); 470 | CUDA_CHECK( 471 | cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * b2[i].size())); 472 | } 473 | 474 | CUDA_CHECK( 475 | cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type*) * batch_count)); 476 | CUDA_CHECK( 477 | cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type*) * batch_count)); 478 | CUDA_CHECK( 479 | cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type*) * batch_count)); 480 | 481 | for (int i = 0; i < batch_count; i++) { 482 | CUDA_CHECK(cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), 483 | cudaMemcpyHostToDevice, stream)); 484 | CUDA_CHECK(cudaMemcpyAsync(d_B[i], w2[i].data(), sizeof(data_type) * w2[i].size(), 485 | cudaMemcpyHostToDevice, stream)); 486 | CUDA_CHECK(cudaMemcpyAsync(d_C[i], b2[i].data(), sizeof(data_type) * b2[i].size(), 487 | cudaMemcpyHostToDevice, stream)); 488 | } 489 | 490 | CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count, 491 | cudaMemcpyHostToDevice, stream)); 492 | CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count, 493 | cudaMemcpyHostToDevice, stream)); 494 | CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count, 495 | cudaMemcpyHostToDevice, stream)); 496 | 497 | /* step 3: compute */ 498 | cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, 499 | d_B_array, ldb, &beta, d_C_array, ldc, batch_count); 500 | 501 | /* step 4: copy data to host */ 502 | for (int i = 0; i < batch_count; i++) { 503 | CUDA_CHECK(cudaMemcpyAsync(b2[i].data(), d_C[i], sizeof(data_type) * b2[i].size(), 504 | cudaMemcpyDeviceToHost, stream)); 505 | } 506 | /* free resources */ 507 | CUDA_CHECK(cudaFree(d_A_array)); 508 | CUDA_CHECK(cudaFree(d_B_array)); 509 | CUDA_CHECK(cudaFree(d_C_array)); 510 | for (int i = 0; i < batch_count; i++) { 511 | CUDA_CHECK(cudaFree(d_A[i])); 512 | CUDA_CHECK(cudaFree(d_B[i])); 513 | CUDA_CHECK(cudaFree(d_C[i])); 514 | } 515 | 516 | CUBLAS_CHECK(cublasDestroy(cublasH)); 517 | 518 | CUDA_CHECK(cudaStreamDestroy(stream)); 519 | 520 | //CUDA_CHECK(cudaDeviceReset()); 521 | } 522 | //对b2进行GELU操作 523 | __global__ void gelu(float* x, int n) 524 | { 525 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 526 | if (ix < n) 527 | x[ix] = 0.5 * x[ix] * (1 + tanh(sqrt(2 / 3.1415926) + 0.004715 * pow(x[ix], 3))); 528 | 529 | } 530 | void GELU() { 531 | 532 | std::vector d_A(batch_count, nullptr); 533 | 534 | cudaStream_t stream = NULL; 535 | 536 | /* copy data to device */ 537 | for (int i = 0; i < batch_count; i++) { 538 | CUDA_CHECK( 539 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * b2[i].size())); 540 | CUDA_CHECK( 541 | cudaMemcpyAsync(d_A[i], b2[i].data(), sizeof(data_type) * QK[i].size(), cudaMemcpyHostToDevice, stream)); 542 | 543 | gelu << <608, 1024 >> > (d_A[i], QK[i].size()); 544 | 545 | CUDA_CHECK(cudaMemcpyAsync(b2[i].data(), d_A[i], sizeof(data_type) * b1[i].size(), 546 | cudaMemcpyDeviceToHost, stream)); 547 | } 548 | 549 | CUDA_CHECK(cudaStreamSynchronize(stream)); 550 | 551 | /* free resources */ 552 | for (int i = 0; i < batch_count; i++) { 553 | CUDA_CHECK(cudaFree(d_A[i])); 554 | } 555 | //CUDA_CHECK(cudaStreamDestroy(stream)); 556 | 557 | //CUDA_CHECK(cudaDeviceReset()); 558 | } 559 | //计算b3=b2*w3+b3,得到的b3(B×197×768)为结果 560 | void liner_3(int input_dim, int output_dim) { 561 | const int m = 197; 562 | const int n = output_dim; 563 | const int k = input_dim; 564 | const int lda = m; 565 | const int ldb = k; 566 | const int ldc = m; 567 | 568 | const data_type alpha = 1.0; 569 | const data_type beta = 1.0; 570 | 571 | cublasHandle_t cublasH = NULL; 572 | cudaStream_t stream = NULL; 573 | 574 | data_type** d_A_array = nullptr; 575 | data_type** d_B_array = nullptr; 576 | data_type** d_C_array = nullptr; 577 | 578 | std::vector d_A(batch_count, nullptr); 579 | std::vector d_B(batch_count, nullptr); 580 | std::vector d_C(batch_count, nullptr); 581 | 582 | cublasOperation_t transa = CUBLAS_OP_N; 583 | cublasOperation_t transb = CUBLAS_OP_N; 584 | 585 | /* step 1: create cublas handle, bind a stream */ 586 | CUBLAS_CHECK(cublasCreate(&cublasH)); 587 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 588 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 589 | 590 | /* step 2: copy data to device */ 591 | for (int i = 0; i < batch_count; i++) { 592 | CUDA_CHECK( 593 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * b2[i].size())); 594 | CUDA_CHECK( 595 | cudaMalloc(reinterpret_cast(&d_B[i]), sizeof(data_type) * w3[i].size())); 596 | CUDA_CHECK( 597 | cudaMalloc(reinterpret_cast(&d_C[i]), sizeof(data_type) * b3[i].size())); 598 | } 599 | 600 | CUDA_CHECK( 601 | cudaMalloc(reinterpret_cast(&d_A_array), sizeof(data_type*) * batch_count)); 602 | CUDA_CHECK( 603 | cudaMalloc(reinterpret_cast(&d_B_array), sizeof(data_type*) * batch_count)); 604 | CUDA_CHECK( 605 | cudaMalloc(reinterpret_cast(&d_C_array), sizeof(data_type*) * batch_count)); 606 | 607 | for (int i = 0; i < batch_count; i++) { 608 | CUDA_CHECK(cudaMemcpyAsync(d_A[i], b2[i].data(), sizeof(data_type) * b2[i].size(), 609 | cudaMemcpyHostToDevice, stream)); 610 | CUDA_CHECK(cudaMemcpyAsync(d_B[i], w3[i].data(), sizeof(data_type) * w3[i].size(), 611 | cudaMemcpyHostToDevice, stream)); 612 | CUDA_CHECK(cudaMemcpyAsync(d_C[i], b3[i].data(), sizeof(data_type) * b3[i].size(), 613 | cudaMemcpyHostToDevice, stream)); 614 | } 615 | 616 | CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count, 617 | cudaMemcpyHostToDevice, stream)); 618 | CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count, 619 | cudaMemcpyHostToDevice, stream)); 620 | CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count, 621 | cudaMemcpyHostToDevice, stream)); 622 | 623 | /* step 3: compute */ 624 | cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda, 625 | d_B_array, ldb, &beta, d_C_array, ldc, batch_count); 626 | 627 | /* step 4: copy data to host */ 628 | for (int i = 0; i < batch_count; i++) { 629 | CUDA_CHECK(cudaMemcpyAsync(b3[i].data(), d_C[i], sizeof(data_type) * b3[i].size(), 630 | cudaMemcpyDeviceToHost, stream)); 631 | } 632 | /* free resources */ 633 | CUDA_CHECK(cudaFree(d_A_array)); 634 | CUDA_CHECK(cudaFree(d_B_array)); 635 | CUDA_CHECK(cudaFree(d_C_array)); 636 | for (int i = 0; i < batch_count; i++) { 637 | CUDA_CHECK(cudaFree(d_A[i])); 638 | CUDA_CHECK(cudaFree(d_B[i])); 639 | CUDA_CHECK(cudaFree(d_C[i])); 640 | } 641 | 642 | CUBLAS_CHECK(cublasDestroy(cublasH)); 643 | 644 | CUDA_CHECK(cudaStreamDestroy(stream)); 645 | 646 | // CUDA_CHECK(cudaDeviceReset()); 647 | 648 | } 649 | //*******************************Transformer Encoder************************************** 650 | //对输入的tensor进行LN处理 651 | __global__ void ss(float* x, int n, float avg) { 652 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 653 | if (ix < n) 654 | x[ix] = pow(x[ix] - avg, 2); 655 | } 656 | __global__ void ln(float* x, int n, float avg, float S) { 657 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 658 | if (ix < n) 659 | x[ix] = (x[ix]-avg)/sqrt(S+ 1e-5); 660 | } 661 | void LayerNorm() { 662 | std::vector d_A(batch_count, nullptr); 663 | std::vector d_A_copy(batch_count, nullptr); 664 | float sum = 0.0, S=0.0; 665 | cudaStream_t stream = NULL; 666 | 667 | cublasHandle_t handle; 668 | cublasCreate(&handle); 669 | 670 | /* copy data to device */ 671 | 672 | for (int i = 0; i < batch_count; i++) { 673 | sum = 0.0; 674 | CUDA_CHECK( 675 | cudaMalloc(reinterpret_cast(&d_A[i]), sizeof(data_type) * tensor[i].size())); 676 | CUDA_CHECK( 677 | cudaMalloc(reinterpret_cast(&d_A_copy[i]), sizeof(data_type) * tensor[i].size())); 678 | CUDA_CHECK( 679 | cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), cudaMemcpyHostToDevice, stream)); 680 | CUDA_CHECK( 681 | cudaMemcpyAsync(d_A_copy[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), cudaMemcpyHostToDevice, stream)); 682 | //先求和 683 | cublasSasum(handle, 197 * 768, d_A[i], 1, &sum); 684 | //求方差 685 | ss << <160, 1024 >> > (d_A_copy[i], 197 * 768, sum / (197 * 768)); 686 | cublasSasum(handle, 197 * 768, d_A_copy[i], 1, &S); 687 | //求LN 688 | ln << <160, 1024 >> > (d_A[i], 197 * 768, sum / (197 * 768), S ); 689 | //将结果赋值给tensor 690 | CUDA_CHECK(cudaMemcpyAsync(tensor[i].data(), d_A[i], sizeof(data_type) * tensor[i].size(), 691 | cudaMemcpyDeviceToHost, stream)); 692 | } 693 | 694 | CUDA_CHECK(cudaStreamSynchronize(stream)); 695 | 696 | /* free resources */ 697 | for (int i = 0; i < batch_count; i++) { 698 | CUDA_CHECK(cudaFree(d_A[i])); 699 | CUDA_CHECK(cudaFree(d_A_copy[i])); 700 | } 701 | 702 | CUBLAS_CHECK(cublasDestroy(handle)); 703 | 704 | //CUDA_CHECK(cudaStreamDestroy(stream)); 705 | 706 | //CUDA_CHECK(cudaDeviceReset()); 707 | } 708 | //MultiHeadAttention block 709 | void MultiHeadAttention() { 710 | //计算b1=tensor*w1+b1,得到的b1(B×197×2304)为结果 711 | liner_1(768, 2304); 712 | //将b1(B×197*2304)划分得到qkv(3×B×8×197×96) 713 | Permute_1(); 714 | //qk=(q@transpose.k)*1/sqrt(Dk) 715 | multiplication_1(); 716 | //对qk进行softmax操作 717 | softmax(); 718 | //计算qk@v,将结果存到q中 719 | multiplication_2(); 720 | //将q(B×8×197×96)转化为B×197×768赋值给tensor 721 | Permute_2(); 722 | } 723 | //MLP block 724 | void MLP() { 725 | //计算b2=tensor*w2+b2,得到的b2(B×197×3072)为结果 726 | liner_2(768,3072); 727 | //对b2进行GELU操作 728 | GELU(); 729 | //计算b3=b2*w3+b3,得到的b3(B×197×768)为结果 730 | liner_3(3072,768); 731 | //将结果赋值给tensor 732 | tensor = b3; 733 | } 734 | //residual结构(tensor=tensor+tensor_copy) 735 | void tensor_add() { 736 | //先化成一维vector 737 | std::vector A(batch_count * 197 * 768); 738 | std::vector B(batch_count * 197 * 768); 739 | const int m = 197; 740 | const int n = 768; 741 | 742 | int t1 = 0; 743 | for (int b = 0; b < batch_count; b++) 744 | for (int i = 0; i < m; i++) { 745 | for (int j = 0; j < n; j++) { 746 | A[t1] = tensor[b][j * m + i]; 747 | B[t1] = tensor_copy[b][j * m + i]; 748 | t1++; 749 | } 750 | } 751 | 752 | cublasHandle_t cublasH = NULL; 753 | cudaStream_t stream = NULL; 754 | 755 | const data_type alpha = 1.0; 756 | const int incx = 1; 757 | const int incy = 1; 758 | 759 | data_type* d_A = nullptr; 760 | data_type* d_B = nullptr; 761 | 762 | /* step 1: create cublas handle, bind a stream */ 763 | CUBLAS_CHECK(cublasCreate(&cublasH)); 764 | 765 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 766 | CUBLAS_CHECK(cublasSetStream(cublasH, stream)); 767 | 768 | /* step 2: copy data to device */ 769 | CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_A), sizeof(data_type) * A.size())); 770 | CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_B), sizeof(data_type) * B.size())); 771 | 772 | CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice, 773 | stream)); 774 | CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(), cudaMemcpyHostToDevice, 775 | stream)); 776 | 777 | /* step 3: compute */ 778 | CUBLAS_CHECK(cublasSaxpy(cublasH, A.size(), &alpha, d_A, incx, d_B, incy)); 779 | 780 | /* step 4: copy data to host */ 781 | CUDA_CHECK(cudaMemcpyAsync(B.data(), d_B, sizeof(data_type) * B.size(), cudaMemcpyDeviceToHost, 782 | stream)); 783 | 784 | CUDA_CHECK(cudaStreamSynchronize(stream)); 785 | 786 | /* free resources */ 787 | CUDA_CHECK(cudaFree(d_A)); 788 | CUDA_CHECK(cudaFree(d_B)); 789 | 790 | CUBLAS_CHECK(cublasDestroy(cublasH)); 791 | CUDA_CHECK(cudaStreamDestroy(stream)); 792 | //CUDA_CHECK(cudaDeviceReset()); 793 | 794 | //将B赋值给tensor 795 | int t2 = 0; 796 | for (int b = 0; b < batch_count; b++) 797 | for (int i = 0; i < m; i++) { 798 | for (int j = 0; j < n; j++) { 799 | tensor[b][j * m + i]=B[t2]; 800 | t2++; 801 | } 802 | } 803 | } 804 | 805 | //**********************************main()************************************************ 806 | int main(int argc, char** argv) 807 | { 808 | cudaEvent_t start, stop; 809 | cudaEventCreate(&start); 810 | cudaEventCreate(&stop); 811 | 812 | //tensor初始化 813 | tensor_initial(M,N); 814 | //tensor_copy初始化(用于residual结构) 815 | tensor_copy = tensor; 816 | //for(int b = 0; b < batch_count; b++)for (int i = 0; i < M; i++) {for (int j = 0; j < N; j++)std::cout << tensor[b][i * N + j] << " ";std::cout << std::endl;} 817 | //初始化liner的w和b 818 | liner_initial(); 819 | //初始化q,k,v 820 | qkv_initial(197, 96); 821 | 822 | /* GPU warm up */ 823 | for(int i=0;i<100;i++) 824 | LayerNorm(); 825 | 826 | /* compute */ 827 | cudaEventRecord(start, 0); 828 | 829 | LayerNorm(); 830 | //MultiHeadAttention block 831 | MultiHeadAttention(); 832 | //tensor=tensor+tensor_copy 833 | tensor_add(); 834 | tensor_copy = tensor; 835 | 836 | LayerNorm(); 837 | //MLP block 838 | MLP(); 839 | //tensor=tensor+tensor_copy 840 | tensor_add(); 841 | 842 | cudaEventRecord(stop, 0); 843 | cudaEventSynchronize(stop); 844 | 845 | float elapsedTime; 846 | cudaEventElapsedTime(&elapsedTime, start, stop); 847 | printf("Transformer Encoder time(Batch_Size=8): %.2f ms\n", elapsedTime); 848 | return 0; 849 | } 850 | 851 | 852 | 853 | 854 | 855 | -------------------------------------------------------------------------------- /cuda/cu_managed_Matrixmultiplication.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define BLOCK_SIZE 16 8 | 9 | const int MAXSUN = 1000; 10 | __managed__ int a[MAXSUN * MAXSUN]; 11 | __managed__ int b[MAXSUN * MAXSUN]; 12 | __managed__ int c_gpu[MAXSUN * MAXSUN]; 13 | __managed__ int c_cpu[MAXSUN * MAXSUN]; 14 | 15 | __global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k) 16 | { 17 | int row = blockIdx.y * blockDim.y + threadIdx.y; 18 | int col = blockIdx.x * blockDim.x + threadIdx.x; 19 | int sum = 0; 20 | if (col < k && row < m) 21 | { 22 | for (int i = 0; i < n; i++) 23 | { 24 | sum += a[row * n + i] * b[i * k + col]; 25 | } 26 | c[row * k + col] = sum; 27 | } 28 | } 29 | 30 | void cpu_matrix_mult(int* a, int* b, int* h_result, int m, int n, int k) { 31 | for (int i = 0; i < m; ++i) 32 | { 33 | for (int j = 0; j < k; ++j) 34 | { 35 | int tmp = 0.0; 36 | for (int h = 0; h < n; ++h) 37 | { 38 | tmp += a[i * n + h] * b[h * k + j]; 39 | } 40 | h_result[i * k + j] = tmp; 41 | } 42 | } 43 | } 44 | 45 | int main(int argc, char const* argv[]) 46 | { 47 | int m = 200; 48 | int n = 200; 49 | int k = 200; 50 | 51 | cudaEvent_t start, stop_cpu, stop_gpu; 52 | cudaEventCreate(&start); 53 | cudaEventCreate(&stop_cpu); 54 | cudaEventCreate(&stop_gpu); 55 | 56 | //初始化矩阵A 57 | for (int i = 0; i < m; ++i) { 58 | for (int j = 0; j < n; ++j) { 59 | a[i * n + j] = 0 * rand() % 1024 + 1; 60 | } 61 | } 62 | //初始化矩阵B 63 | for (int i = 0; i < n; ++i) { 64 | for (int j = 0; j < k; ++j) { 65 | b[i * k + j] = 0 * rand() % 1024 + 1; 66 | } 67 | } 68 | 69 | cudaEventRecord(start); 70 | cudaEventQuery(start); 71 | 72 | unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; 73 | unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; 74 | dim3 dimGrid(grid_cols, grid_rows); 75 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 76 | 77 | gpu_matrix_mult << > > (a, b, c_gpu, m, n, k); 78 | 79 | cudaEventRecord(stop_gpu); 80 | cudaEventSynchronize(stop_gpu); 81 | 82 | cpu_matrix_mult(a, b, c_cpu, m, n, k); 83 | cudaEventRecord(stop_cpu); 84 | cudaEventSynchronize(stop_cpu); 85 | float elapsed_time_cpu, elapsed_time_gpu; 86 | cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu); 87 | cudaEventElapsedTime(&elapsed_time_cpu, stop_gpu, stop_cpu); 88 | printf("GPU Time = %g ms.\n", elapsed_time_gpu); 89 | printf("CPU Time = %g ms.\n", elapsed_time_cpu); 90 | 91 | cudaEventDestroy(start); 92 | cudaEventDestroy(stop_cpu); 93 | cudaEventDestroy(stop_gpu); 94 | 95 | 96 | 97 | int ok = 1; 98 | for (int i = 0; i < m; ++i) 99 | { 100 | for (int j = 0; j < k; ++j) 101 | { 102 | //检验GPU运算结果和CPU运算结果是否相等 103 | if (fabs(c_gpu[i * k + j] - c_cpu[i * k + j]) > (1.0e-10)) 104 | { 105 | 106 | ok = 0; 107 | } 108 | //printf("\n"); 109 | } 110 | } 111 | 112 | if (ok) 113 | { 114 | printf("Pass!!!\n"); 115 | } 116 | else 117 | { 118 | printf("Error!!!\n"); 119 | } 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /cuda/cu_vectorAdd.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /** 13 | * Vector addition: C = A + B. 14 | * 15 | * This sample is a very basic sample that implements element by element 16 | * vector addition. It is the same as the sample illustrating Chapter 2 17 | * of the programming guide with some additions like error checking. 18 | */ 19 | 20 | #include 21 | 22 | // For the CUDA runtime routines (prefixed with "cuda_") 23 | #include 24 | 25 | /** 26 | * CUDA Kernel Device code 27 | * 28 | * Computes the vector addition of A and B into C. The 3 vectors have the same 29 | * number of elements numElements. 30 | */ 31 | __global__ void 32 | vectorAdd(const float *A, const float *B, float *C, int numElements) 33 | { 34 | int i = blockDim.x * blockIdx.x + threadIdx.x; 35 | 36 | if (i < numElements) 37 | { 38 | C[i] = A[i] + B[i]; 39 | } 40 | } 41 | 42 | /** 43 | * Host main routine 44 | */ 45 | int 46 | main(void) 47 | { 48 | // Error code to check return values for CUDA calls 49 | cudaError_t err = cudaSuccess; 50 | 51 | // Print the vector length to be used, and compute its size 52 | int numElements = 50000; 53 | size_t size = numElements * sizeof(float); 54 | printf("[Vector addition of %d elements]\n", numElements); 55 | 56 | // Allocate the host input vector A 57 | float *h_A = (float *)malloc(size); 58 | 59 | // Allocate the host input vector B 60 | float *h_B = (float *)malloc(size); 61 | 62 | // Allocate the host output vector C 63 | float *h_C = (float *)malloc(size); 64 | 65 | // Verify that allocations succeeded 66 | if (h_A == NULL || h_B == NULL || h_C == NULL) 67 | { 68 | fprintf(stderr, "Failed to allocate host vectors!\n"); 69 | exit(EXIT_FAILURE); 70 | } 71 | 72 | // Initialize the host input vectors 73 | for (int i = 0; i < numElements; ++i) 74 | { 75 | h_A[i] = rand()/(float)RAND_MAX; 76 | h_B[i] = rand()/(float)RAND_MAX; 77 | } 78 | 79 | // Allocate the device input vector A 80 | float *d_A = NULL; 81 | err = cudaMalloc((void **)&d_A, size); 82 | 83 | if (err != cudaSuccess) 84 | { 85 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); 86 | exit(EXIT_FAILURE); 87 | } 88 | 89 | // Allocate the device input vector B 90 | float *d_B = NULL; 91 | err = cudaMalloc((void **)&d_B, size); 92 | 93 | if (err != cudaSuccess) 94 | { 95 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); 96 | exit(EXIT_FAILURE); 97 | } 98 | 99 | // Allocate the device output vector C 100 | float *d_C = NULL; 101 | err = cudaMalloc((void **)&d_C, size); 102 | 103 | if (err != cudaSuccess) 104 | { 105 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); 106 | exit(EXIT_FAILURE); 107 | } 108 | 109 | // Copy the host input vectors A and B in host memory to the device input vectors in 110 | // device memory 111 | printf("Copy input data from the host memory to the CUDA device\n"); 112 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 113 | 114 | if (err != cudaSuccess) 115 | { 116 | fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 117 | exit(EXIT_FAILURE); 118 | } 119 | 120 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 121 | 122 | if (err != cudaSuccess) 123 | { 124 | fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); 125 | exit(EXIT_FAILURE); 126 | } 127 | 128 | // Launch the Vector Add CUDA Kernel 129 | int threadsPerBlock = 256; 130 | int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; 131 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 132 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 133 | err = cudaGetLastError(); 134 | 135 | if (err != cudaSuccess) 136 | { 137 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); 138 | exit(EXIT_FAILURE); 139 | } 140 | 141 | // Copy the device result vector in device memory to the host result vector 142 | // in host memory. 143 | printf("Copy output data from the CUDA device to the host memory\n"); 144 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 145 | 146 | if (err != cudaSuccess) 147 | { 148 | fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); 149 | exit(EXIT_FAILURE); 150 | } 151 | 152 | // Verify that the result vector is correct 153 | for (int i = 0; i < numElements; ++i) 154 | { 155 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 156 | { 157 | fprintf(stderr, "Result verification failed at element %d!\n", i); 158 | exit(EXIT_FAILURE); 159 | } 160 | } 161 | 162 | printf("Test PASSED\n"); 163 | 164 | // Free device global memory 165 | err = cudaFree(d_A); 166 | 167 | if (err != cudaSuccess) 168 | { 169 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); 170 | exit(EXIT_FAILURE); 171 | } 172 | 173 | err = cudaFree(d_B); 174 | 175 | if (err != cudaSuccess) 176 | { 177 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); 178 | exit(EXIT_FAILURE); 179 | } 180 | 181 | err = cudaFree(d_C); 182 | 183 | if (err != cudaSuccess) 184 | { 185 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); 186 | exit(EXIT_FAILURE); 187 | } 188 | 189 | // Free host memory 190 | free(h_A); 191 | free(h_B); 192 | free(h_C); 193 | 194 | // Reset the device and exit 195 | // cudaDeviceReset causes the driver to clean up all state. While 196 | // not mandatory in normal operation, it is good practice. It is also 197 | // needed to ensure correct operation when the application is being 198 | // profiled. Calling cudaDeviceReset causes all profile data to be 199 | // flushed before the application exits 200 | err = cudaDeviceReset(); 201 | 202 | if (err != cudaSuccess) 203 | { 204 | fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); 205 | exit(EXIT_FAILURE); 206 | } 207 | 208 | printf("Done\n"); 209 | return 0; 210 | } 211 | 212 | -------------------------------------------------------------------------------- /cuda/cuda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nvcc cu_vectorAdd.cu -o cu_vectorAdd 4 | 5 | nvcc cu_managed_Matrixmultiplication.cu -o cu_managed_Matrixmultiplication 6 | 7 | nvcc Transformer_Encoder.cu -o Transformer_Encoder -lcublas 8 | -------------------------------------------------------------------------------- /dpcpp/dpcpp_sobel.cpp: -------------------------------------------------------------------------------- 1 | //============================================================== 2 | // Copyright © 2019 Intel Corporation 3 | // 4 | // SPDX-License-Identifier: MIT 5 | // ============================================================= 6 | #include 7 | #include 8 | #include 9 | #include "CL/sycl.hpp" 10 | #include "device_selector.hpp" 11 | 12 | // dpc_common.hpp can be found in the dev-utilities include folder. 13 | // e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp 14 | #include "dpc_common.hpp" 15 | 16 | // stb/*.h files can be found in the dev-utilities include folder. 17 | // e.g., $ONEAPI_ROOT/dev-utilities//include/stb/*.h 18 | #define STB_IMAGE_IMPLEMENTATION 19 | #include "stb/stb_image.h" 20 | #define STB_IMAGE_WRITE_IMPLEMENTATION 21 | #include "stb/stb_image_write.h" 22 | 23 | using namespace std; 24 | using namespace sycl; 25 | using namespace chrono; 26 | 27 | // Few useful acronyms. 28 | constexpr auto sycl_read = access::mode::read; 29 | constexpr auto sycl_write = access::mode::write; 30 | constexpr auto sycl_global_buffer = access::target::global_buffer; 31 | 32 | static void ReportTime(const string &msg, event e) { 33 | cl_ulong time_start = 34 | e.get_profiling_info(); 35 | 36 | cl_ulong time_end = 37 | e.get_profiling_info(); 38 | 39 | double elapsed = (time_end - time_start) / 1e6; 40 | cout << msg << elapsed << " milliseconds\n"; 41 | } 42 | 43 | // SYCL does not need any special mark-up for functions which are called from 44 | // SYCL kernel and defined in the same compilation unit. SYCL compiler must be 45 | // able to find the full call graph automatically. 46 | // always_inline as calls are expensive on Gen GPU. 47 | // Notes: 48 | // - coeffs can be declared outside of the function, but still must be constant 49 | // - SYCL compiler will automatically deduce the address space for the two 50 | // pointers; sycl::multi_ptr specialization for particular address space 51 | // can used for more control 52 | __attribute__((always_inline)) static void ApplyFilter(uint8_t *src_image, 53 | uint8_t *dst_image, 54 | int i) { 55 | i *= 3; 56 | float temp; 57 | temp = (0.393f * src_image[i]) + (0.769f * src_image[i + 1]) + 58 | (0.189f * src_image[i + 2]); 59 | dst_image[i] = temp > 255 ? 255 : temp; 60 | temp = (0.349f * src_image[i]) + (0.686f * src_image[i + 1]) + 61 | (0.168f * src_image[i + 2]); 62 | dst_image[i + 1] = temp > 255 ? 255 : temp; 63 | temp = (0.272f * src_image[i]) + (0.534f * src_image[i + 1]) + 64 | (0.131f * src_image[i + 2]); 65 | dst_image[i + 2] = temp > 255 ? 255 : temp; 66 | } 67 | 68 | //sobel filter kernel 69 | __attribute__((always_inline)) static void ApplySFilter(uint8_t *src_image, 70 | uint8_t *dst_image, 71 | int w, 72 | int h) { 73 | cout<<"start filter;\n"; 74 | int Gx = 0; 75 | int Gy = 0; 76 | float temp; 77 | for (int i=1;i<(h-1);i++) 78 | { 79 | for (int j=1;j<(w-1);j++) 80 | { 81 | Gy = src_image[(i+1)*w+(j-1)]*1+src_image[(i+1)*w+(j)]*2+src_image[(i+1)*w+(j+1)]*1-(src_image[(i-1)*w+(j-1)]*1+src_image[(i-1)*w+(j)]*2+src_image[(i-1)*w+(j+1)]*1); 82 | Gx = src_image[(i-1)*w+(j+1)]*1+src_image[(i)*w+(j+1)]*2+src_image[(i+1)*w+(j+1)]*1-(src_image[(i-1)*w+(j-1)]*1+src_image[(i)*w+(j-1)]*2+src_image[(i+1)*w+(j-1)]*1); 83 | temp = (abs(Gx)+abs(Gy))/2.0f; 84 | dst_image[i*w+j] = temp>200?255:temp; 85 | } 86 | } 87 | 88 | } 89 | //并行 ss 90 | __attribute__((always_inline)) static void ApplySSFilter(uint8_t *src_image, 91 | uint8_t *dst_image, 92 | int i, 93 | int w, 94 | int h 95 | ) { 96 | 97 | int Gx = 0; 98 | int Gy = 0; 99 | float temp; 100 | if (i>w) 101 | { 102 | Gy = src_image[i+w-1]*1+src_image[i+w]*2+src_image[i+w+1]*1-(src_image[i-w-1]*1+src_image[i-w]*2+src_image[i-w+1]*1); 103 | Gx = src_image[i-w+1]*1+src_image[i+1]*2+src_image[i+w+1]*1-(src_image[i-w-1]*1+src_image[i-1]*2+src_image[i+w-1]*1); 104 | temp = (abs(Gx)+abs(Gy))/2.0f; 105 | dst_image[i] = temp>200?255:temp; 106 | } 107 | 108 | } 109 | 110 | 111 | int main(int argc, char **argv) { 112 | 113 | // loading the input image 114 | int img_width, img_height, channels; 115 | uint8_t *image = stbi_load("1.jpg", &img_width, &img_height, &channels, 0); 116 | if (image == NULL) { 117 | cout << "Error in loading the image\n"; 118 | exit(1); 119 | } 120 | cout << "Loaded image with a width of " << img_width << ", a height of " 121 | << img_height << " and " << channels << " channels"<<"\n"; 122 | 123 | //像素个数,图像尺寸 124 | size_t num_pixels = img_width * img_height; 125 | // size_t img_size = img_width * img_height * channels; 126 | size_t img_size = img_width * img_height; 127 | 128 | // allocating memory for output images 129 | uint8_t *image_gray = new uint8_t[img_size]; 130 | uint8_t *image_ref = new uint8_t[img_size]; 131 | uint8_t *image_exp1 = new uint8_t[img_size]; 132 | 133 | memset(image_gray, 0, img_size * sizeof(uint8_t)); 134 | memset(image_ref, 0, img_size * sizeof(uint8_t)); 135 | memset(image_exp1, 0, img_size * sizeof(uint8_t)); 136 | 137 | //gray灰度化 138 | for (int p=0;p() 160 | << "\n"; 161 | 162 | // Create SYCL buffer representing source data . 163 | // By default, this buffers will be created with global_buffer access 164 | // target, which means the buffer "projection" to the device (actual 165 | // device memory chunk allocated or mapped on the device to reflect 166 | // buffer's data) will belong to the SYCL global address space - this 167 | // is what host data usually maps to. Other address spaces are: 168 | // private, local and constant. 169 | // Notes: 170 | // - access type (read/write) is not specified when creating a buffer - 171 | // this is done when actual accessor is created 172 | // - there can be multiple accessors to the same buffer in multiple command 173 | // groups 174 | // - 'image' pointer was passed to the constructor, so this host memory 175 | // will be used for "host projection", no allocation will happen on host 176 | buffer image_buf(image_gray, range(img_size)); 177 | 178 | // This is the output buffer device writes to 179 | buffer image_buf_exp1(image_exp1, range(img_size)); 180 | cout << "Submitting lambda kernel...\n"; 181 | 182 | // Submit a command group for execution. Returns immediately, not waiting 183 | // for command group completion. 184 | e1 = q.submit([&](auto &h) { 185 | // This lambda defines a "command group" - a set of commands for the 186 | // device sharing some state and executed in-order - i.e. creation of 187 | // accessors may lead to on-device memory allocation, only after that 188 | // the kernel will be enqueued. 189 | // A command group can contain at most one parallel_for, single_task or 190 | // parallel_for_workgroup construct. 191 | accessor image_acc(image_buf, h, read_only); 192 | accessor image_exp_acc(image_buf_exp1, h, write_only); 193 | 194 | // This is the simplest form cl::sycl::handler::parallel_for - 195 | // - it specifies "flat" 1D ND range(num_pixels), runtime will select 196 | // local size 197 | // - kernel lambda accepts single cl::sycl::id argument, which has very 198 | // limited API; see the spec for more complex forms 199 | // the lambda parameter of the parallel_for is the kernel, which 200 | // actually executes on device 201 | 202 | 203 | h.parallel_for(range<1>(num_pixels), [=](auto i) { 204 | ApplySSFilter(image_acc.get_pointer(), image_exp_acc.get_pointer(), i,img_width,img_height); 205 | }); 206 | }); 207 | q.wait_and_throw(); 208 | 209 | }catch (sycl::exception e) { 210 | // This catches only synchronous exceptions that happened in current thread 211 | // during execution. The asynchronous exceptions caused by execution of the 212 | // command group are caught by the asynchronous exception handler 213 | // registered. Synchronous exceptions are usually those which are thrown 214 | // from the SYCL runtime code, such as on invalid constructor arguments. An 215 | // example of asynchronous exceptions is error occurred during execution of 216 | // a kernel. Make sure sycl::exception is caught, not std::exception. 217 | cout << "SYCL exception caught: " << e.what() << "\n"; 218 | return 1; 219 | } 220 | 221 | // report execution times: 222 | ReportTime("Lambda kernel time: ", e1); 223 | 224 | // get reference result 225 | // 计时开始 226 | auto start = system_clock::now(); 227 | ApplySFilter(image_gray, image_ref, img_width, img_height); 228 | auto end = system_clock::now(); 229 | auto duration = duration_cast(end - start); 230 | cout << "Serial time: " << double(duration.count()) << " milliseconds\n"; 231 | 232 | stbi_write_png("sobel.png", img_width, img_height, 1, image_ref, 233 | img_width*1); 234 | stbi_write_png("sobel_lambda.png", img_width, img_height, 1, 235 | image_exp1, img_width * 1); 236 | 237 | stbi_image_free(image); 238 | delete[] image_ref; 239 | 240 | cout << "Successfully applied to image! \n"; 241 | return 0; 242 | } 243 | -------------------------------------------------------------------------------- /dpcpp/dpcpp_templatematching.cpp: -------------------------------------------------------------------------------- 1 | %%writefile lab/gpu_sample.cpp 2 | #include 3 | #include 4 | #include 5 | #include "CL/sycl.hpp" 6 | //#include "device_selector.hpp" 7 | // dpc_common.hpp can be found in the dev-utilities include folder. 8 | // e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp 9 | #include "dpc_common.hpp" 10 | 11 | // stb/*.h files can be found in the dev-utilities include folder. 12 | // e.g., $ONEAPI_ROOT/dev-utilities//include/stb/*.h 13 | #define STB_IMAGE_IMPLEMENTATION 14 | #include "stb/stb_image.h" 15 | #define STB_IMAGE_WRITE_IMPLEMENTATION 16 | #include "stb/stb_image_write.h" 17 | 18 | using namespace std; 19 | using namespace sycl; 20 | 21 | static void ReportTime(const string &msg, event e) { 22 | cl_ulong time_start = 23 | e.get_profiling_info(); 24 | 25 | cl_ulong time_end = 26 | e.get_profiling_info(); 27 | 28 | double elapsed = (time_end - time_start) / 1e6; 29 | cout << msg << elapsed << " milliseconds\n"; 30 | } 31 | 32 | 33 | // SYCL does not need any special mark-up for functions which are called from 34 | // SYCL kernel and defined in the same compilation unit. SYCL compiler must be 35 | // able to find the full call graph automatically. 36 | // always_inline as calls are expensive on Gen GPU. 37 | // Notes: 38 | // - coeffs can be declared outside of the function, but still must be constant 39 | // - SYCL compiler will automatically deduce the address space for the two 40 | // pointers; sycl::multi_ptr specialization for particular address space 41 | // can used for more control 42 | __attribute__((always_inline)) static void ApplyFilter(uint8_t *I, 43 | uint8_t *T, 44 | float *result, 45 | int i, 46 | int j, 47 | int Iw, 48 | int Ih, 49 | int Tw, 50 | int Th) { 51 | 52 | if (i >= Ih - Th + 1 || j >= Iw - Tw + 1) { 53 | return; 54 | } 55 | float sum = 0.0; 56 | for (int k = 0; k < Th; k++) { 57 | for (int s = 0; s < Tw; s++) { 58 | float diff = I[(i + k) * Iw + j + s] - T[k * Tw + s]; 59 | sum += diff * diff; 60 | } 61 | } 62 | result[i * Iw + j] = sum; 63 | } 64 | 65 | 66 | int main(int argc, char **argv) { 67 | // loading the src image 68 | int src_img_width, src_img_height, src_channels; 69 | // 使用灰度图像 70 | // 加载图片 源图片 71 | uint8_t *src_image = stbi_load("./tmp_src_img.jpg", &src_img_width, &src_img_height, &src_channels, 1); 72 | if (src_image == NULL) { 73 | cout << "Error in loading the image\n"; 74 | exit(1); 75 | } 76 | cout << "Loaded src image with a width of " << src_img_width << ", a height of " 77 | << src_img_height << " and " << src_channels << " channels\n"; 78 | 79 | // loading the template image 80 | int template_img_width, template_img_height, template_channels; 81 | // 加载图片 模板图片 82 | uint8_t *template_image = stbi_load("./tmp_template_img.jpg", &template_img_width, &template_img_height, &template_channels, 1); 83 | if (template_image == NULL) { 84 | cout << "Error in loading the image\n"; 85 | exit(1); 86 | } 87 | cout << "Loaded template image with a width of " << template_img_width << ", a height of " 88 | << template_img_height << " and " << template_channels << " channels\n"; 89 | 90 | if (src_img_width < template_img_width || src_img_height < template_img_height) { 91 | cout << "Error: The template is larger than the picture\n"; 92 | exit(1); 93 | } 94 | 95 | 96 | // 分配的结果内存 97 | size_t num_counts = src_img_height * src_img_width; 98 | size_t src_size = src_img_height * src_img_width; 99 | size_t template_size = template_img_width * template_img_height; 100 | // 分配输出图像的内存 101 | // allocating memory for output images 102 | float *result = new float[num_counts]; 103 | // 初始化 104 | // memset(image_ref, 0, num_counts * sizeof(float)); 105 | 106 | // Create a device selector which rates available devices in the preferred 107 | // order for the runtime to select the highest rated device 108 | // Note: This is only to illustrate the usage of a custom device selector. 109 | // default_selector can be used if no customization is required. 110 | // 选择合适的设备 111 | 112 | //device_selector sel; 113 | 114 | // Using these events to time command group execution 115 | event e1, e2; 116 | 117 | // Wrap main SYCL API calls into a try/catch to diagnose potential errors 118 | try { 119 | // Create a command queue using the device selector and request profiling 120 | // 选择最适合的设备 121 | auto prop_list = property_list{property::queue::enable_profiling()}; 122 | queue q(default_selector{}, dpc_common::exception_handler, prop_list); 123 | 124 | // See what device was actually selected for this queue. 125 | cout << "Running on " << q.get_device().get_info() 126 | << "\n"; 127 | // 源图像buffer 128 | buffer src_image_buf(src_image, range(src_size)); 129 | // 模板图像buffer 130 | // This is the output buffer device writes to 131 | buffer template_image_buf(template_image, range(template_size)); 132 | // 结果的buffer 133 | buffer result_buf(result, range(num_counts)); 134 | cout << "Submitting lambda kernel...\n"; 135 | 136 | // Submit a command group for execution. Returns immediately, not waiting 137 | // for command group completion. 138 | // 得到输出的比较之后的结果 139 | e1 = q.submit([&](auto &h) { 140 | accessor src_image_acc(src_image_buf, h, read_only); 141 | accessor template_image_acc(template_image_buf, h, read_only); 142 | accessor result_acc(result_buf, h, write_only); 143 | // 使用二维线程数 144 | h.parallel_for(range<2>{(size_t)src_img_height, (size_t)src_img_width}, [=](id<2> index) { 145 | // 内核程序执行 146 | ApplyFilter(src_image_acc.get_pointer(), template_image_acc.get_pointer(), result_acc.get_pointer(), index[0], index[1], src_img_width, src_img_height, template_img_width, template_img_height); 147 | }); 148 | }); 149 | q.wait_and_throw(); 150 | } catch (sycl::exception e) { 151 | cout << "SYCL exception caught: " << e.what() << "\n"; 152 | return 1; 153 | } 154 | 155 | // report execution times: 156 | ReportTime("Lambda kernel time: ", e1); 157 | // cout << result[0] << " " << result[1]; 158 | 159 | // 得到匹配位置的最小值 160 | int x,y; 161 | float minresult = result[0]; 162 | for (int i = 0; i < src_img_height - template_img_height + 1; i++) { 163 | for (int j = 0; j < src_img_width - template_img_width + 1; j++) { 164 | if (minresult > result[i * src_img_width + j]) { 165 | y = i; 166 | x = j; 167 | minresult = result[i * src_img_width + j]; 168 | } 169 | } 170 | } 171 | 172 | int x1 = x; 173 | int x2 = x + template_img_width - 1; 174 | int y1 = y; 175 | int y2 = y + template_img_height - 1; 176 | 177 | cout << x1 << " " << x2 << " " << y1 << " " << y2 << " "; 178 | 179 | // 对图片进行保存 180 | 181 | // 先标记两条横线 182 | for (int i = x1; i <= x2; i++) { 183 | src_image[y1 * src_img_width + i] = 0; 184 | src_image[y2 * src_img_width + i] = 0; 185 | } 186 | for (int i = y1 + 1; i < y2; i++) { 187 | src_image[i * src_img_width + x1] = 0; 188 | src_image[i * src_img_width + x2] = 0; 189 | } 190 | 191 | stbi_write_png("sepia_ref.png", src_img_width, src_img_height, src_channels, src_image, src_img_width * src_channels); 192 | return 0; 193 | } 194 | -------------------------------------------------------------------------------- /dpcpp/dpcpp_vectorAdd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace sycl; 3 | static const size_t numElements = 50000; 4 | void work(queue &q) { 5 | std::cout << "Device : " 6 | << q.get_device().get_info() 7 | << std::endl; 8 | float vector1[numElements] , vector2[numElements] , vector3[numElements]; 9 | auto R = range(numElements); 10 | for (int i = 0; i < numElements; ++i) { 11 | vector1[i] = rand()/(float)RAND_MAX; 12 | vector2[i] = rand()/(float)RAND_MAX; 13 | } 14 | //2.创建vector1、vector2、vector3向量的SYCL缓冲区; 15 | buffer vector1_buffer(vector1,R); 16 | buffer vector2_buffer(vector2,R); 17 | buffer vector3_buffer(vector3,R); 18 | 19 | //3.向Device提交工作(定义了访问缓冲区内存的accessor;) 20 | q.submit([&](handler &h) { 21 | accessor v1_accessor (vector1_buffer,h,read_only); 22 | accessor v2_accessor (vector2_buffer,h,read_only); 23 | accessor v3_accessor (vector3_buffer,h); 24 | //4. 调用oneAPI的核函数在Device上完成指定的运算; 25 | h.parallel_for (range<1>(numElements), [=](id<1> index) { 26 | //核函数部分,若单独写一个函数,直接使用函数名(参数表)调用即可 27 | if (index < numElements) 28 | v3_accessor [index] = v1_accessor [index] + v2_accessor [index]; 29 | }); 30 | }).wait(); //排队等待 31 | // 5. 将SYCL缓冲区的数据读到Host端,检查误差 32 | host_accessor h_c(vector3_buffer,read_only); 33 | for (int i = 0; i < numElements; ++i) { 34 | if (fabs(vector1[0] + vector2[0] - vector3[0] ) > 1e-8 ) { 35 | fprintf(stderr, "Result verification failed at element %d!\n", i); 36 | exit(EXIT_FAILURE); 37 | } 38 | } 39 | } 40 | int main() { 41 | try { 42 | queue q; 43 | work(q); 44 | } catch (exception e) { 45 | std::cerr << "Exception: " << e.what() << std::endl; 46 | std::terminate(); 47 | } catch (...) { 48 | std::cerr << "Unknown exception" << std::endl; 49 | std::terminate(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /hip/hip_vectorAdd.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | __global__ void vectorAdd(float *d_A,float *d_B,float *d_C,int numElements) 7 | { 8 | int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 9 | if(i 1e-8) 57 | { 58 | fprintf(stderr, "Result verification failed at element %d!\n", i); 59 | exit(EXIT_FAILURE); 60 | } 61 | } 62 | 63 | //5.释放内存 64 | hipFree(d_A); 65 | hipFree(d_B); 66 | hipFree(d_C); 67 | free(h_A); 68 | free(h_B); 69 | free(h_C); 70 | 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /opencl/OpenCL_Mixer.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | const int t = 1000; 13 | 14 | 15 | const int heightA = t; 16 | const int widthB = t; 17 | const int midle = t; 18 | 19 | //const int heightB = 3; 20 | 21 | //一、 选择OpenCL平台并创建一个上下文 22 | cl_context CreateContext() 23 | { 24 | cl_int errNum; 25 | cl_uint numPlatforms; 26 | cl_platform_id firstPlatformId; 27 | cl_context context = NULL; 28 | 29 | //选择可用的平台中的第一个 30 | errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms); 31 | if (errNum != CL_SUCCESS || numPlatforms <= 0) 32 | { 33 | std::cerr << "Failed to find any OpenCL platforms." << std::endl; 34 | return NULL; 35 | } 36 | 37 | //创建一个OpenCL上下文环境 38 | cl_context_properties contextProperties[] = 39 | { 40 | CL_CONTEXT_PLATFORM, 41 | (cl_context_properties)firstPlatformId, 42 | 0 43 | }; 44 | context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, 45 | NULL, NULL, &errNum); 46 | 47 | return context; 48 | } 49 | 50 | 51 | //二、 创建设备并创建命令队列 52 | cl_command_queue CreateCommandQueue(cl_context context, cl_device_id* device) 53 | { 54 | cl_int errNum; 55 | cl_device_id* devices; 56 | cl_command_queue commandQueue = NULL; 57 | size_t deviceBufferSize = -1; 58 | 59 | // 获取设备缓冲区大小 60 | errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize); 61 | 62 | if (deviceBufferSize <= 0) 63 | { 64 | std::cerr << "No devices available."; 65 | return NULL; 66 | } 67 | 68 | // 为设备分配缓存空间 69 | devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)]; 70 | errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL); 71 | 72 | //选取可用设备中的第一个 73 | commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL); 74 | 75 | *device = devices[0]; 76 | delete[] devices; 77 | return commandQueue; 78 | } 79 | 80 | 81 | // 三、创建和构建程序对象 82 | cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName) 83 | { 84 | cl_int errNum; 85 | cl_program program; 86 | 87 | std::ifstream kernelFile(fileName, std::ios::in); 88 | if (!kernelFile.is_open()) 89 | { 90 | std::cerr << "Failed to open file for reading: " << fileName << std::endl; 91 | return NULL; 92 | } 93 | 94 | std::ostringstream oss; 95 | oss << kernelFile.rdbuf(); 96 | 97 | std::string srcStdStr = oss.str(); 98 | const char* srcStr = srcStdStr.c_str(); 99 | program = clCreateProgramWithSource(context, 1, 100 | (const char**)&srcStr, 101 | NULL, NULL); 102 | 103 | errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 104 | 105 | return program; 106 | } 107 | 108 | //创建和构建程序对象 109 | bool CreateMemObjects(cl_context context, cl_mem memObjects[3], 110 | int* a, int* b) 111 | { 112 | memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 113 | sizeof(int) * midle * heightA, a, NULL); 114 | memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 115 | sizeof(int) * widthB * midle, b, NULL); 116 | memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, 117 | sizeof(int) * widthB * heightA, NULL, NULL); 118 | return true; 119 | } 120 | 121 | 122 | // 释放OpenCL资源 123 | void Cleanup(cl_context context, cl_command_queue commandQueue, 124 | cl_program program, cl_kernel kernel, cl_mem memObjects[3]) 125 | { 126 | for (int i = 0; i < 3; i++) 127 | { 128 | if (memObjects[i] != 0) 129 | clReleaseMemObject(memObjects[i]); 130 | } 131 | if (commandQueue != 0) 132 | clReleaseCommandQueue(commandQueue); 133 | 134 | if (kernel != 0) 135 | clReleaseKernel(kernel); 136 | 137 | if (program != 0) 138 | clReleaseProgram(program); 139 | 140 | if (context != 0) 141 | clReleaseContext(context); 142 | } 143 | 144 | 145 | int main(int argc, char** argv) 146 | { 147 | cl_context context = 0; 148 | cl_command_queue commandQueue = 0; 149 | cl_program program = 0; 150 | cl_device_id device = 0; 151 | cl_kernel kernel = 0; 152 | cl_mem memObjects[3] = { 0, 0, 0 }; 153 | cl_int errNum; 154 | cl_event events[1]; 155 | clock_t t1, t2, t3; 156 | 157 | 158 | const char* filename = "./a.cl"; 159 | // 一、选择OpenCL平台并创建一个上下文 160 | context = CreateContext(); 161 | 162 | // 二、 创建设备并创建命令队列 163 | commandQueue = CreateCommandQueue(context, &device); 164 | 165 | //三、创建和构建程序对象 166 | program = CreateProgram(context, device, filename); 167 | 168 | // 四、 创建OpenCL内核并分配内存空间 169 | kernel = clCreateKernel(program, "hello_kernel", NULL); 170 | 171 | //创建要处理的数据 172 | int* a = NULL; // 输入数组 173 | int* b = NULL; // 输入数组 174 | int* result = NULL; // 输出数组 175 | // 数组的大小 176 | const int elementsA = heightA * midle; 177 | const int elementsB = midle * widthB; 178 | const int elementsC = heightA * widthB; 179 | 180 | // 计算内存大小 181 | size_t datasizeA = sizeof(float) * elementsA; 182 | size_t datasizeB = sizeof(float) * elementsB; 183 | size_t datasizeC = sizeof(float) * elementsC; 184 | // 分配内存空间 185 | a = (int*)malloc(datasizeA); 186 | b = (int*)malloc(datasizeB); 187 | result = (int*)malloc(datasizeC); 188 | 189 | for (int i = 0; i < heightA; i++) 190 | { 191 | for (int j = 0; j < midle; j++) 192 | { 193 | a[i * midle + j] = 2;//10.0f * ((int) rand() / (int) RAND_MAX); 194 | } 195 | 196 | } 197 | 198 | 199 | for (int k = 0; k < midle; k++) 200 | { 201 | for (int m = 0; m < widthB; m++) 202 | { 203 | b[k * widthB + m] = 3;//10.0f * ((int) rand() / (int) RAND_MAX); 204 | } 205 | 206 | } 207 | 208 | t1 = clock(); //mach_absolute_time(); 209 | //cpu串行处理代码 210 | for (int l = 0; l < heightA; l++) { 211 | for (int n = 0; n < widthB; n++) { 212 | for (int q = 0; q < midle; q++) { 213 | result[l * widthB + n] += a[l * midle + q] * b[q * widthB + n]; 214 | 215 | } 216 | //std::cout<<"r = "< 5 | #include 6 | 7 | // OpenCL includes 8 | #include 9 | 10 | // OpenCL kernel to perform an element-wise addition 11 | const char* programSource = 12 | "__kernel \n" 13 | "void vecadd(__global int *A, \n" 14 | " __global int *B, \n" 15 | " __global int *C) \n" 16 | "{ \n" 17 | " \n" 18 | " // Get the work-item’s unique ID \n" 19 | " int idx = get_global_id(0); \n" 20 | " \n" 21 | " // Add the corresponding locations of \n" 22 | " // 'A' and 'B', and store the result in 'C'. \n" 23 | " C[idx] = A[idx] + B[idx]; \n" 24 | "} \n" 25 | ; 26 | 27 | int main() { 28 | // This code executes on the OpenCL host 29 | 30 | // Host data 31 | int *A = NULL; // Input array 32 | int *B = NULL; // Input array 33 | int *C = NULL; // Output array 34 | 35 | // Elements in each array 36 | const int elements = 2048; 37 | 38 | // Compute the size of the data 39 | size_t datasize = sizeof(int)*elements; 40 | 41 | // Allocate space for input/output data 42 | A = (int*)malloc(datasize); 43 | B = (int*)malloc(datasize); 44 | C = (int*)malloc(datasize); 45 | 46 | // Initialize the input data 47 | int i; 48 | for(i = 0; i < elements; i++) { 49 | A[i] = i; 50 | B[i] = i; 51 | } 52 | 53 | // Use this to check the output of each API call 54 | cl_int status; 55 | 56 | // Retrieve the number of platforms 57 | cl_uint numPlatforms = 0; 58 | status = clGetPlatformIDs(0, NULL, &numPlatforms); 59 | 60 | // Allocate enough space for each platform 61 | cl_platform_id *platforms = NULL; 62 | platforms = (cl_platform_id*)malloc( 63 | numPlatforms*sizeof(cl_platform_id)); 64 | 65 | // Fill in the platforms 66 | status = clGetPlatformIDs(numPlatforms, platforms, NULL); 67 | 68 | // Retrieve the number of devices 69 | cl_uint numDevices = 0; 70 | status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, 71 | NULL, &numDevices); 72 | 73 | // Allocate enough space for each device 74 | cl_device_id *devices; 75 | devices = (cl_device_id*)malloc( 76 | numDevices*sizeof(cl_device_id)); 77 | 78 | // Fill in the devices 79 | status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 80 | numDevices, devices, NULL); 81 | 82 | // Create a context and associate it with the devices 83 | cl_context context; 84 | context = clCreateContext(NULL, numDevices, devices, NULL, 85 | NULL, &status); 86 | 87 | // Create a command queue and associate it with the device 88 | cl_command_queue cmdQueue; 89 | cmdQueue = clCreateCommandQueue(context, devices[0], 0, 90 | &status); 91 | 92 | // Create a buffer object that will contain the data 93 | // from the host array A 94 | cl_mem bufA; 95 | bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, 96 | NULL, &status); 97 | 98 | // Create a buffer object that will contain the data 99 | // from the host array B 100 | cl_mem bufB; 101 | bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, 102 | NULL, &status); 103 | 104 | // Create a buffer object that will hold the output data 105 | cl_mem bufC; 106 | bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, 107 | NULL, &status); 108 | 109 | // Write input array A to the device buffer bufferA 110 | status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 111 | 0, datasize, A, 0, NULL, NULL); 112 | 113 | // Write input array B to the device buffer bufferB 114 | status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 115 | 0, datasize, B, 0, NULL, NULL); 116 | 117 | // Create a program with source code 118 | cl_program program = clCreateProgramWithSource(context, 1, 119 | (const char**)&programSource, NULL, &status); 120 | 121 | // Build (compile) the program for the device 122 | status = clBuildProgram(program, numDevices, devices, 123 | NULL, NULL, NULL); 124 | 125 | // Create the vector addition kernel 126 | cl_kernel kernel; 127 | kernel = clCreateKernel(program, "vecadd", &status); 128 | 129 | // Associate the input and output buffers with the kernel 130 | status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA); 131 | status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB); 132 | status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC); 133 | 134 | // Define an index space (global work size) of work 135 | // items for execution. A workgroup size (local work size) 136 | // is not required, but can be used. 137 | size_t globalWorkSize[1]; 138 | 139 | // There are 'elements' work-items 140 | globalWorkSize[0] = elements; 141 | 142 | // Execute the kernel for execution 143 | status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, 144 | globalWorkSize, NULL, 0, NULL, NULL); 145 | 146 | // Read the device output buffer to the host output array 147 | clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, 148 | datasize, C, 0, NULL, NULL); 149 | 150 | // Verify the output 151 | int result = 1; 152 | for(i = 0; i < elements; i++) { 153 | if(C[i] != i+i) { 154 | result = 0; 155 | break; 156 | } 157 | } 158 | if(result) { 159 | printf("Output is correct\n"); 160 | } else { 161 | printf("Output is incorrect\n"); 162 | } 163 | 164 | // Free OpenCL resources 165 | clReleaseKernel(kernel); 166 | clReleaseProgram(program); 167 | clReleaseCommandQueue(cmdQueue); 168 | clReleaseMemObject(bufA); 169 | clReleaseMemObject(bufB); 170 | clReleaseMemObject(bufC); 171 | clReleaseContext(context); 172 | 173 | // Free host resources 174 | free(A); 175 | free(B); 176 | free(C); 177 | free(platforms); 178 | free(devices); 179 | 180 | return 0; 181 | } 182 | 183 | -------------------------------------------------------------------------------- /opencl/a.cl: -------------------------------------------------------------------------------- 1 | __kernel void hello_kernel(__global const int *a, 2 | __global const int *b, 3 | __global int *result_matrix,int result_matrix_row, 4 | int result_matrix_col,int compute_size) 5 | { 6 | int row = get_global_id(0); 7 | int col = get_global_id(1); 8 | 9 | 10 | int sum = 0; 11 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | 6 | const int NUM_THREADS = 4; //设置线程数量 7 | int N = 10000; 8 | int M = 10000; 9 | int mat[10000][10000]; //矩阵mat 10 | int vec[10000], ans[10000]; //向量vec 11 | 12 | void makeRandomMatrix() //生成矩阵 13 | { 14 | srand(time(NULL)); 15 | int i, j; 16 | for (i = 0; i < M; i++) 17 | { 18 | for (j = 0; j < N; j++) 19 | { 20 | mat[i][j] = rand() % 10 + 1; 21 | } 22 | } 23 | } 24 | 25 | void makeRandomVector() //生成向量 26 | { 27 | srand(time(NULL)); 28 | int i; 29 | for (i = 0; i < N; i++) 30 | { 31 | vec[i] = rand() % 10 + 1; 32 | } 33 | } 34 | 35 | void funy(int a[], int cur) //计算矩阵和矢量乘的部分结果 36 | { 37 | int i; 38 | for (i = 0; i < N; i++) 39 | { 40 | ans[cur] += a[i] * vec[i]; 41 | } 42 | } 43 | 44 | void f() //串行计算 45 | { 46 | int i; 47 | for (i = 0; i < M; i++) 48 | { 49 | funy(mat[i], i); 50 | } 51 | } 52 | 53 | void fp() //并行计算 54 | { 55 | int i; 56 | #pragma omp parallel for num_threads(NUM_THREADS) 57 | for (i = 0; i < M; i ++) 58 | { 59 | funy(mat[i], i); 60 | } 61 | } 62 | 63 | int main() 64 | { 65 | printf("Makeing matrix(%d*%d) & vector(%d*1)...\n",N,M,N); 66 | makeRandomMatrix(); 67 | makeRandomVector(); 68 | double start_time = omp_get_wtime(); 69 | f(); 70 | double end_time = omp_get_wtime(); 71 | printf("串行 --- Running time=%f s\n", end_time - start_time); 72 | start_time = omp_get_wtime(); 73 | fp(); 74 | end_time = omp_get_wtime(); 75 | printf("%d threads --- Running time=%f s\n", NUM_THREADS,end_time - start_time); 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /openmp/OpenMP-matrix_multiplication.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | const int maxn = 4; 8 | 9 | int A[maxn][maxn], B[maxn][maxn], C[maxn][maxn]; 10 | 11 | int main() { 12 | int i, j, k; 13 | 14 | omp_set_num_threads(omp_get_num_procs()); 15 | srand(time(NULL)); 16 | for (i = 0; i < maxn; i++) 17 | for (j = 0; j < maxn; j++) { 18 | A[i][j] = rand() % 10; 19 | B[i][j] = rand() % 10; 20 | } 21 | 22 | #pragma omp parallel for private(i,j,k) shared(A,B,C) 23 | for (i = 0; i < maxn; ++i) 24 | for (j = 0; j < maxn; ++j) 25 | for (k = 0; k < maxn; ++k){ 26 | //printf("OpenMP Test, : %d\n", omp_get_thread_num()); 27 | C[i][j] += A[i][k] * B[k][j]; 28 | } 29 | 30 | 31 | for (i = 0; i < maxn; i++) { 32 | for (j = 0; j < maxn; j++) 33 | cout << C[i][j] << "\t"; 34 | cout << endl; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /openmp/OpenMP-simple_instances.c: -------------------------------------------------------------------------------- 1 | #include //hello 2 | #include 3 | 4 | int main() 5 | { 6 | int nthreads,thread_id; 7 | printf("I am the main thread.\n"); 8 | #omp_set_num_threads(32); 9 | #pragma omp parallel private(nthreads,thread_id) 10 | { 11 | nthreads=omp_get_num_threads(); 12 | thread_id=omp_get_thread_num(); 13 | printf("Helllo I am thread %d out of a team of %d\n",thread_id,nthreads); 14 | } 15 | printf("Here I am,back to the main thread.\n"); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /openmp/openmp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcc OpenMP-simple_instances.c -o OpenMP-simple_instances -fopenmp 4 | 5 | g++ OpenMP-matrix_multiplication.cpp -o OpenMP-matrix_multiplication -fopenmp 6 | 7 | gcc OpenMP-Matrix_Vector_Multiplication.c -o OpenMP-Matrix_Vector_Multiplication -fopenmp -------------------------------------------------------------------------------- /pthread/PThread-matrix_multiplication.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #define M 600 10 | #define N 600 11 | int matrixA[M][N]; 12 | int matrixB[N][M]; 13 | int result[M][N]; 14 | 15 | void *func(void *arg); 16 | 17 | const int NUM_THREADS =8 ; //线程数 18 | pthread_t tids[NUM_THREADS]; //线程 19 | int L; //每个线程计算的块大小 20 | 21 | void makeRandomMatrix_A() //生成矩阵 22 | { 23 | srand(time(NULL)); 24 | int i, j; 25 | for (i = 0; i < M; i++) 26 | { 27 | for (j = 0; j < N; j++) 28 | { 29 | matrixA[i][j] = rand() % 10 + 1; 30 | } 31 | } 32 | } 33 | 34 | void makeRandomMatrix_B() //生成矩阵 35 | { 36 | srand(time(NULL)); 37 | int i, j; 38 | for (i = 0; i < N; i++) 39 | { 40 | for (j = 0; j < M; j++) 41 | { 42 | matrixB[i][j] = rand() % 10 + 1; 43 | } 44 | } 45 | } 46 | 47 | //子线程函数 48 | void *func(void *arg) 49 | { 50 | int s=*(int *)arg; //接收传入的参数(此线程从哪一行开始计算) 51 | int t=s+L; //线程算到哪一行为止 52 | for(int i=s;i 2 | #include 3 | #include 4 | #include 5 | 6 | void* thread(void *id){ 7 | pthread_t newthid; 8 | 9 | newthid = pthread_self(); 10 | printf("this is a new thread, thread ID is %u\n", newthid); 11 | return NULL; 12 | } 13 | 14 | int main(){ 15 | int num_thread = 5; 16 | pthread_t *pt = (pthread_t *)malloc(sizeof(pthread_t) * num_thread); 17 | 18 | printf("main thread, ID is %u\n", pthread_self()); 19 | for (int i = 0; i < num_thread; i++){ 20 | if (pthread_create(&pt[i], NULL, thread, NULL) != 0){ 21 | printf("thread create failed!\n"); 22 | return 1; 23 | } 24 | } 25 | sleep(2); 26 | free(pt); 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /pthread/PThread-synchronization.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | int tickets = 20; 6 | pthread_mutex_t mutex; 7 | 8 | void *mythread1(void) 9 | { 10 | while (1) 11 | { 12 | pthread_mutex_lock(&mutex); //给互斥量上锁 13 | if (tickets > 0) 14 | { 15 | usleep(1000); 16 | printf("ticketse1 sells ticket:%d\n", tickets--); 17 | pthread_mutex_unlock(&mutex); //给互斥量解锁 18 | } 19 | else 20 | { 21 | pthread_mutex_unlock(&mutex); //给互斥量解锁 22 | break; 23 | } 24 | sleep(1); 25 | } 26 | return (void *)0; 27 | } 28 | void *mythread2(void) 29 | { 30 | while (1) 31 | { 32 | pthread_mutex_lock(&mutex); //给互斥量上锁 33 | if (tickets > 0) 34 | { 35 | usleep(1000); 36 | printf("ticketse2 sells ticket:%d\n", tickets--); 37 | pthread_mutex_unlock(&mutex); //给互斥量解锁 38 | } 39 | else 40 | { 41 | pthread_mutex_unlock(&mutex); //给互斥量解锁 42 | break; 43 | } 44 | sleep(1); 45 | } 46 | return (void *)0; 47 | } 48 | 49 | int main(int argc, const char *argv[]) 50 | { 51 | //int i = 0; 52 | int ret = 0; 53 | pthread_t id1, id2; 54 | 55 | ret = pthread_create(&id1, NULL, (void *)mythread1, NULL); //创建线程1 56 | if (ret) 57 | { 58 | printf("Create pthread error!\n"); 59 | return 1; 60 | } 61 | 62 | ret = pthread_create(&id2, NULL, (void *)mythread2, NULL); //创建线程2 63 | if (ret) 64 | { 65 | printf("Create pthread error!\n"); 66 | return 1; 67 | } 68 | 69 | pthread_join(id1, NULL); //等待线程结束 70 | pthread_join(id2, NULL); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /pthread/pthread.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gcc PThread-matrix_multiplication.c -o PThread-matrix_multiplication -pthread 4 | 5 | gcc PThread-simple_instances.c -o PThread-simple_instances -pthread 6 | 7 | gcc PThread-synchronization.c -o PThread-synchronization -pthread --------------------------------------------------------------------------------