├── README.md
├── cuda
    ├── Transformer Encoder.cu
    ├── cu_managed_Matrixmultiplication.cu
    ├── cu_vectorAdd.cu
    └── cuda.sh
├── dpcpp
    ├── dpcpp_sobel.cpp
    ├── dpcpp_templatematching.cpp
    └── dpcpp_vectorAdd.cpp
├── hip
    └── hip_vectorAdd.cpp
├── opencl
    ├── OpenCL_Mixer.cpp
    ├── OpenCL_vectorAdd.c
    ├── a.cl
    └── opencl.sh
├── openmp
    ├── OpenMP-Matrix_Vector_Multiplication.c
    ├── OpenMP-matrix_multiplication.cpp
    ├── OpenMP-simple_instances.c
    └── openmp.sh
└── pthread
    ├── PThread-matrix_multiplication.c
    ├── PThread-simple_instances.c
    ├── PThread-synchronization.c
    └── pthread.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # Parallel Computing Lab 
 2 | 
 3 | 
 4 | This is a compilation of experiments on multi-thread computing, parallel computing and a small project on parallel programming language implementations, including Pthread, OpenMP, CUDA, HIP, OpenCL and DPC++.
 5 | 
 6 | 
 7 | 
 8 | ## OpenMP
 9 | 
10 | OpenMP-simple_instances.c is a simple OpenMP example
11 | 
12 | OpenMP-Matrix_Vector_Multiplication.c is an OpenMP example of vector and matrix multiplication.
13 | 
14 | OpenMP-matrix_multiplication.c is an OpenMP example of matrix multiplication.
15 | 
16 | For specific instructions, see
17 | https://blog.csdn.net/qq_46009046/article/details/133587081
18 | 
19 | ## PThread
20 | 
21 | PThread-simple_instances.c is a simple PThread example
22 | 
23 | PThread-synchronization.c is a synchronized PThread example
24 | 
25 | PThread-matrix_multiplication.c is a matrix multiplication PThread example
26 | 
27 | For specific instructions, see
28 | https://blog.csdn.net/qq_46009046/article/details/133587081
29 | 
30 | ## CUDA
31 | 
32 | cu_vectorAdd.cu is a simple vector addition CUDA example
33 | 
34 | managed_cu_Matrixmultiplication.cu is a matrix multiplication implemented using the CDUA unified shared memory
35 | 
36 | Transformer_Encoder.cu is a CUDA-based implementation of the Transformer Encoder Example 
37 | 
38 | For specific instructions, see
39 | https://blog.csdn.net/qq_46009046/article/details/133753993
40 | https://blog.csdn.net/qq_46009046/article/details/133797554
41 | https://blog.csdn.net/qq_46009046/article/details/134020656
42 | 
43 | ## OpenCL
44 | 
45 | OpenCL_vectorAdd.c is a simple vector addition OpenCL example
46 | 
47 | OpenCL_Mixer.c OpenCL_Mixer.c is a matrix multiplication implementation using OpenCL.
48 | 
49 | a.cl a.cl is a kernel function for OpenCL matrix multiplication
50 | 
51 | For specific instructions, see
52 | https://blog.csdn.net/qq_46009046/article/details/133777178
53 | 
54 | ## HIP
55 | 
56 | hip_vectorAdd.cpp is a simple vector addition HIP example
57 | 
58 | For specific instructions, see
59 | https://blog.csdn.net/qq_46009046/article/details/133583217
60 | 
61 | ## DPC++
62 | 
63 | dpcpp_vectorAdd.cpp is a simple vector addition DPC++ example
64 | 
65 | dpcpp_templatematching.cpp is a template matching DPC++ example
66 | 
67 | dpcpp_sobel.cpp is a sobel filter DPC++ example.
68 | 
69 | For specific instructions, see
70 | https://blog.csdn.net/qq_46009046/article/details/123306679
71 | 


--------------------------------------------------------------------------------
/cuda/Transformer Encoder.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include<bits/stdc++.h>
  3 | #include "cuda_runtime.h"
  4 | #include "device_launch_parameters.h"
  5 | #include <cublas_v2.h>
  6 | // cuda API error checking
  7 | #define CUDA_CHECK(err)                                                                            \
  8 |     do {                                                                                           \
  9 |         cudaError_t err_ = (err);                                                                  \
 10 |         if (err_ != cudaSuccess) {                                                                 \
 11 |             std::printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                     \
 12 |         }                                                                                          \
 13 |     } while (0)
 14 | 
 15 | // cublas API error checking
 16 | #define CUBLAS_CHECK(err)                                                                          \
 17 |     do {                                                                                           \
 18 |         cublasStatus_t err_ = (err);                                                               \
 19 |         if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
 20 |             std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                   \
 21 |         }                                                                                          \
 22 |     } while (0)
 23 | 
 24 | 
 25 | #define batch_count 8
 26 | #define M 197
 27 | #define N 768
 28 | 
 29 | using data_type = float;
 30 | //定义输入和用于residual结构的张量
 31 | std::vector<std::vector<data_type>> tensor(batch_count, std::vector<data_type>(M* N));
 32 | std::vector<std::vector<data_type>> tensor_copy(batch_count, std::vector<data_type>(M* N));
 33 | //定义三个liner层的参数
 34 | std::vector<std::vector<data_type>> w1(batch_count, std::vector<data_type>(768 * 2304));
 35 | std::vector<std::vector<data_type>> b1(batch_count, std::vector<data_type>(197 * 2304));
 36 | std::vector<std::vector<data_type>> w2(batch_count, std::vector<data_type>(768 * 3072));
 37 | std::vector<std::vector<data_type>> b2(batch_count, std::vector<data_type>(197 * 3072));
 38 | std::vector<std::vector<data_type>> w3(batch_count, std::vector<data_type>(3072 * 768));
 39 | std::vector<std::vector<data_type>> b3(batch_count, std::vector<data_type>(197 * 768));
 40 | //定义QKV及其中间结果
 41 | std::vector<std::vector<data_type>> Q(batch_count * 8, std::vector<data_type>(197 * 96));
 42 | std::vector<std::vector<data_type>> K(batch_count * 8, std::vector<data_type>(197 * 96));
 43 | std::vector<std::vector<data_type>> V(batch_count * 8, std::vector<data_type>(197 * 96));
 44 | std::vector<std::vector<data_type>> QK(batch_count * 8, std::vector<data_type>(197 * 197));
 45 | 
 46 | //**********************************initial*******************************************
 47 | //初始化tensor数据
 48 | void tensor_initial(int input_dim, int output_dim)
 49 | {
 50 |     const int m = input_dim;
 51 |     const int n = output_dim;
 52 |     for (int b = 0; b < batch_count; b++)
 53 |         for (int i = 0; i < m; i++) {
 54 |             for (int j = 0; j < n; j++) {
 55 |                 tensor[b][j * m + i] = (float)(rand() % 101) / 101;
 56 |             }
 57 |         }
 58 | }
 59 | //初始化liner层w和b数据
 60 | void liner_initial() {
 61 |     //第一个liner层（B*197*768————B*197*2304）
 62 |     const int m1 = 197;
 63 |     const int k1 = 768;
 64 |     const int n1 = 2304;
 65 |     for (int b = 0; b < batch_count; b++)
 66 |         for (int i = 0; i < k1; i++) {
 67 |             for (int j = 0; j < n1; j++) {
 68 |                 w1[b][j * k1 + i] = (float)(rand() % 101) / 101;
 69 |             }
 70 |         }
 71 |     for (int b = 0; b < batch_count; b++)
 72 |         for (int i = 0; i < m1; i++) {
 73 |             for (int j = 0; j < n1; j++) {
 74 |                 b1[b][j * m1 + i] = (float)(rand() % 101) / 101;
 75 |             }
 76 |         }
 77 |     //第二个liner层（B*197*768————B*197*3072）
 78 |     const int m2 = 197;
 79 |     const int k2 = 768;
 80 |     const int n2 = 3072;
 81 |     for (int b = 0; b < batch_count; b++)
 82 |         for (int i = 0; i < k2; i++) {
 83 |             for (int j = 0; j < n2; j++) {
 84 |                 w2[b][j * k2 + i] = (float)(rand() % 101) / 101;
 85 |             }
 86 |         }
 87 |     for (int b = 0; b < batch_count; b++)
 88 |         for (int i = 0; i < m2; i++) {
 89 |             for (int j = 0; j < n2; j++) {
 90 |                 b2[b][j * m2 + i] = (float)(rand() % 101) / 101;
 91 |             }
 92 |         }
 93 |     //第三个liner层（B*197*3072————B*197*768）
 94 |     const int m3 = 197;
 95 |     const int k3 = 3072;
 96 |     const int n3 = 768;
 97 |     for (int b = 0; b < batch_count; b++)
 98 |         for (int i = 0; i < k3; i++) {
 99 |             for (int j = 0; j < n3; j++) {
100 |                 w3[b][j * k3 + i] = (float)(rand() % 101) / 101;
101 |             }
102 |         }
103 |     for (int b = 0; b < batch_count; b++)
104 |         for (int i = 0; i < m3; i++) {
105 |             for (int j = 0; j < n3; j++) {
106 |                 b3[b][j * m3 + i] = (float)(rand() % 101) / 101;
107 |             }
108 |         }
109 | }
110 | //初始化liner的w和b
111 | void qkv_initial(int input_dim, int output_dim) {
112 |     const int m1 = input_dim;
113 |     const int n1 = output_dim;
114 |     const int m_batch_count = batch_count * 8;
115 |     for (int b = 0; b < m_batch_count; b++)
116 |         for (int i = 0; i < m1; i++) {
117 |             for (int j = 0; j < n1; j++) {
118 |                 Q[b][j * m1 + i] = (float)(rand() % 101) / 101;
119 |                 K[b][j * m1 + i] = (float)(rand() % 101) / 101;
120 |                 V[b][j * m1 + i] = (float)(rand() % 101) / 101;
121 |             }
122 |         }
123 | }
124 | //******************************MultiHeadAttention************************************
125 | //计算b1=tensor*w1+b1，得到的b1（B×197×2304）为结果
126 | void liner_1(int input_dim, int output_dim) {
127 |     const int m = 197;
128 |     const int n = output_dim;
129 |     const int k = input_dim;
130 |     const int lda = m;
131 |     const int ldb = k;
132 |     const int ldc = m;
133 | 
134 |     const data_type alpha = 1.0;
135 |     const data_type beta = 1.0;
136 | 
137 |     cublasHandle_t cublasH = NULL;
138 |     cudaStream_t stream = NULL;
139 | 
140 |     data_type** d_A_array = nullptr;
141 |     data_type** d_B_array = nullptr;
142 |     data_type** d_C_array = nullptr;
143 | 
144 |     std::vector<data_type*> d_A(batch_count, nullptr);
145 |     std::vector<data_type*> d_B(batch_count, nullptr);
146 |     std::vector<data_type*> d_C(batch_count, nullptr);
147 | 
148 |     cublasOperation_t transa = CUBLAS_OP_N;
149 |     cublasOperation_t transb = CUBLAS_OP_N;
150 | 
151 |     /* step 1: create cublas handle, bind a stream */
152 |     CUBLAS_CHECK(cublasCreate(&cublasH));
153 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
154 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
155 | 
156 |     /* step 2: copy data to device */
157 |     for (int i = 0; i < batch_count; i++) {
158 |         CUDA_CHECK(
159 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * tensor[i].size()));
160 |         CUDA_CHECK(
161 |             cudaMalloc(reinterpret_cast<void**>(&d_B[i]), sizeof(data_type) * w1[i].size()));
162 |         CUDA_CHECK(
163 |             cudaMalloc(reinterpret_cast<void**>(&d_C[i]), sizeof(data_type) * b1[i].size()));
164 |     }
165 | 
166 |     CUDA_CHECK(
167 |         cudaMalloc(reinterpret_cast<void**>(&d_A_array), sizeof(data_type*) * batch_count));
168 |     CUDA_CHECK(
169 |         cudaMalloc(reinterpret_cast<void**>(&d_B_array), sizeof(data_type*) * batch_count));
170 |     CUDA_CHECK(
171 |         cudaMalloc(reinterpret_cast<void**>(&d_C_array), sizeof(data_type*) * batch_count));
172 | 
173 |     for (int i = 0; i < batch_count; i++) {
174 |         CUDA_CHECK(cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(),
175 |             cudaMemcpyHostToDevice, stream));
176 |         CUDA_CHECK(cudaMemcpyAsync(d_B[i], w1[i].data(), sizeof(data_type) * w1[i].size(),
177 |             cudaMemcpyHostToDevice, stream));
178 |         CUDA_CHECK(cudaMemcpyAsync(d_C[i], b1[i].data(), sizeof(data_type) * b1[i].size(),
179 |             cudaMemcpyHostToDevice, stream));
180 |     }
181 | 
182 |     CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count,
183 |         cudaMemcpyHostToDevice, stream));
184 |     CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count,
185 |         cudaMemcpyHostToDevice, stream));
186 |     CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count,
187 |         cudaMemcpyHostToDevice, stream));
188 |    
189 |     /* step 3: compute */
190 |     cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
191 |         d_B_array, ldb, &beta, d_C_array, ldc, batch_count);
192 | 
193 |     /* step 4: copy data to host */
194 |     for (int i = 0; i < batch_count; i++) {
195 |         CUDA_CHECK(cudaMemcpyAsync(b1[i].data(), d_C[i], sizeof(data_type) * b1[i].size(),
196 |             cudaMemcpyDeviceToHost, stream));
197 |     }
198 |     /* free resources */
199 |     CUDA_CHECK(cudaFree(d_A_array));
200 |     CUDA_CHECK(cudaFree(d_B_array));
201 |     CUDA_CHECK(cudaFree(d_C_array));
202 |     for (int i = 0; i < batch_count; i++) {
203 |         CUDA_CHECK(cudaFree(d_A[i]));
204 |         CUDA_CHECK(cudaFree(d_B[i]));
205 |         CUDA_CHECK(cudaFree(d_C[i]));
206 |     }
207 | 
208 |     CUBLAS_CHECK(cublasDestroy(cublasH));
209 | 
210 |     CUDA_CHECK(cudaStreamDestroy(stream));
211 | 
212 |     //CUDA_CHECK(cudaDeviceReset());
213 | 
214 | }
215 | //将b1（B×197×2304）划分得到qkv（B×8×197×96）
216 | void Permute_1() {
217 |     
218 | }
219 | //qk=(q@transpose.k)×1/sqrt(Dk)
220 | void multiplication_1() {
221 |     cublasHandle_t cublasH = NULL;
222 |     cudaStream_t stream = NULL;
223 | 
224 |     const int m = 197;
225 |     const int n = 197;
226 |     const int k = 96;
227 |     const int lda = m;
228 |     const int ldb = n; //k转置了
229 |     const int ldc = m;
230 |     const int m_batch_count = batch_count*8;
231 | 
232 |     const data_type alpha = 1 / pow(24,-0.5); //放缩系数
233 |     const data_type beta = 0;
234 | 
235 |     data_type** d_A_array = nullptr;
236 |     data_type** d_B_array = nullptr;
237 |     data_type** d_C_array = nullptr;
238 | 
239 |     std::vector<data_type*> d_A(m_batch_count, nullptr);
240 |     std::vector<data_type*> d_B(m_batch_count, nullptr);
241 |     std::vector<data_type*> d_C(m_batch_count, nullptr);
242 | 
243 |     cublasOperation_t transa = CUBLAS_OP_N;
244 |     cublasOperation_t transb = CUBLAS_OP_T; //k转置
245 | 
246 |     /* step 1: create cublas handle, bind a stream */
247 |     CUBLAS_CHECK(cublasCreate(&cublasH));
248 | 
249 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
250 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
251 | 
252 |     /* step 2: copy data to device */
253 |     for (int i = 0; i < m_batch_count; i++) {
254 |         CUDA_CHECK(
255 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * Q[i].size()));
256 |         CUDA_CHECK(
257 |             cudaMalloc(reinterpret_cast<void**>(&d_B[i]), sizeof(data_type) * K[i].size()));
258 |         CUDA_CHECK(
259 |             cudaMalloc(reinterpret_cast<void**>(&d_C[i]), sizeof(data_type) * QK[i].size()));
260 |     }
261 | 
262 |     CUDA_CHECK(
263 |         cudaMalloc(reinterpret_cast<void**>(&d_A_array), sizeof(data_type*) * batch_count));
264 |     CUDA_CHECK(
265 |         cudaMalloc(reinterpret_cast<void**>(&d_B_array), sizeof(data_type*) * batch_count));
266 |     CUDA_CHECK(
267 |         cudaMalloc(reinterpret_cast<void**>(&d_C_array), sizeof(data_type*) * batch_count));
268 | 
269 |     for (int i = 0; i < m_batch_count; i++) {
270 |         CUDA_CHECK(cudaMemcpyAsync(d_A[i], Q[i].data(), sizeof(data_type) * Q[i].size(),
271 |             cudaMemcpyHostToDevice, stream));
272 |         CUDA_CHECK(cudaMemcpyAsync(d_B[i], K[i].data(), sizeof(data_type) * K[i].size(),
273 |             cudaMemcpyHostToDevice, stream));
274 |         CUDA_CHECK(cudaMemcpyAsync(d_C[i], QK[i].data(), sizeof(data_type) * QK[i].size(),
275 |             cudaMemcpyHostToDevice, stream));
276 |     }
277 | 
278 |     CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count,
279 |         cudaMemcpyHostToDevice, stream));
280 |     CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count,
281 |         cudaMemcpyHostToDevice, stream));
282 |     CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count,
283 |         cudaMemcpyHostToDevice, stream));
284 |     /* step 3: compute */
285 |     cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
286 |         d_B_array, ldb, &beta, d_C_array, ldc, batch_count);
287 | 
288 |     /* step 4: copy data to host */
289 |     for (int i = 0; i < m_batch_count; i++) {
290 |         CUDA_CHECK(cudaMemcpyAsync(QK[i].data(), d_C[i], sizeof(data_type) * QK[i].size(),
291 |             cudaMemcpyDeviceToHost, stream));
292 |     }
293 | 
294 |     CUDA_CHECK(cudaStreamSynchronize(stream));
295 |     /* free resources */
296 |     CUDA_CHECK(cudaFree(d_A_array));
297 |     CUDA_CHECK(cudaFree(d_B_array));
298 |     CUDA_CHECK(cudaFree(d_C_array));
299 |     for (int i = 0; i < m_batch_count; i++) {
300 |         CUDA_CHECK(cudaFree(d_A[i]));
301 |         CUDA_CHECK(cudaFree(d_B[i]));
302 |         CUDA_CHECK(cudaFree(d_C[i]));
303 |     }
304 | 
305 |     CUBLAS_CHECK(cublasDestroy(cublasH));
306 | 
307 |     CUDA_CHECK(cudaStreamDestroy(stream));
308 | 
309 |     //CUDA_CHECK(cudaDeviceReset());
310 | }
311 | //对qk进行softmax操作
312 | void softmax() {
313 |     const int m = 197;
314 |     const int n = 197;
315 |     const int count = batch_count * 8 * 197;
316 |     float s[count] = {};
317 |     //归约
318 |     int t = 0;
319 |     for (int b = 0; b < batch_count*8; b++)
320 |         for (int i = 0; i < m; i++) {
321 |             for (int j = 0; j < n; j++) {
322 |                 s[t]= s[t]+ exp(QK[b][j * m + i]);
323 |             }
324 |             t++;
325 |         }
326 |     //求softmax
327 |     t = 0;
328 |     for (int b = 0; b < batch_count * 8; b++)
329 |         for (int i = 0; i < m; i++) {
330 |             for (int j = 0; j < n; j++) {
331 |                 QK[b][j * m + i] = exp(QK[b][j * m + i]) / s[t];
332 |             }
333 |             t++;
334 |         }
335 | }
336 | //计算qk@v,将结果存到q中
337 | void multiplication_2() {
338 |     cublasHandle_t cublasH = NULL;
339 |     cudaStream_t stream = NULL;
340 | 
341 |     const int m = 197;
342 |     const int n = 96;
343 |     const int k = 197;
344 |     const int lda = m;
345 |     const int ldb = k; 
346 |     const int ldc = m;
347 |     const int m_batch_count = batch_count * 8;
348 | 
349 |     const data_type alpha = 1.0;
350 |     const data_type beta = 0;
351 | 
352 |     data_type** d_A_array = nullptr;
353 |     data_type** d_B_array = nullptr;
354 |     data_type** d_C_array = nullptr;
355 | 
356 |     std::vector<data_type*> d_A(m_batch_count, nullptr);
357 |     std::vector<data_type*> d_B(m_batch_count, nullptr);
358 |     std::vector<data_type*> d_C(m_batch_count, nullptr);
359 | 
360 |     cublasOperation_t transa = CUBLAS_OP_N;
361 |     cublasOperation_t transb = CUBLAS_OP_N; 
362 | 
363 |     /* step 1: create cublas handle, bind a stream */
364 |     CUBLAS_CHECK(cublasCreate(&cublasH));
365 | 
366 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
367 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
368 | 
369 |     /* step 2: copy data to device */
370 |     for (int i = 0; i < m_batch_count; i++) {
371 |         CUDA_CHECK(
372 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * QK[i].size()));
373 |         CUDA_CHECK(
374 |             cudaMalloc(reinterpret_cast<void**>(&d_B[i]), sizeof(data_type) * V[i].size()));
375 |         CUDA_CHECK(
376 |             cudaMalloc(reinterpret_cast<void**>(&d_C[i]), sizeof(data_type) * Q[i].size()));
377 |     }
378 | 
379 |     CUDA_CHECK(
380 |         cudaMalloc(reinterpret_cast<void**>(&d_A_array), sizeof(data_type*) * batch_count));
381 |     CUDA_CHECK(
382 |         cudaMalloc(reinterpret_cast<void**>(&d_B_array), sizeof(data_type*) * batch_count));
383 |     CUDA_CHECK(
384 |         cudaMalloc(reinterpret_cast<void**>(&d_C_array), sizeof(data_type*) * batch_count));
385 | 
386 |     for (int i = 0; i < m_batch_count; i++) {
387 |         CUDA_CHECK(cudaMemcpyAsync(d_A[i], QK[i].data(), sizeof(data_type) * QK[i].size(),
388 |             cudaMemcpyHostToDevice, stream));
389 |         CUDA_CHECK(cudaMemcpyAsync(d_B[i], V[i].data(), sizeof(data_type) * V[i].size(),
390 |             cudaMemcpyHostToDevice, stream));
391 |         CUDA_CHECK(cudaMemcpyAsync(d_C[i], Q[i].data(), sizeof(data_type) * Q[i].size(),
392 |             cudaMemcpyHostToDevice, stream));
393 |     }
394 | 
395 |     CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count,
396 |         cudaMemcpyHostToDevice, stream));
397 |     CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count,
398 |         cudaMemcpyHostToDevice, stream));
399 |     CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count,
400 |         cudaMemcpyHostToDevice, stream));
401 |     /* step 3: compute */
402 |     cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
403 |         d_B_array, ldb, &beta, d_C_array, ldc, batch_count);
404 | 
405 |     /* step 4: copy data to host */
406 |     for (int i = 0; i < m_batch_count; i++) {
407 |         CUDA_CHECK(cudaMemcpyAsync(Q[i].data(), d_C[i], sizeof(data_type) * Q[i].size(),
408 |             cudaMemcpyDeviceToHost, stream));
409 |     }
410 | 
411 |     CUDA_CHECK(cudaStreamSynchronize(stream));
412 |     /* free resources */
413 |     CUDA_CHECK(cudaFree(d_A_array));
414 |     CUDA_CHECK(cudaFree(d_B_array));
415 |     CUDA_CHECK(cudaFree(d_C_array));
416 |     for (int i = 0; i < m_batch_count; i++) {
417 |         CUDA_CHECK(cudaFree(d_A[i]));
418 |         CUDA_CHECK(cudaFree(d_B[i]));
419 |         CUDA_CHECK(cudaFree(d_C[i]));
420 |     }
421 | 
422 |     CUBLAS_CHECK(cublasDestroy(cublasH));
423 | 
424 |     CUDA_CHECK(cudaStreamDestroy(stream));
425 | 
426 |     //CUDA_CHECK(cudaDeviceReset());
427 | }
428 | //将q（B×8×197×96）转化为B×197×768赋值给tensor
429 | void Permute_2() {
430 | 
431 | }
432 | //*************************************MLP******************************************
433 | //计算b2=tensor*w2+b2，得到的b2（B×197×3072）为结果
434 | void liner_2(int input_dim, int output_dim) {
435 |     const int m = 197;
436 |     const int n = output_dim;
437 |     const int k = input_dim;
438 |     const int lda = m;
439 |     const int ldb = k;
440 |     const int ldc = m;
441 | 
442 |     const data_type alpha = 1.0;
443 |     const data_type beta = 1.0;
444 | 
445 |     cublasHandle_t cublasH = NULL;
446 |     cudaStream_t stream = NULL;
447 | 
448 |     data_type** d_A_array = nullptr;
449 |     data_type** d_B_array = nullptr;
450 |     data_type** d_C_array = nullptr;
451 | 
452 |     std::vector<data_type*> d_A(batch_count, nullptr);
453 |     std::vector<data_type*> d_B(batch_count, nullptr);
454 |     std::vector<data_type*> d_C(batch_count, nullptr);
455 | 
456 |     cublasOperation_t transa = CUBLAS_OP_N;
457 |     cublasOperation_t transb = CUBLAS_OP_N;
458 | 
459 |     /* step 1: create cublas handle, bind a stream */
460 |     CUBLAS_CHECK(cublasCreate(&cublasH));
461 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
462 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
463 | 
464 |     /* step 2: copy data to device */
465 |     for (int i = 0; i < batch_count; i++) {
466 |         CUDA_CHECK(
467 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * tensor[i].size()));
468 |         CUDA_CHECK(
469 |             cudaMalloc(reinterpret_cast<void**>(&d_B[i]), sizeof(data_type) * w2[i].size()));
470 |         CUDA_CHECK(
471 |             cudaMalloc(reinterpret_cast<void**>(&d_C[i]), sizeof(data_type) * b2[i].size()));
472 |     }
473 | 
474 |     CUDA_CHECK(
475 |         cudaMalloc(reinterpret_cast<void**>(&d_A_array), sizeof(data_type*) * batch_count));
476 |     CUDA_CHECK(
477 |         cudaMalloc(reinterpret_cast<void**>(&d_B_array), sizeof(data_type*) * batch_count));
478 |     CUDA_CHECK(
479 |         cudaMalloc(reinterpret_cast<void**>(&d_C_array), sizeof(data_type*) * batch_count));
480 | 
481 |     for (int i = 0; i < batch_count; i++) {
482 |         CUDA_CHECK(cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(),
483 |             cudaMemcpyHostToDevice, stream));
484 |         CUDA_CHECK(cudaMemcpyAsync(d_B[i], w2[i].data(), sizeof(data_type) * w2[i].size(),
485 |             cudaMemcpyHostToDevice, stream));
486 |         CUDA_CHECK(cudaMemcpyAsync(d_C[i], b2[i].data(), sizeof(data_type) * b2[i].size(),
487 |             cudaMemcpyHostToDevice, stream));
488 |     }
489 | 
490 |     CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count,
491 |         cudaMemcpyHostToDevice, stream));
492 |     CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count,
493 |         cudaMemcpyHostToDevice, stream));
494 |     CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count,
495 |         cudaMemcpyHostToDevice, stream));
496 | 
497 |     /* step 3: compute */
498 |     cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
499 |         d_B_array, ldb, &beta, d_C_array, ldc, batch_count);
500 | 
501 |     /* step 4: copy data to host */
502 |     for (int i = 0; i < batch_count; i++) {
503 |         CUDA_CHECK(cudaMemcpyAsync(b2[i].data(), d_C[i], sizeof(data_type) * b2[i].size(),
504 |             cudaMemcpyDeviceToHost, stream));
505 |     }
506 |     /* free resources */
507 |     CUDA_CHECK(cudaFree(d_A_array));
508 |     CUDA_CHECK(cudaFree(d_B_array));
509 |     CUDA_CHECK(cudaFree(d_C_array));
510 |     for (int i = 0; i < batch_count; i++) {
511 |         CUDA_CHECK(cudaFree(d_A[i]));
512 |         CUDA_CHECK(cudaFree(d_B[i]));
513 |         CUDA_CHECK(cudaFree(d_C[i]));
514 |     }
515 | 
516 |     CUBLAS_CHECK(cublasDestroy(cublasH));
517 | 
518 |     CUDA_CHECK(cudaStreamDestroy(stream));
519 | 
520 |     //CUDA_CHECK(cudaDeviceReset());
521 | }
522 | //对b2进行GELU操作
523 | __global__ void gelu(float* x, int n)
524 | {
525 |     int ix = threadIdx.x + blockDim.x * blockIdx.x;
526 |     if (ix < n)
527 |         x[ix] = 0.5 * x[ix] * (1 + tanh(sqrt(2 / 3.1415926) + 0.004715 * pow(x[ix], 3)));
528 |         
529 | }
530 | void GELU() {
531 | 
532 |     std::vector<data_type*> d_A(batch_count, nullptr);
533 | 
534 |     cudaStream_t stream = NULL;
535 | 
536 |     /* copy data to device */
537 |     for (int i = 0; i < batch_count; i++) {
538 |         CUDA_CHECK(
539 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * b2[i].size()));
540 |         CUDA_CHECK(
541 |             cudaMemcpyAsync(d_A[i], b2[i].data(), sizeof(data_type) * QK[i].size(), cudaMemcpyHostToDevice, stream));
542 |         
543 |         gelu << <608, 1024 >> > (d_A[i], QK[i].size());
544 | 
545 |         CUDA_CHECK(cudaMemcpyAsync(b2[i].data(), d_A[i], sizeof(data_type) * b1[i].size(),
546 |             cudaMemcpyDeviceToHost, stream));
547 |     }
548 | 
549 |     CUDA_CHECK(cudaStreamSynchronize(stream));
550 | 
551 |     /* free resources */
552 |     for (int i = 0; i < batch_count; i++) {
553 |         CUDA_CHECK(cudaFree(d_A[i]));
554 |     }
555 |     //CUDA_CHECK(cudaStreamDestroy(stream));
556 | 
557 |     //CUDA_CHECK(cudaDeviceReset());
558 | }
559 | //计算b3=b2*w3+b3，得到的b3（B×197×768）为结果
560 | void liner_3(int input_dim, int output_dim) {
561 |     const int m = 197;
562 |     const int n = output_dim;
563 |     const int k = input_dim;
564 |     const int lda = m;
565 |     const int ldb = k;
566 |     const int ldc = m;
567 | 
568 |     const data_type alpha = 1.0;
569 |     const data_type beta = 1.0;
570 | 
571 |     cublasHandle_t cublasH = NULL;
572 |     cudaStream_t stream = NULL;
573 | 
574 |     data_type** d_A_array = nullptr;
575 |     data_type** d_B_array = nullptr;
576 |     data_type** d_C_array = nullptr;
577 | 
578 |     std::vector<data_type*> d_A(batch_count, nullptr);
579 |     std::vector<data_type*> d_B(batch_count, nullptr);
580 |     std::vector<data_type*> d_C(batch_count, nullptr);
581 | 
582 |     cublasOperation_t transa = CUBLAS_OP_N;
583 |     cublasOperation_t transb = CUBLAS_OP_N;
584 | 
585 |     /* step 1: create cublas handle, bind a stream */
586 |     CUBLAS_CHECK(cublasCreate(&cublasH));
587 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
588 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
589 | 
590 |     /* step 2: copy data to device */
591 |     for (int i = 0; i < batch_count; i++) {
592 |         CUDA_CHECK(
593 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * b2[i].size()));
594 |         CUDA_CHECK(
595 |             cudaMalloc(reinterpret_cast<void**>(&d_B[i]), sizeof(data_type) * w3[i].size()));
596 |         CUDA_CHECK(
597 |             cudaMalloc(reinterpret_cast<void**>(&d_C[i]), sizeof(data_type) * b3[i].size()));
598 |     }
599 | 
600 |     CUDA_CHECK(
601 |         cudaMalloc(reinterpret_cast<void**>(&d_A_array), sizeof(data_type*) * batch_count));
602 |     CUDA_CHECK(
603 |         cudaMalloc(reinterpret_cast<void**>(&d_B_array), sizeof(data_type*) * batch_count));
604 |     CUDA_CHECK(
605 |         cudaMalloc(reinterpret_cast<void**>(&d_C_array), sizeof(data_type*) * batch_count));
606 | 
607 |     for (int i = 0; i < batch_count; i++) {
608 |         CUDA_CHECK(cudaMemcpyAsync(d_A[i], b2[i].data(), sizeof(data_type) * b2[i].size(),
609 |             cudaMemcpyHostToDevice, stream));
610 |         CUDA_CHECK(cudaMemcpyAsync(d_B[i], w3[i].data(), sizeof(data_type) * w3[i].size(),
611 |             cudaMemcpyHostToDevice, stream));
612 |         CUDA_CHECK(cudaMemcpyAsync(d_C[i], b3[i].data(), sizeof(data_type) * b3[i].size(),
613 |             cudaMemcpyHostToDevice, stream));
614 |     }
615 | 
616 |     CUDA_CHECK(cudaMemcpyAsync(d_A_array, d_A.data(), sizeof(data_type*) * batch_count,
617 |         cudaMemcpyHostToDevice, stream));
618 |     CUDA_CHECK(cudaMemcpyAsync(d_B_array, d_B.data(), sizeof(data_type*) * batch_count,
619 |         cudaMemcpyHostToDevice, stream));
620 |     CUDA_CHECK(cudaMemcpyAsync(d_C_array, d_C.data(), sizeof(data_type*) * batch_count,
621 |         cudaMemcpyHostToDevice, stream));
622 | 
623 |     /* step 3: compute */
624 |     cublasSgemmBatched(cublasH, transa, transb, m, n, k, &alpha, d_A_array, lda,
625 |         d_B_array, ldb, &beta, d_C_array, ldc, batch_count);
626 | 
627 |     /* step 4: copy data to host */
628 |     for (int i = 0; i < batch_count; i++) {
629 |         CUDA_CHECK(cudaMemcpyAsync(b3[i].data(), d_C[i], sizeof(data_type) * b3[i].size(),
630 |             cudaMemcpyDeviceToHost, stream));
631 |     }
632 |     /* free resources */
633 |     CUDA_CHECK(cudaFree(d_A_array));
634 |     CUDA_CHECK(cudaFree(d_B_array));
635 |     CUDA_CHECK(cudaFree(d_C_array));
636 |     for (int i = 0; i < batch_count; i++) {
637 |         CUDA_CHECK(cudaFree(d_A[i]));
638 |         CUDA_CHECK(cudaFree(d_B[i]));
639 |         CUDA_CHECK(cudaFree(d_C[i]));
640 |     }
641 | 
642 |     CUBLAS_CHECK(cublasDestroy(cublasH));
643 | 
644 |     CUDA_CHECK(cudaStreamDestroy(stream));
645 | 
646 |    // CUDA_CHECK(cudaDeviceReset());
647 | 
648 | }
649 | //*******************************Transformer Encoder**************************************
650 | //对输入的tensor进行LN处理
651 | __global__ void ss(float* x, int n, float avg) {
652 |     int ix = threadIdx.x + blockDim.x * blockIdx.x;
653 |     if (ix < n)
654 |         x[ix] = pow(x[ix] - avg, 2);
655 | }
656 | __global__ void ln(float* x, int n, float avg, float S) {
657 |     int ix = threadIdx.x + blockDim.x * blockIdx.x;
658 |     if (ix < n)
659 |         x[ix] = (x[ix]-avg)/sqrt(S+ 1e-5);
660 | }
661 | void LayerNorm() {
662 |     std::vector<data_type*> d_A(batch_count, nullptr);
663 |     std::vector<data_type*> d_A_copy(batch_count, nullptr);
664 |     float sum = 0.0, S=0.0;
665 |     cudaStream_t stream = NULL;
666 | 
667 |     cublasHandle_t handle;
668 |     cublasCreate(&handle);
669 | 
670 |     /* copy data to device */
671 | 
672 |     for (int i = 0; i < batch_count; i++) {
673 |         sum = 0.0;
674 |         CUDA_CHECK(
675 |             cudaMalloc(reinterpret_cast<void**>(&d_A[i]), sizeof(data_type) * tensor[i].size()));
676 |         CUDA_CHECK(
677 |             cudaMalloc(reinterpret_cast<void**>(&d_A_copy[i]), sizeof(data_type) * tensor[i].size()));
678 |         CUDA_CHECK(
679 |             cudaMemcpyAsync(d_A[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), cudaMemcpyHostToDevice, stream));
680 |         CUDA_CHECK(
681 |             cudaMemcpyAsync(d_A_copy[i], tensor[i].data(), sizeof(data_type) * tensor[i].size(), cudaMemcpyHostToDevice, stream));
682 |         //先求和
683 |         cublasSasum(handle, 197 * 768, d_A[i], 1, &sum);
684 |         //求方差
685 |         ss << <160, 1024 >> > (d_A_copy[i], 197 * 768, sum / (197 * 768));
686 |         cublasSasum(handle, 197 * 768, d_A_copy[i], 1, &S);
687 |         //求LN
688 |         ln << <160, 1024 >> > (d_A[i], 197 * 768, sum / (197 * 768), S );
689 |         //将结果赋值给tensor
690 |         CUDA_CHECK(cudaMemcpyAsync(tensor[i].data(), d_A[i], sizeof(data_type) * tensor[i].size(),
691 |             cudaMemcpyDeviceToHost, stream));
692 |     }
693 | 
694 |     CUDA_CHECK(cudaStreamSynchronize(stream));
695 | 
696 |     /* free resources */
697 |     for (int i = 0; i < batch_count; i++) {
698 |         CUDA_CHECK(cudaFree(d_A[i]));
699 |         CUDA_CHECK(cudaFree(d_A_copy[i]));
700 |     }
701 | 
702 |     CUBLAS_CHECK(cublasDestroy(handle));
703 | 
704 |     //CUDA_CHECK(cudaStreamDestroy(stream));
705 | 
706 |     //CUDA_CHECK(cudaDeviceReset());
707 | }
708 | //MultiHeadAttention block 
709 | void MultiHeadAttention() {
710 |     //计算b1=tensor*w1+b1，得到的b1（B×197×2304）为结果
711 |     liner_1(768, 2304);
712 |     //将b1（B×197*2304）划分得到qkv（3×B×8×197×96）
713 |     Permute_1();
714 |     //qk=(q@transpose.k)*1/sqrt(Dk)
715 |     multiplication_1();
716 |     //对qk进行softmax操作
717 |     softmax();
718 |     //计算qk@v,将结果存到q中
719 |     multiplication_2();
720 |     //将q（B×8×197×96）转化为B×197×768赋值给tensor
721 |     Permute_2();
722 | }
723 | //MLP block 
724 | void MLP() {
725 |     //计算b2=tensor*w2+b2，得到的b2（B×197×3072）为结果
726 |     liner_2(768,3072);
727 |     //对b2进行GELU操作
728 |     GELU();
729 |     //计算b3=b2*w3+b3，得到的b3（B×197×768）为结果
730 |     liner_3(3072,768);
731 |     //将结果赋值给tensor
732 |     tensor = b3;
733 | }
734 | //residual结构(tensor=tensor+tensor_copy)
735 | void tensor_add() {
736 |     //先化成一维vector
737 |     std::vector<data_type> A(batch_count * 197 * 768);
738 |     std::vector<data_type> B(batch_count * 197 * 768);
739 |     const int m = 197;
740 |     const int n = 768;
741 |  
742 |     int t1 = 0;
743 |     for (int b = 0; b < batch_count; b++)
744 |         for (int i = 0; i < m; i++) {
745 |             for (int j = 0; j < n; j++) {
746 |                 A[t1] = tensor[b][j * m + i];
747 |                 B[t1] = tensor_copy[b][j * m + i];
748 |                 t1++;
749 |             }
750 |         }
751 | 
752 |     cublasHandle_t cublasH = NULL;
753 |     cudaStream_t stream = NULL;
754 | 
755 |     const data_type alpha = 1.0;
756 |     const int incx = 1;
757 |     const int incy = 1;
758 | 
759 |     data_type* d_A = nullptr;
760 |     data_type* d_B = nullptr;
761 | 
762 |     /* step 1: create cublas handle, bind a stream */
763 |     CUBLAS_CHECK(cublasCreate(&cublasH));
764 | 
765 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
766 |     CUBLAS_CHECK(cublasSetStream(cublasH, stream));
767 | 
768 |     /* step 2: copy data to device */
769 |     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_A), sizeof(data_type) * A.size()));
770 |     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_B), sizeof(data_type) * B.size()));
771 | 
772 |     CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice,
773 |         stream));
774 |     CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(), cudaMemcpyHostToDevice,
775 |         stream));
776 | 
777 |     /* step 3: compute */
778 |     CUBLAS_CHECK(cublasSaxpy(cublasH, A.size(), &alpha, d_A, incx, d_B, incy));
779 | 
780 |     /* step 4: copy data to host */
781 |     CUDA_CHECK(cudaMemcpyAsync(B.data(), d_B, sizeof(data_type) * B.size(), cudaMemcpyDeviceToHost,
782 |         stream));
783 | 
784 |     CUDA_CHECK(cudaStreamSynchronize(stream));
785 | 
786 |     /* free resources */
787 |     CUDA_CHECK(cudaFree(d_A));
788 |     CUDA_CHECK(cudaFree(d_B));
789 | 
790 |     CUBLAS_CHECK(cublasDestroy(cublasH));
791 |     CUDA_CHECK(cudaStreamDestroy(stream));
792 |     //CUDA_CHECK(cudaDeviceReset());
793 | 
794 |     //将B赋值给tensor
795 |     int t2 = 0;
796 |     for (int b = 0; b < batch_count; b++)
797 |         for (int i = 0; i < m; i++) {
798 |             for (int j = 0; j < n; j++) {
799 |                 tensor[b][j * m + i]=B[t2];
800 |                 t2++;
801 |             }
802 |         }
803 | }
804 | 
805 | //**********************************main()************************************************
806 | int main(int argc, char** argv)
807 | {
808 |     cudaEvent_t start, stop;
809 |     cudaEventCreate(&start);
810 |     cudaEventCreate(&stop);
811 | 
812 |     //tensor初始化
813 |     tensor_initial(M,N);
814 |     //tensor_copy初始化(用于residual结构)
815 |     tensor_copy = tensor;
816 |     //for(int b = 0; b < batch_count; b++)for (int i = 0; i < M; i++) {for (int j = 0; j < N; j++)std::cout << tensor[b][i * N + j] << " ";std::cout << std::endl;}
817 |     //初始化liner的w和b
818 |     liner_initial();
819 |     //初始化q,k,v
820 |     qkv_initial(197, 96);
821 | 
822 |     /* GPU warm up */
823 |     for(int i=0;i<100;i++)
824 |         LayerNorm();
825 | 
826 |     /* compute */
827 |     cudaEventRecord(start, 0);
828 | 
829 |     LayerNorm();
830 |     //MultiHeadAttention block 
831 |     MultiHeadAttention();
832 |     //tensor=tensor+tensor_copy
833 |     tensor_add();
834 |     tensor_copy = tensor;
835 | 
836 |     LayerNorm();
837 |     //MLP block 
838 |     MLP();
839 |     //tensor=tensor+tensor_copy
840 |     tensor_add();
841 | 
842 |     cudaEventRecord(stop, 0);
843 |     cudaEventSynchronize(stop);
844 | 
845 |     float elapsedTime;
846 |     cudaEventElapsedTime(&elapsedTime, start, stop);
847 |     printf("Transformer Encoder time(Batch_Size=8): %.2f ms\n", elapsedTime);
848 |     return 0;
849 | }
850 | 
851 | 
852 | 
853 | 
854 | 
855 | 


--------------------------------------------------------------------------------
/cuda/cu_managed_Matrixmultiplication.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdio.h>
  3 | #include <math.h>
  4 | #include <cuda_runtime.h>
  5 | #include <device_launch_parameters.h>
  6 | 	
  7 | #define BLOCK_SIZE 16
  8 | 
  9 | const int MAXSUN = 1000;
 10 | __managed__ int a[MAXSUN * MAXSUN];
 11 | __managed__ int b[MAXSUN * MAXSUN];
 12 | __managed__ int c_gpu[MAXSUN * MAXSUN];
 13 | __managed__ int c_cpu[MAXSUN * MAXSUN];
 14 | 	
 15 | __global__ void gpu_matrix_mult(int* a, int* b, int* c, int m, int n, int k)
 16 | {
 17 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 18 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 19 |     int sum = 0;
 20 |     if (col < k && row < m)
 21 |     {
 22 |       for (int i = 0; i < n; i++)
 23 |        {
 24 |             sum += a[row * n + i] * b[i * k + col];
 25 |         }
 26 |         c[row * k + col] = sum;
 27 |     }
 28 | }
 29 | 	
 30 | void cpu_matrix_mult(int* a, int* b, int* h_result, int m, int n, int k) {
 31 |     for (int i = 0; i < m; ++i)
 32 |     {
 33 |        for (int j = 0; j < k; ++j)
 34 |        {
 35 |            int tmp = 0.0;
 36 |            for (int h = 0; h < n; ++h)
 37 |            {
 38 |                tmp += a[i * n + h] * b[h * k + j];
 39 |            }
 40 |             h_result[i * k + j] = tmp;
 41 |         }
 42 |     }
 43 | }
 44 | 	
 45 | int main(int argc, char const* argv[])
 46 | {
 47 |     int m = 200;
 48 |     int n = 200;
 49 |     int k = 200;
 50 | 	
 51 |     cudaEvent_t start, stop_cpu, stop_gpu;
 52 |     cudaEventCreate(&start);
 53 |     cudaEventCreate(&stop_cpu);
 54 |     cudaEventCreate(&stop_gpu);
 55 | 
 56 |     //初始化矩阵A
 57 |     for (int i = 0; i < m; ++i) {
 58 |         for (int j = 0; j < n; ++j) {
 59 |             a[i * n + j] = 0 * rand() % 1024 + 1;
 60 |         }
 61 |     }
 62 |     //初始化矩阵B
 63 |     for (int i = 0; i < n; ++i) {
 64 |        for (int j = 0; j < k; ++j) {
 65 |             b[i * k + j] = 0 * rand() % 1024 + 1;
 66 |         }
 67 |     }
 68 | 
 69 |     cudaEventRecord(start);
 70 |     cudaEventQuery(start);
 71 | 	
 72 |     unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
 73 |     unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
 74 |     dim3 dimGrid(grid_cols, grid_rows);
 75 |     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
 76 | 	    
 77 |     gpu_matrix_mult << <dimGrid, dimBlock >> > (a, b, c_gpu, m, n, k);
 78 | 
 79 |     cudaEventRecord(stop_gpu);
 80 |     cudaEventSynchronize(stop_gpu);
 81 | 	
 82 |     cpu_matrix_mult(a, b, c_cpu, m, n, k);
 83 |     cudaEventRecord(stop_cpu);
 84 |     cudaEventSynchronize(stop_cpu);
 85 |     float elapsed_time_cpu, elapsed_time_gpu;
 86 |     cudaEventElapsedTime(&elapsed_time_gpu, start, stop_gpu);
 87 |     cudaEventElapsedTime(&elapsed_time_cpu, stop_gpu, stop_cpu);
 88 |     printf("GPU Time = %g ms.\n", elapsed_time_gpu);
 89 |     printf("CPU Time = %g ms.\n", elapsed_time_cpu);
 90 | 
 91 |     cudaEventDestroy(start);
 92 |     cudaEventDestroy(stop_cpu);
 93 |     cudaEventDestroy(stop_gpu);
 94 | 
 95 | 	
 96 | 	
 97 |     int ok = 1;
 98 |     for (int i = 0; i < m; ++i)
 99 |     {
100 |         for (int j = 0; j < k; ++j)
101 |         {
102 |         //检验GPU运算结果和CPU运算结果是否相等
103 |            if (fabs(c_gpu[i * k + j] - c_cpu[i * k + j]) > (1.0e-10))
104 |          {
105 | 
106 |               ok = 0;
107 |          }
108 |          //printf("\n");
109 |        }
110 |     }
111 | 
112 |     if (ok)
113 |     {
114 |         printf("Pass!!!\n");
115 |     }
116 |     else
117 |     {
118 |         printf("Error!!!\n");
119 |     }
120 | 
121 |     return 0;
122 | }
123 | 


--------------------------------------------------------------------------------
/cuda/cu_vectorAdd.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | /**
 13 |  * Vector addition: C = A + B.
 14 |  *
 15 |  * This sample is a very basic sample that implements element by element
 16 |  * vector addition. It is the same as the sample illustrating Chapter 2
 17 |  * of the programming guide with some additions like error checking.
 18 |  */
 19 | 
 20 | #include <stdio.h>
 21 | 
 22 | // For the CUDA runtime routines (prefixed with "cuda_")
 23 | #include <cuda_runtime.h>
 24 | 
 25 | /**
 26 |  * CUDA Kernel Device code
 27 |  *
 28 |  * Computes the vector addition of A and B into C. The 3 vectors have the same
 29 |  * number of elements numElements.
 30 |  */
 31 | __global__ void
 32 | vectorAdd(const float *A, const float *B, float *C, int numElements)
 33 | {
 34 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 35 | 
 36 |     if (i < numElements)
 37 |     {
 38 |         C[i] = A[i] + B[i];
 39 |     }
 40 | }
 41 | 
 42 | /**
 43 |  * Host main routine
 44 |  */
 45 | int
 46 | main(void)
 47 | {
 48 |     // Error code to check return values for CUDA calls
 49 |     cudaError_t err = cudaSuccess;
 50 | 
 51 |     // Print the vector length to be used, and compute its size
 52 |     int numElements = 50000;
 53 |     size_t size = numElements * sizeof(float);
 54 |     printf("[Vector addition of %d elements]\n", numElements);
 55 | 
 56 |     // Allocate the host input vector A
 57 |     float *h_A = (float *)malloc(size);
 58 | 
 59 |     // Allocate the host input vector B
 60 |     float *h_B = (float *)malloc(size);
 61 | 
 62 |     // Allocate the host output vector C
 63 |     float *h_C = (float *)malloc(size);
 64 | 
 65 |     // Verify that allocations succeeded
 66 |     if (h_A == NULL || h_B == NULL || h_C == NULL)
 67 |     {
 68 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 69 |         exit(EXIT_FAILURE);
 70 |     }
 71 | 
 72 |     // Initialize the host input vectors
 73 |     for (int i = 0; i < numElements; ++i)
 74 |     {
 75 |         h_A[i] = rand()/(float)RAND_MAX;
 76 |         h_B[i] = rand()/(float)RAND_MAX;
 77 |     }
 78 | 
 79 |     // Allocate the device input vector A
 80 |     float *d_A = NULL;
 81 |     err = cudaMalloc((void **)&d_A, size);
 82 | 
 83 |     if (err != cudaSuccess)
 84 |     {
 85 |         fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 86 |         exit(EXIT_FAILURE);
 87 |     }
 88 | 
 89 |     // Allocate the device input vector B
 90 |     float *d_B = NULL;
 91 |     err = cudaMalloc((void **)&d_B, size);
 92 | 
 93 |     if (err != cudaSuccess)
 94 |     {
 95 |         fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 96 |         exit(EXIT_FAILURE);
 97 |     }
 98 | 
 99 |     // Allocate the device output vector C
100 |     float *d_C = NULL;
101 |     err = cudaMalloc((void **)&d_C, size);
102 | 
103 |     if (err != cudaSuccess)
104 |     {
105 |         fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
106 |         exit(EXIT_FAILURE);
107 |     }
108 | 
109 |     // Copy the host input vectors A and B in host memory to the device input vectors in
110 |     // device memory
111 |     printf("Copy input data from the host memory to the CUDA device\n");
112 |     err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
113 | 
114 |     if (err != cudaSuccess)
115 |     {
116 |         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
117 |         exit(EXIT_FAILURE);
118 |     }
119 | 
120 |     err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
121 | 
122 |     if (err != cudaSuccess)
123 |     {
124 |         fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
125 |         exit(EXIT_FAILURE);
126 |     }
127 | 
128 |     // Launch the Vector Add CUDA Kernel
129 |     int threadsPerBlock = 256;
130 |     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
131 |     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
132 |     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
133 |     err = cudaGetLastError();
134 | 
135 |     if (err != cudaSuccess)
136 |     {
137 |         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
138 |         exit(EXIT_FAILURE);
139 |     }
140 | 
141 |     // Copy the device result vector in device memory to the host result vector
142 |     // in host memory.
143 |     printf("Copy output data from the CUDA device to the host memory\n");
144 |     err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
145 | 
146 |     if (err != cudaSuccess)
147 |     {
148 |         fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
149 |         exit(EXIT_FAILURE);
150 |     }
151 | 
152 |     // Verify that the result vector is correct
153 |     for (int i = 0; i < numElements; ++i)
154 |     {
155 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
156 |         {
157 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
158 |             exit(EXIT_FAILURE);
159 |         }
160 |     }
161 | 
162 |     printf("Test PASSED\n");
163 | 
164 |     // Free device global memory
165 |     err = cudaFree(d_A);
166 | 
167 |     if (err != cudaSuccess)
168 |     {
169 |         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
170 |         exit(EXIT_FAILURE);
171 |     }
172 | 
173 |     err = cudaFree(d_B);
174 | 
175 |     if (err != cudaSuccess)
176 |     {
177 |         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
178 |         exit(EXIT_FAILURE);
179 |     }
180 | 
181 |     err = cudaFree(d_C);
182 | 
183 |     if (err != cudaSuccess)
184 |     {
185 |         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
186 |         exit(EXIT_FAILURE);
187 |     }
188 | 
189 |     // Free host memory
190 |     free(h_A);
191 |     free(h_B);
192 |     free(h_C);
193 | 
194 |     // Reset the device and exit
195 |     // cudaDeviceReset causes the driver to clean up all state. While
196 |     // not mandatory in normal operation, it is good practice.  It is also
197 |     // needed to ensure correct operation when the application is being
198 |     // profiled. Calling cudaDeviceReset causes all profile data to be
199 |     // flushed before the application exits
200 |     err = cudaDeviceReset();
201 | 
202 |     if (err != cudaSuccess)
203 |     {
204 |         fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
205 |         exit(EXIT_FAILURE);
206 |     }
207 | 
208 |     printf("Done\n");
209 |     return 0;
210 | }
211 | 
212 | 


--------------------------------------------------------------------------------
/cuda/cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nvcc cu_vectorAdd.cu -o cu_vectorAdd
4 | 
5 | nvcc cu_managed_Matrixmultiplication.cu -o cu_managed_Matrixmultiplication
6 | 
7 | nvcc Transformer_Encoder.cu -o Transformer_Encoder -lcublas
8 | 


--------------------------------------------------------------------------------
/dpcpp/dpcpp_sobel.cpp:
--------------------------------------------------------------------------------
  1 | //==============================================================
  2 | // Copyright © 2019 Intel Corporation
  3 | //
  4 | // SPDX-License-Identifier: MIT
  5 | // =============================================================
  6 | #include <chrono>
  7 | #include <cmath>
  8 | #include <iostream>
  9 | #include "CL/sycl.hpp"
 10 | #include "device_selector.hpp"
 11 | 
 12 | // dpc_common.hpp can be found in the dev-utilities include folder.
 13 | // e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
 14 | #include "dpc_common.hpp"
 15 | 
 16 | // stb/*.h files can be found in the dev-utilities include folder.
 17 | // e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/stb/*.h
 18 | #define STB_IMAGE_IMPLEMENTATION
 19 | #include "stb/stb_image.h"
 20 | #define STB_IMAGE_WRITE_IMPLEMENTATION
 21 | #include "stb/stb_image_write.h"
 22 | 
 23 | using namespace std;
 24 | using namespace sycl;
 25 | using namespace chrono;
 26 | 
 27 | // Few useful acronyms.
 28 | constexpr auto sycl_read = access::mode::read;
 29 | constexpr auto sycl_write = access::mode::write;
 30 | constexpr auto sycl_global_buffer = access::target::global_buffer;
 31 | 
 32 | static void ReportTime(const string &msg, event e) {
 33 |   cl_ulong time_start =
 34 |       e.get_profiling_info<info::event_profiling::command_start>();
 35 | 
 36 |   cl_ulong time_end =
 37 |       e.get_profiling_info<info::event_profiling::command_end>();
 38 | 
 39 |   double elapsed = (time_end - time_start) / 1e6;
 40 |   cout << msg << elapsed << " milliseconds\n";
 41 | }
 42 | 
 43 | // SYCL does not need any special mark-up for functions which are called from
 44 | // SYCL kernel and defined in the same compilation unit. SYCL compiler must be
 45 | // able to find the full call graph automatically.
 46 | // always_inline as calls are expensive on Gen GPU.
 47 | // Notes:
 48 | // - coeffs can be declared outside of the function, but still must be constant
 49 | // - SYCL compiler will automatically deduce the address space for the two
 50 | //   pointers; sycl::multi_ptr specialization for particular address space
 51 | //   can used for more control
 52 | __attribute__((always_inline)) static void ApplyFilter(uint8_t *src_image,
 53 |                                                        uint8_t *dst_image,
 54 |                                                        int i) {
 55 |   i *= 3;
 56 |   float temp;
 57 |   temp = (0.393f * src_image[i]) + (0.769f * src_image[i + 1]) +
 58 |          (0.189f * src_image[i + 2]);
 59 |   dst_image[i] = temp > 255 ? 255 : temp;
 60 |   temp = (0.349f * src_image[i]) + (0.686f * src_image[i + 1]) +
 61 |          (0.168f * src_image[i + 2]);
 62 |   dst_image[i + 1] = temp > 255 ? 255 : temp;
 63 |   temp = (0.272f * src_image[i]) + (0.534f * src_image[i + 1]) +
 64 |          (0.131f * src_image[i + 2]);
 65 |   dst_image[i + 2] = temp > 255 ? 255 : temp;
 66 | }
 67 | 
 68 | //sobel filter kernel
 69 | __attribute__((always_inline)) static void ApplySFilter(uint8_t *src_image,
 70 |                                                         uint8_t *dst_image,
 71 |                                                         int w,
 72 |                                                         int h) {
 73 |   cout<<"start filter;\n";
 74 |   int Gx = 0;
 75 |   int Gy = 0;
 76 |   float temp;
 77 |   for (int i=1;i<(h-1);i++)
 78 |   {
 79 |     for (int j=1;j<(w-1);j++)
 80 |     {  
 81 |       Gy = src_image[(i+1)*w+(j-1)]*1+src_image[(i+1)*w+(j)]*2+src_image[(i+1)*w+(j+1)]*1-(src_image[(i-1)*w+(j-1)]*1+src_image[(i-1)*w+(j)]*2+src_image[(i-1)*w+(j+1)]*1);
 82 |       Gx = src_image[(i-1)*w+(j+1)]*1+src_image[(i)*w+(j+1)]*2+src_image[(i+1)*w+(j+1)]*1-(src_image[(i-1)*w+(j-1)]*1+src_image[(i)*w+(j-1)]*2+src_image[(i+1)*w+(j-1)]*1);
 83 |       temp = (abs(Gx)+abs(Gy))/2.0f;
 84 |       dst_image[i*w+j] = temp>200?255:temp;
 85 |     }
 86 |   }
 87 |     
 88 | }
 89 | //并行 ss
 90 | __attribute__((always_inline)) static void ApplySSFilter(uint8_t *src_image,
 91 |                                                          uint8_t *dst_image,
 92 |                                                          int i,
 93 |                                                          int w,
 94 |                                                          int h
 95 |                                                          ) {
 96 | 
 97 |   int Gx = 0;
 98 |   int Gy = 0;
 99 |   float temp;
100 |   if (i>w)
101 |   {
102 |       Gy = src_image[i+w-1]*1+src_image[i+w]*2+src_image[i+w+1]*1-(src_image[i-w-1]*1+src_image[i-w]*2+src_image[i-w+1]*1);
103 |       Gx = src_image[i-w+1]*1+src_image[i+1]*2+src_image[i+w+1]*1-(src_image[i-w-1]*1+src_image[i-1]*2+src_image[i+w-1]*1);
104 |       temp = (abs(Gx)+abs(Gy))/2.0f;
105 |       dst_image[i] = temp>200?255:temp;
106 |   }
107 |     
108 | }
109 | 
110 | 
111 | int main(int argc, char **argv) {
112 |     
113 |   // loading the input image
114 |   int img_width, img_height, channels;
115 |   uint8_t *image = stbi_load("1.jpg", &img_width, &img_height, &channels, 0);
116 |   if (image == NULL) {
117 |     cout << "Error in loading the image\n";
118 |     exit(1);
119 |   }
120 |   cout << "Loaded image with a width of " << img_width << ", a height of "
121 |        << img_height << " and " << channels << " channels"<<"\n";
122 | 
123 |   //像素个数，图像尺寸
124 |   size_t num_pixels = img_width * img_height;
125 |   // size_t img_size = img_width * img_height * channels;
126 |   size_t img_size = img_width * img_height;
127 | 
128 |   // allocating memory for output images
129 |   uint8_t *image_gray = new uint8_t[img_size];
130 |   uint8_t *image_ref = new uint8_t[img_size];
131 |   uint8_t *image_exp1 = new uint8_t[img_size];
132 | 
133 |   memset(image_gray, 0, img_size * sizeof(uint8_t));
134 |   memset(image_ref, 0, img_size * sizeof(uint8_t));
135 |   memset(image_exp1, 0, img_size * sizeof(uint8_t));
136 |   
137 |   //gray灰度化
138 |   for (int p=0;p<img_width*img_height;p++)
139 |   { 
140 |       image_gray[p] = (image[3*p]+image[3*p+1]+image[3*p+2])/3.0f;
141 |   }
142 | 
143 |   // Create a device selector which rates available devices in the preferred
144 |   // order for the runtime to select the highest rated device
145 |   // Note: This is only to illustrate the usage of a custom device selector.
146 |   // default_selector can be used if no customization is required.
147 |   MyDeviceSelector sel;
148 | 
149 |   // Using these events to time command group execution
150 |   event e1;
151 | 
152 |   // Wrap main SYCL API calls into a try/catch to diagnose potential errors
153 |   try {
154 |     // Create a command queue using the device selector and request profiling
155 |     auto prop_list = property_list{property::queue::enable_profiling()};
156 |     queue q(sel, dpc_common::exception_handler, prop_list);
157 | 
158 |     // See what device was actually selected for this queue.
159 |     cout << "Running on " << q.get_device().get_info<info::device::name>()
160 |         << "\n";
161 | 
162 |     // Create SYCL buffer representing source data .
163 |     // By default, this buffers will be created with global_buffer access
164 |     // target, which means the buffer "projection" to the device (actual
165 |     // device memory chunk allocated or mapped on the device to reflect
166 |     // buffer's data) will belong to the SYCL global address space - this
167 |     // is what host data usually maps to. Other address spaces are:
168 |     // private, local and constant.
169 |     // Notes:
170 |     // - access type (read/write) is not specified when creating a buffer -
171 |     //   this is done when actual accessor is created
172 |     // - there can be multiple accessors to the same buffer in multiple command
173 |     //   groups
174 |     // - 'image' pointer was passed to the constructor, so this host memory
175 |     //   will be used for "host projection", no allocation will happen on host
176 |     buffer image_buf(image_gray, range(img_size));
177 | 
178 |     // This is the output buffer device writes to
179 |     buffer image_buf_exp1(image_exp1, range(img_size));
180 |     cout << "Submitting lambda kernel...\n";
181 | 
182 |     // Submit a command group for execution. Returns immediately, not waiting
183 |     // for command group completion.
184 |     e1 = q.submit([&](auto &h) {
185 |       // This lambda defines a "command group" - a set of commands for the
186 |       // device sharing some state and executed in-order - i.e. creation of
187 |       // accessors may lead to on-device memory allocation, only after that
188 |       // the kernel will be enqueued.
189 |       // A command group can contain at most one parallel_for, single_task or
190 |       // parallel_for_workgroup construct.
191 |       accessor image_acc(image_buf, h, read_only);
192 |       accessor image_exp_acc(image_buf_exp1, h, write_only);
193 | 
194 |       // This is the simplest form cl::sycl::handler::parallel_for -
195 |       // - it specifies "flat" 1D ND range(num_pixels), runtime will select
196 |       //   local size
197 |       // - kernel lambda accepts single cl::sycl::id argument, which has very
198 |       //   limited API; see the spec for more complex forms
199 |       // the lambda parameter of the parallel_for is the kernel, which
200 |       // actually executes on device
201 | 
202 | 
203 |       h.parallel_for(range<1>(num_pixels), [=](auto i) {
204 |         ApplySSFilter(image_acc.get_pointer(), image_exp_acc.get_pointer(), i,img_width,img_height);
205 |       });
206 |     });
207 |     q.wait_and_throw();
208 | 
209 |   }catch (sycl::exception e) {
210 |     // This catches only synchronous exceptions that happened in current thread
211 |     // during execution. The asynchronous exceptions caused by execution of the
212 |     // command group are caught by the asynchronous exception handler
213 |     // registered. Synchronous exceptions are usually those which are thrown
214 |     // from the SYCL runtime code, such as on invalid constructor arguments. An
215 |     // example of asynchronous exceptions is error occurred during execution of
216 |     // a kernel. Make sure sycl::exception is caught, not std::exception.
217 |     cout << "SYCL exception caught: " << e.what() << "\n";
218 |     return 1;
219 |   }
220 | 
221 |   // report execution times:
222 |   ReportTime("Lambda kernel time: ", e1);
223 | 
224 |   // get reference result
225 |   // 计时开始
226 |   auto start = system_clock::now();
227 |   ApplySFilter(image_gray, image_ref, img_width, img_height);
228 |   auto end = system_clock::now();
229 |   auto duration = duration_cast<milliseconds>(end - start);
230 |   cout << "Serial time: " << double(duration.count()) << " milliseconds\n";
231 | 
232 |   stbi_write_png("sobel.png", img_width, img_height, 1, image_ref,
233 |                   img_width*1);
234 |   stbi_write_png("sobel_lambda.png", img_width, img_height, 1,
235 |                  image_exp1, img_width * 1);
236 | 
237 |   stbi_image_free(image);
238 |   delete[] image_ref;
239 | 
240 |   cout << "Successfully applied to image! \n";
241 |   return 0;
242 | }
243 | 


--------------------------------------------------------------------------------
/dpcpp/dpcpp_templatematching.cpp:
--------------------------------------------------------------------------------
  1 | %%writefile lab/gpu_sample.cpp
  2 | #include <chrono>
  3 | #include <cmath>
  4 | #include <iostream>
  5 | #include "CL/sycl.hpp"
  6 | //#include "device_selector.hpp"
  7 | // dpc_common.hpp can be found in the dev-utilities include folder.
  8 | // e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
  9 | #include "dpc_common.hpp"
 10 | 
 11 | // stb/*.h files can be found in the dev-utilities include folder.
 12 | // e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/stb/*.h
 13 | #define STB_IMAGE_IMPLEMENTATION
 14 | #include "stb/stb_image.h"
 15 | #define STB_IMAGE_WRITE_IMPLEMENTATION
 16 | #include "stb/stb_image_write.h"
 17 | 
 18 | using namespace std;
 19 | using namespace sycl;
 20 | 
 21 | static void ReportTime(const string &msg, event e) {
 22 |   cl_ulong time_start =
 23 |       e.get_profiling_info<info::event_profiling::command_start>();
 24 | 
 25 |   cl_ulong time_end =
 26 |       e.get_profiling_info<info::event_profiling::command_end>();
 27 | 
 28 |   double elapsed = (time_end - time_start) / 1e6;
 29 |   cout << msg << elapsed << " milliseconds\n";
 30 | }
 31 | 
 32 | 
 33 | // SYCL does not need any special mark-up for functions which are called from
 34 | // SYCL kernel and defined in the same compilation unit. SYCL compiler must be
 35 | // able to find the full call graph automatically.
 36 | // always_inline as calls are expensive on Gen GPU.
 37 | // Notes:
 38 | // - coeffs can be declared outside of the function, but still must be constant
 39 | // - SYCL compiler will automatically deduce the address space for the two
 40 | //   pointers; sycl::multi_ptr specialization for particular address space
 41 | //   can used for more control
 42 | __attribute__((always_inline)) static void ApplyFilter(uint8_t *I,
 43 |                                                        uint8_t *T,
 44 |                                                        float *result,
 45 |                                                        int i,
 46 |                                                        int j,
 47 |                                                        int Iw,
 48 |                                                        int Ih, 
 49 |                                                        int Tw,
 50 |                                                        int Th)  {
 51 |     
 52 |   if (i >= Ih - Th + 1 || j >= Iw - Tw + 1) {
 53 |       return;
 54 |   }
 55 |   float sum = 0.0;
 56 |   for (int k = 0; k < Th; k++) {
 57 |     for (int s = 0; s < Tw; s++) {
 58 |         float diff = I[(i + k) * Iw + j + s] - T[k * Tw + s];
 59 |         sum += diff * diff;
 60 |     }
 61 |   }
 62 |   result[i * Iw + j] = sum;  
 63 | }
 64 | 
 65 | 
 66 | int main(int argc, char **argv) {
 67 |   // loading the src image
 68 |   int src_img_width, src_img_height, src_channels;
 69 |   // 使用灰度图像
 70 |   // 加载图片 源图片
 71 |   uint8_t *src_image = stbi_load("./tmp_src_img.jpg", &src_img_width, &src_img_height, &src_channels, 1);
 72 |   if (src_image == NULL) {
 73 |     cout << "Error in loading the image\n";
 74 |     exit(1);
 75 |   }
 76 |   cout << "Loaded src image with a width of " << src_img_width << ", a height of "
 77 |        << src_img_height << " and " << src_channels << " channels\n";
 78 |   
 79 |   // loading the template image
 80 |   int template_img_width, template_img_height, template_channels;
 81 |   // 加载图片 模板图片
 82 |   uint8_t *template_image = stbi_load("./tmp_template_img.jpg", &template_img_width, &template_img_height, &template_channels, 1);
 83 |   if (template_image == NULL) {
 84 |     cout << "Error in loading the image\n";
 85 |     exit(1);
 86 |   }
 87 |   cout << "Loaded template image with a width of " << template_img_width << ", a height of "
 88 |        << template_img_height << " and " << template_channels << " channels\n";
 89 | 
 90 |   if (src_img_width < template_img_width || src_img_height < template_img_height) {
 91 |     cout << "Error: The template is larger than the picture\n";
 92 |     exit(1);
 93 |   }
 94 |   
 95 |   
 96 |   // 分配的结果内存
 97 |   size_t num_counts = src_img_height * src_img_width;
 98 |   size_t src_size = src_img_height * src_img_width;
 99 |   size_t template_size = template_img_width * template_img_height;
100 |   // 分配输出图像的内存
101 |   // allocating memory for output images
102 |   float *result = new float[num_counts];
103 |   // 初始化
104 |   // memset(image_ref, 0, num_counts * sizeof(float));
105 | 
106 |   // Create a device selector which rates available devices in the preferred
107 |   // order for the runtime to select the highest rated device
108 |   // Note: This is only to illustrate the usage of a custom device selector.
109 |   // default_selector can be used if no customization is required.
110 |   // 选择合适的设备
111 |   
112 |     //device_selector sel;
113 | 
114 |   // Using these events to time command group execution
115 |   event e1, e2;
116 |   
117 |   // Wrap main SYCL API calls into a try/catch to diagnose potential errors
118 |   try {
119 |     // Create a command queue using the device selector and request profiling
120 |     // 选择最适合的设备
121 |     auto prop_list = property_list{property::queue::enable_profiling()};
122 |     queue q(default_selector{}, dpc_common::exception_handler, prop_list);
123 | 
124 |     // See what device was actually selected for this queue.
125 |     cout << "Running on " << q.get_device().get_info<info::device::name>()
126 |          << "\n";
127 |     // 源图像buffer
128 |     buffer src_image_buf(src_image, range(src_size));
129 |     // 模板图像buffer
130 |     // This is the output buffer device writes to
131 |     buffer template_image_buf(template_image, range(template_size));
132 |     // 结果的buffer
133 |     buffer result_buf(result, range(num_counts));
134 |     cout << "Submitting lambda kernel...\n";
135 | 
136 |     // Submit a command group for execution. Returns immediately, not waiting
137 |     // for command group completion.
138 |     // 得到输出的比较之后的结果
139 |     e1 = q.submit([&](auto &h) {
140 |       accessor src_image_acc(src_image_buf, h, read_only);
141 |       accessor template_image_acc(template_image_buf, h, read_only);
142 |       accessor result_acc(result_buf, h, write_only);
143 |       // 使用二维线程数
144 |       h.parallel_for(range<2>{(size_t)src_img_height, (size_t)src_img_width}, [=](id<2> index) {
145 | 		// 内核程序执行
146 |         ApplyFilter(src_image_acc.get_pointer(), template_image_acc.get_pointer(), result_acc.get_pointer(), index[0], index[1], src_img_width, src_img_height, template_img_width, template_img_height);
147 |       });
148 |     });
149 |     q.wait_and_throw();  
150 |   } catch (sycl::exception e) {
151 |     cout << "SYCL exception caught: " << e.what() << "\n";
152 |     return 1;
153 |   }
154 | 
155 |   // report execution times:
156 |   ReportTime("Lambda kernel time: ", e1);
157 |   // cout << result[0] << " " << result[1];
158 |     
159 |   // 得到匹配位置的最小值
160 |   int x,y;
161 |   float minresult = result[0];
162 |   for (int i = 0; i < src_img_height - template_img_height + 1; i++) {
163 |     for (int j = 0; j < src_img_width - template_img_width + 1; j++) {
164 |         if (minresult > result[i * src_img_width + j]) {
165 |             y = i;
166 |             x = j;
167 |             minresult = result[i * src_img_width + j];
168 |         }
169 |     }
170 |   }
171 |   
172 |   int x1 = x;
173 |   int x2 = x + template_img_width - 1;
174 |   int y1 = y;
175 |   int y2 = y + template_img_height - 1;
176 |     
177 |   cout << x1 << "  " << x2 << "  " << y1 << "  " << y2 << "  ";
178 |   
179 |   // 对图片进行保存
180 |     
181 |   // 先标记两条横线
182 |   for (int i = x1; i <= x2; i++) {
183 |       src_image[y1 * src_img_width + i] = 0;
184 |       src_image[y2 * src_img_width + i] = 0;
185 |   }
186 |   for (int i = y1 + 1; i < y2; i++) {
187 |       src_image[i * src_img_width + x1] = 0;
188 |       src_image[i * src_img_width + x2] = 0;
189 |   }
190 |     
191 |   stbi_write_png("sepia_ref.png", src_img_width, src_img_height, src_channels, src_image, src_img_width * src_channels);
192 |   return 0;
193 | }
194 | 


--------------------------------------------------------------------------------
/dpcpp/dpcpp_vectorAdd.cpp:
--------------------------------------------------------------------------------
 1 | #include <CL/sycl.hpp>
 2 | using namespace sycl;
 3 | static const size_t numElements = 50000;
 4 | void work(queue &q) {
 5 |   std::cout << "Device : "
 6 |             << q.get_device().get_info<info::device::name>()
 7 |             << std::endl;
 8 |   float vector1[numElements] , vector2[numElements] , vector3[numElements];
 9 |   auto R = range(numElements);
10 |    for (int i = 0; i < numElements; ++i) {
11 |         vector1[i] = rand()/(float)RAND_MAX;
12 |         vector2[i] = rand()/(float)RAND_MAX;
13 |     }
14 | //2.创建vector1、vector2、vector3向量的SYCL缓冲区；
15 |   buffer vector1_buffer(vector1,R);
16 |   buffer vector2_buffer(vector2,R);
17 |   buffer vector3_buffer(vector3,R);
18 | 
19 | //3.向Device提交工作（定义了访问缓冲区内存的accessor；）
20 |   q.submit([&](handler &h) {
21 |     accessor v1_accessor (vector1_buffer,h,read_only);
22 |     accessor v2_accessor (vector2_buffer,h,read_only);
23 |     accessor v3_accessor (vector3_buffer,h);
24 |    //4. 调用oneAPI的核函数在Device上完成指定的运算；
25 |         h.parallel_for (range<1>(numElements), [=](id<1> index) {
26 |   //核函数部分，若单独写一个函数，直接使用函数名（参数表）调用即可
27 |       if (index < numElements)
28 |         v3_accessor [index] = v1_accessor [index] + v2_accessor [index];
29 |     });
30 |   }).wait(); //排队等待
31 |    // 5. 将SYCL缓冲区的数据读到Host端，检查误差 
32 |     host_accessor h_c(vector3_buffer,read_only);
33 |     for (int i = 0; i < numElements; ++i) {
34 |      if (fabs(vector1[0] + vector2[0] - vector3[0] ) > 1e-8 ) {
35 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
36 |             exit(EXIT_FAILURE);
37 |         }
38 |     }
39 | }
40 | int main() {  
41 |   try {
42 |     queue q;
43 |     work(q);
44 |   } catch (exception e) {
45 |     std::cerr << "Exception: " << e.what() << std::endl;
46 |     std::terminate();
47 |   } catch (...) {
48 |     std::cerr << "Unknown exception" << std::endl;
49 |     std::terminate();
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/hip/hip_vectorAdd.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <hip/hip_runtime.h>
 4 | #include <hip/hip_runtime_api.h>
 5 | 
 6 | __global__ void vectorAdd(float *d_A,float  *d_B,float  *d_C,int numElements)
 7 |  {
 8 |      int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
 9 |      if(i<numElements)
10 |      {
11 |      d_C[i] = d_A[i] + d_B[i];
12 |      }
13 |  }
14 | 
15 |  int main(int argc,char **argv)
16 | {
17 | 
18 |     int numElements = 50000;
19 |     size_t size = numElements * sizeof(float);
20 |     printf("[Vector addition of %d elements]\n", numElements);
21 | 
22 | //1.申请Host内存并初始化
23 |     float *h_A = (float *)malloc(size);
24 |     float *h_B = (float *)malloc(size);
25 |     float *h_C = (float *)malloc(size);
26 |     
27 |     for (int i = 0; i < numElements; ++i)
28 |     {
29 |         h_A[i] = rand()/(float)RAND_MAX;
30 |         h_B[i] = rand()/(float)RAND_MAX;
31 |     }
32 | 
33 | //1.申请Device内存
34 |     float *d_A = NULL;
35 |     hipMalloc((void **)&d_A, size);
36 |     float *d_B = NULL;
37 |     hipMalloc((void **)&d_B, size);
38 |     float *d_C = NULL;
39 |     hipMalloc((void **)&d_C, size);
40 | 
41 |  //2.将两个向量从Host端提交到Device端
42 |      hipMemcpy(d_A,h_A,size,hipMemcpyHostToDevice);
43 |      hipMemcpy(d_B,h_B,size,hipMemcpyHostToDevice);
44 |  
45 | //3.调用hip核函数    
46 |      int threadsPerBlock = 256;
47 |      int blocksPerGrid =(numElements+ threadsPerBlock - 1) / threadsPerBlock;
48 |      hipLaunchKernelGGL(vectorAdd,blocksPerGrid, threadsPerBlock,0,0,d_A,d_B,d_C,numElements);
49 |      printf("HIP kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
50 |   
51 | //4.将两个向量相乘的结果从Device端传回Host端
52 |     hipMemcpy(h_C,d_C,size,hipMemcpyDeviceToHost);
53 |     //对比CPU和GPU计算结果误差
54 |    for (int i = 0; i < numElements; ++i)
55 |     {
56 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-8)
57 |         {
58 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
59 |             exit(EXIT_FAILURE);
60 |         }
61 |     }
62 |  
63 | //5.释放内存
64 |       hipFree(d_A);
65 |       hipFree(d_B);
66 |       hipFree(d_C);
67 |       free(h_A);
68 |       free(h_B);
69 |       free(h_C);
70 |  
71 |      return 0;
72 |  }
73 | 


--------------------------------------------------------------------------------
/opencl/OpenCL_Mixer.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <CL/cl.h>
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <sstream>
  6 | #include<time.h>
  7 | #include<stdio.h>
  8 | #include<stdlib.h>
  9 | 
 10 | using namespace std;
 11 | 
 12 | const int t = 1000;
 13 | 
 14 | 
 15 | const int heightA = t;
 16 | const int widthB = t;
 17 | const int midle = t;
 18 | 
 19 | //const int heightB = 3;
 20 | 
 21 | //一、 选择OpenCL平台并创建一个上下文
 22 | cl_context CreateContext()
 23 | {
 24 |     cl_int errNum;
 25 |     cl_uint numPlatforms;
 26 |     cl_platform_id firstPlatformId;
 27 |     cl_context context = NULL;
 28 | 
 29 |     //选择可用的平台中的第一个
 30 |     errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
 31 |     if (errNum != CL_SUCCESS || numPlatforms <= 0)
 32 |     {
 33 |         std::cerr << "Failed to find any OpenCL platforms." << std::endl;
 34 |         return NULL;
 35 |     }
 36 | 
 37 |     //创建一个OpenCL上下文环境
 38 |     cl_context_properties contextProperties[] =
 39 |     {
 40 |         CL_CONTEXT_PLATFORM,
 41 |         (cl_context_properties)firstPlatformId,
 42 |         0
 43 |     };
 44 |     context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU,
 45 |         NULL, NULL, &errNum);
 46 | 
 47 |     return context;
 48 | }
 49 | 
 50 | 
 51 | //二、 创建设备并创建命令队列
 52 | cl_command_queue CreateCommandQueue(cl_context context, cl_device_id* device)
 53 | {
 54 |     cl_int errNum;
 55 |     cl_device_id* devices;
 56 |     cl_command_queue commandQueue = NULL;
 57 |     size_t deviceBufferSize = -1;
 58 | 
 59 |     // 获取设备缓冲区大小
 60 |     errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
 61 | 
 62 |     if (deviceBufferSize <= 0)
 63 |     {
 64 |         std::cerr << "No devices available.";
 65 |         return NULL;
 66 |     }
 67 | 
 68 |     // 为设备分配缓存空间
 69 |     devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
 70 |     errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
 71 | 
 72 |     //选取可用设备中的第一个
 73 |     commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
 74 | 
 75 |     *device = devices[0];
 76 |     delete[] devices;
 77 |     return commandQueue;
 78 | }
 79 | 
 80 | 
 81 | // 三、创建和构建程序对象
 82 | cl_program CreateProgram(cl_context context, cl_device_id device, const char* fileName)
 83 | {
 84 |     cl_int errNum;
 85 |     cl_program program;
 86 | 
 87 |     std::ifstream kernelFile(fileName, std::ios::in);
 88 |     if (!kernelFile.is_open())
 89 |     {
 90 |         std::cerr << "Failed to open file for reading: " << fileName << std::endl;
 91 |         return NULL;
 92 |     }
 93 | 
 94 |     std::ostringstream oss;
 95 |     oss << kernelFile.rdbuf();
 96 | 
 97 |     std::string srcStdStr = oss.str();
 98 |     const char* srcStr = srcStdStr.c_str();
 99 |     program = clCreateProgramWithSource(context, 1,
100 |         (const char**)&srcStr,
101 |         NULL, NULL);
102 | 
103 |     errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
104 | 
105 |     return program;
106 | }
107 | 
108 | //创建和构建程序对象
109 | bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
110 |     int* a, int* b)
111 | {
112 |     memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
113 |         sizeof(int) * midle * heightA, a, NULL);
114 |     memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
115 |         sizeof(int) * widthB * midle, b, NULL);
116 |     memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
117 |         sizeof(int) * widthB * heightA, NULL, NULL);
118 |     return true;
119 | }
120 | 
121 | 
122 | // 释放OpenCL资源
123 | void Cleanup(cl_context context, cl_command_queue commandQueue,
124 |     cl_program program, cl_kernel kernel, cl_mem memObjects[3])
125 | {
126 |     for (int i = 0; i < 3; i++)
127 |     {
128 |         if (memObjects[i] != 0)
129 |             clReleaseMemObject(memObjects[i]);
130 |     }
131 |     if (commandQueue != 0)
132 |         clReleaseCommandQueue(commandQueue);
133 | 
134 |     if (kernel != 0)
135 |         clReleaseKernel(kernel);
136 | 
137 |     if (program != 0)
138 |         clReleaseProgram(program);
139 | 
140 |     if (context != 0)
141 |         clReleaseContext(context);
142 | }
143 | 
144 | 
145 | int main(int argc, char** argv)
146 | {
147 |     cl_context context = 0;
148 |     cl_command_queue commandQueue = 0;
149 |     cl_program program = 0;
150 |     cl_device_id device = 0;
151 |     cl_kernel kernel = 0;
152 |     cl_mem memObjects[3] = { 0, 0, 0 };
153 |     cl_int errNum;
154 |     cl_event events[1];
155 |     clock_t t1, t2, t3;
156 | 
157 | 
158 |     const char* filename = "./a.cl";
159 |     // 一、选择OpenCL平台并创建一个上下文
160 |     context = CreateContext();
161 | 
162 |     // 二、 创建设备并创建命令队列
163 |     commandQueue = CreateCommandQueue(context, &device);
164 | 
165 |     //三、创建和构建程序对象
166 |     program = CreateProgram(context, device, filename);
167 | 
168 |     // 四、 创建OpenCL内核并分配内存空间
169 |     kernel = clCreateKernel(program, "hello_kernel", NULL);
170 | 
171 |     //创建要处理的数据
172 |     int* a = NULL; // 输入数组
173 |     int* b = NULL; // 输入数组
174 |     int* result = NULL; // 输出数组
175 |     // 数组的大小
176 |     const int  elementsA = heightA * midle;
177 |     const int  elementsB = midle * widthB;
178 |     const int  elementsC = heightA * widthB;
179 | 
180 |     // 计算内存大小
181 |     size_t datasizeA = sizeof(float) * elementsA;
182 |     size_t datasizeB = sizeof(float) * elementsB;
183 |     size_t datasizeC = sizeof(float) * elementsC;
184 |     // 分配内存空间
185 |     a = (int*)malloc(datasizeA);
186 |     b = (int*)malloc(datasizeB);
187 |     result = (int*)malloc(datasizeC);
188 | 
189 |     for (int i = 0; i < heightA; i++)
190 |     {
191 |         for (int j = 0; j < midle; j++)
192 |         {
193 |             a[i * midle + j] = 2;//10.0f * ((int) rand() / (int) RAND_MAX);
194 |         }
195 | 
196 |     }
197 | 
198 | 
199 |     for (int k = 0; k < midle; k++)
200 |     {
201 |         for (int m = 0; m < widthB; m++)
202 |         {
203 |             b[k * widthB + m] = 3;//10.0f * ((int) rand() / (int) RAND_MAX);
204 |         }
205 | 
206 |     }
207 | 
208 |     t1 = clock();  //mach_absolute_time();
209 |     //cpu串行处理代码
210 |     for (int l = 0; l < heightA; l++) {
211 |         for (int n = 0; n < widthB; n++) {
212 |             for (int q = 0; q < midle; q++) {
213 |                 result[l * widthB + n] += a[l * midle + q] * b[q * widthB + n];
214 | 
215 |             }
216 |             //std::cout<<"r = "<<result[l*widthB+n]<<std::endl;
217 |         }
218 |     }
219 |     t2 = clock(); //mach_absolute_time();
220 | 
221 |     //创建内存对象
222 |     if (!CreateMemObjects(context, memObjects, a, b))
223 |     {
224 |         Cleanup(context, commandQueue, program, kernel, memObjects);
225 |         return 1;
226 |     }
227 | 
228 |     // 五、 设置内核数据并执行内核
229 |     errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
230 |     errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]);
231 |     errNum = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]);
232 |     errNum = clSetKernelArg(kernel, 3, sizeof(int), &heightA);
233 |     errNum = clSetKernelArg(kernel, 4, sizeof(int), &widthB);
234 |     errNum = clSetKernelArg(kernel, 5, sizeof(int), &midle);
235 | 
236 |     size_t globalWorkSize[2];
237 |     globalWorkSize[0] = heightA;
238 |     globalWorkSize[1] = widthB;
239 |     // size_t localWorkSize[2] = { 1,1 };
240 | 
241 |     errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
242 |         globalWorkSize, NULL,
243 |         0, NULL, &events[0]);
244 | 
245 | 
246 |     // 六、 读取执行结果并释放OpenCL资源
247 |     errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE,
248 |         0, widthB * heightA * sizeof(int), result,
249 |         0, NULL, NULL);
250 |     //    for(int p=0;p<20;p++){
251 |     //        cout<<"new ="<<result[p];
252 |     //    }
253 |     errNum = clWaitForEvents(1, &events[0]);
254 | 
255 |     t3 = clock();  //mach_absolute_time();
256 | 
257 |     errNum = clReleaseEvent(events[0]);
258 | 
259 | 
260 | 
261 | 
262 |     printf("cpu t = %.8f\n", ((double)t2 - (double)t1) / CLOCKS_PER_SEC);
263 |     printf("gpu t = %.8f \n", ((double)t3 - (double)t2)/ CLOCKS_PER_SEC);
264 | 
265 |     std::cout << std::endl;
266 |     std::cout << "Executed program succesfully." << std::endl;
267 |     Cleanup(context, commandQueue, program, kernel, memObjects);
268 | 
269 |     return 0;
270 | }
271 | 


--------------------------------------------------------------------------------
/opencl/OpenCL_vectorAdd.c:
--------------------------------------------------------------------------------
  1 | // This program implements a vector addition using OpenCL
  2 | 
  3 | // System includes
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | 
  7 | // OpenCL includes
  8 | #include <CL/cl.h>
  9 | 
 10 | // OpenCL kernel to perform an element-wise addition 
 11 | const char* programSource =
 12 | "__kernel                                            \n"
 13 | "void vecadd(__global int *A,                        \n"
 14 | "            __global int *B,                        \n"
 15 | "            __global int *C)                        \n"
 16 | "{                                                   \n"
 17 | "                                                    \n"
 18 | "   // Get the work-item’s unique ID                 \n"
 19 | "   int idx = get_global_id(0);                      \n"
 20 | "                                                    \n"
 21 | "   // Add the corresponding locations of            \n"
 22 | "   // 'A' and 'B', and store the result in 'C'.     \n"
 23 | "   C[idx] = A[idx] + B[idx];                        \n"
 24 | "}                                                   \n"
 25 | ;
 26 | 
 27 | int main() {
 28 |     // This code executes on the OpenCL host
 29 |     
 30 |     // Host data
 31 |     int *A = NULL;  // Input array
 32 |     int *B = NULL;  // Input array
 33 |     int *C = NULL;  // Output array
 34 |     
 35 |     // Elements in each array
 36 |     const int elements = 2048;   
 37 |     
 38 |     // Compute the size of the data 
 39 |     size_t datasize = sizeof(int)*elements;
 40 | 
 41 |     // Allocate space for input/output data
 42 |     A = (int*)malloc(datasize);
 43 |     B = (int*)malloc(datasize);
 44 |     C = (int*)malloc(datasize);
 45 | 
 46 |     // Initialize the input data
 47 |     int i;
 48 |     for(i = 0; i < elements; i++) {
 49 |         A[i] = i;
 50 |         B[i] = i;
 51 |     }
 52 | 
 53 |     // Use this to check the output of each API call
 54 |     cl_int status;  
 55 |      
 56 |     // Retrieve the number of platforms
 57 |     cl_uint numPlatforms = 0;
 58 |     status = clGetPlatformIDs(0, NULL, &numPlatforms);
 59 |  
 60 |     // Allocate enough space for each platform
 61 |     cl_platform_id *platforms = NULL;
 62 |     platforms = (cl_platform_id*)malloc(
 63 |         numPlatforms*sizeof(cl_platform_id));
 64 |  
 65 |     // Fill in the platforms
 66 |     status = clGetPlatformIDs(numPlatforms, platforms, NULL);
 67 | 
 68 |     // Retrieve the number of devices
 69 |     cl_uint numDevices = 0;
 70 |     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, 
 71 |         NULL, &numDevices);
 72 | 
 73 |     // Allocate enough space for each device
 74 |     cl_device_id *devices;
 75 |     devices = (cl_device_id*)malloc(
 76 |         numDevices*sizeof(cl_device_id));
 77 | 
 78 |     // Fill in the devices 
 79 |     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL,        
 80 |         numDevices, devices, NULL);
 81 | 
 82 |     // Create a context and associate it with the devices
 83 |     cl_context context;
 84 |     context = clCreateContext(NULL, numDevices, devices, NULL, 
 85 |         NULL, &status);
 86 | 
 87 |     // Create a command queue and associate it with the device 
 88 |     cl_command_queue cmdQueue;
 89 |     cmdQueue = clCreateCommandQueue(context, devices[0], 0, 
 90 |         &status);
 91 | 
 92 |     // Create a buffer object that will contain the data 
 93 |     // from the host array A
 94 |     cl_mem bufA;
 95 |     bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,                       
 96 |        NULL, &status);
 97 | 
 98 |     // Create a buffer object that will contain the data 
 99 |     // from the host array B
100 |     cl_mem bufB;
101 |     bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize,                        
102 |         NULL, &status);
103 | 
104 |     // Create a buffer object that will hold the output data
105 |     cl_mem bufC;
106 |     bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize,
107 |         NULL, &status); 
108 |     
109 |     // Write input array A to the device buffer bufferA
110 |     status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 
111 |         0, datasize, A, 0, NULL, NULL);
112 |     
113 |     // Write input array B to the device buffer bufferB
114 |     status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 
115 |         0, datasize, B, 0, NULL, NULL);
116 | 
117 |     // Create a program with source code
118 |     cl_program program = clCreateProgramWithSource(context, 1, 
119 |         (const char**)&programSource, NULL, &status);
120 | 
121 |     // Build (compile) the program for the device
122 |     status = clBuildProgram(program, numDevices, devices, 
123 |         NULL, NULL, NULL);
124 | 
125 |     // Create the vector addition kernel
126 |     cl_kernel kernel;
127 |     kernel = clCreateKernel(program, "vecadd", &status);
128 | 
129 |     // Associate the input and output buffers with the kernel 
130 |     status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);
131 |     status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);
132 |     status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);
133 | 
134 |     // Define an index space (global work size) of work 
135 |     // items for execution. A workgroup size (local work size) 
136 |     // is not required, but can be used.
137 |     size_t globalWorkSize[1];   
138 |  
139 |     // There are 'elements' work-items 
140 |     globalWorkSize[0] = elements;
141 | 
142 |     // Execute the kernel for execution
143 |     status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, 
144 |         globalWorkSize, NULL, 0, NULL, NULL);
145 | 
146 |     // Read the device output buffer to the host output array
147 |     clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, 
148 |         datasize, C, 0, NULL, NULL);
149 | 
150 |     // Verify the output
151 |     int result = 1;
152 |     for(i = 0; i < elements; i++) {
153 |         if(C[i] != i+i) {
154 |             result = 0;
155 |             break;
156 |         }
157 |     }
158 |     if(result) {
159 |         printf("Output is correct\n");
160 |     } else {
161 |         printf("Output is incorrect\n");
162 |     }
163 | 
164 |     // Free OpenCL resources
165 |     clReleaseKernel(kernel);
166 |     clReleaseProgram(program);
167 |     clReleaseCommandQueue(cmdQueue);
168 |     clReleaseMemObject(bufA);
169 |     clReleaseMemObject(bufB);
170 |     clReleaseMemObject(bufC);
171 |     clReleaseContext(context);
172 | 
173 |     // Free host resources
174 |     free(A);
175 |     free(B);
176 |     free(C);
177 |     free(platforms);
178 |     free(devices);
179 | 
180 |     return 0;
181 | }
182 | 
183 | 


--------------------------------------------------------------------------------
/opencl/a.cl:
--------------------------------------------------------------------------------
 1 | __kernel void hello_kernel(__global const int *a,
 2 |                            __global const int *b,
 3 |                            __global int *result_matrix,int  result_matrix_row,
 4 |                            int  result_matrix_col,int  compute_size)
 5 | {
 6 |     int row = get_global_id(0);
 7 |     int col = get_global_id(1);
 8 | 
 9 | 
10 |     int sum = 0;
11 |     for(int i=0;i<compute_size;i++)
12 |     {
13 |         sum += a[row*compute_size+i] * b[i*result_matrix_col+col];
14 |     }
15 | 
16 |     result_matrix[row*result_matrix_col+col] = sum;
17 | }


--------------------------------------------------------------------------------
/opencl/opencl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | nvcc OpenCL_vectorAdd.c -o OpenCL_vectorAdd -lOpenCL
4 | 
5 | nvcc OpenCL_Mixer.c -o OpenCL_Mixer -lOpenCL
6 | 


--------------------------------------------------------------------------------
/openmp/OpenMP-Matrix_Vector_Multiplication.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <omp.h>
 3 | #include <stdlib.h>
 4 | #include <time.h>
 5 | 
 6 | const int NUM_THREADS = 4; //设置线程数量
 7 | int N = 10000;
 8 | int M = 10000;
 9 | int mat[10000][10000]; //矩阵mat
10 | int vec[10000], ans[10000]; //向量vec
11 | 
12 | void makeRandomMatrix()  //生成矩阵
13 | {
14 |     srand(time(NULL));
15 |     int i, j;
16 |     for (i = 0; i < M; i++)
17 |     {
18 |         for (j = 0; j < N; j++)
19 |         {
20 |             mat[i][j] = rand() % 10 + 1;
21 |         }
22 |     }
23 | }
24 | 
25 | void makeRandomVector() //生成向量
26 | {
27 |     srand(time(NULL));
28 |     int i;
29 |     for (i = 0; i < N; i++)
30 |     {
31 |         vec[i] = rand() % 10 + 1;
32 |     }
33 | }
34 | 
35 | void funy(int a[], int cur)  //计算矩阵和矢量乘的部分结果
36 | {
37 |     int i;
38 |     for (i = 0; i < N; i++)
39 |     {
40 |         ans[cur] += a[i] * vec[i];
41 |     }
42 | }
43 | 
44 | void f()  //串行计算
45 | {
46 |     int i;
47 |     for (i = 0; i < M; i++)
48 |     {
49 |         funy(mat[i], i);
50 |     }
51 | }
52 | 
53 | void fp() //并行计算
54 | {
55 |     int i;
56 |     #pragma omp parallel for num_threads(NUM_THREADS)
57 |         for (i = 0; i < M; i ++)
58 |         {
59 |             funy(mat[i], i);
60 |         }
61 | }
62 | 
63 | int main()
64 | {
65 |     printf("Makeing matrix(%d*%d) & vector(%d*1)...\n",N,M,N);
66 |     makeRandomMatrix(); 
67 |     makeRandomVector();
68 |     double start_time = omp_get_wtime();
69 |     f();
70 |     double end_time = omp_get_wtime();
71 |     printf("串行 --- Running time=%f s\n", end_time - start_time);
72 |     start_time = omp_get_wtime();
73 |     fp();
74 |     end_time = omp_get_wtime();
75 |     printf("%d threads --- Running time=%f s\n", NUM_THREADS,end_time - start_time);
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/openmp/OpenMP-matrix_multiplication.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <pthread.h>
 3 | #include <omp.h>
 4 | #include <sys/time.h>
 5 | #include <ctime>
 6 | using namespace std;
 7 | const int maxn = 4;
 8 | 
 9 | int A[maxn][maxn], B[maxn][maxn], C[maxn][maxn];
10 | 
11 | int main() {
12 |     int i, j, k;
13 | 
14 |     omp_set_num_threads(omp_get_num_procs());
15 |     srand(time(NULL));
16 |     for (i = 0; i < maxn; i++)
17 |         for (j = 0; j < maxn; j++) {
18 |             A[i][j] = rand() % 10;
19 |             B[i][j] = rand() % 10;
20 |         }
21 | 
22 |     #pragma omp parallel for private(i,j,k) shared(A,B,C)
23 |     for (i = 0; i < maxn; ++i)
24 |         for (j = 0; j < maxn; ++j)
25 |             for (k = 0; k < maxn; ++k){
26 |                 //printf("OpenMP Test, : %d\n", omp_get_thread_num());
27 |                 C[i][j] += A[i][k] * B[k][j];
28 |             }
29 |                 
30 | 
31 |     for (i = 0; i < maxn; i++) {
32 |         for (j = 0; j < maxn; j++)
33 |             cout << C[i][j] << "\t";
34 |         cout << endl;
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/openmp/OpenMP-simple_instances.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>  //hello
 2 | #include<omp.h>
 3 | 
 4 | int main()
 5 | {
 6 |     int nthreads,thread_id;
 7 |     printf("I am the main thread.\n");
 8 |     #omp_set_num_threads(32); 
 9 |     #pragma omp parallel private(nthreads,thread_id) 
10 |     {
11 |         nthreads=omp_get_num_threads();     
12 |         thread_id=omp_get_thread_num();    
13 |         printf("Helllo I am thread %d out of a team of %d\n",thread_id,nthreads);
14 |     }
15 |     printf("Here I am,back to the main thread.\n");
16 |     return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/openmp/openmp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gcc OpenMP-simple_instances.c -o OpenMP-simple_instances -fopenmp
4 | 
5 | g++ OpenMP-matrix_multiplication.cpp -o OpenMP-matrix_multiplication -fopenmp
6 |     
7 | gcc OpenMP-Matrix_Vector_Multiplication.c -o OpenMP-Matrix_Vector_Multiplication -fopenmp


--------------------------------------------------------------------------------
/pthread/PThread-matrix_multiplication.c:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>   
  2 | #include<time.h>
  3 | #include<pthread.h>
  4 | #include<stdlib.h>
  5 | #include<unistd.h>
  6 | #include<memory.h>
  7 |  
  8 | 
  9 | #define M 600
 10 | #define N 600
 11 | int matrixA[M][N];
 12 | int matrixB[N][M];
 13 | int result[M][N];
 14 | 
 15 | void *func(void *arg);
 16 | 
 17 | const int NUM_THREADS =8 ;   //线程数
 18 | pthread_t tids[NUM_THREADS];  //线程
 19 | int L;                       //每个线程计算的块大小
 20 | 
 21 | void makeRandomMatrix_A()  //生成矩阵
 22 | {
 23 |     srand(time(NULL));
 24 |     int i, j;
 25 |     for (i = 0; i < M; i++)
 26 |     {
 27 |         for (j = 0; j < N; j++)
 28 |         {
 29 |             matrixA[i][j] = rand() % 10 + 1;
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | void makeRandomMatrix_B()  //生成矩阵
 35 | {
 36 |     srand(time(NULL));
 37 |     int i, j;
 38 |     for (i = 0; i < N; i++)
 39 |     {
 40 |         for (j = 0; j < M; j++)
 41 |         {
 42 |             matrixB[i][j] = rand() % 10 + 1;
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | //子线程函数 
 48 | void *func(void *arg)                                  
 49 | {
 50 |     int s=*(int *)arg;      //接收传入的参数（此线程从哪一行开始计算）
 51 |     int t=s+L;              //线程算到哪一行为止
 52 |     for(int i=s;i<t;i++)                                 
 53 |         for(int j=0;j<M;j++)
 54 |             for(int k=0;k<N;k++)
 55 |                 result[i][j]+=matrixA[i][k]*matrixB[k][j];                               
 56 | }
 57 | 
 58 | void fp(){                              //串多线程函数
 59 |     int i;
 60 |     int j = 0;    
 61 |     int t[NUM_THREADS];    //传参索引
 62 |     L = M / NUM_THREADS;  //按设置的线程数分配工作块（单个线程所要计算的行数L）
 63 | 
 64 |     for(i=0;i<M;i+=L)
 65 |         {
 66 |             t[j] = i;
 67 |             if (pthread_create(&tids[j], NULL, func, (void *)&(t[j]))) //产生线程，去完成矩阵相乘的部分工作量
 68 |             {
 69 |                 perror("pthread_create");
 70 |                 exit(1);
 71 |             }
 72 |             j++;
 73 |        } 
 74 | 
 75 |     for(i=0;i<NUM_THREADS;i++)
 76 |         pthread_join(tids[i],NULL);                         //等所有的子线程计算结束
 77 | }
 78 | 
 79 | void f(){                                                //串行程序函数
 80 |     int res[M][M]={0};                                  //保存矩阵相乘的结果。非全局变量一定要显示初始化为0,否则为随机的一个数
 81 |     for(int i=0;i<M;i++)                                 
 82 |         for(int j=0;j<M;j++)
 83 |             for(int k=0;k<N;k++)
 84 |                 res[i][j]+=matrixA[i][k]*matrixB[k][j];               
 85 | }
 86 | 
 87 | int main()
 88 | {
 89 |     makeRandomMatrix_A();                                      //用随机数产生两个待相乘的矩阵，并分别存入两个文件中
 90 |     makeRandomMatrix_B();                                      //从两个文件中读出数据赋给matrixA和matrixB
 91 |     printf("Makeing matrix(%d*%d) & matrix(%d*%d)...\n",N,M,M,N);
 92 | 
 93 |     //串行计算
 94 |     clock_t start2=clock();                              //开始计时
 95 |     f();                                               //串行程序
 96 |     clock_t finish2=clock();                             //结束计算
 97 |     printf("串行 --- Running time=%f s\n", (double)(finish2 - start2) / CLOCKS_PER_SEC);
 98 | 
 99 |     //多线程计算
100 |     clock_t start1=clock();                              //开始计时
101 |     fp();                                               //多线程
102 |     clock_t finish1=clock();                             //结束计算
103 |     printf("%d threads --- Running time=%f s\n", NUM_THREADS,(double)(finish1 - start1) / CLOCKS_PER_SEC);    
104 | 
105 |     return 0;
106 | }
107 | 


--------------------------------------------------------------------------------
/pthread/PThread-simple_instances.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>   
 2 | #include <pthread.h>
 3 | #include <unistd.h>
 4 | #include <malloc.h>
 5 | 
 6 | void* thread(void *id){
 7 |         pthread_t newthid;
 8 | 
 9 |         newthid = pthread_self();
10 |         printf("this is a new thread, thread ID is %u\n", newthid);
11 |         return NULL;
12 | }
13 | 
14 | int main(){
15 |         int num_thread = 5;
16 |         pthread_t *pt = (pthread_t *)malloc(sizeof(pthread_t) * num_thread);
17 | 
18 |         printf("main thread, ID is %u\n", pthread_self());
19 |         for (int i = 0; i < num_thread; i++){
20 |                 if (pthread_create(&pt[i], NULL, thread, NULL) != 0){
21 |                         printf("thread create failed!\n");
22 |                         return 1;
23 |                 }
24 |         }
25 |         sleep(2);
26 |         free(pt);
27 |         return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/pthread/PThread-synchronization.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <unistd.h>
 3 | #include <stdio.h>
 4 | #include<math.h>
 5 | int tickets = 20;
 6 | pthread_mutex_t mutex;
 7 | 
 8 | void *mythread1(void)
 9 | {
10 |     while (1)
11 |     {
12 |         pthread_mutex_lock(&mutex); //给互斥量上锁
13 |         if (tickets > 0)
14 |         {
15 |             usleep(1000);
16 |             printf("ticketse1 sells ticket:%d\n", tickets--);
17 |             pthread_mutex_unlock(&mutex); //给互斥量解锁
18 |         }
19 |         else
20 |         {
21 |             pthread_mutex_unlock(&mutex); //给互斥量解锁
22 |             break;
23 |         }
24 |         sleep(1);
25 |     }
26 |     return (void *)0;
27 | }
28 | void *mythread2(void)
29 | {
30 |     while (1)
31 |     {
32 |         pthread_mutex_lock(&mutex); //给互斥量上锁
33 |         if (tickets > 0)
34 |         {
35 |             usleep(1000);
36 |             printf("ticketse2 sells ticket:%d\n", tickets--);
37 |             pthread_mutex_unlock(&mutex); //给互斥量解锁
38 |         }
39 |         else
40 |         {
41 |             pthread_mutex_unlock(&mutex); //给互斥量解锁
42 |             break;
43 |         }
44 |         sleep(1);
45 |     }
46 |     return (void *)0;
47 | }
48 | 
49 | int main(int argc, const char *argv[])
50 | {
51 |     //int i = 0;
52 |     int ret = 0;
53 |     pthread_t id1, id2;
54 | 
55 |     ret = pthread_create(&id1, NULL, (void *)mythread1, NULL); //创建线程1
56 |     if (ret)
57 |     {
58 |         printf("Create pthread error!\n");
59 |         return 1;
60 |     }
61 | 
62 |     ret = pthread_create(&id2, NULL, (void *)mythread2, NULL); //创建线程2
63 |     if (ret)
64 |     {
65 |         printf("Create pthread error!\n");
66 |         return 1;
67 |     }
68 | 
69 |     pthread_join(id1, NULL); //等待线程结束
70 |     pthread_join(id2, NULL);
71 | 
72 |     return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/pthread/pthread.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gcc PThread-matrix_multiplication.c -o PThread-matrix_multiplication -pthread
4 | 
5 | gcc PThread-simple_instances.c -o PThread-simple_instances -pthread
6 |     
7 | gcc PThread-synchronization.c -o PThread-synchronization -pthread


--------------------------------------------------------------------------------