├── .gitignore
├── Lab1
    ├── Lab1.pdf
    ├── problem1
    │   ├── problem1_Linux.cpp
    │   ├── problem1_Linux_optimize.cpp
    │   ├── problem1_Linux_ordinary.cpp
    │   ├── problem1_Linux_unroll.cpp
    │   └── problem1_Windows.cpp
    ├── problem2
    │   ├── problem2_Linux.cpp
    │   ├── problem2_Linux_optimize.cpp
    │   ├── problem2_Linux_ordinary.cpp
    │   └── problem2_Windows.cpp
    └── report
    │   ├── p1.jpg
    │   ├── p2.jpg
    │   └── 思路.md
├── Lab2
    ├── ARM
    │   ├── KMeans_SIMD.cpp
    │   ├── KMeans_neon.cpp
    │   ├── KMeans_neon_cache.cpp
    │   └── KMeans_serial.cpp
    ├── Lab2.pdf
    ├── report
    │   ├── Lab2.md
    │   ├── d_sp.jpg
    │   └── n_sp.jpg
    └── x86
    │   ├── KMeans_SIMD.cpp
    │   └── KMeans_serial.cpp
├── Lab3
    ├── ARM
    │   ├── Gauss_arm.cpp
    │   └── Gauss_pthread.cpp
    ├── Gauss_serial.cpp
    ├── Lab3.pdf
    ├── report
    │   ├── CPU.png
    │   ├── arm.png
    │   ├── arm_thread.png
    │   ├── arm线程.png
    │   ├── report.md
    │   ├── x86.png
    │   ├── x86_pthread.png
    │   └── x86线程.png
    └── x86
    │   ├── Gauss_pthread.cpp
    │   └── Gauss_x86.cpp
├── Lab4
    ├── ARM
    │   └── Gauss_openmp.cpp
    ├── Gauss_serial.cpp
    ├── report
    │   ├── Lab4.pdf
    │   ├── SIMD.png
    │   ├── arm_sp.png
    │   ├── cache.png
    │   ├── idea.md
    │   ├── offload.png
    │   ├── report.md
    │   ├── result_arm.csv
    │   ├── result_x86.csv
    │   ├── rowcol.png
    │   ├── schedule.png
    │   ├── threads.png
    │   └── x86.png
    └── x86
    │   ├── Gauss_openmp.cpp
    │   ├── offLoading.cpp
    │   └── result.csv
├── Lab5
    ├── CMakeLists.txt
    ├── arm
    │   ├── Gauss.cpp
    │   └── am.csv
    ├── report
    │   ├── Lab5.pdf
    │   ├── OMP.png
    │   ├── SIMD.png
    │   ├── arm_OMP.png
    │   ├── arm_SIMD.png
    │   ├── arm_block_cycle.png
    │   ├── arm_send_receive.png
    │   ├── arm_sp.png
    │   ├── block_cycle.png
    │   ├── process.png
    │   ├── report.md
    │   ├── send_receive.png
    │   └── x86_sp.png
    ├── test.cpp
    └── x86
    │   ├── Gauss.cpp
    │   └── x86.csv
├── Lab6
    ├── Certificate.pdf
    ├── Ex1
    │   ├── 01-add-error-handling.cu
    │   ├── 01-basic-parallel.cu
    │   ├── 01-double-elements.cu
    │   ├── 01-grid-stride-double.cu
    │   ├── 01-hello-gpu.cu
    │   ├── 01-mismatched-config-loop.cu
    │   ├── 01-multi-block-loop.cu
    │   ├── 01-single-block-loop.cu
    │   ├── 01-thread-and-block-idx.cu
    │   └── 01-vector-add.cu
    ├── Ex2
    │   ├── 02-get-device-properties.cu
    │   ├── 02-page-faults-solution-cpu-only.cu
    │   ├── 02-page-faults-solution-cpu-then-gpu.cu
    │   ├── 02-page-faults-solution-gpu-only.cu
    │   ├── 02-page-faults-solution-gpu-then-cpu.cu
    │   ├── 02-saxpy-solution.cu
    │   ├── 02-vector-add-1.cu
    │   ├── 02-vector-add-init-in-kernel-solution.cu
    │   ├── 02-vector-add-prefetch-solution-cpu-also.cu
    │   ├── 02-vector-add-prefetch-solution.cu
    │   └── 02-vector-add.cu
    ├── Ex3
    │   ├── 03-init-kernel-solution.cu
    │   ├── 03-nbody.cu
    │   ├── 03-prefetch-check-solution.cu
    │   ├── 03-print-numbers-solution.cu
    │   ├── 03-print-numbers.cu
    │   ├── 03-stream-init-solution.cu
    │   └── 03-vector-add-prefetch-solution.cu
    └── report
    │   ├── Lab6.pdf
    │   ├── image-20220613213324039.png
    │   ├── image-20220613213540824.png
    │   ├── image-20220613214044904.png
    │   ├── image-20220613215757260.png
    │   ├── image-20220613220353591.png
    │   ├── image-20220613222635516.png
    │   ├── image-20220613223730354.png
    │   ├── image-20220614213733118.png
    │   ├── image-20220615125537746.png
    │   ├── image-20220615131242726.png
    │   ├── image-20220615131738098.png
    │   └── report.md
├── README.md
├── test.sh
├── 实验指导书
    ├── CUDA_C_Best_Practices_Guide.pdf
    ├── OpenMP-4.5-1115-CPP-web.pdf
    ├── OpenMPRefCard-5-2-web.pdf
    ├── 实验教学指导书-1 实验环境搭建.pdf
    ├── 实验教学指导书-2 体系结构相关及性能测试.pdf
    ├── 实验教学指导书-3 SIMD编程.pdf
    ├── 实验教学指导书-4 Pthread编程.pdf
    ├── 实验教学指导书-5 OpenMP编程.pdf
    ├── 实验教学指导书-6 MPI编程.pdf
    └── 实验教学指导书-7 GPU编程.pdf
└── 调研
    ├── Intel Core 12th CPU 架构调研.pdf
    ├── image-20220226163655963.png
    ├── image-20220226164012526.png
    ├── image-20220227154957590.png
    ├── image-20220227155106394.png
    ├── image-20220227170838930.png
    ├── image-20220227170901851.png
    ├── image-20220227175847534.png
    ├── image-20220227210558360.png
    ├── intel-architecture-day-2021-presentation.pdf
    ├── s_a991b8031e974f40838bdc54f32a43b9.jpg
    └── 调研.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | exe
3 | .idea
4 | cmake-build-debug*
5 | *.txt


--------------------------------------------------------------------------------
/Lab1/Lab1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab1/Lab1.pdf


--------------------------------------------------------------------------------
/Lab1/problem1/problem1_Linux.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <sys/time.h>
  3 | using namespace std;
  4 | 
  5 | #define ull unsigned long long int 
  6 | 
  7 | const int N = 80;
  8 | ull a[N];
  9 | ull b[N][N];
 10 | ull sum[N];
 11 | int LOOP = 1000;
 12 | 
 13 | void init()
 14 | {
 15 |     for(int i=0;i<N;i++)
 16 |         a[i]=i;
 17 |     for(int i=0;i<N;i++)
 18 |         for(int j=0;j<N;j++)
 19 |             b[i][j]=i+j;
 20 | }
 21 | 
 22 | void ordinary()
 23 | {
 24 |     struct timeval start;
 25 |     struct timeval end;
 26 |     gettimeofday(&start,NULL);
 27 |     for(int l=0;l<LOOP;l++)
 28 |     {
 29 |         for(int i=0;i<N;i++)
 30 |             sum[i]=0;
 31 |         for(int i=0;i<N;i++)
 32 |             for(int j=0;j<N;j++)
 33 |                 sum[i]+=a[j]*b[j][i];
 34 |     }
 35 |     gettimeofday(&end,NULL);
 36 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 37 | }
 38 | 
 39 | void optimize()
 40 | {
 41 |     struct timeval start;
 42 |     struct timeval end;
 43 |     gettimeofday(&start,NULL);
 44 |     for(int l=0;l<LOOP;l++)
 45 |     {
 46 |         for(int i=0;i<N;i++)
 47 |             sum[i]=0;
 48 |         for(int j=0;j<N;j++)
 49 |             for(int i=0;i<N;i++)
 50 |                 sum[i]+=b[j][i]*a[j];
 51 |     }
 52 |     gettimeofday(&end,NULL);
 53 |     cout<<"optimize:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 54 | }
 55 | 
 56 | void unroll()
 57 | {
 58 |     struct timeval start;
 59 |     struct timeval end;
 60 |     gettimeofday(&start,NULL);
 61 |     for(int l=0;l<LOOP;l++)
 62 |     {
 63 |         for(int i=0;i<N;i++)
 64 |             sum[i]=0;
 65 |         for(int j=0;j<N;j+=10)
 66 |         {
 67 |             int tmp0=0,tmp1=0,tmp2=0,tmp3=0,tmp4=0,tmp5=0,tmp6=0,tmp7=0,tmp8=0,tmp9=0;
 68 |             for(int i=0;i<N;i++)
 69 |             {
 70 |                 tmp0+=a[j+0]*b[j+0][i];
 71 |                 tmp1+=a[j+1]*b[j+1][i];
 72 |                 tmp2+=a[j+2]*b[j+2][i];
 73 |                 tmp3+=a[j+3]*b[j+3][i];
 74 |                 tmp4+=a[j+4]*b[j+4][i];
 75 |                 tmp5+=a[j+5]*b[j+5][i];
 76 |                 tmp6+=a[j+6]*b[j+6][i];
 77 |                 tmp6+=a[j+6]*b[j+6][i];
 78 |                 tmp7+=a[j+7]*b[j+7][i];
 79 |                 tmp8+=a[j+8]*b[j+8][i];
 80 |                 tmp9+=a[j+9]*b[j+9][i];
 81 |             }
 82 |             sum[j+0]=tmp0;
 83 |             sum[j+1]=tmp1;
 84 |             sum[j+2]=tmp2;
 85 |             sum[j+3]=tmp3;
 86 |             sum[j+4]=tmp4;
 87 |             sum[j+5]=tmp5;
 88 |             sum[j+6]=tmp6;
 89 |             sum[j+7]=tmp7;
 90 |             sum[j+8]=tmp8;
 91 |             sum[j+9]=tmp9;
 92 |         }
 93 |     }
 94 |     gettimeofday(&end,NULL);
 95 |     cout<<"unroll:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 96 | }
 97 | 
 98 | int main()
 99 | {
100 |     init();
101 |     ordinary();
102 |     optimize();
103 |     unroll();
104 | }


--------------------------------------------------------------------------------
/Lab1/problem1/problem1_Linux_optimize.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int 
 6 | 
 7 | const int N = 3000;
 8 | ull a[N];
 9 | ull b[N][N];
10 | ull sum[N];
11 | int LOOP = 1;
12 | 
13 | void init()
14 | {
15 |     for(int i=0;i<N;i++)
16 |         a[i]=i;
17 |     for(int i=0;i<N;i++)
18 |         for(int j=0;j<N;j++)
19 |             b[i][j]=i+j;
20 | }
21 | 
22 | void optimize()
23 | {
24 |     struct timeval start;
25 |     struct timeval end;
26 |     gettimeofday(&start,NULL);
27 |     for(int l=0;l<LOOP;l++)
28 |     {
29 |         for(int i=0;i<N;i++)
30 |             sum[i]=0;
31 |         for(int j=0;j<N;j++)
32 |             for(int i=0;i<N;i++)
33 |                 sum[i]+=b[j][i]*a[j];
34 |     }
35 |     gettimeofday(&end,NULL);
36 |     cout<<"optimize:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
37 | }
38 | 
39 | int main()
40 | {
41 |     init();
42 |     optimize();
43 | }


--------------------------------------------------------------------------------
/Lab1/problem1/problem1_Linux_ordinary.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int 
 6 | 
 7 | const int N = 5000;
 8 | ull a[N];
 9 | ull b[N][N];
10 | ull sum[N];
11 | int LOOP = 1;
12 | 
13 | void init()
14 | {
15 |     for(int i=0;i<N;i++)
16 |         a[i]=i;
17 |     for(int i=0;i<N;i++)
18 |         for(int j=0;j<N;j++)
19 |             b[i][j]=i+j;
20 | }
21 | 
22 | void ordinary()
23 | {
24 |     struct timeval start;
25 |     struct timeval end;
26 |     gettimeofday(&start,NULL);
27 |     for(int l=0;l<LOOP;l++)
28 |     {
29 |         for(int i=0;i<N;i++)
30 |             sum[i]=0;
31 |         for(int i=0;i<N;i++)
32 |             for(int j=0;j<N;j++)
33 |                 sum[i]+=a[j]*b[j][i];
34 |     }
35 |     gettimeofday(&end,NULL);
36 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
37 | }
38 | 
39 | int main()
40 | {
41 |     init();
42 |     ordinary();
43 | }


--------------------------------------------------------------------------------
/Lab1/problem1/problem1_Linux_unroll.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int 
 6 | 
 7 | const int N = 1000;
 8 | ull a[N];
 9 | ull b[N][N];
10 | ull sum[N];
11 | int LOOP = 1;
12 | 
13 | void init()
14 | {
15 |     for(int i=0;i<N;i++)
16 |         a[i]=i;
17 |     for(int i=0;i<N;i++)
18 |         for(int j=0;j<N;j++)
19 |             b[i][j]=i+j;
20 | }
21 | 
22 | void unroll()
23 | {
24 |     struct timeval start;
25 |     struct timeval end;
26 |     gettimeofday(&start,NULL);
27 |     for(int l=0;l<LOOP;l++)
28 |     {
29 |         for(int i=0;i<N;i++)
30 |             sum[i]=0;
31 |         for(int j=0;j<N;j+=10)
32 |         {
33 |             int tmp0=0,tmp1=0,tmp2=0,tmp3=0,tmp4=0,tmp5=0,tmp6=0,tmp7=0,tmp8=0,tmp9=0;
34 |             for(int i=0;i<N;i++)
35 |             {
36 |                 tmp0+=a[j+0]*b[j+0][i];
37 |                 tmp1+=a[j+1]*b[j+1][i];
38 |                 tmp2+=a[j+2]*b[j+2][i];
39 |                 tmp3+=a[j+3]*b[j+3][i];
40 |                 tmp4+=a[j+4]*b[j+4][i];
41 |                 tmp5+=a[j+5]*b[j+5][i];
42 |                 tmp6+=a[j+6]*b[j+6][i];
43 |                 tmp6+=a[j+6]*b[j+6][i];
44 |                 tmp7+=a[j+7]*b[j+7][i];
45 |                 tmp8+=a[j+8]*b[j+8][i];
46 |                 tmp9+=a[j+9]*b[j+9][i];
47 |             }
48 |             sum[j+0]=tmp0;
49 |             sum[j+1]=tmp1;
50 |             sum[j+2]=tmp2;
51 |             sum[j+3]=tmp3;
52 |             sum[j+4]=tmp4;
53 |             sum[j+5]=tmp5;
54 |             sum[j+6]=tmp6;
55 |             sum[j+7]=tmp7;
56 |             sum[j+8]=tmp8;
57 |             sum[j+9]=tmp9;
58 |         }
59 |     }
60 |     gettimeofday(&end,NULL);
61 |     cout<<"unroll:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
62 | }
63 | 
64 | int main()
65 | {
66 |     init();
67 |     unroll();
68 | }


--------------------------------------------------------------------------------
/Lab1/problem1/problem1_Windows.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <windows.h>
  3 | using namespace std;
  4 | const int N = 10000;
  5 | int a[N];
  6 | int b[N][N];
  7 | int sum[N];
  8 | int LOOP = 1;
  9 | 
 10 | void init()
 11 | {
 12 |     for(int i=0;i<N;i++)
 13 |         a[i]=i;
 14 |     for(int i=0;i<N;i++)
 15 |         for(int j=0;j<N;j++)
 16 |             b[i][j]=i+j;
 17 | }
 18 | 
 19 | void ordinary()
 20 | {
 21 |     long long int begin, end, freq;
 22 |     QueryPerformanceFrequency((LARGE_INTEGER *) &freq);
 23 |     QueryPerformanceCounter((LARGE_INTEGER*) &begin);
 24 |     for(int l=0;l<LOOP;l++)
 25 |     {
 26 |         for(int i=0;i<N;i++)
 27 |         {
 28 |             sum[i]=0;
 29 |             for(int j=0;j<N;j++)
 30 |                 sum[i]+=a[j]*b[j][i];
 31 |         }
 32 |     }
 33 |     QueryPerformanceCounter((LARGE_INTEGER*) &end);
 34 |     cout<<"ordinary:"<<(end-begin)*1000.0/freq/LOOP<<"ms"<<endl;
 35 | }
 36 | 
 37 | void optimize()
 38 | {
 39 |     long long int begin, end, freq;
 40 |     QueryPerformanceFrequency((LARGE_INTEGER *) &freq);
 41 |     QueryPerformanceCounter((LARGE_INTEGER*) &begin);
 42 |     for(int l=0;l<LOOP;l++)
 43 |     {
 44 |         for(int i=0;i<N;i++)
 45 |             sum[i]=0;
 46 |         for(int j=0;j<N;j++)
 47 |             for(int i=0;i<N;i++)
 48 |                 sum[i]+=a[j]*b[j][i];
 49 |     }
 50 |     QueryPerformanceCounter((LARGE_INTEGER*) &end);
 51 |     cout<<"optimize:"<<(end-begin)*1000.0/freq/LOOP<<"ms"<<endl;
 52 | }
 53 | 
 54 | void unroll()
 55 | {
 56 |     long long int begin, end, freq;
 57 |     QueryPerformanceFrequency((LARGE_INTEGER *) &freq);
 58 |     QueryPerformanceCounter((LARGE_INTEGER*) &begin);
 59 |     for(int l=0;l<LOOP;l++)
 60 |     {
 61 |         for(int i=0;i<N;i++)
 62 |             sum[i]=0;
 63 |         for(int j=0;j<N;j+=10)
 64 |         {
 65 |             int tmp0=0,tmp1=0,tmp2=0,tmp3=0,tmp4=0,tmp5=0,tmp6=0,tmp7=0,tmp8=0,tmp9=0;
 66 |             for(int i=0;i<N;i++)
 67 |             {
 68 |                 tmp0+=a[j+0]*b[j+0][i];
 69 |                 tmp1+=a[j+1]*b[j+1][i];
 70 |                 tmp2+=a[j+2]*b[j+2][i];
 71 |                 tmp3+=a[j+3]*b[j+3][i];
 72 |                 tmp4+=a[j+4]*b[j+4][i];
 73 |                 tmp5+=a[j+5]*b[j+5][i];
 74 |                 tmp6+=a[j+6]*b[j+6][i];
 75 |                 tmp6+=a[j+6]*b[j+6][i];
 76 |                 tmp7+=a[j+7]*b[j+7][i];
 77 |                 tmp8+=a[j+8]*b[j+8][i];
 78 |                 tmp9+=a[j+9]*b[j+9][i];
 79 |             }
 80 |             sum[j+0]=tmp0;
 81 |             sum[j+1]=tmp1;
 82 |             sum[j+2]=tmp2;
 83 |             sum[j+3]=tmp3;
 84 |             sum[j+4]=tmp4;
 85 |             sum[j+5]=tmp5;
 86 |             sum[j+6]=tmp6;
 87 |             sum[j+7]=tmp7;
 88 |             sum[j+8]=tmp8;
 89 |             sum[j+9]=tmp9;
 90 |         }
 91 |     }
 92 |     QueryPerformanceCounter((LARGE_INTEGER*) &end);
 93 |     cout<<"unroll:"<<(end-begin)*1000.0/freq/LOOP<<"ms"<<endl;
 94 | }
 95 | 
 96 | int main()
 97 | {
 98 |     init();
 99 |     ordinary();
100 |     optimize();
101 |     unroll();
102 | }


--------------------------------------------------------------------------------
/Lab1/problem2/problem2_Linux.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int
 6 | 
 7 | const ull N = 67108864;
 8 | ull a[N];
 9 | int LOOP = 1;
10 | 
11 | void init()
12 | {
13 |     for (ull i = 0; i < N; i++)
14 |         a[i] = i;
15 | }
16 | 
17 | void ordinary()
18 | {
19 |     struct timeval start;
20 |     struct timeval end;
21 |     gettimeofday(&start,NULL);
22 |     for(int l=0;l<LOOP;l++)
23 |     {
24 |         // init();
25 |         ull sum = 0;
26 |         for (int i = 0; i < N - 1; i+=2)
27 |             sum += a[i], sum += a[i+1]; 
28 |     }
29 |     gettimeofday(&end,NULL);
30 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
31 | }
32 | 
33 | void optimize()
34 | {
35 |     struct timeval start;
36 |     struct timeval end;
37 |     gettimeofday(&start,NULL);
38 |     for(int l=0;l<LOOP;l++)
39 |     {
40 |         ull sum1 = 0, sum2 = 0;
41 |         for(int i=0;i<N-1; i+=2)
42 |             sum1+=a[i],sum2+= a[i+1];
43 |         ull sum = sum1 + sum2;
44 |     }
45 |     gettimeofday(&end,NULL);
46 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
47 | }
48 | 
49 | 
50 | int main()
51 | {
52 |     init();
53 |     ordinary();
54 |     optimize();
55 | }
56 | 


--------------------------------------------------------------------------------
/Lab1/problem2/problem2_Linux_optimize.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int
 6 | 
 7 | const ull N = 33554432;
 8 | ull a[N];
 9 | int LOOP = 1;
10 | 
11 | void init()
12 | {
13 |     for (ull i = 0; i < N; i++)
14 |         a[i] = i;
15 | }
16 | 
17 | void optimize()
18 | {
19 |     struct timeval start;
20 |     struct timeval end;
21 |     gettimeofday(&start,NULL);
22 |     for(int l=0;l<LOOP;l++)
23 |     {
24 |         ull sum1 = 0, sum2 = 0;
25 |         for(int i=0;i<N-1; i+=2)
26 |             sum1+=a[i],sum2+= a[i+1];
27 |         ull sum = sum1 + sum2;
28 |     }
29 |     gettimeofday(&end,NULL);
30 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
31 | }
32 | 
33 | int main()
34 | {
35 |     init();
36 |     optimize();
37 | }


--------------------------------------------------------------------------------
/Lab1/problem2/problem2_Linux_ordinary.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int
 6 | 
 7 | const ull N = 33554432;
 8 | ull a[N];
 9 | int LOOP = 1;
10 | 
11 | void init()
12 | {
13 |     for (ull i = 0; i < N; i++)
14 |         a[i] = i;
15 | }
16 | 
17 | void ordinary()
18 | {
19 |     struct timeval start;
20 |     struct timeval end;
21 |     gettimeofday(&start,NULL);
22 |     for(int l=0;l<LOOP;l++)
23 |     {
24 |         // init();
25 |         ull sum = 0;
26 |         for (int i = 0; i < N; i++)
27 |             sum += a[i];
28 |     }
29 |     gettimeofday(&end,NULL);
30 |     cout<<"ordinary:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
31 | }
32 | 
33 | int main()
34 | {
35 |     init();
36 |     ordinary();
37 | }


--------------------------------------------------------------------------------
/Lab1/problem2/problem2_Windows.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <windows.h>
 3 | using namespace std;
 4 | 
 5 | #define ull unsigned long long int
 6 | 
 7 | const ull N = 1024;
 8 | ull a[N];
 9 | int LOOP = 1;
10 | 
11 | void init()
12 | {
13 |     for (ull i = 0; i < N; i++)
14 |         a[i] = i;
15 | }
16 | 
17 | void ordinary()
18 | {
19 |     long long int begin, end, freq;
20 |     QueryPerformanceFrequency((LARGE_INTEGER *) &freq);
21 |     QueryPerformanceCounter((LARGE_INTEGER*) &begin);
22 |     for(int l=0;l<LOOP;l++)
23 |     {
24 |         // init();
25 |         ull sum = 0;
26 |         for (int i = 0; i < N; i++)
27 |             sum += a[i];
28 |     }
29 |     QueryPerformanceCounter((LARGE_INTEGER*) &end);
30 |     cout<<"ordinary:"<<(end-begin)*1000.0/freq/LOOP<<"ms"<<endl;
31 | }
32 | 
33 | void optimize()
34 | {
35 |     long long int begin, end, freq;
36 |     QueryPerformanceFrequency((LARGE_INTEGER *) &freq);
37 |     QueryPerformanceCounter((LARGE_INTEGER*) &begin);
38 |     for(int l=0;l<LOOP;l++)
39 |     {
40 |         ull sum1 = 0, sum2 = 0;
41 |         for(int i=0;i<N-1; i+=2)
42 |             sum1+=a[i],sum2+= a[i+1];
43 |         ull sum = sum1 + sum2;
44 |     }
45 |     QueryPerformanceCounter((LARGE_INTEGER*) &end);
46 |     cout<<"optimize:"<<(end-begin)*1000.0/freq/LOOP<<"ms"<<endl;
47 | }
48 | 
49 | int main()
50 | {
51 |     init();
52 |     ordinary();
53 |     optimize();
54 | }
55 | 


--------------------------------------------------------------------------------
/Lab1/report/p1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab1/report/p1.jpg


--------------------------------------------------------------------------------
/Lab1/report/p2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab1/report/p2.jpg


--------------------------------------------------------------------------------
/Lab1/report/思路.md:
--------------------------------------------------------------------------------
 1 | - [x] 代码实现
 2 | - [x] 代码优化：循环展开
 3 | - [ ] 性能测试
 4 |   - [x] 问题一
 5 |     - [x] 规模测试：不同大小规模下的时间
 6 |       - [x] 绘制走势，对比放一张图
 7 |       - [x] 找到趋势转折点，让cache大小不足
 8 |         - [x] 利用工具测试该点前后的命中率
 9 |   - [ ] 问题二
10 |     - [ ] CPI对比
11 |     - [x] 对比平台
12 |       - [x] x86和arm Linux下
13 | 
14 | # 问题重述
15 | 
16 | ## 体系结构相关实验分析——cache优化
17 | 
18 | 计算给定n*n矩阵的每一列和给定向量的内积，考虑两种算法设计思路：
19 | 
20 | 1. 逐列访问元素的平凡算法
21 | 2. cache的优化算法
22 | 
23 | ## 体系结构相关实验分析——超标量优化
24 | 
25 | 计算n个数的和，考虑两种算法的设计思路
26 | 
27 | 1. 逐个累加的平凡算法
28 | 2. 超标量优化算法，如最简单的两路链式累加，或两两相加后中间结果再两两相加的递归算法
29 | 
30 | # 实验环境
31 | 
32 | ## ARM架构
33 | 
34 | |   参数   |  数值  |
35 | | :------: | :----: |
36 | | CPU主频  | 2.6GHz |
37 | | L1 cache |  64K   |
38 | | L2 cache |  512K  |
39 | | L3 cache |  48MB  |
40 | 
41 | ## x86架构
42 | 
43 | |   参数   |         数值         |
44 | | :------: | :------------------: |
45 | | CPU型号  | Intel Core i7-11800H |
46 | | CPU主频  |        2.3GHz        |
47 | | L1 cache |         48K          |
48 | | L2 cache |        1.25MB        |
49 | | L3 cache |         24MB         |
50 | 
51 | # 实验设计及分析
52 | 
53 | ## cache优化
54 | 
55 | 针对给定的问题，由于矩阵在内存中存储时按照行有限的顺序存储的，也就是说，在内存中的矩阵是按行紧密排列的。因此，对于原始朴素的逐列访问算法来说，CPU会一次读入连续的一段数据到缓存中，其中可能只包含需要计算一个元素，因此当计算该列的第二个元素的时候，CPU又需要到更低的缓存或内存中去读取所需要的元素，而访存的时间相较于运算来说，开销是很大的，这会在很大程度上降低程序运行的效率。
56 | 
57 | 因此我们考虑改进算法，采用逐列访问的cache优化算法，即充分利用每次读入的数据，将当前读入的缓存中的一行数据全部进行计算，然后累加到结果数组的对应位置，虽然在这个过程中并没有能够直接计算出结果，但是极大利用了cache中的缓存数据，减少去内存中寻找数据的访存时间。
58 | 
59 | 同时，为了能够降低循环访问过程中，条件判断，指令跳转等额外开销，我们对于逐列访问的算法进行了进一步优化，采用循环展开的方法，在一次循环中，同时计算十个位置的值，可以利用多条流水线同时作业，发挥CPU超标量计算的性能。
60 | 
61 | 为此，我们分别设计了三种算法，并在ARM架构的华为云鲲鹏服务器上进行测试
62 | 
63 | ![p1](E:\学习资料\大二\并行程序设计\Experiment\Lab1\p1.jpg)
64 | 
65 | 根据测试数据我们可以看出，在N<300的规模下，逐行或逐列的访问方式在效率上差别不大，但采用循环展开的优化算法能够取得较大的性能提升。而当300<N<3000时，逐列访问要比逐行访问的增长速度慢，但在此规模下，三种方法的增长趋势还是基本相同的。当N>3000之后，逐行访问的增长速度要明显超越了逐列访问的方法，证明此时cache优化起到了显著作用。
66 | 
67 | 由数据可以进行理论分析，由于CPU的L1-L3的各级cache大小分别为64KB，512KB和48MB，对于数组元素为unsigned long long int而言，每一个元素占据10个字节，因此，填满各级cache的元素规模大概在80，200，2000。而图像的大概走势也恰恰能够印证这一假设，即在N<300下，尽管普通方法的L1 cache命中率可能已经很低，但是由于L2 cache的访问速度相对比较快，所以访存速度对整体的时间影响不太大，即两种逐行和逐列访问的方法效率差别不大。而当300<N<3000这个区间上时，L2 cache的命中率也会不断下降，使得逐行访问的方法被迫大更大的L3 cache中去寻找数据，这就导致了较大的访存开销，也可以看出两种方法的差距逐渐显现出来。而当数据规模超过3000之后，逐行访问的L3 cache命中率也会降到很低，也就是说逐行访问方法被迫到内存中去寻找数据，这导致的访存开销是非常大的，因此也使得两种方法的访问效率产生了显著的差异。
68 | 
69 | 为了证明上述假设，我们分别采集了数据规模在300和4000的两组实验，通过VTune分析其各级缓存的访存次数的命中率。当N=80时，可以看到，逐行访问方法的L1 cache未命中率要显著高于逐列访问，导致其L2 cache的访问次数增多，但L2 cache大部分情况命中，所以两种方法效率差距不大。当N=300时，逐行访问L2 cache未命中率要显著高于逐列访问，导致其L3 cache的访问次数也显著高于逐列访问，但L3 cache绝大情况下都已命中。而L3 cache的访问时间相对开销较大，因此两种方法的时间差异开始显现出来。当N=4000时，我们可以看到，逐行访问方法的的L3 cache访问次数要远远高出逐列访问的方法，而且L3 cache的未命中率也高达27.1%，需要由大量的内存访问，时间开销会非常大，这也是两种方法产生巨大差距的主要原因。
70 | 
71 | |      | or        |          |          |         |         |        | op        |       |       |      |      |      |
72 | | ---- | --------- | -------- | -------- | ------- | ------- | ------ | --------- | ----- | ----- | ---- | ---- | ---- |
73 | | n    | L1        |          | l2       |         | l3      |        | l1        |       | l2    |      | l3   |      |
74 | | 80   | 78113853  | 318810   | 310911   | 7908    | 264     | 12     | 91436268  | 4082  | 3858  | 137  | 120  | 0    |
75 | | 300  | 145250802 | 13056978 | 12844359 | 212616  | 162792  | 150    | 183033393 | 56220 | 51114 | 5121 | 936  | 0    |
76 | | 4000 | 149440947 | 15846024 | 11914602 | 3931422 | 2389212 | 888993 | 145316769 | 80232 | 76473 | 3762 | 357  | 87   |
77 | 
78 | 对于循环展开方法对逐列访问方式的优化，由于循环展开可以在一个循环周期内利用多条流水线并行执行相同指令，因此能够在一定程度上优化代码运行效率。这一点通过比较两种方法的CPI也能够得到印证。
79 | 
80 | | optimize | optimize+unroll |
81 | | :------: | :-------------: |
82 | |  0.4975  |     0.4761      |
83 | 
84 | ## 超标量优化
85 | 
86 | 对于给定的问题，要求计算N个数的和，对于常规的顺序算法而言， 由于每次都是在同一个累加变量上进行累加，导致只能调用CPU的一条流水线进行处理，无法充分发挥CPU超标量优化的性能，因此考虑使用多链路的方法对传统的链式累加方法进行改进，即设置多个临时变量，在一个循环内同时用着多个临时变量对多个不同的位置进行累加，达到多个位置并行累加的效果，同时还能够减少循环遍历的步长，降低循环开销。由于多链路方法使用了循环展开技术在一定程度上降低了循环的额外开销，为了保证实验的准确性，我们对普通的链式累加方法也要进行同样比例的循环展开，控制实验的可变因素，使得实验结果具有合理的对比性。
87 | 
88 | 通过对比在不同实验规模下的两种方法的运行时间，探究优化加速比同问题规模的变化情况，并分析其中的内在原因。此外，还将会探究在x86架构下，Windows和Linux两种系统对于处理同样规模的问题所需要消耗的绝对时间，以及优化的加速比的情况。
89 | 
90 | 为了方便算法的实现，我们的所有问题规模都取成2的n次幂，由于当问题规模较小的时候，两种算法并没有显著的时间效率差异，因此我们直接扩大了问题规模，分别测试了从n=9到28之间的20组数据，如表所示。
91 | 
92 | 由表中的数据可以看出，无论是链式累加的方法还是多链路展开的方法，由于都属于线性时间效率的方法，因此随着问题规模的翻倍，时间也近似翻倍。采用双链路展开的超标量优化方法的时间效率要显著高于普通的链式累加方法，这是因为，通过多链路的方法，将相互联系的累加解耦成了两路不相关的问题，使得CPU能够同时调用两条流水线处理问题，实现超标量优化的目的。为了进一步探究超标量优化的加速比，我们对这二十组数据计算了优化加速比同问题规模的变化情况，如图所示。
93 | 
94 | <img src="E:\学习资料\大二\并行程序设计\Experiment\Lab1\p2.jpg" alt="p2" style="zoom:50%;" />
95 | 
96 | 通过实验数据可以发现，当问题规模较小的时候，算法的优化加速比比较低，但随着问题规模的增大，算法的优化加速比呈现先增加再保持最后降低的趋势。其增加的原因主要是由于多链路方法将累加拆分成了两个不相关的部分，能够利用CPU的超标量优化，使得算法获得逐渐增高的加速比。当问题问题规模$2^{10}$到$2^{17}$之间的时候，优化加速比基本保持一个稳定的状态，说明此时的超标量优化已经达到一个上限。而当问题规模超过$2^{17}$后，整体呈现一个下降的趋势，猜测是由于问题规模过大，导致缓存不足，由于需要经常进行内外存的访问，导致了较大的访存开销，而这个访存开销占据了程序运行的大部分时间，所以超标量优化的效果被一定程度上减弱。为了验证上述想法，我们利用VTune对下降最显著的点分析各级缓存的访问和命中情况，选取了$2^{22}$这个点分析，如表所示。
97 | 
98 | 通过数据可以得到，当问题规模在$2^{12}$时，所有的问题几乎全部在L1 cache命中，L2 cache和L3 cache在过程中几乎没有访问，而且命中率也几乎在100%。而当问题规模达到$2^{22}$时，虽然L1 cache的命中率还是接近100%，但是已经有了很多的L2 cache和L3 cache访问，而且L3 cache的命中率只有80%，会出现很多的内存访问，这将会极大的影响程序运行的时间，因此印证了我们上面的猜测。
99 | 


--------------------------------------------------------------------------------
/Lab2/ARM/KMeans_SIMD.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <sys/time.h>
  3 | #include <arm_neon.h>
  4 | using namespace std;
  5 | 
  6 | const int N = 4*4*4*4*4*4*4*4;
  7 | const int D = 4;
  8 | const int K = 4;
  9 | const int L = 100;
 10 | const int LOOP = 1;
 11 | 
 12 | float **data;  // 数据集
 13 | float data_align[D][N];
 14 | float centroids[K][D];  // 聚类中心
 15 | int cluster[N];  // 各数据所属类别
 16 | int cntCluster[K];  // 个聚类计数
 17 | 
 18 | void initData();
 19 | void initCentroids();
 20 | void calculate_serial();
 21 | void calculate_parallel();
 22 | void calculate_cache();
 23 | void calculate_align();
 24 | void updateCentroids();
 25 | 
 26 | int main()
 27 | {
 28 |     initData();
 29 |     initCentroids();
 30 |     struct timeval start;
 31 |     struct timeval end;
 32 |     gettimeofday(&start,NULL);
 33 |     for(int i=0; i < LOOP; i++)
 34 |     {
 35 |         calculate_serial();
 36 |         // updateCentroids();
 37 |     }
 38 |     gettimeofday(&end,NULL);
 39 |     cout<<"serial:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 40 | 
 41 |     initData();
 42 |     initCentroids();
 43 |     gettimeofday(&start,NULL);
 44 |     for(int i=0; i < LOOP; i++)
 45 |     {
 46 |         calculate_parallel();
 47 |         // updateCentroids();
 48 |     }
 49 |     gettimeofday(&end,NULL);
 50 |     cout<<"parallel:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 51 | 
 52 |     initData();
 53 |     initCentroids();
 54 |     gettimeofday(&start,NULL);
 55 |     for(int i=0; i < LOOP; i++)
 56 |     {
 57 |         calculate_cache();
 58 |         // updateCentroids();
 59 |     }
 60 |     gettimeofday(&end,NULL);
 61 |     cout<<"parallel_cache:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 62 | 
 63 |     initData();
 64 |     initCentroids();
 65 |     gettimeofday(&start,NULL);
 66 |     for(int i=0; i < LOOP; i++)
 67 |     {
 68 |         calculate_align();
 69 |         // updateCentroids();
 70 |     }
 71 |     gettimeofday(&end,NULL);
 72 |     cout<<"calculate_align:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
 73 | }
 74 | 
 75 | void initData()
 76 | {
 77 |     data = new float*[D];
 78 |     for(int i=0; i < D; i++)
 79 |         data[i] = new float[N];
 80 |     for(int i=0; i<D; i++)
 81 |         for(int j=0; j<N; j++)
 82 |             data[i][j] = rand()*1.0/RAND_MAX * L , data_align[i][j] = rand()*1.0/RAND_MAX * L;
 83 | }
 84 | 
 85 | void initCentroids()
 86 | {
 87 |     for(int i=0; i<K; i++)
 88 |         for(int j=0; j<D; j++)
 89 |             centroids[i][j] = rand()*1.0/RAND_MAX * L;
 90 | }
 91 | 
 92 | void updateCentroids()
 93 | {
 94 |     for(int i=0; i< N; i++)
 95 |         for(int j=0; j<D; j++)
 96 |             centroids[cluster[i]][j] += data[j][i];
 97 |     for(int i=0;i<K;i++)
 98 |         for(int j=0;j<D;j++)
 99 |             centroids[i][j] /= cntCluster[i];
100 | }
101 | 
102 | void calculate_serial()
103 | {
104 |     for(int i = 0; i < N; i++)
105 |     {
106 |         float min_dis = L*L;
107 |         for(int j = 0; j < K; j++)
108 |         {
109 |             float dis = 0;
110 |             for(int d = 0; d < D; d++)
111 |                 dis += (data[d][i] - centroids[j][d]) * (data[d][i] - centroids[j][d]);
112 |             if(dis < min_dis)
113 |                 min_dis = dis,cluster[i] = j,cntCluster[j]++;
114 |         }
115 |     }
116 | }
117 | 
118 | void calculate_parallel()
119 | {
120 |     cout<<data<<endl;
121 |     for(int i = 0; i < N - N % 4; i+=4)
122 |     {
123 |         float tmp[4] = {L*L, L*L, L*L, L*L}; 
124 |         float32x4_t min_dis = vld1q_f32(tmp);
125 |         for(int j = 0; j < K; j++)
126 |         {
127 |             float32x4_t distance = vdupq_n_f32(0.0);
128 |             for(int d = 0; d < D; d++)
129 |             {
130 |                 // 构造质心的某一维度数据
131 |                 float tmp_centroid_d[4] = {centroids[j][d], centroids[j][d], centroids[j][d], centroids[j][d]};
132 |                 float32x4_t centroid_d = vld1q_f32(tmp_centroid_d);
133 |                 // 一次取出四个元素的某一维度数据
134 |                 float32x4_t data_d = vld1q_f32(&data[d][i]);
135 |                 // 对每一数据该维度计算差值
136 |                 float32x4_t delta = vsubq_f32(data_d,centroid_d);
137 |                 // 对每一数据该维度累加距离
138 |                 distance = vmlaq_f32(distance, delta, delta);
139 |             }
140 |             // 判断当前的每一个数据到该质心的距离是否是最小的
141 |             float disK[4];
142 |             vst1q_f32(disK, distance);
143 |             for(int k = 0; k < 4; k++)
144 |             {
145 |                 if(disK[k] < min_dis[k])
146 |                 {
147 |                     min_dis[k] = disK[k];
148 |                     cluster[i+k] = j;
149 |                 }
150 |             }
151 |         }
152 |     }
153 | }
154 | 
155 | void calculate_cache()
156 | {
157 |     cout<<data<<endl;
158 |     float min_distance[N] = {0.0};
159 |     for(int j = 0; j < K; j++)
160 |     {
161 |         // 各个点到各个聚类中心的距离
162 |         float dis_k[N] = {0.0};
163 |         for(int d = 0; d < D; d++)
164 |         {
165 |             for(int i = 0; i < N - N % 4; i+=4)
166 |             {
167 |                 // 构造质心的某一维度数据
168 |                 float tmp_centroid_d[4] = {centroids[j][d], centroids[j][d], centroids[j][d], centroids[j][d]};
169 |                 float32x4_t centroid_d = vld1q_f32(tmp_centroid_d);
170 |                 // 一次取出四个元素的某一维度数据
171 |                 float32x4_t data_d = vld1q_f32(&data[d][i]);
172 |                 // 对每一数据该维度计算差值
173 |                 float32x4_t delta = vsubq_f32(data_d,centroid_d);
174 |                 // 取出原始积累的距离
175 |                 float32x4_t distance = vld1q_f32(&dis_k[i]);
176 |                 // 对每一数据该维度累加距离
177 |                 distance = vmlaq_f32(distance, delta, delta);
178 |                 // 存回
179 |                 vst1q_f32(&dis_k[i], distance);
180 |             }
181 |         }
182 |         // 判断当前的每一个数据到该质心的距离是否是最小的
183 |         for(int i = 0; i < N ; i++)
184 |             if(dis_k[i]<min_distance[i])
185 |                 min_distance[i] = dis_k[i],cluster[i] = j;
186 |     }
187 | }
188 | 
189 | void calculate_align()
190 | {
191 |     cout<<data_align<<endl;
192 |     for(int i = 0; i < N - N % 4; i+=4)
193 |     {
194 |         float tmp[4] = {L*L, L*L, L*L, L*L}; 
195 |         float32x4_t min_dis = vld1q_f32(tmp);
196 |         for(int j = 0; j < K; j++)
197 |         {
198 |             float32x4_t distance = vdupq_n_f32(0.0);
199 |             for(int d = 0; d < D; d++)
200 |             {
201 |                 // 构造质心的某一维度数据
202 |                 float tmp_centroid_d[4] = {centroids[j][d], centroids[j][d], centroids[j][d], centroids[j][d]};
203 |                 float32x4_t centroid_d = vld1q_f32(tmp_centroid_d);
204 |                 // 一次取出四个元素的某一维度数据
205 |                 float32x4_t data_d = vld1q_f32(&data_align[d][i]);
206 |                 // 对每一数据该维度计算差值
207 |                 float32x4_t delta = vsubq_f32(data_d,centroid_d);
208 |                 // 对每一数据该维度累加距离
209 |                 distance = vmlaq_f32(distance, delta, delta);
210 |             }
211 |             // 判断当前的每一个数据到该质心的距离是否是最小的
212 |             float disK[4];
213 |             vst1q_f32(disK, distance);
214 |             for(int k = 0; k < 4; k++)
215 |             {
216 |                 if(disK[k] < min_dis[k])
217 |                 {
218 |                     min_dis[k] = disK[k];
219 |                     cluster[i+k] = j;
220 |                 }
221 |             }
222 |         }
223 |     }
224 | }


--------------------------------------------------------------------------------
/Lab2/ARM/KMeans_neon.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | #include <arm_neon.h>
 4 | 
 5 | const int N = 4*4*4*4*4*4*4*4;
 6 | const int D = 2;
 7 | const int K = 4;
 8 | const int L = 100;
 9 | const int LOOP = 100;
10 | 
11 | float data[D][N];  // 数据集  转置
12 | float centroids[K][D];  // 聚类中心
13 | int cluster[N];  // 各数据所属类别
14 | int cntCluster[K];  // 个聚类计数
15 | 
16 | void initData();
17 | void initCentroids();
18 | void calculate();
19 | void updateCentroids();
20 | 
21 | int main()
22 | {
23 |     initData();
24 |     initCentroids();
25 |     struct timeval start;
26 |     struct timeval end;
27 |     gettimeofday(&start,NULL);
28 |     for(int i=0; i < LOOP; i++)
29 |     {
30 |         calculate();
31 |         updateCentroids();
32 |     }
33 |     gettimeofday(&end,NULL);
34 |     cout<<"parallel:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
35 | }
36 | 
37 | void initData()
38 | {
39 |     for(int i=0; i<D; i++)
40 |         for(int j=0; j<N; j++)
41 |             data[i][j] = rand()*1.0/RAND_MAX * L;
42 | }
43 | 
44 | void initCentroids()
45 | {
46 |     for(int i=0; i<K; i++)
47 |         for(int j=0; j<D; j++)
48 |             centroids[i][j] = rand()*1.0/RAND_MAX * L;
49 | }
50 | 
51 | void calculate()
52 | {
53 |     for(int i = 0; i < N - N % 4; i+=4)
54 |     {
55 |         float tmp[4] = {L*L, L*L, L*L, L*L}; 
56 |         float32x4_t min_dis = vld1q_f32(tmp);
57 |         for(int j = 0; j < K; j++)
58 |         {
59 |             float32x4_t distance = vdupq_n_f32(0.0);
60 |             for(int d = 0; d < D; d++)
61 |             {
62 |                 // 构造质心的某一维度数据
63 |                 float tmp_centroid_d[4] = {centroids[j][d], centroids[j][d], centroids[j][d], centroids[j][d]};
64 |                 float32x4_t centroid_d = vld1q_f32(tmp_centroid_d);
65 |                 // 一次取出四个元素的某一维度数据
66 |                 float32x4_t data_d = vld1q_f32(&data[d][i]);
67 |                 // 对每一数据该维度计算差值
68 |                 float32x4_t delta = vsubq_f32(data_d,centroid_d);
69 |                 // 对每一数据该维度累加距离
70 |                 distance = vmlaq_f32(distance, delta, delta);
71 |             }
72 |             // 判断当前的每一个数据到该质心的距离是否是最小的
73 |             float disK[4];
74 |             vst1q_f32(disK, distance);
75 |             for(int k = 0; k < 4; k++)
76 |             {
77 |                 if(disK[k] < min_dis[k])
78 |                 {
79 |                     min_dis[k] = disK[k];
80 |                     cluster[i+k] = j;
81 |                 }
82 |             }
83 |         }
84 |     }
85 | }
86 | 
87 | void updateCentroids()
88 | {
89 |     for(int i=0; i< N; i++)
90 |         for(int j=0; j<D; j++)
91 |             centroids[cluster[i]][j] += data[j][i];
92 |     for(int i=0;i<K;i++)
93 |         for(int j=0;j<D;j++)
94 |             centroids[i][j] /= cntCluster[i];
95 | }
96 | 


--------------------------------------------------------------------------------
/Lab2/ARM/KMeans_neon_cache.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | #include <arm_neon.h>
 4 | 
 5 | const int N = 10000;
 6 | const int D = 2;
 7 | const int K = 4;
 8 | const int L = 100;
 9 | const int LOOP = 100;
10 | 
11 | float data[D][N];  // 数据集  转置
12 | float centroids[K][D];  // 聚类中心
13 | int cluster[N];  // 各数据所属类别
14 | int cntCluster[K];  // 个聚类计数
15 | 
16 | void initData();
17 | void initCentroids();
18 | void calculate_cache();
19 | void updateCentroids();
20 | 
21 | int main()
22 | {
23 |     initData();
24 |     initCentroids();
25 |     struct timeval start;
26 |     struct timeval end;
27 |     gettimeofday(&start,NULL);
28 |     for(int i=0; i < LOOP; i++)
29 |     {
30 |         calculate();
31 |         updateCentroids();
32 |     }
33 |     gettimeofday(&end,NULL);
34 |     cout<<"parallel_cache:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
35 | }
36 | 
37 | void initData()
38 | {
39 |     for(int i=0; i<D; i++)
40 |         for(int j=0; j<N; j++)
41 |             data[i][j] = rand()*1.0/RAND_MAX * L;
42 | }
43 | 
44 | void initCentroids()
45 | {
46 |     for(int i=0; i<K; i++)
47 |         for(int j=0; j<D; j++)
48 |             centroids[i][j] = rand()*1.0/RAND_MAX * L;
49 | }
50 | 
51 | void calculate_cache()
52 | {
53 |     float min_distance[N] = {0.0};
54 |     for(int j = 0; j < K; j++)
55 |     {
56 |         // 各个点到各个聚类中心的距离
57 |         float dis_k[N] = {0.0};
58 |         for(int d = 0; d < D; d++)
59 |         {
60 |             for(int i = 0; i < N - N % 4; i+=4)
61 |             {
62 |                 // 取出原始积累的距离
63 |                 float32x4_t distance = vld1q_f32(&dis_k[i]);
64 |                 // 构造质心的某一维度数据
65 |                 float tmp_centroid_d[4] = {centroids[j][d], centroids[j][d], centroids[j][d], centroids[j][d]};
66 |                 float32x4_t centroid_d = vld1q_f32(tmp_centroid_d);
67 |                 // 一次取出四个元素的某一维度数据
68 |                 float32x4_t data_d = vld1q_f32(&data[d][i]);
69 |                 // 对每一数据该维度计算差值
70 |                 float32x4_t delta = vsubq_f32(data_d,centroid_d);
71 |                 // 对每一数据该维度累加距离
72 |                 distance = vmlaq_f32(distance, delta, delta);
73 |                 // 存回
74 |                 vst1q_f32(&dis_k[i], distance);
75 |             }
76 |         }
77 |         // 判断当前的每一个数据到该质心的距离是否是最小的
78 |         for(int i = 0; i < N ; i++)
79 |             if(dis_k[i]<min_distance[i])
80 |                 min_distance[i] = dis_k[i],cluster[i] = j;
81 |     }
82 | }
83 | 
84 | void updateCentroids()
85 | {
86 |     for(int i=0; i< N; i++)
87 |         for(int j=0; j<D; j++)
88 |             centroids[cluster[i]][j] += data[j][i];
89 |     for(int i=0;i<K;i++)
90 |         for(int j=0;j<D;j++)
91 |             centroids[i][j] /= cntCluster[i];
92 | }
93 | 


--------------------------------------------------------------------------------
/Lab2/ARM/KMeans_serial.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <sys/time.h>
 3 | using namespace std;
 4 | 
 5 | const int N = 10000;
 6 | const int D = 2;
 7 | const int K = 4;
 8 | const int L = 100;
 9 | const int LOOP = 100;
10 | 
11 | float data[D][N];  // 数据集
12 | float centroids[K][D];  // 聚类中心
13 | int cluster[N];  // 各数据所属类别
14 | int cntCluster[K];  // 个聚类计数
15 | 
16 | void initData();
17 | void initCentroids();
18 | void calculate();
19 | void updateCentroids();
20 | 
21 | int main()
22 | {
23 |     initData();
24 |     initCentroids();
25 |     struct timeval start;
26 |     struct timeval end;
27 |     gettimeofday(&start,NULL);
28 |     for(int i=0; i < LOOP; i++)
29 |     {
30 |         calculate();
31 |         updateCentroids();
32 |     }
33 |     gettimeofday(&end,NULL);
34 |     cout<<"serial:"<<((end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec))*1.0/1000/LOOP<<"ms"<<endl;
35 |     system("pause");
36 | }
37 | 
38 | void initData()
39 | {
40 |     for(int i=0; i<D; i++)
41 |         for(int j=0; j<N; j++)
42 |             data[i][j] = rand()*1.0/RAND_MAX * L;
43 | }
44 | 
45 | void initCentroids()
46 | {
47 |     for(int i=0; i<K; i++)
48 |         for(int j=0; j<D; j++)
49 |             centroids[i][j] = rand()*1.0/RAND_MAX * L;
50 | }
51 | 
52 | void calculate()
53 | {
54 |     for(int i = 0; i < N; i++)
55 |     {
56 |         float min_dis = L*L;
57 |         for(int j = 0; j < K; j++)
58 |         {
59 |             float dis = 0;
60 |             for(int d = 0; d < D; d++)
61 |                 dis += (data[d][i] - centroids[j][d]) * (data[d][i] - centroids[j][d]);
62 |             if(dis < min_dis)
63 |                 min_dis = dis,cluster[i] = j,cntCluster[j]++;
64 |         }
65 |     }
66 | }
67 | 
68 | void updateCentroids()
69 | {
70 |     for(int i=0; i< N; i++)
71 |         for(int j=0; j<D; j++)
72 |             centroids[cluster[i]][j] += data[j][i];
73 |     for(int i=0;i<K;i++)
74 |         for(int j=0;j<D;j++)
75 |             centroids[i][j] /= cntCluster[i];
76 | }


--------------------------------------------------------------------------------
/Lab2/Lab2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab2/Lab2.pdf


--------------------------------------------------------------------------------
/Lab2/report/Lab2.md:
--------------------------------------------------------------------------------
  1 | # SIMD
  2 | 
  3 | ## 问题重述
  4 | 
  5 | ### 研究任务
  6 | 
  7 | 对于KMeans聚类算法，其基本思想是根据数据点在所有属性上的相似性，将数据点划分成给定的K个集合。在划分的过程中，会涉及到大量有关距离的计算，而这类计算规则简单，大量重复，而且数据之间没有依赖性，十分适合进行并行化处理。在本次基于SIMD的并行优化实验中，我们主要针对在KMeans算法中，每次迭代中对数据点的划分进行并行化处理。
  8 | 
  9 | 每次迭代的过程中主要包含两个过程：
 10 | 
 11 | 1. 对于每个数据点，计算其到各个质心之间的距离
 12 | 2. 针对每个数据点，选择其中距离最近的质心，将其划分到其所属的集合中。
 13 | 
 14 | 而本次实验也正是针对这两个过程，采用SIMD的方式进行并行化处理，探究在不同平台，不同指令集架构下，SIMD对于KMeans算法的并行优化效果。
 15 | 
 16 | ### 数学描述
 17 | 
 18 | 对于给定的$n$组数据$\{x_1,x_2,\cdots, x_n\}$，每个数据属性的维度均为$d$维$x(a_1,a_2,\cdots,a_d)$，通过计算数据之间的相似度，将这$n$个数据划分成为$K$组。假设这$K$组分类记为$S=\{s_1,s_2,\cdots ,s_k\}$，$K$组分类中心的集合记为$C=\{c_1,c_2, \cdots , c_k\}$。
 19 | 
 20 | 针对第$i$个节点，计算这个节点到第$k$个质心的距离
 21 | $$
 22 | L_{ik}=\sqrt{\sum_{j=1}^{d}(x_i(a_j)-c_k(a_j))^2} \ \ \ \ ,1 \leq k \leq K
 23 | $$
 24 | 针对第$i$个节点，选择距离他最近的质心，并将其划分如该质心所属集合中
 25 | $$
 26 | \min\{L_{ik}\},1 \leq k \leq K
 27 | $$
 28 | 
 29 | ## 实验设计
 30 | 
 31 | 考虑到KMeans算法每次迭代过程中，都需要重复计算各个数据点到所有质心的距离，而这个过程，对于每个数据点而言，其计算的方法是相同的，并且在不同数据点之间是数据是没有依赖性的，因此，这个过程十分适合进行向量化的处理，因此我们对KMeans的并行优化首先从对各个数据点计算其到所有质心的距离这一过程开始。
 32 | 
 33 | ### 并行处理
 34 | 
 35 | 针对计算距离这一过程，我们考虑采用向量化的手段进行处理，即每次取出多个数据点同一维度的数据存储到向量寄存器当中，再构造某一个质心同一维度的一个向量寄存器，首先利用CPU中有关向量减法的指令，计算出这多个数据点和该质心在这一维度上的距离差，然后将这个距离差利用向量对位乘法相乘，得到平方距离。再循环对这多个数据点在其他维度上重复上述操作，并累加这个距离，直到将这多个数据点各个维度的距离全部计算累加完成。再继续去取第二组数据点重复计算上述过程。
 36 | 
 37 | 为了能够让向量寄存器一次从连续的内存地址中取出多个数据点在某一维度的值，我们考虑将存储数据点和各个维度数值的矩阵进行转置，即用每一行记录$N$个数据点在某一维度上的数据，因此转置后的矩阵为$D$行$N$列。这样，我们在取出多个数据点同一维度的数据时，就不需要进行通道取值或者数据拼接，而是直接传入起始元素的地址即可。这样能够提高我们并行算法的优化效率。
 38 | 
 39 | 而对于如何高效筛选数据点所属质心，考虑采用控制流的方式，将每一轮计算出来的数据点到某一质心的距离与存放最近质心距离的数组做差，根据这个差值是否为零来建立控制流，根据这个控制流来筛选是否要更新最近的质心距离和最近的质心标号。
 40 | 
 41 | ### cache优化
 42 | 
 43 | 在初步的并行处理中，我们是每次取出同一行中多个数据点某一维度的数据，接下来是对这多个数据点的各个维度上的数据重复计算距离质心距离的操作。这种操作方式，访问时是按照列去访问的，即当前维度的数据计算完之后，将会到下一行去取下一个维度的数据重复计算操作。当数据规模比较大的时候，这种操作会导致大量cache miss，因此会产生很多访存操作，这回极大影响并行算法的优化效果。
 44 | 
 45 | 因此我们考虑对初步的并行算法进行cache优化，即尽可能使得数据能够在缓存中命中，来减少访存导致的额外开销。对于并行算法的cache优化，我们考虑改变循环的顺序，最外层循环是对维度的迭代，内层循环是对数据的迭代。即每一次内层循环会将所有数据点某一维度与质心之间的距离全部计算出来，然后在外层循环中去迭代各个维度的数据。
 46 | 
 47 | 这样的改进方式，使得内层循环在每一步中都是相邻的移动，不会出现跨行的数据读取，这样就会提高数据的cache命中率，进而能够进一步优化并行算法的效率。
 48 | 
 49 | ### 内存地址对齐
 50 | 
 51 | 考虑到对数据进行向量化存取的时候，如果取数据的原始内存不是按照存取规模的大小内存对齐的，则会导致在每一次数据向量化读取的时候，由于内存不对齐，而需要进行两次内存访问，然后将得到的数据再进行一次拼接，返回给向量寄存器。当数据量很大的时候，这种额外的开销就会十分明显。
 52 | 
 53 | 因此考虑对并行算法进行内存对齐。为了能够简化算法实现，并且还能够对比出内存对齐与否对并行算法的影响，我们将数据规模全部设置为16的倍数，这样只需要对数组的第一行做一次对齐处理即可。因为每一行是紧密排列的，因此当对第一行进行对齐处理之后，后续其他行做对应位置的存取时，内存也是对齐的。
 54 | 
 55 | 对于内存地址的对齐，首先判断存放数据数组的每一行的首地址是否是对齐的，如果是对齐的则可以不做任何处理。如果不对齐，则对该行起始的$m$个未对齐的元素进行串行化处理，由于在这个过程中，后续行的起始$m$个元素也会同样做串行处理，因此在此之后，每一行的剩余部分都是地址对齐的。此外，对于开头做串行处理之后，还可能会导致每一行剩余的元素不足，因此需要对每一行剩余的元素同样做串行处理，保证算法不遗漏数据。
 56 | 
 57 | ### x86平台
 58 | 
 59 | 本次实验除了对ARM架构下采用neon指令集架构对KMeans算法进行并行化处理，还将算法迁移到了x86平台上，采用x86中的SSE、AVX和AVX512指令集架构分别对算法进行重构，然后对比实验效果。
 60 | 
 61 | 考察了本机所支持的指令集架构，情况如表所示
 62 | 
 63 | 虽然本机只支持AVX512F，但是AVX512F是AVX512的基本子集，能够支持基本的向量化计算，因此在本次实验中能够支持AVX512指令集架构的实验。
 64 | 
 65 | ##  实验分析
 66 | 
 67 | ### ARM平台
 68 | 
 69 | 考虑到在我们的实验数据中总共有两个影响因素，一个是数据点的数量$N$，另一个因素是数据的维度$D$，因此当考虑并行算法的优化效果的时候，需要综合考虑这两方面的因素。为了能够对比出在不同数据规模和不同数据维度下，并行算法的优化效果，我们采用控制变量的方式，分别研究并行算法的优化效果随着数据规模和数据维度两个因素的变化情况。
 70 | 
 71 | 为了能够保证实验设计在后续内存对齐方面研究相对简单，我们将数据规模全部设定为$4^n$，这样在研究内存是否对齐对实验效果的影响时，各行的内存对齐情况是完全一致的，不需要对每一行的数据进行单独的内存对齐调整。为了数据的规整，我们将数据的维度数值也设定为$4^d$。
 72 | 
 73 | 分别调整$n$和$d$的取值，研究串行算法、普通并行算法、cache优化算法以及内存对齐算法的时间表现情况，具体分析如下。
 74 | 
 75 | #### 数据规模角度
 76 | 
 77 | 选定数据维度为$4^2$时，各种优化算法表现效果随数据规模$n$的变化趋势如表所示，根据所得的实验数据，计算出各种优化算法的加速比，如表所示。
 78 | 
 79 | <img src="E:\project\parallel\Lab2\report\n_sp.jpg" alt="n_sp" style="zoom:50%;" />
 80 | 
 81 | 可以发现，在较小数据规模下，串行算法的优化效果并不明显，这是因为串行算法虽然能够同时并行多条数据，但是每一条指令所消耗的时间比加长，因此在较小的数据规模之下，这种优化效果并不明显。尤其是对于cache优化的并行算法，由于cache优化算法所需要的指令数比较多，因此在较小规模的问题上反倒是表现效果更差。当数据规模达到$4^8$以上时，三种并行优化算法都能够取得十分明显的优化效果。这符合我们常规的认知。通过perf进行对三种方法的相关数据进行分析，所得数据如下表所示。
 82 | 
 83 | 从表格中可以看出，对于未进行cache优化的两个并行算法，其优化主要是减少了程序运行的所需要的cycle，因为其一次可以取出多条数据进行计算，因此其所需要的周期数自然要少于串行算法，这也是普通并行优化算法能够取得性能提升的原因。而对于内存是否对齐，我们可以看到，内存对齐的算法由于不涉及到数据的拼接，因此其所需要的指令数会略低于内存不对齐的算法，这也是其性能要略高于内存未对齐的算法的原因。注意到，利用了cache优化的算法当数据规模增大时，具有明显的优化效果，通过观察perf的结果可以得到，在相同的数据规模下，未采用cache优化的并行算法，其L1 cache未命中的概率要远远高于利用cache优化的算法，因此随着问题规模的增加，不采用cache优化的算法会遇到性能瓶颈，而采用了cache优化的算法能够取得更好的性能提升。
 84 | 
 85 | #### 数据维度角度
 86 | 
 87 | 选定数据规模为$4^8$时，各种优化算法表现效果随数据维度$d$的变化趋势如表所示，根据所得的实验数据，计算出各种优化算法的加速比，如表所示。
 88 | 
 89 | <img src="E:\project\parallel\Lab2\report\d_sp.jpg" alt="d_sp" style="zoom:50%;" />
 90 | 
 91 | 由于我们选定的数据规模已经达到了一定程度，所以在较小的问题规模下，三种并行优化算法都能够取得不错的优化算法。但是随着数据维度的指数级增加，未采用cache优化的两种算法在总体的增长趋势和幅度上大致相同，但是采用了cache优化的算法却能够取得极高的加速比。未采用cache优化的两种算法其加速比基本已经达到了理论上的4倍，原因是同时进行了四路展开。而采用了cache优化的算法却能够取得远超过4倍的加速比，这是因为cache优化能够极大的提高数据在缓存中的命中率。而且在本次实验的算法设计中，行存储的是$N$个数据同一维度的数据，而每一列存储的是同一个数据点不同维度的值。因此当数据规模较大，且数据维度较大的时候，每一次访问下一行的维度值时，都可能导致cache miss，而cache优化的算法是按照行进行计算的，因此cache miss率比较低。这种效应当数据维度较高时会更加明显。通过perf对程序执行过程中的具体指标进行观测，如下表所示。
 92 | 
 93 | 从表中的数据我们也能够看出，内存对齐和内存不对齐的算法其差异主要体现在指令数上，由于内存不对齐导致数据拼接会产生额外的指令。而cache优化后，可以发现，其cache命中率是远远高于串行算法和另外俩各种为进行cache优化的算法。而这三种算法的加速比表现均要高于对数据规模探索的实验结果，其原因主要是由于增加了外层循环数，导致串行算法所需要的时钟周期大大增加，而这对并行算法的影响不大，因此三种并行算法的加速比均有较显著的提升。
 94 | 
 95 | ### x86平台
 96 | 
 97 | 在x86平台上，主要尝试了SSE、AVX和AVX512这三种并行指令集架构，编写了不同的串行算法和采用这三种并行架构的并行算法，并且对比了这三种架构下数据是否内存对齐对于并行算法的优化效果的影响。
 98 | 
 99 | 在x86平台下的实验设计思路基本与ARM平台一致，即采用向量化的存取手段，一次性存取多个数据元素，准备好连续多个数据某一维度的数值作为一个向量寄存器的数值，再构造一个某一质心该维度对应的一个向量寄存器，然后利用向量化的运算，首先进行对位相减，然后再将差对位相乘，在每一维度累加这个操作的结果，就可以得到这些数据点到某一质心的距离。再通过控制流来判断是否要更新这些数据点所属的划分以及到最近质心的距离。
100 | 
101 | 为了能够充分体现出并行算法的优化效果，进而能够去横向对比不同指令集架构的优化效果，在实验测试时，直接选取了较大的数据规模和较高的数据维度进行测试，当数据规模为$4^6$，数据维度为$4^8$时，各种指令集架构下的并行算法优化效果如下表所示
102 | 
103 | 通过数据对比可以发现，由于采取的指令集不同，随着向量寄存器所能容纳的数据增多，并行算法的优化效果也在不断提升。这主要是由于采用向量化的优化，能够一次性处理多条数据，当数据规模和数据维度足够大的时候，各种并行算法的理论加速比应该逼近其向量寄存器一次性能够处理的数据的数量。对于内存是否对齐的算法，在本次实验测试中的效果并不明显，但是也能够看出内存对齐的并行算法在各种指令集架构下，均较内存未对齐的并行算法有更好的效果表现。
104 | 
105 | 实验使用了VTune性能分析工具对本次实验结果的所对应的CPU硬件事件进行进一步的分析，其具体数据如下表所示。
106 | 
107 | 通过对比指令数可以发现，三种并行优化算法在指令数上均远远小于串行算法，并且其指令数与串行算法的比值也基本接近加速比，因此可以得出并行算法的优化效果主要体现在能够减少指令数上。对比了对齐和不对齐算法的差异，也主要体现在了算法所需要的指令数上，由于不对齐算法需要进行数据的拼接，因此其指令数均略高于内存对齐的算法。
108 | 
109 | ## 总结
110 | 
111 | 在本次SIMD的并行实验中，对KMeans算法进行了并行化处理，主要针对算法中的两部分进行优化：第一部分是在每次循环迭代中对所有的数据点计算到各个质心的距离，第二部分是对每个数据点筛选距离其最近的质心进行划分。对这两部分分别进行了向量化的处理，并辅以控制流进行选择控制。实验不仅在ARM平台上完成了Neon指令集下的并行算法，对比了内存对齐与内存不对齐算法的性能表现，并且对该算法进行了cache优化，取得了显著的效果提升。此外，在本次实验中还尝试了在x86平台上，选择SSE、AVX和AVX512指令集架构重构KMeans并行算法，并且对比了不同指令集下内存对齐与内存不对齐算法的性能表现。通过使用perf和VTune等性能分析工具，进一步分析并行算法能够取得性能提升的深层原因，也能够对比发现内存对齐算法和内存不对齐算法之间产生性能差异的原因。本次实验的相关代码和文档已经上传至
112 | 


--------------------------------------------------------------------------------
/Lab2/report/d_sp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab2/report/d_sp.jpg


--------------------------------------------------------------------------------
/Lab2/report/n_sp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab2/report/n_sp.jpg


--------------------------------------------------------------------------------
/Lab2/x86/KMeans_SIMD.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <xmmintrin.h> //SSE
  3 | #include <emmintrin.h> //SSE2
  4 | #include <pmmintrin.h> //SSE3
  5 | #include <tmmintrin.h> //SSSE3
  6 | #include <smmintrin.h> //SSE4.1
  7 | #include <nmmintrin.h> //SSSE4.2
  8 | #include <immintrin.h> //AVX、AVX2
  9 | #include <windows.h>
 10 | using namespace std;
 11 | 
 12 | const int N = 16 * 16 * 16;
 13 | const int D = 16 * 16 * 16 * 16;
 14 | const int K = 4;
 15 | const int L = 100;
 16 | const int LOOP = 1;
 17 | 
 18 | float **data; // 数据集
 19 | float **data_align;
 20 | float centroids[K][D]; // 聚类中心
 21 | int cluster[N];        // 各数据所属类别
 22 | int cntCluster[K];     // 个聚类计数
 23 | 
 24 | void initData();
 25 | void initCentroids();
 26 | void calculate_serial();
 27 | void calculate_SSE_aligned();
 28 | void calculate_SSE_unaligned();
 29 | void calculate_AVX_aligned();
 30 | void calculate_AVX_unaligned();
 31 | void calculate_AVX512_aligned();
 32 | void calculate_AVX512_unaligned();
 33 | void updateCentroids();
 34 | 
 35 | int main()
 36 | {
 37 |     long long int head, tail, freq;
 38 |     initData();
 39 |     initCentroids();
 40 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
 41 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
 42 |     for (int i = 0; i < LOOP; i++)
 43 |     {
 44 |         calculate_serial();
 45 |         // updateCentroids();
 46 |     }
 47 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
 48 |     cout << "calculate_serial: " << (tail - head) * 1000.0 / freq / LOOP << endl;
 49 | 
 50 |     initData();
 51 |     initCentroids();
 52 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
 53 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
 54 |     for (int i = 0; i < LOOP; i++)
 55 |     {
 56 |         calculate_SSE_aligned();
 57 |         // updateCentroids();
 58 |     }
 59 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
 60 |     cout << "calculate_SSE_aligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
 61 | 
 62 |     initData();
 63 |     initCentroids();
 64 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
 65 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
 66 |     for (int i = 0; i < LOOP; i++)
 67 |     {
 68 |         calculate_SSE_unaligned();
 69 |         // updateCentroids();
 70 |     }
 71 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
 72 |     cout << "calculate_SSE_unaligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
 73 | 
 74 |     initData();
 75 |     initCentroids();
 76 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
 77 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
 78 |     for (int i = 0; i < LOOP; i++)
 79 |     {
 80 |         calculate_AVX_aligned();
 81 |         // updateCentroids();
 82 |     }
 83 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
 84 |     cout << "calculate_AVX_aligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
 85 | 
 86 |     initData();
 87 |     initCentroids();
 88 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
 89 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
 90 |     for (int i = 0; i < LOOP; i++)
 91 |     {
 92 |         calculate_AVX_unaligned();
 93 |         // updateCentroids();
 94 |     }
 95 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
 96 |     cout << "calculate_AVX_unaligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
 97 | 
 98 |     initData();
 99 |     initCentroids();
100 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
101 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
102 |     for (int i = 0; i < LOOP; i++)
103 |     {
104 |         calculate_AVX512_aligned();
105 |         // updateCentroids();
106 |     }
107 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
108 |     cout << "calculate_AVX512_aligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
109 | 
110 |     initData();
111 |     initCentroids();
112 |     QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
113 |     QueryPerformanceCounter((LARGE_INTEGER *)&head);
114 |     for (int i = 0; i < LOOP; i++)
115 |     {
116 |         calculate_AVX512_unaligned();
117 |         // updateCentroids();
118 |     }
119 |     QueryPerformanceCounter((LARGE_INTEGER *)&tail);
120 |     cout << "calculate_AVX512_unaligned: " << (tail - head) * 1000.0 / freq / LOOP << endl;
121 | 
122 |     system("pause");
123 | }
124 | 
125 | void initData()
126 | {
127 |     data = new float *[D];
128 |     data_align = (float **)malloc(sizeof(float *) * D);
129 |     for (int i = 0; i < D; i++)
130 |         data[i] = new float[N + 1], data_align[i] = (float *)_aligned_malloc(sizeof(float) * N, 64);
131 |     for (int i = 0; i < D; i++)
132 |         for (int j = 0; j < N; j++)
133 |             data[i][j + 1] = rand() * 1.0 / RAND_MAX * L, data_align[i][j] = rand() * 1.0 / RAND_MAX * L;
134 | }
135 | 
136 | void initCentroids()
137 | {
138 |     for (int i = 0; i < K; i++)
139 |         for (int j = 0; j < D; j++)
140 |             centroids[i][j] = rand() * 1.0 / RAND_MAX * L;
141 | }
142 | 
143 | void updateCentroids()
144 | {
145 |     for (int i = 0; i < N; i++)
146 |         for (int j = 0; j < D; j++)
147 |             centroids[cluster[i]][j] += data[j][i];
148 |     for (int i = 0; i < K; i++)
149 |         for (int j = 0; j < D; j++)
150 |             centroids[i][j] /= cntCluster[i];
151 | }
152 | 
153 | void calculate_serial()
154 | {
155 |     for (int i = 1; i < N + 1; i++)
156 |     {
157 |         float min_dis = L * L;
158 |         for (int j = 0; j < K; j++)
159 |         {
160 |             float dis = 0;
161 |             for (int d = 0; d < D; d++)
162 |                 dis += (data[d][i] - centroids[j][d]) * (data[d][i] - centroids[j][d]);
163 |             if (dis < min_dis)
164 |                 min_dis = dis, cluster[i] = j, cntCluster[j]++;
165 |         }
166 |     }
167 | }
168 | 
169 | void calculate_SSE_aligned()
170 | {
171 |     float min_distance[N] = {0.0};
172 |     for (int j = 0; j < K; j++)
173 |     {
174 |         // 各个点到各个聚类中心的距离
175 |         float dis_k[N] = {0.0};
176 |         for (int d = 0; d < D; d++)
177 |         {
178 |             for (int i = 0; i < N - N % 4; i += 4)
179 |             {
180 |                 // 取出原始积累的距离
181 |                 __m128 distance = _mm_loadu_ps(&dis_k[i]);
182 |                 // 构造质心的某一维度数据
183 |                 float tmp_centroid_d[4] = {centroids[j][d]};
184 |                 __m128 centroid_d = _mm_loadu_ps(tmp_centroid_d);
185 |                 // 一次取出四个元素的某一维度数据
186 |                 __m128 data_d = _mm_load_ps(&data_align[d][i]);
187 |                 // 对每一数据该维度计算差值
188 |                 __m128 delta = _mm_sub_ps(data_d, centroid_d);
189 |                 // 对每一数据该维度累加距离
190 |                 distance = _mm_add_ps(distance, _mm_mul_ps(delta, delta));
191 |                 // 存回
192 |                 _mm_storeu_ps(&dis_k[i], distance);
193 |             }
194 |         }
195 |         // 判断当前的每一个数据到该质心的距离是否是最小的
196 |         for (int i = 0; i < N; i++)
197 |             if (dis_k[i] < min_distance[i])
198 |                 min_distance[i] = dis_k[i], cluster[i] = j;
199 |     }
200 | }
201 | 
202 | void calculate_SSE_unaligned()
203 | {
204 |     float min_distance[N + 1] = {0.0};
205 |     for (int j = 0; j < K; j++)
206 |     {
207 |         // 各个点到各个聚类中心的距离
208 |         float dis_k[N] = {0.0};
209 |         for (int d = 0; d < D; d++)
210 |         {
211 |             for (int i = 1; i < N - N % 4 + 1; i += 4)
212 |             {
213 |                 // 取出原始积累的距离
214 |                 __m128 distance = _mm_loadu_ps(&dis_k[i]);
215 |                 // 构造质心的某一维度数据
216 |                 float tmp_centroid_d[4] = {centroids[j][d]};
217 |                 __m128 centroid_d = _mm_loadu_ps(tmp_centroid_d);
218 |                 // 一次取出四个元素的某一维度数据
219 |                 __m128 data_d = _mm_loadu_ps(&data[d][i]);
220 |                 // 对每一数据该维度计算差值
221 |                 __m128 delta = _mm_sub_ps(data_d, centroid_d);
222 |                 // 对每一数据该维度累加距离
223 |                 distance = _mm_add_ps(distance, _mm_mul_ps(delta, delta));
224 |                 // 存回
225 |                 _mm_storeu_ps(&dis_k[i], distance);
226 |             }
227 |         }
228 |         // 判断当前的每一个数据到该质心的距离是否是最小的
229 |         for (int i = 1; i < N + 1; i++)
230 |             if (dis_k[i] < min_distance[i])
231 |                 min_distance[i] = dis_k[i], cluster[i] = j;
232 |     }
233 | }
234 | 
235 | void calculate_AVX_aligned()
236 | {
237 |     float min_distance[N] = {0.0};
238 |     for (int j = 0; j < K; j++)
239 |     {
240 |         // 各个点到各个聚类中心的距离
241 |         float dis_k[N] = {0.0};
242 |         for (int d = 0; d < D; d++)
243 |         {
244 |             for (int i = 0; i < N - N % 8; i += 8)
245 |             {
246 |                 // 取出原始积累的距离
247 |                 __m256 distance = _mm256_loadu_ps(&dis_k[i]);
248 |                 // 构造质心的某一维度数据
249 |                 float tmp_centroid_d[8] = {centroids[j][d]};
250 |                 __m256 centroid_d = _mm256_loadu_ps(tmp_centroid_d);
251 |                 // 一次取出四个元素的某一维度数据
252 |                 __m256 data_d = _mm256_load_ps(&data_align[d][i]);
253 |                 // 对每一数据该维度计算差值
254 |                 __m256 delta = _mm256_sub_ps(data_d, centroid_d);
255 |                 // 对每一数据该维度累加距离
256 |                 distance = _mm256_add_ps(distance, _mm256_mul_ps(delta, delta));
257 |                 // 存回
258 |                 _mm256_storeu_ps(&dis_k[i], distance);
259 |             }
260 |         }
261 |         // 判断当前的每一个数据到该质心的距离是否是最小的
262 |         for (int i = 0; i < N; i++)
263 |             if (dis_k[i] < min_distance[i])
264 |                 min_distance[i] = dis_k[i], cluster[i] = j;
265 |     }
266 | }
267 | 
268 | void calculate_AVX_unaligned()
269 | {
270 |     float min_distance[N + 1] = {0.0};
271 |     for (int j = 0; j < K; j++)
272 |     {
273 |         // 各个点到各个聚类中心的距离
274 |         float dis_k[N] = {0.0};
275 |         for (int d = 0; d < D; d++)
276 |         {
277 |             for (int i = 1; i < N - N % 8 + 1; i += 8)
278 |             {
279 |                 // 取出原始积累的距离
280 |                 __m256 distance = _mm256_loadu_ps(&dis_k[i]);
281 |                 // 构造质心的某一维度数据
282 |                 float tmp_centroid_d[8] = {centroids[j][d]};
283 |                 __m256 centroid_d = _mm256_loadu_ps(tmp_centroid_d);
284 |                 // 一次取出四个元素的某一维度数据
285 |                 __m256 data_d = _mm256_loadu_ps(&data[d][i]);
286 |                 // 对每一数据该维度计算差值
287 |                 __m256 delta = _mm256_sub_ps(data_d, centroid_d);
288 |                 // 对每一数据该维度累加距离
289 |                 distance = _mm256_add_ps(distance, _mm256_mul_ps(delta, delta));
290 |                 // 存回
291 |                 _mm256_storeu_ps(&dis_k[i], distance);
292 |             }
293 |         }
294 |         // 判断当前的每一个数据到该质心的距离是否是最小的
295 |         for (int i = 1; i < N + 1; i++)
296 |             if (dis_k[i] < min_distance[i])
297 |                 min_distance[i] = dis_k[i], cluster[i] = j;
298 |     }
299 | }
300 | 
301 | void calculate_AVX512_aligned()
302 | {
303 |     float min_distance[N] = {0.0};
304 |     for (int j = 0; j < K; j++)
305 |     {
306 |         // 各个点到各个聚类中心的距离
307 |         float dis_k[N] = {0.0};
308 |         for (int d = 0; d < D; d++)
309 |         {
310 |             for (int i = 0; i < N - N % 16; i += 16)
311 |             {
312 |                 // 取出原始积累的距离
313 |                 __m512 distance = _mm512_loadu_ps(&dis_k[i]);
314 |                 // 构造质心的某一维度数据
315 |                 float tmp_centroid_d[16] = {centroids[j][d]};
316 |                 __m512 centroid_d = _mm512_loadu_ps(tmp_centroid_d);
317 |                 // 一次取出四个元素的某一维度数据
318 |                 __m512 data_d = _mm512_load_ps(&data_align[d][i]);
319 |                 // 对每一数据该维度计算差值
320 |                 __m512 delta = _mm512_sub_ps(data_d, centroid_d);
321 |                 // 对每一数据该维度累加距离
322 |                 distance = _mm512_add_ps(distance, _mm512_mul_ps(delta, delta));
323 |                 // 存回
324 |                 _mm512_storeu_ps(&dis_k[i], distance);
325 |             }
326 |         }
327 |         // 判断当前的每一个数据到该质心的距离是否是最小的
328 |         for (int i = 0; i < N; i++)
329 |             if (dis_k[i] < min_distance[i])
330 |                 min_distance[i] = dis_k[i], cluster[i] = j;
331 |     }
332 | }
333 | 
334 | void calculate_AVX512_unaligned()
335 | {
336 |     float min_distance[N + 1] = {0.0};
337 |     for (int j = 0; j < K; j++)
338 |     {
339 |         // 各个点到各个聚类中心的距离
340 |         float dis_k[N] = {0.0};
341 |         for (int d = 0; d < D; d++)
342 |         {
343 |             for (int i = 1; i < N - N % 16 + 1; i += 16)
344 |             {
345 |                 // 取出原始积累的距离
346 |                 __m512 distance = _mm512_loadu_ps(&dis_k[i]);
347 |                 // 构造质心的某一维度数据
348 |                 float tmp_centroid_d[16] = {centroids[j][d]};
349 |                 __m512 centroid_d = _mm512_loadu_ps(tmp_centroid_d);
350 |                 // 一次取出四个元素的某一维度数据
351 |                 __m512 data_d = _mm512_loadu_ps(&data[d][i]);
352 |                 // 对每一数据该维度计算差值
353 |                 __m512 delta = _mm512_sub_ps(data_d, centroid_d);
354 |                 // 对每一数据该维度累加距离
355 |                 distance = _mm512_add_ps(distance, _mm512_mul_ps(delta, delta));
356 |                 // 存回
357 |                 _mm512_storeu_ps(&dis_k[i], distance);
358 |             }
359 |         }
360 |         // 判断当前的每一个数据到该质心的距离是否是最小的
361 |         for (int i = 1; i < N + 1; i++)
362 |             if (dis_k[i] < min_distance[i])
363 |                 min_distance[i] = dis_k[i], cluster[i] = j;
364 |     }
365 | }
366 | 


--------------------------------------------------------------------------------
/Lab2/x86/KMeans_serial.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <xmmintrin.h> //SSE
 3 | #include <emmintrin.h> //SSE2
 4 | #include <pmmintrin.h> //SSE3
 5 | #include <tmmintrin.h> //SSSE3
 6 | #include <smmintrin.h> //SSE4.1
 7 | #include <nmmintrin.h> //SSSE4.2
 8 | #include <immintrin.h> //AVX、AVX2
 9 | #include<windows.h>
10 | 
11 | using namespace std;
12 | 
13 | const int N = 4*4*4*4*4*4*4*4;
14 | const int D = 4;
15 | const int K = 4;
16 | const int L = 100;
17 | const int LOOP = 1;
18 | 
19 | float **data;  // 数据集
20 | float data_align[D][N];
21 | float centroids[K][D];  // 聚类中心
22 | int cluster[N];  // 各数据所属类别
23 | int cntCluster[K];  // 个聚类计数
24 | 
25 | void initData();
26 | void initCentroids();
27 | void calculate_serial();
28 | void updateCentroids();
29 | 
30 | int main()
31 | {
32 |     long long int head, tail, freq;
33 |     initData();
34 |     initCentroids();
35 | 	QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
36 | 	QueryPerformanceCounter((LARGE_INTEGER*)&head);
37 |     for(int i=0; i < LOOP; i++)
38 |     {
39 |         calculate_serial();
40 |         // updateCentroids();
41 |     }
42 |     QueryPerformanceCounter((LARGE_INTEGER*)&tail);
43 |     cout << (tail - head) * 1000.0 / freq / LOOP << endl;
44 |     system("pause");
45 | }
46 | 
47 | void initData()
48 | {
49 |     data = new float*[D];
50 |     for(int i=0; i < D; i++)
51 |         data[i] = new float[N];
52 |     for(int i=0; i<D; i++)
53 |         for(int j=0; j<N; j++)
54 |             data[i][j] = rand()*1.0/RAND_MAX * L , data_align[i][j] = rand()*1.0/RAND_MAX * L;
55 | }
56 | 
57 | void initCentroids()
58 | {
59 |     for(int i=0; i<K; i++)
60 |         for(int j=0; j<D; j++)
61 |             centroids[i][j] = rand()*1.0/RAND_MAX * L;
62 | }
63 | 
64 | void calculate_serial()
65 | {
66 |     for(int i = 0; i < N; i++)
67 |     {
68 |         float min_dis = L*L;
69 |         for(int j = 0; j < K; j++)
70 |         {
71 |             float dis = 0;
72 |             for(int d = 0; d < D; d++)
73 |                 dis += (data[d][i] - centroids[j][d]) * (data[d][i] - centroids[j][d]);
74 |             if(dis < min_dis)
75 |                 min_dis = dis,cluster[i] = j,cntCluster[j]++;
76 |         }
77 |     }
78 | }
79 | 
80 | void updateCentroids()
81 | {
82 |     for(int i=0; i< N; i++)
83 |         for(int j=0; j<D; j++)
84 |             centroids[cluster[i]][j] += data[j][i];
85 |     for(int i=0;i<K;i++)
86 |         for(int j=0;j<D;j++)
87 |             centroids[i][j] /= cntCluster[i];
88 | }


--------------------------------------------------------------------------------
/Lab3/ARM/Gauss_arm.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <sys/time.h>
  3 | #include <arm_neon.h>
  4 | #include <pthread.h>
  5 | #include <semaphore.h>
  6 | using namespace std;
  7 | 
  8 | //------------------------------------------ 线程控制变量 ------------------------------------------
  9 | typedef struct
 10 | {
 11 |     int t_id;
 12 | } threadParam_t;
 13 | 
 14 | sem_t sem_Division;
 15 | pthread_barrier_t barrier;
 16 | 
 17 | const int THREAD_NUM = 8;
 18 | 
 19 | // ------------------------------------------ 全局计算变量 ------------------------------------------
 20 | const int N = 10;
 21 | const int L = 100;
 22 | const int LOOP = 1;
 23 | float data[N][N];
 24 | float matrix[N][N];
 25 | 
 26 | void init_data();
 27 | void init_matrix();
 28 | void calculate_serial();
 29 | void calculate_neon();
 30 | void calculate_pthread();
 31 | void print_matrix();
 32 | 
 33 | int main()
 34 | {
 35 |     struct timeval start;
 36 |     struct timeval end;
 37 |     float time = 0;
 38 |     init_data();
 39 |     // ====================================== serial ======================================
 40 |     time = 0;
 41 |     for (int i = 0; i < LOOP; i++)
 42 |     {
 43 |         init_matrix();
 44 |         gettimeofday(&start, NULL);
 45 |         calculate_serial();
 46 |         gettimeofday(&end, NULL);
 47 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 48 |     }
 49 |     cout << "serial:" << time / LOOP << "ms" << endl;
 50 |     // ====================================== neon ======================================
 51 |     time = 0;
 52 |     for (int i = 0; i < LOOP; i++)
 53 |     {
 54 |         init_matrix();
 55 |         gettimeofday(&start, NULL);
 56 |         calculate_neon();
 57 |         gettimeofday(&end, NULL);
 58 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 59 |     }
 60 |     cout << "neon:" << time / LOOP << "ms" << endl;
 61 |     print_matrix();
 62 |     // ====================================== pthread ======================================
 63 |     time = 0;
 64 |     for (int i = 0; i < LOOP; i++)
 65 |     {
 66 |         init_matrix();
 67 |         gettimeofday(&start, NULL);
 68 |         calculate_pthread();
 69 |         gettimeofday(&end, NULL);
 70 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 71 |     }
 72 |     cout << "pthread:" << time / LOOP << "ms" << endl;
 73 |     print_matrix();
 74 | }
 75 | 
 76 | // 初始化data，保证每次数据都是一致的
 77 | void init_data()
 78 | {
 79 |     for (int i = 0; i < N; i++)
 80 |         for (int j = i; j < N; j++)
 81 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
 82 |     for (int i = 0; i < N - 1; i++)
 83 |         for (int j = i + 1; j < N; j++)
 84 |             for (int k = 0; k < N; k++)
 85 |                 data[j][k] += data[i][k];
 86 | }
 87 | 
 88 | // 用data初始化matrix，保证每次进行计算的数据是一致的
 89 | void init_matrix()
 90 | {
 91 |     for (int i = 0; i < N; i++)
 92 |         for (int j = 0; j < N; j++)
 93 |             matrix[i][j] = data[i][j];
 94 | }
 95 | 
 96 | // 串行算法
 97 | void calculate_serial()
 98 | {
 99 |     for (int k = 0; k < N; k++)
100 |     {
101 |         for (int j = k + 1; j < N; j++)
102 |         {
103 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
104 |         }
105 |         matrix[k][k] = 1;
106 |         for (int i = k + 1; i < N; i++)
107 |         {
108 |             for (int j = k + 1; j < N; j++)
109 |             {
110 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
111 |             }
112 |             matrix[i][k] = 0;
113 |         }
114 |     }
115 | }
116 | 
117 | // neon并行算法
118 | void calculate_neon()
119 | {
120 |     for (int k = 0; k < N; k++)
121 |     {
122 |         float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
123 |         int j;
124 |         for (j = k + 1; j + 3 < N; j += 4)
125 |         {
126 |             float32x4_t Akj = vld1q_f32(matrix[k] + j);
127 |             Akj = vdivq_f32(Akj, Akk);
128 |             vst1q_f32(matrix[k] + j, Akj);
129 |         }
130 |         for (; j < N; j++)
131 |         {
132 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
133 |         }
134 |         matrix[k][k] = 1;
135 |         for (int i = k + 1; i < N; i++)
136 |         {
137 |             float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
138 |             for (j = k + 1; j + 3 < N; j += 4)
139 |             {
140 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
141 |                 float32x4_t Aij = vld1q_f32(matrix[i] + j);
142 |                 float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
143 |                 Aij = vsubq_f32(Aij, AikMulAkj);
144 |                 vst1q_f32(matrix[i] + j, Aij);
145 |             }
146 |             for (; j < N; j++)
147 |             {
148 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
149 |             }
150 |             matrix[i][k] = 0;
151 |         }
152 |     }
153 | }
154 | 
155 | 
156 | void *threadFunc(void *param)
157 | {
158 |     threadParam_t *thread_param_t = (threadParam_t *)param;
159 |     int t_id = thread_param_t->t_id;
160 |     for (int k = 0; k < N; k++)
161 |     {
162 |         // 如果当前是0号线程，则进行除法操作，其余线程处于等待状态
163 |         if (t_id == 0)
164 |         {
165 |             float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
166 |             int j;
167 |             for (j = k + 1; j + 3 < n; j += 4)
168 |             {
169 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
170 |                 Akj = vdivq_f32(Akj, Akk);
171 |                 vst1q_f32(matrix[k] + j, Akj);
172 |             }
173 |             for (; j < n; j++)
174 |             {
175 |                 matrix[k][j] = matrix[k][j] / matrix[k][k];
176 |             }
177 |             matrix[k][k] = 1.0;
178 |         }
179 |         else
180 |         {
181 |             sem_wait(&sem_Division);
182 |         }
183 | 
184 |         // 除法操作完成后，如果是0号线程，则需要唤醒其他线程
185 |         if (t_id == 0)
186 |         {
187 |             for (int i = 1; i < THREAD_NUM; i++)
188 |             {
189 |                 sem_post(&sem_Division);
190 |             }
191 |         }
192 |         else
193 |         {
194 |             // 循环划分任务
195 |             for (int i = k + t_id; i < N; i += (THREAD_NUM - 1))
196 |             {
197 |                 int j = k + 1;
198 |                 float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
199 |                 for (j = k + 1; j + 3 < N; j += 4)
200 |                 {
201 |                     float32x4_t Akj = vld1q_f32(matrix[k] + j);
202 |                     float32x4_t Aij = vld1q_f32(matrix[i] + j);
203 |                     float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
204 |                     Aij = vsubq_f32(Aij, AikMulAkj);
205 |                     vst1q_f32(matrix[i] + j, Aij);
206 |                 }
207 |                 for (; j < N; j++)
208 |                 {
209 |                     matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
210 |                 }
211 |                 matrix[i][k] = 0;
212 |             }
213 |         }
214 | 
215 |         // 所有线程进入下一轮
216 |         pthread_barrier_wait(&barrier);
217 |     }
218 |     pthread_exit(NULL);
219 |     return NULL;
220 | }
221 | 
222 | // pthread并行算法
223 | void calculate_pthread()
224 | {
225 |     // 信号量初始化
226 |     sem_init(&sem_Division, 0, 0);
227 |     pthread_barrier_init(&barrier,NULL,THREAD_NUM);
228 | 
229 |     // 创建线程
230 |     pthread_t threads[THREAD_NUM];
231 |     threadParam_t thread_param_t[THREAD_NUM];
232 |     for (int i = 0; i < THREAD_NUM; i++)
233 |     {
234 |         thread_param_t[i].t_id = i;
235 |         pthread_create(&threads[i], NULL, threadFunc, (void *)(&thread_param_t[i]));
236 |     }
237 | 
238 |     // 加入执行线程
239 |     for (int i = 0; i < THREAD_NUM; i++)
240 |     {
241 |         pthread_join(threads[i], NULL);
242 |     }
243 | 
244 |     // 销毁信号量
245 |     sem_destroy(&sem_Division);
246 |     pthread_barrier_destroy(&barrier);
247 | }
248 | 
249 | void print_matrix()
250 | {
251 |     for (int i = 0; i < N; i++)
252 |     {
253 |         for (int j = 0; j < N; j++)
254 |         {
255 |             printf("%.2f ", matrix[i][j]);
256 |         }
257 |         printf("\n");
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------
/Lab3/ARM/Gauss_pthread.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <sys/time.h>
  3 | #include <arm_neon.h>
  4 | #include <pthread.h>
  5 | #include <semaphore.h>
  6 | #include <math.h>
  7 | using namespace std;
  8 | 
  9 | //------------------------------------------ 线程控制变量 ------------------------------------------
 10 | typedef struct
 11 | {
 12 |     int t_id;
 13 | } threadParam_t;
 14 | 
 15 | sem_t sem_Division;
 16 | pthread_barrier_t barrier;
 17 | pthread_mutex_t task;
 18 | int myIndex;
 19 | 
 20 | const int THREAD_NUM = 8;
 21 | 
 22 | // ------------------------------------------ 全局计算变量 ------------------------------------------
 23 | const int N = 100;
 24 | const int L = 100;
 25 | const int LOOP = 100;
 26 | float data[N][N];
 27 | float matrix[N][N];
 28 | 
 29 | void init_data();
 30 | void init_matrix();
 31 | void calculate_serial();
 32 | void calculate_neon();
 33 | void calculate_pthread_discrete();
 34 | void calculate_pthread_continuous();
 35 | void calculate_pthread_dynamic();
 36 | void print_matrix();
 37 | 
 38 | int main()
 39 | {
 40 |     struct timeval start;
 41 |     struct timeval end;
 42 |     float time = 0;
 43 |     init_data();
 44 |     // ====================================== serial ======================================
 45 |     time = 0;
 46 |     for (int i = 0; i < LOOP; i++)
 47 |     {
 48 |         init_matrix();
 49 |         gettimeofday(&start, NULL);
 50 |         calculate_serial();
 51 |         gettimeofday(&end, NULL);
 52 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 53 |     }
 54 |     cout << "serial:" << time / LOOP << "ms" << endl;
 55 |     // ====================================== neon ======================================
 56 |     time = 0;
 57 |     for (int i = 0; i < LOOP; i++)
 58 |     {
 59 |         init_matrix();
 60 |         gettimeofday(&start, NULL);
 61 |         calculate_neon();
 62 |         gettimeofday(&end, NULL);
 63 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 64 |     }
 65 |     cout << "neon:" << time / LOOP << "ms" << endl;
 66 |     // ====================================== pthread_discrete ======================================
 67 |     time = 0;
 68 |     for (int i = 0; i < LOOP; i++)
 69 |     {
 70 |         init_matrix();
 71 |         gettimeofday(&start, NULL);
 72 |         calculate_pthread_discrete();
 73 |         gettimeofday(&end, NULL);
 74 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 75 |     }
 76 |     cout << "pthread_discrete:" << time / LOOP << "ms" << endl;
 77 |     // ====================================== pthread_continuous ======================================
 78 |     time = 0;
 79 |     for (int i = 0; i < LOOP; i++)
 80 |     {
 81 |         init_matrix();
 82 |         gettimeofday(&start, NULL);
 83 |         calculate_pthread_continuous();
 84 |         gettimeofday(&end, NULL);
 85 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 86 |     }
 87 |     cout << "pthread_continuous:" << time / LOOP << "ms" << endl;
 88 |     // ====================================== pthread_dynamic ======================================
 89 |     time = 0;
 90 |     for (int i = 0; i < LOOP; i++)
 91 |     {
 92 |         init_matrix();
 93 |         gettimeofday(&start, NULL);
 94 |         calculate_pthread_dynamic();
 95 |         gettimeofday(&end, NULL);
 96 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
 97 |     }
 98 |     cout << "pthread_dynamic:" << time / LOOP << "ms" << endl;
 99 | }
100 | 
101 | // 初始化data，保证每次数据都是一致的
102 | void init_data()
103 | {
104 |     for (int i = 0; i < N; i++)
105 |         for (int j = i; j < N; j++)
106 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
107 |     for (int i = 0; i < N - 1; i++)
108 |         for (int j = i + 1; j < N; j++)
109 |             for (int k = 0; k < N; k++)
110 |                 data[j][k] += data[i][k];
111 | }
112 | 
113 | // 用data初始化matrix，保证每次进行计算的数据是一致的
114 | void init_matrix()
115 | {
116 |     for (int i = 0; i < N; i++)
117 |         for (int j = 0; j < N; j++)
118 |             matrix[i][j] = data[i][j];
119 | }
120 | 
121 | // 串行算法
122 | void calculate_serial()
123 | {
124 |     for (int k = 0; k < N; k++)
125 |     {
126 |         for (int j = k + 1; j < N; j++)
127 |         {
128 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
129 |         }
130 |         matrix[k][k] = 1;
131 |         for (int i = k + 1; i < N; i++)
132 |         {
133 |             for (int j = k + 1; j < N; j++)
134 |             {
135 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
136 |             }
137 |             matrix[i][k] = 0;
138 |         }
139 |     }
140 | }
141 | 
142 | // neon并行算法
143 | void calculate_neon()
144 | {
145 |     for (int k = 0; k < N; k++)
146 |     {
147 |         float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
148 |         int j;
149 |         for (j = k + 1; j + 3 < N; j += 4)
150 |         {
151 |             float32x4_t Akj = vld1q_f32(matrix[k] + j);
152 |             Akj = vdivq_f32(Akj, Akk);
153 |             vst1q_f32(matrix[k] + j, Akj);
154 |         }
155 |         for (; j < N; j++)
156 |         {
157 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
158 |         }
159 |         matrix[k][k] = 1;
160 |         for (int i = k + 1; i < N; i++)
161 |         {
162 |             float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
163 |             for (j = k + 1; j + 3 < N; j += 4)
164 |             {
165 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
166 |                 float32x4_t Aij = vld1q_f32(matrix[i] + j);
167 |                 float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
168 |                 Aij = vsubq_f32(Aij, AikMulAkj);
169 |                 vst1q_f32(matrix[i] + j, Aij);
170 |             }
171 |             for (; j < N; j++)
172 |             {
173 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
174 |             }
175 |             matrix[i][k] = 0;
176 |         }
177 |     }
178 | }
179 | 
180 | 
181 | void *threadFunc_discrete(void *param)
182 | {
183 |     threadParam_t *thread_param_t = (threadParam_t *)param;
184 |     int t_id = thread_param_t->t_id;
185 |     for (int k = 0; k < N; k++)
186 |     {
187 |         // 如果当前是0号线程，则进行除法操作，其余线程处于等待状态
188 |         if (t_id == 0)
189 |         {
190 |             float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
191 |             int j;
192 |             for (j = k + 1; j + 3 < N; j += 4)
193 |             {
194 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
195 |                 Akj = vdivq_f32(Akj, Akk);
196 |                 vst1q_f32(matrix[k] + j, Akj);
197 |             }
198 |             for (; j < N; j++)
199 |             {
200 |                 matrix[k][j] = matrix[k][j] / matrix[k][k];
201 |             }
202 |             matrix[k][k] = 1.0;
203 |         }
204 |         else
205 |         {
206 |             sem_wait(&sem_Division);
207 |         }
208 | 
209 |         // 除法操作完成后，如果是0号线程，则需要唤醒其他线程
210 |         if (t_id == 0)
211 |         {
212 |             for (int i = 1; i < THREAD_NUM; i++)
213 |             {
214 |                 sem_post(&sem_Division);
215 |             }
216 |         }
217 |         else
218 |         {
219 |             // 循环划分任务
220 |             for (int i = k + t_id; i < N; i += (THREAD_NUM - 1))
221 |             {
222 |                 int j = k + 1;
223 |                 float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
224 |                 for (j = k + 1; j + 3 < N; j += 4)
225 |                 {
226 |                     float32x4_t Akj = vld1q_f32(matrix[k] + j);
227 |                     float32x4_t Aij = vld1q_f32(matrix[i] + j);
228 |                     float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
229 |                     Aij = vsubq_f32(Aij, AikMulAkj);
230 |                     vst1q_f32(matrix[i] + j, Aij);
231 |                 }
232 |                 for (; j < N; j++)
233 |                 {
234 |                     matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
235 |                 }
236 |                 matrix[i][k] = 0;
237 |             }
238 |         }
239 | 
240 |         // 所有线程进入下一轮
241 |         pthread_barrier_wait(&barrier);
242 |     }
243 |     pthread_exit(NULL);
244 |     return NULL;
245 | }
246 | 
247 | // pthread并行算法
248 | void calculate_pthread_discrete()
249 | {
250 |     // 信号量初始化
251 |     sem_init(&sem_Division, 0, 0);
252 |     pthread_barrier_init(&barrier,NULL,THREAD_NUM);
253 | 
254 |     // 创建线程
255 |     pthread_t threads[THREAD_NUM];
256 |     threadParam_t thread_param_t[THREAD_NUM];
257 |     for (int i = 0; i < THREAD_NUM; i++)
258 |     {
259 |         thread_param_t[i].t_id = i;
260 |         pthread_create(&threads[i], NULL, threadFunc_discrete, (void *)(&thread_param_t[i]));
261 |     }
262 | 
263 |     // 加入执行线程
264 |     for (int i = 0; i < THREAD_NUM; i++)
265 |     {
266 |         pthread_join(threads[i], NULL);
267 |     }
268 | 
269 |     // 销毁信号量
270 |     sem_destroy(&sem_Division);
271 |     pthread_barrier_destroy(&barrier);
272 | }
273 | 
274 | void *threadFunc_continuous(void *param)
275 | {
276 |     threadParam_t *thread_param_t = (threadParam_t *)param;
277 |     int t_id = thread_param_t->t_id;
278 |     for (int k = 0; k < N; k++)
279 |     {
280 |         // 如果当前是0号线程，则进行除法操作，其余线程处于等待状态
281 |         if (t_id == 0)
282 |         {
283 |             float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
284 |             int j;
285 |             for (j = k + 1; j + 3 < N; j += 4)
286 |             {
287 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
288 |                 Akj = vdivq_f32(Akj, Akk);
289 |                 vst1q_f32(matrix[k] + j, Akj);
290 |             }
291 |             for (; j < N; j++)
292 |             {
293 |                 matrix[k][j] = matrix[k][j] / matrix[k][k];
294 |             }
295 |             matrix[k][k] = 1.0;
296 |         }
297 |         else
298 |         {
299 |             sem_wait(&sem_Division);
300 |         }
301 | 
302 |         // 除法操作完成后，如果是0号线程，则需要唤醒其他线程
303 |         if (t_id == 0)
304 |         {
305 |             for (int i = 1; i < THREAD_NUM; i++)
306 |             {
307 |                 sem_post(&sem_Division);
308 |             }
309 |         }
310 |         else
311 |         {
312 |             int L =  ceil((N-k)*1.0 / (THREAD_NUM - 1));
313 |             // 循环划分任务
314 |             for (int i = k + (t_id - 1) * L + 1; i < N && i < k + t_id * L + 1 ; i++)
315 |             {
316 |                 int j = k + 1;
317 |                 float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
318 |                 for (j = k + 1; j + 3 < N; j += 4)
319 |                 {
320 |                     float32x4_t Akj = vld1q_f32(matrix[k] + j);
321 |                     float32x4_t Aij = vld1q_f32(matrix[i] + j);
322 |                     float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
323 |                     Aij = vsubq_f32(Aij, AikMulAkj);
324 |                     vst1q_f32(matrix[i] + j, Aij);
325 |                 }
326 |                 for (; j < N; j++)
327 |                 {
328 |                     matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
329 |                 }
330 |                 matrix[i][k] = 0;
331 |             }
332 |         }
333 | 
334 |         // 所有线程进入下一轮
335 |         pthread_barrier_wait(&barrier);
336 |     }
337 |     pthread_exit(NULL);
338 |     return NULL;
339 | }
340 | 
341 | void calculate_pthread_continuous()
342 | {
343 |     // 信号量初始化
344 |     sem_init(&sem_Division, 0, 0);
345 |     pthread_barrier_init(&barrier,NULL,THREAD_NUM);
346 | 
347 |     // 创建线程
348 |     pthread_t threads[THREAD_NUM];
349 |     threadParam_t thread_param_t[THREAD_NUM];
350 |     for (int i = 0; i < THREAD_NUM; i++)
351 |     {
352 |         thread_param_t[i].t_id = i;
353 |         pthread_create(&threads[i], NULL, threadFunc_continuous, (void *)(&thread_param_t[i]));
354 |     }
355 | 
356 |     // 加入执行线程
357 |     for (int i = 0; i < THREAD_NUM; i++)
358 |     {
359 |         pthread_join(threads[i], NULL);
360 |     }
361 | 
362 |     // 销毁信号量
363 |     sem_destroy(&sem_Division);
364 |     pthread_barrier_destroy(&barrier);
365 | }
366 | 
367 | // pthread_dynamci 线程函数
368 | void * threadFunc_dynamic(void *param)
369 | {
370 |     threadParam_t *thread_param_t = (threadParam_t *)param;
371 |     int t_id = thread_param_t->t_id;
372 |     for (int k = 0; k < N; k++)
373 |     {
374 |         // 如果当前是0号线程，则进行除法操作，其余线程处于等待状态
375 |         if (t_id == 0)
376 |         {
377 |             float32x4_t Akk = vmovq_n_f32(matrix[k][k]);
378 |             int j;
379 |             for (j = k + 1; j + 3 < N; j += 4)
380 |             {
381 |                 float32x4_t Akj = vld1q_f32(matrix[k] + j);
382 |                 Akj = vdivq_f32(Akj, Akk);
383 |                 vst1q_f32(matrix[k] + j, Akj);
384 |             }
385 |             for (; j < N; j++)
386 |             {
387 |                 matrix[k][j] = matrix[k][j] / matrix[k][k];
388 |             }
389 |             matrix[k][k] = 1.0;
390 |             myIndex = k + 1;
391 |         }
392 |         else
393 |         {
394 |             sem_wait(&sem_Division);
395 |         }
396 | 
397 |         // 除法操作完成后，如果是0号线程，则需要唤醒其他线程
398 |         if (t_id == 0)
399 |         {
400 |             for (int i = 1; i < THREAD_NUM; i++)
401 |             {
402 |                 sem_post(&sem_Division);
403 |             }
404 |         }
405 |         else
406 |         {
407 |             while(myIndex < N)
408 |             {
409 |                 pthread_mutex_lock(&task);
410 |                 int i = myIndex;
411 |                 if( i < N )
412 |                 {
413 |                     myIndex++;
414 |                     pthread_mutex_unlock(&task);
415 |                 }
416 |                 else
417 |                 {
418 |                     pthread_mutex_unlock(&task);
419 |                     break;
420 |                 }
421 |                 int j = k + 1;
422 |                 float32x4_t Aik = vmovq_n_f32(matrix[i][k]);
423 |                 for (j = k + 1; j + 3 < N; j += 4)
424 |                 {
425 |                     float32x4_t Akj = vld1q_f32(matrix[k] + j);
426 |                     float32x4_t Aij = vld1q_f32(matrix[i] + j);
427 |                     float32x4_t AikMulAkj = vmulq_f32(Aik, Akj);
428 |                     Aij = vsubq_f32(Aij, AikMulAkj);
429 |                     vst1q_f32(matrix[i] + j, Aij);
430 |                 }
431 |                 for (; j < N; j++)
432 |                 {
433 |                     matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
434 |                 }
435 |                 matrix[i][k] = 0;
436 |             }
437 |         }
438 | 
439 |         // 所有线程准备进入下一轮
440 |         pthread_barrier_wait(&barrier);
441 |     }
442 |     pthread_exit(NULL);
443 |     return NULL;
444 | }
445 | 
446 | // pthread_dynamic 并行算法
447 | void calculate_pthread_dynamic()
448 | {
449 |     // 信号量初始化
450 |     sem_init(&sem_Division, 0, 0);
451 |     pthread_barrier_init(&barrier, NULL, THREAD_NUM);
452 |     task=PTHREAD_MUTEX_INITIALIZER;
453 | 
454 |     // 创建线程
455 |     pthread_t threads[THREAD_NUM];
456 |     threadParam_t thread_param_t[THREAD_NUM];
457 |     for (int i = 0; i < THREAD_NUM; i++)
458 |     {
459 |         thread_param_t[i].t_id = i;
460 |         pthread_create(&threads[i], NULL, threadFunc_dynamic, (void *)(&thread_param_t[i]));
461 |     }
462 | 
463 |     // 加入执行线程
464 |     for (int i = 0; i < THREAD_NUM; i++)
465 |     {
466 |         pthread_join(threads[i], NULL);
467 |     }
468 | 
469 |     // 销毁信号量
470 |     sem_destroy(&sem_Division);
471 |     pthread_barrier_destroy(&barrier);
472 |     pthread_mutex_destroy(&task);
473 | }
474 | 
475 | void print_matrix()
476 | {
477 |     for (int i = 0; i < N; i++)
478 |     {
479 |         for (int j = 0; j < N; j++)
480 |         {
481 |             printf("%.2f ", matrix[i][j]);
482 |         }
483 |         printf("\n");
484 |     }
485 | }
486 | 


--------------------------------------------------------------------------------
/Lab3/Gauss_serial.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <windows.h>
 3 | #include <sys/time.h>
 4 | using namespace std;
 5 | 
 6 | const int N = 500;
 7 | const int L = 100;
 8 | const int LOOP = 1;
 9 | float data[N][N];
10 | float matrix[N][N];
11 | 
12 | void init_data();
13 | void init_matrix();
14 | void calculate_serial();
15 | 
16 | int main()
17 | {
18 |     struct timeval start;
19 |     struct timeval end;
20 |     float time = 0;
21 |     init_data();
22 |     // ====================================== serial ======================================
23 |     time = 0;
24 |     for (int i = 0; i < LOOP; i++)
25 |     {
26 |         init_matrix();
27 |         gettimeofday(&start, NULL);
28 |         calculate_serial();
29 |         gettimeofday(&end, NULL);
30 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
31 |     }
32 |     cout << "serial:" << time / LOOP << "ms" << endl;
33 | 	system("pause");
34 | }
35 | 
36 | // 初始化data，保证每次数据都是一致的
37 | void init_data()
38 | {
39 |     for (int i = 0; i < N; i++)
40 |         for (int j = i; j < N; j++)
41 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
42 |     for (int i = 0; i < N - 1; i++)
43 |         for (int j = i + 1; j < N; j++)
44 |             for (int k = 0; k < N; k++)
45 |                 data[j][k] += data[i][k];
46 | }
47 | 
48 | // 用data初始化matrix，保证每次进行计算的数据是一致的
49 | void init_matrix()
50 | {
51 |     for (int i = 0; i < N; i++)
52 |         for (int j = 0; j < N; j++)
53 |             matrix[i][j] = data[i][j];
54 | }
55 | 
56 | // 串行算法
57 | void calculate_serial()
58 | {
59 |     for (int k = 0; k < N; k++)
60 |     {
61 |         for (int j = k + 1; j < N; j++)
62 |         {
63 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
64 |         }
65 |         matrix[k][k] = 1;
66 |         for (int i = k + 1; i < N; i++)
67 |         {
68 |             for (int j = k + 1; j < N; j++)
69 |             {
70 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
71 |             }
72 |             matrix[i][k] = 0;
73 |         }
74 |     }
75 | }


--------------------------------------------------------------------------------
/Lab3/Lab3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/Lab3.pdf


--------------------------------------------------------------------------------
/Lab3/report/CPU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/CPU.png


--------------------------------------------------------------------------------
/Lab3/report/arm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/arm.png


--------------------------------------------------------------------------------
/Lab3/report/arm_thread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/arm_thread.png


--------------------------------------------------------------------------------
/Lab3/report/arm线程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/arm线程.png


--------------------------------------------------------------------------------
/Lab3/report/report.md:
--------------------------------------------------------------------------------
  1 | # 问题描述
  2 | 
  3 | 在进行科学计算的过程中，经常会遇到对于多元线性方程组的求解，而求解线性方程组的一种常用方法就是Gauss消去法。即通过一种自上而下，逐行消去的方法，将线性方程组的系数矩阵消去为主对角线元素均为1的上三角矩阵。
  4 | $$
  5 | \left[
  6 | \begin{matrix}
  7 | a_{11}&a_{12}&\cdots&a_{1\ n-1}&a_{1n}\\
  8 | a_{21}&a_{22}&\cdots&a_{2\ n-1}&a_{2n}\\
  9 | \vdots&\vdots&\ddots&\vdots&\vdots\\
 10 | a_{n-1\ 1}&a_{n-1\ 2}&\cdots&a_{n-1 \ n-1}&a_{n-1\ n}\\
 11 | a_{n\ 1}&a_{n\ 2}&\cdots&a_{n \ n-1}&a_{n\ n}\\
 12 | \end{matrix}
 13 | \right]
 14 | =>
 15 | \left[
 16 | \begin{matrix}
 17 | 1&a_{12}'&\cdots&a_{1\ n-1}'&a_{1n}'\\
 18 | 0&1&\cdots&a_{2\ n-1}'&a_{2n}'\\
 19 | \vdots&\vdots&\ddots&\vdots&\vdots\\
 20 | 0&0&\cdots&1&a_{n-1\ n}\\
 21 | 0&0&\cdots&0&1\\
 22 | \end{matrix}
 23 | \right]
 24 | $$
 25 | 在整个消去的过程中，主要包含两个过程
 26 | 
 27 | ![image-20220428214131431](C:\Users\Lenovo\AppData\Roaming\Typora\typora-user-images\image-20220428214131431.png)
 28 | 
 29 | # 实验设计
 30 | 
 31 | 考虑Gauss消去的整个过程中主要涉及到两个阶段，一个是在消元行行内除法过程，一个是其余行减去消元行的过程。而就每个阶段而言，其所做的工作基本是一致的，只是在不同的消元轮次时，消元的起始位置不同。尤其是针对第二个阶段，即其余行依次减去消元行的过程，这个阶段每一行所做的工作是完全一致的，十分适合并行化处理，即将待消去的行平均分配给几个线程，由于这些数据之间不存在依赖性，因此每个线程只需要各自完成好自己的工作即可，不存在线程之间进行通信的额外开销。
 32 | 
 33 | 而对于第一阶段，即消元行行内进行除法操作时，由于这个问题规模相对较小，如果将待操作的数据分配给不同的线程进行处理的话，线程挂起到唤醒这部分的时间开销相较于要处理的问题而言占比很高，因此不适合进行多线程并行处理，但是仍可以结合SIMD的向量化处理。同样在第二阶段，被消元行依次减去消元行的过程中，每一行内的减法运算同样也不适合进行多线程的并行处理，也可以采用SIMD进行向量化处理。
 34 | 
 35 | 在本次实验中，将设计以下实验进行探究：
 36 | 
 37 | ## pthread并行处理
 38 | 
 39 | 对于Gauss消去的过程，在每一轮消去中主要包含两个阶段，首先是针对消元行做除法运算，然后是对于剩余的被消元行，依次减去消元行的某个倍数。在每一轮的过程中，除法操作和消元减法操作之间是有着严格的先后顺序的，即必须首先完成消元行的除法操作之后，才能够执行被消元行的减法操作。因此需要引入信号量进行同步控制，即当0号线程完成了对于消元行的除法操作之后，依次向其余挂起等待的线程发送信号，之后所有线程一起并行执行被消元行的减法操作。
 40 | 
 41 | 在执行消去的时候，考虑对于数据采用分散的划分方式，即以线程数量为步长对于剩余的被消元行进行划分，分配给不同的线程。不同的线程之间执行的工作是完全一致的，并且由于不同行之间并不存在数据依赖，因此可以避免线程之间的通信开销。
 42 | 
 43 | 而由于不同的线程在执行消去操作时所需要的时间可能并不相同，因此需要在所有线程完成本轮被分配的消元任务之后，进行一次同步控制。在这一次同步控制中，出于方便考虑，使用了barrier进行同步控制。即只有当所有的线程都完成了消去任务之后，才会进入下一轮的消元。
 44 | 
 45 | ## 数据块划分设计
 46 | 
 47 | 在进行任务划分时，给出的样例中采用了等步长的划分方式，这种划分方式存在一定的弊端。即当数据规模比较大的时候，由于L1 cache大小有限，很有可能会导致在访问下一个间隔为线程数的行的时候出现cache miss，这样就需要到L2、L3甚至内存中去读取数据。这将会造成额外的访存开销。
 48 | 
 49 | 因此在进行数据划分的时候，考虑设计一种充分利用cache优化的数据划分方式，即将数据按块划分。每个线程负责连续的几行的消去任务。这样做的好处是，当线程正在处理当前行的时候，CPU可能会提前预取下一行的数据到cache中，这就会使得下一次进行数据访问的时候，能够尽快在cache中命中，减少了不必要的访存开销。
 50 | 
 51 | ## 数据动态划分设计
 52 | 
 53 | 考虑在进行任务划分的时候，由于不同线程在执行任务的时候，所需要的时间可能不一致，甚至因为数据规模不是线程数量的整数倍，导致某些线程出现在个别轮次中处于空等待的状态。这是由于数据划分的时候，由于细粒度的数据划分导致的线程之间负载不均衡。
 54 | 
 55 | 因此考虑在设计数据划分的时候采用动态的数据划分方式。即在对被消元行执行减法操作的过程中，并不明确指定某个线程对哪部分数据执行任务，而是根据各个线程任务完成的情况动态的进行数据划分。即通过一个全局的变量index来指示现阶段已经处理到哪一行。而当某一个线程完成了其被分配的任务的时候，会查看关于index的互斥量，如果这个互斥量并没有上锁，则说明当前处于可以进行任务划分的阶段。于是让这个线程对关于index的互斥量上锁，并将index所指的行分配给该线程，任务分配完成后，线程释放互斥量，然后去执行所分配的任务。
 56 | 
 57 | 这样就可以保证每条线程都一直在执行被分配的任务，而不会出现个别线程由于负载不均衡出现空等待的现象，而其他线程还在执行任务。由于只有当所有线程的任务都执行完毕的时候才会进入下一轮迭代，因此那些进行空等待的线程就浪费了CPU的计算资源。这就是该实验设计选择进行优化的方向。
 58 | 
 59 | ## 不同数据规模和线程数下的性能探究
 60 | 
 61 | 考虑到线程的创建，调度，挂起和唤醒等操作相对于简单的计算操作而言，所需要的时间开销是非常大的。因此可以推测，当问题规模比较小的时候，由于线程调度导致的额外开销会抵消掉多线程优化效果，甚至还会表现出多线程比串行算法更慢的情况。而随着问题规模的增加，线程之间调度切换所需要的时间开销相对于线程完成任务所需要的时间而言已经占比很低，这样就能够正常反映出多线程并行优化的效果。因此，设计实验探究在不同数据规模下，多线程并行优化算法的优化效果。此外还将探究在所使用的线程数量不同的情况下，并行算法优化效果的变化情况。
 62 | 
 63 | ## x86平台迁移
 64 | 
 65 | 本次实验除了对ARM架构下采用neon指令集架构结合pthread多线程编程，对Gauss消去算法进行并行化处理，还将算法迁移到了x86平台上，采用x86中的SSE、AVX和AVX512指令集架构分别对算法进行重构，然后对比实验效果。
 66 | 
 67 | # 实验结果分析
 68 | 
 69 | ## ARM平台
 70 | 
 71 | ### pthread并行处理
 72 | 
 73 | 为了能够探究pthread并行算法的优化效果，考虑调整问题规模，测量在不同任务规模下，pthread并行优化算法对于普通串行算法和SIMD向量化优化算法的加速比。在本次实验中，pthread并行算法中，同样融合了SIMD的向量化处理。在ARM平台上，SIMD的实现是基于Neon指令集架构的。为了能够比较全面的展现并行优化效果随问题规模的变化情况，在问题规模小于1000时采用步长为100，而当问题规模大于1000时，步长调整为1000。三种算法的在不同问题规模下的表现如下表所示。
 74 | 
 75 | 在实验设计时，SIMD进行向量化处理的时候，采用的是四路向量化处理，而pthread多线程优化时，总共开启了8条线程，其中一条线程负责除法操作，剩余的7条线程负责做消元操作。因此从时间表现情况来看，理论上SIMD优化算法所需要的时间应该是串行算法的$\frac{1}{4}$，pthread多线程所需要的时间应该是SIMD向量化的$\frac{1}{7}$。而从实验数据来看，当问题规模较小的时候，pthrad多线程算法的时间性能甚至差于普通的串行算法。这是由于线程的创建，挂起，唤醒和切换等操作，所需要消耗的时钟周期数要远远多余简单的运算操作。因此当问题规模较小时，由于运算操作在整个问题求解的过程中所占比例较低，因此线程额外开销的副作用就会显现出来。而随着问题规模的增大，pthread多线程的优势就能够显现出来。两种并行优化算法的加速比变化如下图所示。
 76 | 
 77 | 从图像中可以看出，SIMD的加速比随着问题规模的增加基本保持稳定，由于算法中还涉及到其他的数据处理，因此其加速比只达到了2左右，并没有能够达到理论上的4。而pthread优化的效果则随着问题规模的增加呈现出持续上升的趋势。这是因为，问题规模的增加，使得程序在运行的过程中，运算所占比例不断上升，这将会逐步抵消由于线程切换导致的额外开销。从数据中可以看出，当问题规模达到2000时，已经接近了其对SIMD的理论加速比。可以推测，当问题规模持续上升时，这个加速比将会接近7。
 78 | 
 79 | ### 数据划分方式对比
 80 | 
 81 | 本次实验中，除了进行基础的pthread多线程优化尝试之外，还从数据划分的角度出发，考虑不同的数据划分方式，对于并行算法优化效果的影响。结合前文实验设计，分别对比了循环划分，块划分和动态划分三种方式，在不同问题规模下的表现效果，并以SIMD算法为baseline，其加速比变化情况如下图所示。
 82 | 
 83 | 可以看到，随着问题规模的增大，这三种任务划分方式的加速比都是逐渐去接近理论加速比的。但是也可以注意到，在三种任务划分之间的性能表现还是存在着明显差异的。
 84 | 
 85 | 从cache优化的角度出发，循环划分和块划分的主要区别就在于能都利用到cache优化。就循环划分这种方式而言，线程在处理完当前行之后，接下来要处理的行距离当前间隔为NUM_PHTREAD，因此当数据规模很大的时候，会因为L1 cache不能够容纳下足够的数据，或者由于CPU未能够及时的预取下一行数据，而导致cache miss，因此需要额外的访存开销。而块划分的方式就能够很好的弥补这一点，其原因是对于每个线程而言，他所需要处理的数据之间在内存上是连续的，因此有很好的cache优势，因此能够减小由于cache miss导致的额外访存开销。使用perf工具对于这两种算法的L1 cache的命中率进行检测，如下表所示。块划分的命中率能够达到98%，而循环划分的方式只有94%左右，两者差异不大，因此在性能表现上的差异也不显著。
 86 | 
 87 | 从负载均衡的角度出发，循环划分和动态数据划分的主要区别就在于能否充分利用各个线程的计算资源，尽可能减少同步等待所导致的额外开销。如果采用循环数据划分的方式，由于各个线程完成任务所需要的时间不尽相同，并且由于问题规模可能不是线程的整数倍，因此可能存在某些线程较早完成任务进入同步等待状态，而其他线程还未完成任务，因此就浪费了一些计算资源。而动态数据划分就是从这个角度出发，尽可能充分利用每个线程的计算资源，使得任务能够在线程之间得到比较均匀的划分。从图中也可以看出，当问题规模较小的时候，动态划分方式的表现不如循环划分，这是由于动态划分在保证负载均衡的前提下，牺牲了线程调度的开销，由于每个线程不清楚自己具体的工作，因此会存在比较大的线程同步和线程切换的开销，这种额外开销在问题规模比较小的时候会格外显著。而当问题规模提升的时候，可以发现，动态划分方式的表现已经能够超越循环划分，负载均衡带来的收益已经抵消了线程调度的额外开销。
 88 | 
 89 | ### 线程数量对比
 90 | 
 91 | 本次实验中，还探究了pthread多线程优化方法，在开启不同的线程数量时，优化效果的变化情况。为了能够显著体现pthread的优化效果，选取数据规模为1000，调整线程数量，观测加速比的变化情况如下图所示。
 92 | 
 93 | 从图像中可以看出，随着线程数量的线性增加，pthread多线程的优化效果也是呈现出线性提升的趋势。而当线程数量超过8个之后，其优化效果不再有显著的变化。这是由于实验使用的服务器单CPU核心能够提供8个线程，因此当线程数量小于8个的时候，CPU核心能够使用自己的8个线程调度任务，而当所需要的线程数量超过8个之后，就需要和服务器中的其他CPU核心借用线程，这之间会存在着额外的调度开销，因此抵消掉了性能的提升效果。
 94 | 
 95 | ## x86平台迁移
 96 | 
 97 | ### 多种SIMD指令集融合
 98 | 
 99 | 基于前文在ARM平台上对于pthread多线程编程的探究，在本次实验中还将pthread多线程优化方法迁移到x86平台上，做同样的实验探究。由于x86平台上拥有更多的SIMD指令集架构，因此实验中分别探究了SSE、AVX和AVX512三种指令集架构配合pthread多线程的优化效果，测量在不同问题规模下的运行时间，如下表所示。可以看出，pthread多线程可以结合多种SIMD指令集架构，并且在各种指令集架构上的表现基本保持稳定，并没有出现在某种指令集架构下不能够发挥很好的多线程优势的现象。
100 | 
101 | 此外，实验还以SSE指令集架构为例，探究了随着问题规模的变化，不同SSE向量化处理和pthread多线程结合SSE向量化处理这两种方法的表现情况，变化趋势图如下图所示。
102 | 
103 | 可以看出，在问题规模小于1000的时候，加速比随着问题规模的线性增长呈现出一个线性上升的趋势。而当问题规模超过1000的时候，会发现加速比出现了一个下降的趋势。结合VTune性能分析工具分析的结果，分析其原因是因为，当问题规模增加时，超过了线程cache的大小，导致出现了大量的cache miss，额外的访存开销在一定程度上抵消了多线程的优化效果，使得加速比的变化出现拐点。
104 | 
105 | ### 不同任务划分方式对比
106 | 
107 | 从cache优化的角度出发，循环划分和块划分的主要区别就在于能都利用到cache优化。就循环划分这种方式而言，线程在处理完当前行之后，接下来要处理的行距离当前间隔为NUM\_PHTREAD，因此当数据规模很大的时候，会因为cache不能够容纳下足够的数据，或者由于CPU未能够及时的预取下一行数据，而导致cache miss，因此需要额外的访存开销。而块划分的方式就能够很好的弥补这一点，其原因是对于每个线程而言，他所需要处理的数据之间在内存上是连续的，因此有很好的cache优势，因此能够减小由于cache miss导致的额外访存开销。使用VTune工具对于这两种算法的L1 cache的命中率进行检测，如下表所示。块划分的命中率能够达到98\%，而循环划分的方式只有92\%左右，因此，对于块划分而言，由于其考虑到了cache特性，因此随着问题规模的增大，其性能并未明显受到访存开销的影响。而对于循环数据划分，则因为其划分方式会导致大量的cache miss，因此访存开销会极大影响其性能表现。这也正符合图中的变化趋势。
108 | 
109 | 从负载均衡的角度出发，循环划分和动态数据划分的主要区别就在于能否充分利用各个线程的计算资源，尽可能减少同步等待所导致的额外开销。从图\ref{fig:ff6}中也可以看出，当问题规模较小的时候，动态划分方式的表现不如循环划分，这是由于动态划分在保证负载均衡的前提下，牺牲了线程调度的开销，由于每个线程不清楚自己具体的工作，因此会存在比较大的线程同步和线程切换的开销，这种额外开销在问题规模比较小的时候会格外显著。而当问题规模提升的时候，可以发现，动态划分方式的表现已经能够超越循环划分，负载均衡带来的收益已经抵消了线程调度的额外开销。根据VTune性能分析工具，观察这三种任务划分方式的CPU占用率，可以得到如下对比图。从途中可以看出，当动态数据划分的CPU占用率一直保持一个较高水平，并且相对比较均衡。而对比其余两种划分方式，由于其没有考虑负载均衡，因此在CPU占用率这个指标上，其波动十分明显，甚至会出现低于20%的占用率，这是对于计算资源的严重浪费。


--------------------------------------------------------------------------------
/Lab3/report/x86.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/x86.png


--------------------------------------------------------------------------------
/Lab3/report/x86_pthread.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/x86_pthread.png


--------------------------------------------------------------------------------
/Lab3/report/x86线程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab3/report/x86线程.png


--------------------------------------------------------------------------------
/Lab4/Gauss_serial.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <windows.h>
 3 | #include <sys/time.h>
 4 | 
 5 | //#define debug
 6 | 
 7 | using namespace std;
 8 | 
 9 | const int N = 500;
10 | const int L = 100;
11 | const int LOOP = 1;
12 | float data[N][N];
13 | float matrix[N][N];
14 | 
15 | void init_data();
16 | void init_matrix();
17 | void calculate_serial();
18 | 
19 | int main()
20 | {
21 |     struct timeval start;
22 |     struct timeval end;
23 |     float time = 0;
24 |     init_data();
25 |     // ====================================== serial ======================================
26 |     time = 0;
27 |     for (int i = 0; i < LOOP; i++)
28 |     {
29 |         init_matrix();
30 |         gettimeofday(&start, NULL);
31 |         calculate_serial();
32 |         gettimeofday(&end, NULL);
33 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
34 |     }
35 |     #ifdef debug
36 |     cout << "serial:" << time / LOOP << "ms" << endl;
37 |     #endif
38 | 	system("pause");
39 | }
40 | 
41 | // 初始化data，保证每次数据都是一致的
42 | void init_data()
43 | {
44 |     for (int i = 0; i < N; i++)
45 |         for (int j = i; j < N; j++)
46 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
47 |     for (int i = 0; i < N - 1; i++)
48 |         for (int j = i + 1; j < N; j++)
49 |             for (int k = 0; k < N; k++)
50 |                 data[j][k] += data[i][k];
51 | }
52 | 
53 | // 用data初始化matrix，保证每次进行计算的数据是一致的
54 | void init_matrix()
55 | {
56 |     for (int i = 0; i < N; i++)
57 |         for (int j = 0; j < N; j++)
58 |             matrix[i][j] = data[i][j];
59 | }
60 | 
61 | // 串行算法
62 | void calculate_serial()
63 | {
64 |     for (int k = 0; k < N; k++)
65 |     {
66 |         for (int j = k + 1; j < N; j++)
67 |         {
68 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
69 |         }
70 |         matrix[k][k] = 1;
71 |         for (int i = k + 1; i < N; i++)
72 |         {
73 |             for (int j = k + 1; j < N; j++)
74 |             {
75 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
76 |             }
77 |             matrix[i][k] = 0;
78 |         }
79 |     }
80 | }


--------------------------------------------------------------------------------
/Lab4/report/Lab4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/Lab4.pdf


--------------------------------------------------------------------------------
/Lab4/report/SIMD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/SIMD.png


--------------------------------------------------------------------------------
/Lab4/report/arm_sp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/arm_sp.png


--------------------------------------------------------------------------------
/Lab4/report/cache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/cache.png


--------------------------------------------------------------------------------
/Lab4/report/idea.md:
--------------------------------------------------------------------------------
1 | * 串行算法
2 | * 手动的SIMD
3 | * 手动的Pthread算法
4 | * 对比openmp的SIMD和手动的SIMD之间的差别
5 | * 对比openmp的pthread和手动的pthread之间的差别
6 | * 对比不同任务划分方式之间的差别
7 | * 对比行列划分之间的差别
8 | * 对比动态线程创建和静态线程创建的开销
9 | 


--------------------------------------------------------------------------------
/Lab4/report/offload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/offload.png


--------------------------------------------------------------------------------
/Lab4/report/report.md:
--------------------------------------------------------------------------------
  1 | # 问题描述
  2 | 
  3 | 在进行科学计算的过程中，经常会遇到对于多元线性方程组的求解，而求解线性方程组的一种常用方法就是Gauss消去法。即通过一种自上而下，逐行消去的方法，将线性方程组的系数矩阵消去为主对角线元素均为1的上三角矩阵。
  4 | $$
  5 | \left[
  6 | \begin{matrix}
  7 | a_{11}&a_{12}&\cdots&a_{1\ n-1}&a_{1n}\\
  8 | a_{21}&a_{22}&\cdots&a_{2\ n-1}&a_{2n}\\
  9 | \vdots&\vdots&\ddots&\vdots&\vdots\\
 10 | a_{n-1\ 1}&a_{n-1\ 2}&\cdots&a_{n-1 \ n-1}&a_{n-1\ n}\\
 11 | a_{n\ 1}&a_{n\ 2}&\cdots&a_{n \ n-1}&a_{n\ n}\\
 12 | \end{matrix}
 13 | \right]
 14 | =>
 15 | \left[
 16 | \begin{matrix}
 17 | 1&a_{12}'&\cdots&a_{1\ n-1}'&a_{1n}'\\
 18 | 0&1&\cdots&a_{2\ n-1}'&a_{2n}'\\
 19 | \vdots&\vdots&\ddots&\vdots&\vdots\\
 20 | 0&0&\cdots&1&a_{n-1\ n}\\
 21 | 0&0&\cdots&0&1\\
 22 | \end{matrix}
 23 | \right]
 24 | $$
 25 | 在整个消去的过程中，主要包含两个过程
 26 | 
 27 | ![image-20220428214131431](C:\Users\Lenovo\AppData\Roaming\Typora\typora-user-images\image-20220428214131431.png)
 28 | 
 29 | # 实验设计
 30 | 
 31 | 考虑Gauss消去的整个过程中主要涉及到两个阶段，一个是在消元行行内除法过程，一个是其余行减去消元行的过程。而就每个阶段而言，其所做的工作基本是一致的，只是在不同的消元轮次时，消元的起始位置不同。尤其是针对第二个阶段，即其余行依次减去消元行的过程，这个阶段每一行所做的工作是完全一致的，十分适合并行化处理，即将待消去的行平均分配给几个线程，由于这些数据之间不存在依赖性，因此每个线程只需要各自完成好自己的工作即可，不存在线程之间进行通信的额外开销。
 32 | 
 33 | 而对于第一阶段，即消元行行内进行除法操作时，由于这个问题规模相对较小，如果将待操作的数据分配给不同的线程进行处理的话，线程挂起到唤醒这部分的时间开销相较于要处理的问题而言占比很高，因此不适合进行多线程并行处理，但是仍可以结合SIMD的向量化处理。同样在第二阶段，被消元行依次减去消元行的过程中，每一行内的减法运算同样也不适合进行多线程的并行处理，也可以采用SIMD进行向量化处理。
 34 | 
 35 | 在本次实验中，将设计以下实验进行探究：
 36 | 
 37 | * 采用openmp对于Gauss消去的算法进行多线程优化，分析在不同任务规模下的性能表现
 38 | * 设计不同的任务划分方式，调整schedule的参数，对比在不同任务规模下的性能表现
 39 | * 使用openmp进行SIMD向量化处理，并与之前手动SIMD的性能进行对比
 40 | * 使用openmp进行多线程处理，并与之前手动pthread的性能进行对比
 41 | * 对比不同的任务划分方式之间的性能差异，主要对比按行划分和按列划分
 42 | * 对比在openmp下采用不同的线程创建方式的性能差异，主要对比动态线程创建和静态线程创建
 43 | * 对比在不同线程数量下的openmp的性能表现情况
 44 | * 尝试将openmp优化方法从arm平台迁移到x86平台上，分析性能表现
 45 | * 在devcloud上尝试任务卸载，将Gauss消元的消去过程卸载到GPU上进行运算，测量其性能表现
 46 | 
 47 | ## openmp并行处理
 48 | 
 49 | 对于Gauss消去的过程，在每一轮消去中主要包含两个阶段，首先是针对消元行做除法运算，然后是对于剩余的被消元行，依次减去消元行的某个倍数。在每一轮的过程中，除法操作和消元减法操作之间是有着严格的先后顺序的，即必须首先完成消元行的除法操作之后，才能够执行被消元行的减法操作。因此在除法运算和消元之间需要进行以及每一轮次消元结束之后需要进行一次同步。
 50 | 
 51 | 在考虑使用openmp进行多线程的实验设计时，考虑使用单个线程首先来处理消元行的除法操作。当这一行的除法操作完成之后，再将后续的消元过程分配给多个线程，并将循环变量全部声明为线程私有，将待消元的矩阵声明为线程间共享。
 52 | 
 53 | 由于openmp还提供的simd的自动向量化预编译选项，因此在实验探究的过程中，还将探究单线程条件下使用了simd预编译选项后的性能表现，并且同之前手动设计的SIMD向量化算法的性能表现进行对比。此外还会将openmp设计的多线程程序和之前设计的pthread多线程程序进行性能对比，对比两种不同的算法设计的性能差异。
 54 | 
 55 | 可以计算出采用了openmp方式的优化算法，其理论加速比应该和线程数量成正比，最优效果下应该能够达到 NUM_THREADS 倍的性能提升。而当同时结合了 SIMD 算法之后，这个加速比又将提升，并于向量化处理的宽度有关。当采用四路向量化时，整体的最优加速比应该能够达到 $NUM_THREADS \times 4 $ 倍。
 56 | 
 57 | ## 数据划分设计
 58 | 
 59 | openmp为我们提供了多种的任务划分方式，可以通过设置schedule中的参数选择不同的任务划分方式。针对不同的负载特性，可以考虑使用static、dynamic和guided这三种不同的任务划分方式。
 60 | 
 61 | 最朴素的想法就是采用静态数据划分的方式，即在给各个线程分配任务的时候就已经确定好了每个线程负责的任务范围。这种任务划分方式，在有些情况下，即任务分布不均匀的时候会导致比较严重的负载不均。而在Gauss消去的过程中，由于每个阶段都是重新进行任务划分的，因此负载不均的问题并不明显，因而直接采用静态数据划分的方式也应该能够收获不错的效果。
 62 | 
 63 | 而从负载均衡的角度出发，可以使用动态数据划分的方式。由于在每一轮消元的过程中，最终决定本轮消元时间的是运行时间最长的线程。因此可能存在个别线程提前完成任务而进入空闲等待浪费了计算资源。因此可以使用动态任务划分的方式，但可能会导致较大的额外线程调度开销。
 64 | 
 65 | 由于高斯消元的过程中，任务的规模在不断减小，因此如果划分给每个线程的任务范围是固定的，就会导致随着任务的逐步推进，个别线程在某一轮消元的过程中没有被分配到任何任务，而其他线程则需要执行很多的任务，因此浪费了比较多的计算资源。因此可以使用schedule中提供的guided参数，随着任务推进，指数缩减到指定的步幅，这样就可以尽可能地利用上全部线程地计算资源。
 66 | 
 67 | ## 行列划分设计
 68 | 
 69 | 考虑不同的任务划分方式，对于Gauss消元的问题，可以考虑从行列两种划分角度进行实验设计。通常的，我们都是采用按行划分的方式，原因在于，在高斯消元的过程中，每一行的所需要做的操作是完全相同的，并且行内的数据是连续的，因此具有比较好的空间局部性。但是由于外层循环的迭代，每一轮各个线程所处理的行号可能是完全不同的，这将会导致在不同的消元轮次中，各个线程的L1和L2cache中不存在所需要的数据，而需要到其他线程的cache中去访问，这就造成了伪共享。而线程之间访问cache也是一个很大的开销，因此按行划分的方式会受到时间局部性的制约。
 70 | 
 71 | 考虑按列划分的方式。在进行高斯消元的过程中，每一列所做的操作也是完全相同的，即将消元行该列的值依次减到该列中的其余位置处，而各列之间又不存在着数据依赖。因此也可以考虑按列划分的任务方式。但是由于在计算的过程中，每一个线程中的数据访问是跨行的，因此可能存在着比较大的几率导致cache miss，因此这种方法受到空间局部性的制约。而由于Gauss消元的过程是逐渐向右下角收缩的过程，因此按列划分的方式也会受到时间局部性的制约。但是按列划分也是一种合理的尝试。
 72 | 
 73 | ## 不同数据规模和线程数下的性能探究
 74 | 
 75 | 考虑到线程的创建，调度，挂起和唤醒等操作相对于简单的计算操作而言，所需要的时间开销是非常大的。因此可以推测，当问题规模比较小的时候，由于线程调度导致的额外开销会抵消掉多线程优化效果，甚至还会表现出多线程比串行算法更慢的情况。而随着问题规模的增加，线程之间调度切换所需要的时间开销相对于线程完成任务所需要的时间而言已经占比很低，这样就能够正常反映出多线程并行优化的效果。因此，设计实验探究在不同数据规模下，多线程并行优化算法的优化效果。此外还将探究在所使用的线程数量不同的情况下，并行算法优化效果的变化情况。
 76 | 
 77 | ## x86平台迁移
 78 | 
 79 | 本次实验除了对ARM架构下采用neon指令集架构结合openmp多线程编程，对Gauss消去算法进行并行化处理，还将算法迁移到了x86平台上，采用x86中的SSE指令集架构分别对算法进行重构，然后对比实验效果。
 80 | 
 81 | ## 任务卸载实验设计
 82 | 
 83 | Intel的在线平台devcloud为实验提供了很好的平台，能够在在线平台上尝试使用GPU进行运算卸载。因此可以考虑将数据以来较小，运算密集的任务卸载到GPU上进行加速运算。
 84 | 
 85 | 针对Gauss消元的问题，可以发现，对于每一轮循环，可以将消元的过程卸载到GPU中进行运算，针对每个GPU计算单元，需要将矩阵作为共享数据分发到每个GPU运算单元的内存中，而将其余的循环控制变量作为私有变量，防止不同的GPU运算单元之间相互影响。
 86 | 
 87 | 因此，在设计实验的时候，在每轮循环中，首先让主线程去处理除法操作，然后将隔行的消元操作卸载到GPU上。对于卸载到GPU中的运算部分，可以使用SIMD进行向量化的处理，充分利用GPU中的矩阵计算优势，以期望能够获得很好的性能表现。
 88 | 
 89 | # 实验结果分析
 90 | 
 91 | ## ARM平台
 92 | 
 93 | ### openmp实现及性能对比
 94 | 
 95 | 为了能够探究openmp并行算法的优化效果，考虑调整问题规模，测量在不同任务规模下，串行算法，手动SIMD算法，手动pthread算法以及openmp版本的SIMD算法、openmp版本的多线程算法的时间性能表现。其中pthread多线程算法和openmp多线程算法都和SIMD算法进行了结合，启用了8条线程，并采用了四路向量化处理。为了能够比较全面的展现并行优化效果随问题规模的变化情况，在问题规模小于1000时采用步长为100，而当问题规模大于1000时，步长调整为500。五种算法在不同问题规模下的表现如下表所示。
 96 | 
 97 | 为了能够更加直观的观察算法的性能表现随问题规模的变化情况，特意利用测量的数据计算了四种并行优化算法的加速比随时间的变化情况，如图所示。
 98 | 
 99 | 从图像中可以看出，随着问题规模的增加，四种并行优化算法的加速比都呈现一个递增的趋势。由于在计算的过程中，出了乘法操作和消元操作，还有很多和分支跳转等不能够进行向量化处理的运算，因此采用SIMD思路进行优化的两种方法并没有能够接近理论加速比4倍，之保持在了一个1.5左右的加速比。而对于两种多线程的并行优化算法，其加速比随着问题规模的增加呈现快速增长的趋势，并且由于两种多线程的算法都与SIMD算法进行了融合，因此在评价其性能的时候可以用SIMD的性能作为baseline。以此为参考时，可以发现，随着问题规模的增加，两种并行优化算法都接近了对SIMD的理论加速比8倍，这说明Gauss消元的问题是十分适合进行多线程优化的，原因是因为不同的线程之间在分配任务的时候并不存在严重的数据以来问题，因此线程之间额外的通信开销很少。
100 | 
101 | 此外，实验还对比了手动的SIMD算法和openmp版本的SIMD算法之间的性能差异，从图像中可以看出，手动的SIMD算法要略优于openmp版本，通过perf来分析其性能可以发现，openmp版本的所需要的指令数instructions要明显高于手动SIMD的方法，并且在时钟周期cycles指标上也要高于手动版本，因此其性能表现更差。
102 | 
103 | 同时，实验还对比了手动的pthread多线程和openmp多线程之间的性能差异，从图像中可以看出，手动的pthread算法其性能表现要一直差于openmp，这是因为在手动实现pthread多线程算法的时候，由于Gauss消元的各个轮次之间存在着严格的先后关系，每一轮内部的除法操作和消元操作也都存在着严格的依赖关系，因此在手动实现多线程的时候，线程的同步开销要比openmp管理所需要的时间更多。
104 | 
105 | ### 数据划分对比
106 | 
107 | openmp在进行数据划分schedule的时候，为我们提供了三种不同的选项，即static、dynamic和guided。static方法是在线程创建完成之后，就明确划分了任务，dynamic方法则是在线程执行的过程中去动态的划分任务，而guided方法则是随着任务的推进逐步缩减任务划分的粒度。在本次实验中，我们对比了这三种方法在不同任务规模下的性能表现，并将绘制出了加速比随问题规模的变化情况，如图所示。
108 | 
109 | 从图像中可以看出，随着任务规模的增加，这三种划分方式的加速比都逐渐增加，而static方法首先达到了瓶颈，这是因为10-12左右就是openmp的一个最大理论加速比的范围，因此在Gauss消元这个问题上static的性能表现是最好的。原因是，在Gauss消元的过程中，每一轮都会重新进行任务划分，因此虽然整体上来看，矩阵的每个部分的计算量是不同的，但是在每一个消元轮次内，任务基本上是均匀的，因此不存在严重的负载不均的问题。而static和guided主要是为了解决这种负载不均的现象，为了平衡负载不均，必定要采用更细粒度的任务划分，因此会涉及到更多的额外条件判断以及通信开销，因此在Gauss消元问题上的表现不如static方法。
110 | 
111 | 同样是考虑了负载均衡，由于guided方法会逐步缩减任务划分的粒度，尽可能让所有线程都被分配任务，而动态划分的方式可能出现随着任务推进，个别线程不会被分配到任务的情况，因此浪费了一定的计算资源。所以从图像中也可以看出，采用guided逐步调整划分粒度的方法其性能表现要比dynamic方法更优。
112 | 
113 | ### 行列划分对比
114 | 
115 | 在本次实验中，除了从负载均衡的角度出发，探究不同的粒度的任务划分方式以外，还从cache的角度出发，考虑空间局部性和时间局部性，探究了行列划分两种方式的性能表现，其性能表现如图所示。
116 | 
117 | 从图像中可以很明显的看到，由于按列计算的方式，对于矩阵的访问来说是跨行访问的，因此随着任务规模的增加，由于空间局部性的限制，会产生比较严重的cache miss，因此在访存的时候需要到内存中或者其他线程的cache中去访问，这会导致很严重的访存开销。通过perf分析两种方式的L1 dcache miss情况，如图所示，这也能够很好的印证我们的推断。
118 | 
119 | 而从时间局部性考虑，这两种方法都会受到时间局部性的制约。这是由于随着任务的推进，任务执行的区域逐渐缩减，因此对于每个线程而言，其被分配的任务行号或者列号可能每次都不一样，因此就不能够利用上一轮计算中已经缓存到cache中的数据。因此可以从这个角度出发进一步改良算法。
120 | 
121 | ### 线程数量对比
122 | 
123 | 在本次实验中，探究了不同线程数量下，openmp多线程算法的性能表现，其变化趋势大致如下图所示。从图像中可以看出，这三种划分方式的加速比都随着线程数量的增加呈现一个线性增加的趋势，说明openmp的多线程算法具有很好的扩展性。
124 | 
125 | ### 线程管理对比
126 | 
127 | 在本次实验中，探究了动态线程创建和静态线程创建这两种线程管理方式的性能对比。实验结果如图所示。动态线程创建的方式就是在每一轮消元的过程中，重新创建线程，然后进行任务的划分，而静态线程创建的方式则是在最初一次性创建好全部线程，然后在每一轮中重新进行任务的划分。动态线程创建的方式适合于在不同的阶段需要处理完全不同的任务，而静态线程创建的方式适合于各个阶段所需要处理的任务近似相同。因此在本次实验中，更适合采用静态线程创建的方式，而减小线程创建、初始化以及任务划分的额外开销，而这种开销相对于简单的计算而言是比较大的。这也可以从实验数据中得到进行印证。
128 | 
129 | ## x86平台迁移
130 | 
131 | ### 多种SIMD指令集融合
132 | 
133 | 基于前文在ARM平台上对于openmp多线程编程的探究，在本次实验中还将openmp多线程优化方法迁移到x86平台上，做同样的实验探究。由于x86平台上拥有更多的SIMD指令集架构，因此实验中分别探究了SSE、AVX和AVX512三种指令集架构配合openmp多线程的优化效果，测量在不同问题规模下的运行时间，如下表所示。可以看出，openmp多线程可以结合多种SIMD指令集架构，并且在各种指令集架构上的表现基本保持稳定，并没有出现在某种指令集架构下不能够发挥很好的多线程优势的现象。
134 | 
135 | 从数据中可以看出，随着问题规模的增加，SSE、AVX和AVX512三种指令集架构的加速比均保持在一个比较稳定的水平上，这说明向量化的优化已经达到了一个瓶颈，而之所以没有能够达到理论加速比是因为整个程序并不能完全进行向量化的展开，因此其余未能够向量化的部分极大的影响了整体的加速比。
136 | 
137 | 而横向对比同一指令集架构下的SIMD和结合了SIMD的openmp方法，则可以发现，由于在实验中总共拉起了8条线程，因此随着问题规模的增加，当问题规模达到3000时，这三种指令集架构的openmp算法相对于SIMD算法的加速比已经能够逼近8倍了，说明在devcloud平台上，多线程的性能能够得到充分的释放。
138 | 
139 | ### 不同任务划分方式对比
140 | 
141 | openmp为任务划分提供了三种选项：static、dynamic和guided。static方法是在线程创建完成之后，就明确划分了任务，dynamic方法则是在线程执行的过程中去动态的划分任务，而guided方法则是随着任务的推进逐步缩减任务划分的粒度。在x86平台的实验中，我们对比了这三种方法在数据规模为1000时的性能表现，并将使用VTune性能分析工具对于这三种方式的CPU占用时间进行监测，得到对比图如下。
142 | 
143 | 从图中可以看出，static方式的CPU占用时间分配十分的不均衡，这也体现了这种方法会受到负载不均的制约，因此其运行时间较长，为178.4ms。在本次实验中，dynamic方式的CPU占用时间分布较为均衡，而且通过实验测量程序的运行时间可以发现dynamic方式所消耗的时间最少，为169.2ms。对于guided方法，在本次实验中并没有能够表现出很好的性能，其消耗的时间略高于dynamic方式但仍低于static方式，说明其在一定程度上可以减轻负载不均的影响。
144 | 
145 | ## GPU任务卸载
146 | 
147 | 在本次实验中还尝试了在Devcloud平台上进行GPU运算卸载，将Gauss消元的消元过程卸载到GPU，利用GPU运算单元加速运算，具体代码如下。
148 | 
149 | 在调整数据规模进行多次实验之后，发现offloading方式的性能表现甚至差于普通的串行算法。分析其原因，可能是因为在不同消元轮次之间和每一轮中除法操作和消元操作之间都存在这比较严重的数据依赖，因此会造成GPU各个运算单元之间需要不断地通过CPU进行通信和同步，而这种通信的开销是非常大的。因此当数据规模比较小的时候，会严重的影响性能。笔者猜测这种优化的效果可能会在数据规模很大的情况下，并且尽可能减少数据依赖后体现出来。
150 | 
151 | # 总结
152 | 
153 | 在本次openmp多线程并行实验中，基于Gauss消元问题，对于过程中的消元操作采用openmp多线程优化，并在线程内进行除法或者减法运算时，结合SIMD向量化处理，在ARM平台上采用openmp+neon的方式，在x86平台上采用openmp+SSE/AVX/AVX512的方式，探究了其并行优化效果。除此之外，还基于cache的空间局部性和时间局部性对比了行划分和列划分的性能差异，基于负载均衡考虑对比了openmp提供的三种schedule方式的性能差异，通过实验可以验证，考虑了cache特性的行划分方式和考虑了负载均衡的动态划分方式均能够取得一定的性能提升。实验还探究了开启线程的数量同优化性能之间的关系，并结合实验平台的硬件参数进行合理假设和分析。在实验的过程中，利用perf和VTune等性能分析工具，对于深层次的内核和硬件事件进行分析，从底层的角度解释了表面上性能差异的原因。本次实验中第一次尝试了通过offloading的方式将部分运算卸载到GPU上进行加速，但是由于数据依赖导致的通信开销，并未能够取得很好的效果，后续会对相关的问题进行进一步研究和优化。


--------------------------------------------------------------------------------
/Lab4/report/result_arm.csv:
--------------------------------------------------------------------------------
 1 | N,serial,SIMD,openmp_single_SIMD,pthread,static,dynamic,guided,guided_nowait,openmp+SIMD,static_thread,dynamic_thread,row,colum,,N,serial,SIMD,openmp_single_SIMD,pthread,openmp_pthread,,,,,,,
 2 | 100,2.65594,1.71756,2.04062,2.86874,0.80802,1.6149,1.11228,0.85638,1.0763,1.1156,1.25274,1.1157,2.58,,100,2.66,1.72,2.04,2.87,1.08,,,,,,,
 3 | 200,20.9783,13.4111,15.749,6.88256,3.11922,5.43658,4.12146,3.5104,3.7472,4.16408,4.43766,4.17048,13.0417,,200,20.98,13.41,15.75,6.88,3.75,,,,,,,
 4 | 300,70.9989,45.0177,53.0444,14.9859,8.66504,12.5394,10.5173,9.62818,9.34518,10.5892,10.9585,10.6003,22.6931,,300,71,45.02,53.04,14.99,9.35,,,,,,,
 5 | 400,169.704,107.465,126.626,29.2214,18.6185,25.0377,22.2336,23.8745,20.8678,24.0723,23.7118,24.1693,87.3793,,400,169.7,107.47,126.63,29.22,20.87,,,,,,,
 6 | 500,340.583,214.38,252.674,51.549,35.8131,49.8718,44.3103,44.9247,39.0818,44.315,42.2139,44.3444,122.154,,500,340.58,214.38,252.67,51.55,39.08,,,,,,,
 7 | 600,597.55,375.187,441.189,83.7673,59.6132,77.077,70.8986,74.1502,61.6912,70.7942,68.9358,71.0824,251.968,,600,597.55,375.19,441.19,83.77,61.69,,,,,,,
 8 | 700,952.116,595.738,702.35,122.307,92.8821,120.111,112.173,120.041,99.9264,111.31,105.411,111.482,290.221,,700,952.12,595.74,702.35,122.31,99.93,,,,,,,
 9 | 800,1425.64,890.338,1041.39,176.05,136.469,174.442,163.35,176.606,145.936,162.564,153.963,163.255,530.797,,800,1425.64,890.34,1041.39,176.05,145.94,,,,,,,
10 | 900,2044.56,1263.94,1486.4,245.218,193.378,245.197,229.589,247.571,205.548,229.23,221.286,230.074,606.942,,900,2044.56,1263.94,1486.4,245.22,205.55,,,,,,,
11 | 1000,2737.41,1711.53,2011.04,372.966,262.921,328.06,309.898,334.606,275.593,307.483,288.842,308.307,1044.79,,1000,2737.41,1711.53,2011.04,372.97,275.59,,,,,,,
12 | 1500,9493.09,5741.21,6772.34,976.941,921.541,1148.44,1032.27,1079.94,887.126,1009.96,981.762,1012.49,2286.33,,1500,9493.09,5741.21,6772.34,976.94,887.13,,,,,,,
13 | 2000,23178.9,13670.6,16068.5,2215.51,2234.29,2481.04,2359.16,2454.23,2051.97,2344.34,2226.36,2366.24,6471.24,,2000,23178.9,13670.6,16068.5,2215.51,2051.97,,,,,,,
14 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
15 | N,serial,SIMD,openmp_single_SIMD,pthread,static,dynamic,guided,guided_nowait,openmp+SIMD,static_thread,dynamic_thread,row,colum,,,,,,,,,,,,,,
16 | 100,,1.546344815,1.301535808,0.925821092,3.286973095,1.644646727,2.387833999,3.101356874,2.467657716,2.380727859,2.12010473,2.380514475,1.029434109,,,,,,,,,,,,,,
17 | 200,,1.564249018,1.33204013,3.048037358,6.725495476,3.85873104,5.090016645,5.976042616,5.598393467,5.03791954,4.727333775,5.030188372,1.608555633,,,,,,,,,,,,,,
18 | 300,,1.577132994,1.338480594,4.737713451,8.193718667,5.662065171,6.750677455,7.374072774,7.597381752,6.704840781,6.478888534,6.697819873,3.128655847,,,,,,,,,,,,,,
19 | 400,,1.579156004,1.340198695,5.807524622,9.114805167,6.777938868,7.632772021,7.108169805,8.13233786,7.04976259,7.156942957,7.021469385,1.942153348,,,,,,,,N,serial,static,dynamic,guided,static_thread,dynamic_thread
20 | 500,,1.58868831,1.347914704,6.606975887,9.5100117,6.829169992,7.686316725,7.581196981,8.714619081,7.685501523,8.068029725,7.680406094,2.788144473,,,,,,,,2,2737.41,1032,1028,1031.5,1031.9,1020
21 | 600,,1.592672454,1.354408201,7.133451836,10.02378668,7.752636973,8.428234126,8.058643132,9.686146484,8.440663218,8.668210132,8.406440976,2.371531306,,,,,,,,3,2737.41,691,864,777,774,692
22 | 700,,1.598212637,1.355614722,7.78464029,10.25080182,7.926967555,8.487924902,7.93159004,9.528172735,8.553732818,9.032415972,8.540535692,3.280658533,,,,,,,,4,2737.41,511,524,519,518,511.5
23 | 800,,1.601234587,1.368978001,8.097926725,10.44662158,8.172573119,8.7275176,8.07243242,9.768939809,8.769715312,9.259627313,8.732596245,2.685847885,,,,,,,,5,2737.41,414,495,473,469,467
24 | 900,,1.617608431,1.375511302,8.337723984,10.57286765,8.338438072,8.905304697,8.258479386,9.946873723,8.919251407,9.239445785,8.886532159,3.368625009,,,,,,,,6,2737.41,340,373,398,395,391
25 | 1000,,1.599393525,1.361191224,8.539569827,10.41153046,8.344235811,8.833261267,8.180994961,9.932799454,8.90263852,9.477188221,8.878844788,2.620057619,,,,,,,,7,2737.41,293,318.3,318.5,317.4,312
26 | 1500,,1.653499872,1.401744449,9.717157945,10.30132137,8.266073979,9.196324605,8.790386503,10.70094891,9.399471266,9.66944127,9.37598396,4.152108401,,,,,,,,8,2737.41,262.921,314.2,313.1,313.2,309.2
27 | 2000,,1.695529092,1.442505523,10.46210579,10.37416808,9.342412859,9.825064854,9.444469345,11.29592538,9.887175068,10.4111195,9.795667388,3.581832848,,,,,,,,,,,,,,
28 | ,,,,,,,,,,,,,,,,,,,,,N,serial,static,dynamic,guided,static_thread,dynamic_thread
29 | ,,,,,,,,,,,,,,,,,,,,,2,,2.65252907,2.662850195,2.653814833,2.652786123,2.683735294
30 | ,,,,,,,,,,,,,,,,,,,,,3,,3.961519537,3.168298611,3.523050193,3.536705426,3.955794798
31 | ,,,,,,,,,,,,,,,,,,,,,4,,5.356966732,5.224064885,5.274393064,5.28457529,5.351730205
32 | ,,,,,,,,,,,,,,,,,,,,,5,,6.612101449,5.530121212,5.787336152,5.836695096,5.861691649
33 | ,,,,,,,,,,,,,,,,,,,,,6,,8.051205882,7.338900804,6.877914573,6.930151899,7.001048593
34 | ,,,,,,,,,,,,,,,,,,,,,7,,9.342696246,8.600094251,8.594693878,8.624480151,8.77375
35 | ,,,,,,,,,,,,,,,,,,,,,8,,10.41153046,8.712316996,8.742925583,8.7401341,8.853201811
36 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
37 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
38 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
39 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
40 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
41 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
42 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
43 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
44 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
45 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
46 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
47 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
48 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
49 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
50 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
51 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
52 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
53 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
54 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
55 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
56 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
57 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
58 | ,,SIMD,openmp_SIMD,,,,,,,,,row,colum,,,,,,,,,,,,,,
59 | ,instructions,3.39%,7.80%,,,,,,,,L1-dcache-miss,3%,64%,,,,,,,,,,,,,,
60 | ,cycles,4.97%,6.03%,,,,,,,,,,,,,,,,,,,,,,,,
61 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
62 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
63 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
64 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
65 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
66 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
67 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
68 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
69 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
70 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
71 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
72 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
73 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
74 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
75 | ,,,,,,,,,,,,,,,,,,,,,,,,,,,
76 | ,2.38 ,2.12 ,,3.35%,5.35%,,,,,,,,,,,,,,,,,,,,,,
77 | ,5.04 ,4.73 ,,3.21%,5.13%,,,,,,,,,,,,,,,,,,,,,,
78 | ,6.70 ,6.48 ,,2.89%,4.82%,,,,,,,,,,,,,,,,,,,,,,
79 | ,7.05 ,7.16 ,,2.54%,4.51%,,,,,,,,,,,,,,,,,,,,,,
80 | ,7.69 ,8.07 ,,2.21%,4.23%,,,,,,,,,,,,,,,,,,,,,,
81 | ,8.44 ,8.67 ,,1.98%,4.01%,,,,,,,,,,,,,,,,,,,,,,
82 | ,8.55 ,9.03 ,,1.50%,3.89%,,,,,,,,,,,,,,,,,,,,,,
83 | ,8.77 ,9.26 ,,1.32%,3.82%,,,,,,,,,,,,,,,,,,,,,,
84 | ,8.92 ,9.24 ,,0.89%,3.79%,,,,,,,,,,,,,,,,,,,,,,
85 | ,8.90 ,9.48 ,,<0.1%,3.68%,,,,,,,,,,,,,,,,,,,,,,
86 | ,9.40 ,9.67 ,,<0.1%,3.62%,,,,,,,,,,,,,,,,,,,,,,
87 | ,9.89 ,10.41 ,,<0.1%,3.56%,,,,,,,,,,,,,,,,,,,,,,
88 | 


--------------------------------------------------------------------------------
/Lab4/report/result_x86.csv:
--------------------------------------------------------------------------------
 1 | N,serial,SIMD,openmp_single_SIMD,pthread,static,dynamic,guided,guided_nowait,openmp+SIMD,static_thread,dynamic_thread,row,colum
 2 | 100,1.28382,0.6914,1.11928,5.68214,0.98946,1.10512,0.89884,0.8297,0.88666,0.89258,1.16444,0.90122,7.02926
 3 | 200,9.84506,5.27498,8.08128,12.9701,2.50868,2.94168,2.65674,2.53786,2.47448,2.67136,3.16778,2.65918,26.7435
 4 | 300,33.2587,17.7215,26.7237,25.4901,5.90436,6.7098,6.50022,5.93614,5.49856,6.40806,7.2573,6.3096,81.2164
 5 | 400,78.057,41.5094,62.5016,37.4657,9.56804,13.1526,12.1024,11.7268,10.0007,12.1149,13.2091,12.1108,126.437
 6 | 500,153.204,81.82,122.036,46.8573,17.3326,22.9469,21.4389,20.9759,16.9808,21.4049,22.6868,21.4429,240.44
 7 | 600,264.586,142.789,211.327,67.0099,32.5289,36.4944,34.9419,34.2271,26.7428,34.8158,36.39,34.8527,316.888
 8 | 700,419.474,224.398,334.133,93.9803,47.9082,55.2311,52.6686,51.8387,39.3955,52.56,54.4266,52.5974,495.138
 9 | 800,626.831,335.78,498.492,131.594,68.2243,78.8986,75.836,74.7731,55.689,75.8335,77.8991,75.6808,600.007
10 | 900,890.119,475.649,707.011,169.6,91.8144,109.291,105.437,104.203,76.7005,107.641,108.297,105.17,858.473
11 | 1000,1221.11,652.026,970.062,221.495,128.331,146.479,141.865,140.325,104.363,141.702,144.311,143.603,973.993
12 | 1000,1220.79,652.808,969.457,223.658,125.762,148.221,142.599,140.292,102.333,141.678,144.568,141.52,983.234
13 | 1500,4116.48,2195.12,3263.27,573.421,423.402,461.907,451.065,448.478,317.682,453.959,459.395,450.445,2786.84
14 | 2000,9736.62,5186.1,7706.22,1090.24,981.088,1048.14,1033.67,1023.09,724.363,1030.51,1034.1,1026.15,5483.45
15 | 2500,19017.7,10203.4,15072.2,2098.49,1915.91,2008.4,1973.23,1966.18,1362.62,1978.86,1982.57,1966.04,9972.7
16 | 3000,32890,17535.9,26051.6,3447.47,3297.82,3429.42,3382.24,3374.12,2344.9,3389.17,3391.92,3392.95,16712.9
17 | ,,,,,,,,,,,,,
18 | N,serial,SIMD,openmp_single_SIMD,pthread,static,dynamic,guided,guided_nowait,openmp+SIMD,static_thread,dynamic_thread,row,colum
19 | 100,,1.856841192,1.147005218,0.225939523,1.297495604,1.161701897,1.428307596,1.54733036,1.44792818,1.438324856,1.102521384,1.424535629,0.182639424
20 | 200,,1.866369162,1.218255029,0.759058141,3.924398488,3.346747437,3.705691938,3.879276241,3.978637936,3.685411176,3.107873653,3.702291684,0.368129078
21 | 300,,1.876742939,1.244539491,1.304769303,5.632905175,4.956734925,5.116549901,5.602748587,6.048620002,5.190135548,4.582792499,5.271126537,0.409507193
22 | 400,,1.880465629,1.248880029,2.08342564,8.158097165,5.934720131,6.449712454,6.656291571,7.805153639,6.443057722,5.909335231,6.44523896,0.617358843
23 | 500,,1.872451723,1.255400046,3.269586596,8.839066268,6.676457386,7.146075592,7.303810564,9.022189767,7.157426571,6.753001746,7.144742549,0.637181833
24 | 600,,1.852985874,1.252021748,3.948461347,8.133874801,7.250043842,7.572169802,7.730307271,9.893728405,7.599595586,7.270843638,7.591549579,0.83495115
25 | 700,,1.869330386,1.255410271,4.463424782,8.75578711,7.594887663,7.964403838,8.091908169,10.64776434,7.98085997,7.707150548,7.975185085,0.847186037
26 | 800,,1.866790756,1.257454483,4.76337067,9.187796723,7.944767081,8.265612638,8.383108364,11.25592128,8.26588513,8.046704006,8.282563081,1.044706145
27 | 900,,1.871377844,1.258988898,5.24834316,9.694764656,8.144485822,8.442188226,8.542162894,11.60512643,8.26933046,8.219239683,8.463620804,1.036863128
28 | 1000,,1.872793416,1.25879583,5.513036412,9.515315863,8.336416824,8.607549431,8.702013184,11.7006027,8.617450706,8.46165573,8.503373885,1.253715376
29 | 1000,,1.870059803,1.259251313,5.458288995,9.707145243,8.236282308,8.560999727,8.701779146,11.92958283,8.616651844,8.444399867,8.626271905,1.241606779
30 | 1500,,1.875287,1.261458598,7.178809287,9.722391486,8.911923829,9.126134814,9.178778,12.95786352,9.067955476,8.960654774,9.138696178,1.477113864
31 | 2000,,1.877445479,1.263475478,8.930712504,9.924308523,9.289426985,9.419466561,9.516875348,13.44163078,9.448350817,9.415549753,9.488495834,1.7756376
32 | 2500,,1.863859106,1.261773331,9.062564034,9.926196951,9.469079865,9.637852658,9.672410461,13.95671574,9.610432269,9.592448186,9.673099225,1.906976045
33 | 3000,,1.875580951,1.262494434,9.540329575,9.973255059,9.590543007,9.724324708,9.747726815,14.02618449,9.70444091,9.696573032,9.693629438,1.967940932
34 | ,,,,,,,,,,,,,
35 | ,,,,,,,,,,,,,
36 | N,SSE,AVX,AVX512,openmp_SSE,openmp_AVX,openmp_AVX512,,,,,,,
37 | 100,1.86 ,3.31 ,6.25 ,1.45 ,2.58 ,4.87 ,,,,,,,
38 | 200,1.87 ,3.32 ,6.28 ,3.98 ,7.08 ,13.38 ,,,,,,,
39 | 300,1.88 ,3.34 ,6.31 ,6.05 ,10.77 ,20.35 ,,,,,,,
40 | 400,1.88 ,3.35 ,6.33 ,7.81 ,13.89 ,26.26 ,,,,,,,
41 | 500,1.87 ,3.33 ,6.30 ,9.02 ,16.06 ,30.35 ,,,,,,,
42 | 600,1.85 ,3.30 ,6.23 ,9.89 ,17.61 ,33.28 ,,,,,,,
43 | 700,1.87 ,3.33 ,6.29 ,10.65 ,18.95 ,35.82 ,,,,,,,
44 | 800,1.87 ,3.32 ,6.28 ,11.26 ,20.04 ,37.87 ,,,,,,,
45 | 900,1.87 ,3.33 ,6.30 ,11.61 ,20.66 ,39.04 ,,,,,,,
46 | 1000,1.87 ,3.33 ,6.30 ,11.70 ,20.83 ,39.36 ,,,,,,,
47 | 1000,1.87 ,3.33 ,6.29 ,11.93 ,21.23 ,40.13 ,,,,,,,
48 | 1500,1.88 ,3.34 ,6.31 ,12.96 ,23.06 ,43.59 ,,,,,,,
49 | 2000,1.88 ,3.34 ,6.32 ,13.44 ,23.93 ,45.22 ,,,,,,,
50 | 2500,1.86 ,3.32 ,6.27 ,13.96 ,24.84 ,46.95 ,,,,,,,
51 | 3000,1.88 ,3.34 ,6.31 ,14.03 ,24.97 ,47.19 ,,,,,,,
52 | 


--------------------------------------------------------------------------------
/Lab4/report/rowcol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/rowcol.png


--------------------------------------------------------------------------------
/Lab4/report/schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/schedule.png


--------------------------------------------------------------------------------
/Lab4/report/threads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/threads.png


--------------------------------------------------------------------------------
/Lab4/report/x86.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab4/report/x86.png


--------------------------------------------------------------------------------
/Lab4/x86/offLoading.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <math.h>
  4 | #include <sys/time.h>
  5 | #include <xmmintrin.h> // SSE
  6 | #include <pthread.h>   // pthread
  7 | #include <semaphore.h>
  8 | #include <omp.h>
  9 | 
 10 | #define _PRINT
 11 | // #define _TEST
 12 | 
 13 | using namespace std;
 14 | 
 15 | int NUM_THREADS = 8;
 16 | // ============================================== pthread 线程控制变量 ==============================================
 17 | typedef struct
 18 | {
 19 |     int t_id;
 20 | } threadParam_t;
 21 | 
 22 | sem_t sem_Division;
 23 | pthread_barrier_t barrier;
 24 | // ============================================== 运算变量 ==============================================
 25 | int N;
 26 | const int L = 100;
 27 | const int LOOP = 1;
 28 | float **data;
 29 | float **matrix;
 30 | 
 31 | ofstream res_stream;
 32 | 
 33 | void init_data();
 34 | void init_matrix();
 35 | void calculate_serial();
 36 | void calculate_SIMD();
 37 | void calculate_openmp_offloading(float*);
 38 | void print_matrix();
 39 | void test(int);
 40 | void print_result(int);
 41 | 
 42 | int main()
 43 | {
 44 |     #ifdef _TEST
 45 |     res_stream.open("result.csv", ios::out);
 46 |     for (int i = 100; i <= 1000; i += 100)
 47 |         test(i);
 48 |     for (int i = 1000; i <= 3000; i += 500)
 49 |         test(i);
 50 |     res_stream.close();
 51 |     #endif
 52 |     #ifdef _PRINT
 53 |         test(10);
 54 |     #endif
 55 |     system("pause");
 56 |     return 0;
 57 | }
 58 | 
 59 | void init_data()
 60 | {
 61 |     data = new float *[N], matrix = new float *[N];
 62 |     for (int i = 0; i < N; i++)
 63 |         data[i] = new float[N], matrix[i] = new float[N];
 64 |     for (int i = 0; i < N; i++)
 65 |         for (int j = i; j < N; j++)
 66 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
 67 |     for (int i = 0; i < N - 1; i++)
 68 |         for (int j = i + 1; j < N; j++)
 69 |             for (int k = 0; k < N; k++)
 70 |                 data[j][k] += data[i][k];
 71 | }
 72 | 
 73 | // 用data初始化matrix，保证每次进行计算的数据是一致的
 74 | void init_matrix()
 75 | {
 76 |     for (int i = 0; i < N; i++)
 77 |         for (int j = 0; j < N; j++)
 78 |             matrix[i][j] = data[i][j];
 79 | }
 80 | 
 81 | // 串行算法
 82 | void calculate_serial()
 83 | {
 84 |     for (int k = 0; k < N; k++)
 85 |     {
 86 |         for (int j = k + 1; j < N; j++)
 87 |         {
 88 |             matrix[k][j] = matrix[k][j] / matrix[k][k];
 89 |         }
 90 |         matrix[k][k] = 1;
 91 |         for (int i = k + 1; i < N; i++)
 92 |         {
 93 |             for (int j = k + 1; j < N; j++)
 94 |             {
 95 |                 matrix[i][j] = matrix[i][j] - matrix[i][k] * matrix[k][j];
 96 |             }
 97 |             matrix[i][k] = 0;
 98 |         }
 99 |     }
100 | }
101 | 
102 | // openmp offloading
103 | void calculate_openmp_offloading(float * buffer)
104 | {
105 |     int is_cpu = true;
106 |     float * buf = buffer;
107 | #pragma omp target map(tofrom: buf[0:N*N]) map(from: is_cpu) map(to: N)
108 |     {
109 |         int i, j, k;
110 |         is_cpu = omp_is_initial_device();
111 | 
112 |         for (k = 0; k < N; k++) {
113 | #pragma omp parallel default(none), private(i, j), shared(buf, N, k)
114 |             {
115 | #pragma omp single
116 |                 {
117 |                     for (j = k + 1; j < N; j++) {
118 |                         buf[k*N+j] = buf[k*N+j] / buf[k*N+k];
119 |                     }
120 |                     buf[k*N+k] = 1;
121 |                 }
122 | #pragma omp for simd
123 |                 for (i = k + 1; i < N; i++) {
124 |                     for (j = k + 1; j < N; j++) {
125 |                         buf[i*N+j] = buf[i*N+j] - buf[i*N+k] * buf[k*N+j];
126 |                     }
127 |                     buf[i*N+k] = 0;
128 |                 }
129 |             }
130 |         }
131 |     }
132 |     cout<<(is_cpu? "CPU":"GPU")<<endl;
133 | }
134 | 
135 | // 打印矩阵
136 | void print_matrix()
137 | {
138 |     for (int i = 0; i < N; i++)
139 |     {
140 |         for (int j = 0; j < N; j++)
141 |         {
142 |             printf("%.2f ", matrix[i][j]);
143 |         }
144 |         printf("\n");
145 |     }
146 | }
147 | 
148 | void test(int n)
149 | {
150 |     N = n;
151 |     cout << "=================================== " << N << " ===================================" << endl;
152 |     #ifdef _TEST
153 |     res_stream << N;
154 |     #endif
155 |     struct timeval start;
156 |     struct timeval end;
157 |     float time = 0;
158 |     init_data();
159 |     // ====================================== serial ======================================
160 |     time = 0;
161 |     for (int i = 0; i < LOOP; i++)
162 |     {
163 |         init_matrix();
164 |         gettimeofday(&start, NULL);
165 |         calculate_serial();
166 |         gettimeofday(&end, NULL);
167 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
168 |     }
169 |     cout << "serial:" << time / LOOP << "ms" << endl;
170 |     print_result(time);
171 |     // ====================================== openmp offloading ======================================
172 |     time = 0;
173 |     for (int i = 0; i < LOOP; i++)
174 |     {
175 |         init_matrix();
176 |         float * buffer = new float[N*N];
177 |         for(int i = 0; i < N*N; i++)
178 |             buffer[i] = matrix[i/N][i%N];
179 |         gettimeofday(&start, NULL);
180 |         calculate_openmp_offloading(buffer);
181 |         gettimeofday(&end, NULL);
182 |         // 将buffer复制回matrix
183 |         for(int i = 0; i < N*N; i++)
184 |             matrix[i/N][i%N] = buffer[i];
185 |         time += ((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) * 1.0 / 1000;
186 |     }
187 |     cout << "openmp_offloading:" << time / LOOP << "ms" << endl;
188 |     print_result(time);
189 |     #ifdef _TEST
190 |     res_stream << endl;
191 |     #endif
192 | }
193 | 
194 | void print_result(int time)
195 | {
196 |     #ifdef _TEST
197 |     res_stream << "," << time / LOOP;
198 |     #endif
199 |     #ifdef _PRINT
200 |     print_matrix();
201 |     #endif
202 | }


--------------------------------------------------------------------------------
/Lab4/x86/result.csv:
--------------------------------------------------------------------------------
1 | 1000


--------------------------------------------------------------------------------
/Lab5/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.21)
 2 | project(Lab5)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | set(CMAKE_EXE_LINKER_FLAGS -static)
 6 | set(CMAKE_CXX_FLAGS -march=native)
 7 | 
 8 | find_package(MPI REQUIRED)
 9 | 
10 | find_package(OpenMP REQUIRED)
11 | if(OPENMP_FOUND)
12 |     message("OPENMP FOUND")
13 |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
14 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
15 |     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
16 | endif()
17 | 
18 | include_directories(SYSTEM ${MPI_INCLUDE_PATH})
19 | 
20 | add_executable(x86 x86/Gauss.cpp)
21 | add_executable(arm arm/Gauss.cpp)
22 | add_executable(test test.cpp)
23 | 
24 | target_link_libraries(x86 ${MPI_LIBRARIES})
25 | target_link_libraries(arm ${MPI_LIBRARIES})


--------------------------------------------------------------------------------
/Lab5/arm/am.csv:
--------------------------------------------------------------------------------
 1 |  ,serial,MPI_block,MPI_cycle,MPI_pipeline,MPI_SIMD,MPI_OMP,MPI_OMP_SIMD,,MPI_block,MPI_cycle,MPI_pipeline,MPI_SIMD,MPI_OMP,MPI_OMP_SIMD
 2 | 100,0.33862,0.90476,3.77945,3.65523,3.5666,3.91236,4.06474,,0.374264998,0.089595047,0.092639861,0.094941962,0.08655134,0.083306681
 3 | 200,2.7185,3.72827,10.1691,10.3002,10.0733,10.5972,10.7221,,0.729158564,0.267329459,0.263926914,0.269871839,0.256530027,0.25354175
 4 | 300,9.10204,10.4317,20.3652,20.3559,19.5031,20.9339,19.8682,,0.872536595,0.44694086,0.447145054,0.466697089,0.434799058,0.458121018
 5 | 400,21.4773,21.3106,42.0001,41.9716,39.2503,41.1116,38.5392,,1.007822398,0.511363068,0.511710299,0.547188174,0.522414598,0.557284531
 6 | 500,42.0603,38.6061,65.8516,65.8022,60.0661,62.6272,57.0654,,1.089472907,0.63871341,0.639192915,0.700233576,0.671597964,0.737054327
 7 | 600,72.9207,63.2224,96.6892,96.792,83.653,86.4704,75.4285,,1.153399744,0.754176268,0.753375279,0.871704541,0.843302448,0.96675262
 8 | 700,116.331,97.8989,134.915,135.279,112.542,111.4,92.7622,,1.188276886,0.862254012,0.859933914,1.033667431,1.044263914,1.254077631
 9 | 800,174.761,143.845,185.646,185.523,147.853,136.999,114.965,,1.214925788,0.941366903,0.94199102,1.181991573,1.275637048,1.520123516
10 | 900,250.14,203.481,248.521,247.211,191.613,164.244,139.9,,1.229303964,1.00651454,1.011848178,1.305443785,1.522978008,1.787991422
11 | 1000,346.754,276.217,316.976,315.751,245.347,212.05,166.701,,1.255368062,1.093944021,1.098188129,1.413320725,1.635246404,2.0800955
12 | 1500,1176.88,912.376,891.533,893.707,679.538,456.254,339.622,,1.289906793,1.320063307,1.316852167,1.731882544,2.579440399,3.465264323
13 | 2000,2833.69,2107.88,1941.73,1944.59,1435.86,896.433,603.925,,1.344331746,1.459363557,1.457217203,1.973514131,3.16107283,4.692122366
14 | 2500,5568.77,4081.31,3571.36,3571.11,2603.29,2083.57,1176.75,,1.36445651,1.559285538,1.559394698,2.139127796,2.672705981,4.732330571
15 | 3000,9726.77,7002.86,5951.89,5948.32,4320.35,2268.49,1387.72,,1.388971078,1.634232151,1.635212968,2.251384726,4.287772924,7.00917332
16 | ,,,,,,,,,,,,,,
17 | ,,,,,,,,,,,,,,
18 | N,MPI,MPI_SIMD,MPI_OMP,MPI_OMP_SIMD,,,,,,,,,,
19 | 100,0.374264998,0.094941962,0.08655134,0.083306681,,,,,,,,,,
20 | 200,0.729158564,0.269871839,0.256530027,0.25354175,,,,,,,,,,
21 | 300,0.872536595,0.466697089,0.434799058,0.458121018,,,,,,,,,,
22 | 400,1.007822398,0.547188174,0.522414598,0.557284531,,,,,,,,,,
23 | 500,1.089472907,0.700233576,0.671597964,0.737054327,,,,,,,,,,
24 | 600,1.153399744,0.871704541,0.843302448,0.96675262,,,,,,,,,,
25 | 700,1.188276886,1.033667431,1.044263914,1.254077631,,,,,,,,,,
26 | 800,1.214925788,1.181991573,1.275637048,1.520123516,,,,,,,,,,
27 | 900,1.229303964,1.305443785,1.522978008,1.787991422,,,,,,,,,,
28 | 1000,1.255368062,1.413320725,1.635246404,2.0800955,,,,,,,,,,
29 | 1500,1.289906793,1.731882544,2.579440399,3.465264323,,,,,,,,,,
30 | 2000,1.344331746,1.973514131,3.16107283,4.692122366,,,,,,,,,,
31 | 2500,1.36445651,2.139127796,2.672705981,4.732330571,,,,,,,,,,
32 | 3000,1.388971078,2.251384726,4.287772924,7.00917332,,,,,,,,,,
33 | ,,,,,,,,,,,,,,
34 | ,,,,,,,,,,,,,,
35 | ,,,,,,,,,,,,,,
36 | N,MPI,MPI_OMP,K,,,,,,,,,,,
37 | 100,0.374264998,0.08655134,0.231256838,,,,,,,,,,,
38 | 200,0.729158564,0.256530027,0.351816518,,,,,,,,,,,
39 | 300,0.872536595,0.434799058,0.498316129,,,,,,,,,,,
40 | 400,1.007822398,0.522414598,0.518359781,,,,,,,,,,,
41 | 500,1.089472907,0.671597964,0.616443015,,,,,,,,,,,
42 | 600,1.153399744,0.843302448,0.731144993,,,,,,,,,,,
43 | 700,1.188276886,1.044263914,0.878805206,,,,,,,,,,,
44 | 800,1.214925788,1.275637048,1.049971167,,,,,,,,,,,
45 | 900,1.229303964,1.522978008,1.238894572,,,,,,,,,,,
46 | 1000,1.255368062,1.635246404,1.30260316,,,,,,,,,,,
47 | 1500,1.289906793,2.579440399,1.999710687,,,,,,,,,,,
48 | 2000,1.344331746,3.16107283,2.351408303,,,,,,,,,,,
49 | 2500,1.36445651,2.672705981,1.95880628,,,,,,,,,,,
50 | 3000,1.388971078,4.287772924,3.087013828,,,,,,,,,,,
51 | ,,,,,,,,,,,,,,
52 | ,,,,,,,,,,,,,,
53 | N,MPI_block,MPI_cycle,,,,,,,,,,,,
54 | 100,0.374264998,0.089595047,,,,,,,,,,,,
55 | 200,0.729158564,0.267329459,,,,,,,,,,,,
56 | 300,0.872536595,0.44694086,,,,,,,,,,,,
57 | 400,1.007822398,0.511363068,,,,,,,,,,,,
58 | 500,1.089472907,0.63871341,,,,,,,,,,,,
59 | 600,1.153399744,0.754176268,,,,,,,,,,,,
60 | 700,1.188276886,0.862254012,,,,,,,,,,,,
61 | 800,1.214925788,0.941366903,,,,,,,,,,,,
62 | 900,1.229303964,1.00651454,,,,,,,,,,,,
63 | 1000,1.255368062,1.093944021,,,,,,,,,,,,
64 | 1500,1.289906793,1.320063307,,,,,,,,,,,,
65 | 2000,1.344331746,1.459363557,,,,,,,,,,,,
66 | 2500,1.36445651,1.559285538,,,,,,,,,,,,
67 | 3000,1.388971078,1.634232151,,,,,,,,,,,,
68 | ,,,,,,,,,,,,,,
69 | ,,,,,,,,,,,,,,
70 | ,,,,,,,,,,,,,,
71 | N,MPI,MPI_pipeline,K,,,,,,,,,,,
72 | 100,0.374264998,0.092639861,0.247524779,,,,,,,,,,,
73 | 200,0.729158564,0.263926914,0.361960933,,,,,,,,,,,
74 | 300,0.872536595,0.447145054,0.512465674,,,,,,,,,,,
75 | 400,1.007822398,0.511710299,0.507738566,,,,,,,,,,,
76 | 500,1.089472907,0.639192915,0.586699229,,,,,,,,,,,
77 | 600,1.153399744,0.753375279,0.653177949,,,,,,,,,,,
78 | 700,1.188276886,0.859933914,0.723681428,,,,,,,,,,,
79 | 800,1.214925788,0.94199102,0.775348609,,,,,,,,,,,
80 | 900,1.229303964,1.011848178,0.823106577,,,,,,,,,,,
81 | 1000,1.255368062,1.098188129,0.874793746,,,,,,,,,,,
82 | 1500,1.289906793,1.316852167,1.020889396,,,,,,,,,,,
83 | 2000,1.344331746,1.457217203,1.083971428,,,,,,,,,,,
84 | 2500,1.36445651,1.559394698,1.142868744,,,,,,,,,,,
85 | 3000,1.388971078,1.635212968,1.17728367,,,,,,,,,,,
86 | 


--------------------------------------------------------------------------------
/Lab5/report/Lab5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/Lab5.pdf


--------------------------------------------------------------------------------
/Lab5/report/OMP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/OMP.png


--------------------------------------------------------------------------------
/Lab5/report/SIMD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/SIMD.png


--------------------------------------------------------------------------------
/Lab5/report/arm_OMP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/arm_OMP.png


--------------------------------------------------------------------------------
/Lab5/report/arm_SIMD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/arm_SIMD.png


--------------------------------------------------------------------------------
/Lab5/report/arm_block_cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/arm_block_cycle.png


--------------------------------------------------------------------------------
/Lab5/report/arm_send_receive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/arm_send_receive.png


--------------------------------------------------------------------------------
/Lab5/report/arm_sp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/arm_sp.png


--------------------------------------------------------------------------------
/Lab5/report/block_cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/block_cycle.png


--------------------------------------------------------------------------------
/Lab5/report/process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/process.png


--------------------------------------------------------------------------------
/Lab5/report/report.md:
--------------------------------------------------------------------------------
 1 | # 实验设计
 2 | 
 3 | 考虑Gauss消去的整个过程中主要涉及到两个阶段，一个是在消元行行内除法过程，一个是其余行减去消元行的过程。而就每个阶段而言，其所做的工作基本是一致的，只是在不同的消元轮次时，消元的起始位置不同。尤其是针对第二个阶段，即其余行依次减去消元行的过程，这个阶段每一行所做的工作是完全一致的，十分适合并行化处理，即将待消去的行平均分配给几个不同的进程，由于这些数据之间不存在依赖性，因此每个进程只需要各自完成好自己的工作即可，不存在进程之间进行通信的额外开销，只需要在各个进程完成了分配的工作之后进行一次同步。
 4 | 
 5 | 而当任务下发给不同的进程之后，在各个进程内又可以开启多条线程来处理任务，由于人物之间是不存在数据依赖的，因此不需要进行数据的通信，因此考虑在同一个进程内开启多条线程来处理任务是十分合理并且有效的。
 6 | 
 7 | 而对于第一阶段，即消元行行内进行除法操作时，由于这个问题规模相对较小，如果将待操作的数据分配给不同的线程进行处理的话，线程挂起到唤醒这部分的时间开销相较于要处理的问题而言占比很高，因此不适合进行多线程并行处理，但是仍可以结合SIMD的向量化处理。同样在第二阶段，被消元行依次减去消元行的过程中，每一行内的减法运算同样也不适合进行多线程的并行处理，也可以采用SIMD进行向量化处理。
 8 | 
 9 | 在本次实验中，将设计以下实验进行探究:
10 | 
11 | ## MPI并行处理
12 | 
13 | 对于Gauss消去的过程，在每一轮消去中主要包含两个阶段，首先是针对消元行做除法运算，然后是对于剩余的被消元行，依次减去消元行的某个倍数。在每一轮的过程中，除法操作和消元减法操作之间是有着严格的先后顺序的，即必须首先完成消元行的除法操作之后，才能够执行被消元行的减法操作。因此在除法运算和消元之间需要进行以及每一轮次消元结束之后需要进行一次同步。
14 | 
15 | 在考虑使用MPI进行多进程的实验设计时，考虑使用单个进程首先来分发任务。然后在外层循环中，每个线程都要判断当前做除法的行是否是自己分配的任务，当这一行的除法操作完成之后，需要将除法的结果广播给其他所有进程，而其他进程接收除法结果的过程就自然的完成了除法后进程间的同步。
16 | 
17 | 在这本次实验中，我们没有采用主从模式，即所有的进程都会去执行消元的任务，因此可以发现，消元的结果即使每次除法完成后的结果的累计，而除法做完之后都会进行一次广播，因此当最后一行完成之后，所有的除法结果都已经被广播到了各个进程中。因此可以省去最后一次的结果同步的过程。
18 | 
19 | 可以计算出采用了MPI方式的优化算法，其理论加速比应该和进程数量成正比。但是由于进程之间的通信开销是十分大的，因此这种开销会大大降低优化效果。
20 | 
21 | ## 数据划分对比
22 | 
23 | 考虑到MPI属于进程级的并行手段，因此如果存在负载不均的情况，其浪费的资源要远远高于线程级的负载不均。因此在本次实验中也从负载均衡的角度出发，设计实验对比了不同数据划分方式的性能差异。主要对比了循环划分和块划分之间的差异。
24 | 
25 | 最朴素的想法就是采用块划分的方式，即给各个进程分配连续的几行数据。但是考虑到Gauss消元的过程中，随着消元的进行，前面完成了除法的行将不会再参与后续的运算，也就是从整体的角度来考虑的话，各行之间的计算量是不同的，因此这种划分方式就会导致在后续的消元过程中，负责前面几行的进程处于闲置状态，因此是不能够接近理论加速比的。
26 | 
27 | 而从负载均衡的角度出发，可以使用循环划分的方式。即将数据按照进程数量等步长分配给每个进程，这样从整体来看，每个进程负责的任务的计算量大致是相同的，不会导致严重的负载不均现象。因此在整个过程中，所有的进程基本保持满负荷，整体的加速比就会接近理论的加速比。
28 | 
29 | ## 不同数据收发方式的实验对比
30 | 
31 | 考虑到MPI是进程级的并行，因此消息通信是在不同进程之间完成的，而进程间的通信开销是远比线程级的通信开销要大的，因此合理的设计进程间的通信方式和限制通信次数对于提高性能优化会有比较明显的效果。
32 | 
33 | 采用最朴素的方式就是当负责除法的进程完成了除法工作之后，将除法的结果依次发放给所有的进程，这个通信的开销是和进程的数量成正比的。而由于通信是需要时间的，因此当有些线程还没有接收到除法的结果的时候，是不能够开展后续的消元工作的。因此这里会存在着比较长的空闲等待。
34 | 
35 | 而pipeline的方式就是利用流水线的思想去减少了这种空闲等待的性能损失，即当一个进程收到了除法的结果后，他需要将这个结果转发给下一个进程，之后就可以开始自己的工作了。这样就可以减少了负责除法工作进程的阻塞时间。
36 | 
37 | 综合分析以上两种方法，采用广播的方式的时间开销为 $O(N^3logN)$，采用pipeline的方式的时间开销为 $O(N^3)$。
38 | 
39 | # 实验分析
40 | 
41 | ## x86平台
42 | 
43 | ### MPI并行实现及性能对比
44 | 
45 | 为了能够探究MPI并行算法的优化效果，考虑调整问题规模，测量在不同任务规模下，串行算法，MPI优化，MPI+SIMD优化，MPI+OMP优化以及MPI+SIMD+OMP优化的时间性能表现。其中MPI采用了8个进程进行多进程并行，OMP采用了8条线程进行多线程并行，SIMD并采用了四路向量化处理。为了能够比较全面的展现并行优化效果随问题规模的变化情况，在问题规模小于1000时采用步长为100，而当问题规模大于1000时，步长调整为500。五种算法在不同问题规模下的表现如表所示。
46 | 
47 | 为了能够更加直观的观察算法的性能表现随问题规模的变化情况，特意利用测量的数据计算了四种并行优化算法的加速比随时间的变化情况，如图所示。
48 | 
49 | 从图像中可以看出，随着问题规模的增加，四种并行优化算法的加速比都呈现一个递增的趋势。就单独采用MPI进行多进程并行的优化方式而言，可以看到其随着任务规模的增加，加速比逐渐上升直至平稳在4.5左右，而实际开启了8条线程，其加速比并没有能够达到理论加速比，其原因在于通信开销以及其余没有进行多进程并行的部分。
50 | 
51 | 在实验中还对比了基于MPI和SIMD、OMP的结合，并以MPI的性能为baseline，分析增加了其他优化方式的性能提升。
52 | 
53 | 由于SIMD采用了4路向量化的手段，因此其相对于baseline的理论加速比应该能达到四倍。但是实验表明，SIMD实际的加速比只达到了2.2-2.5之间，这也和之前的实验向契合，证明在Gauss消元问题上，由于其他未能够进行SIMD向量化运算部分的影响，其理论加速比只能达到这个数值。
54 | 
55 | 由于OMP拉起了8条线程，因此其相对于baseline的理论加速比应该能够达到8倍。但是实验表明，当问题规模达到3000的时候，这个性能提升也只达到了2.5倍左右，但是可以看出，随着问题规模的增加，这个加速比还有提升的趋势。分析原因，由于MPI多进程分配任务，每个进程上的任务只有总任务的$\frac{1}{8}$，因此，对于每个进程而言，即使问题规模达到了3000，分配给每个进程的任务也只有400左右，而通过之前的实验可以表明，在400左右的任务规模下，其加速比很难达到OpenMP的理论加速比。并且其趋势也能够表明，随着问题规模的增加，这个加速比应该是会继续上升的。
56 | 
57 | 最后在实验中尝试了将MPI和SIMD、OpenMP融合到一起，同时启用多进程多线程向量化优化，在问题规模达到3000的时候，取得了将近20倍的性能提升。这和单独采用某一种优化方式取得的性能提升的乘积基本一致。
58 | 
59 | ### 任务划分方式
60 | 
61 | 在本次实验中，从负载均衡的角度触发，探究不同的任务划分方式对于算法性能的影响。由于MPI是进程级的并行方式，因此进程间的通信开销是很大的，所以要尽可能地利用每一个进程，并且减少进程之间的通信次数。实验对比了两种不同的任务划分方式，块划分和循环划分，实验结果如图所示。
62 | 
63 | 从图像中可以看出，随着任务规模的增加，这两种划分方式的加速比都逐渐增加，但是随着任务规模的增加，循环划分的方式逐渐反超了块划分的方式。并且块划分方式率先达到了性能瓶颈，并且只达到了理论加速比的一半。这很符合预期，从整个过程来看，采用块划分的方式，随着消元的推进，负责前面行的进程会逐渐空闲下来，平均下来每个进程只有一半的时间在有效工作，因此其加速比只达到了4左右。而对于循环数据划分的方式而言，由于其从计算量的角度去划分任务，因此整体来看每个线程的有效工作时间几乎相同，达到了比较好的负载均衡，充分利用了每个线程的计算资源，因此其加速比逐步逼近理论加速比8。
64 | 
65 | ### 数据收发方式对比
66 | 
67 | 考虑到MPI在接收数据的时候处于阻塞状态，因此对于进行除法操作的进程而言，如果按照顺序向各个进程发送除法结果的话，那么该进程就需要等待所有的数据全部发送完成后才能开始处理自己的消元任务，而这个是完全浪费的时间。因此可以考虑借用流水线的思想，即从执行除法操作的进程开始，每个进程都只是接收上一个进程发来的除法结果，并将这个结果转发给下一个进程，然后就可以开始自己的消元任务。这样可以有效减少进程等待而浪费的时间。实验结果如图所示。
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/Lab5/report/send_receive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/send_receive.png


--------------------------------------------------------------------------------
/Lab5/report/x86_sp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/report/x86_sp.png


--------------------------------------------------------------------------------
/Lab5/test.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Lenovo on 2022/6/1.
 3 | //
 4 | #include <iostream>
 5 | using namespace std;
 6 | 
 7 | int N = 5;
 8 | const int L = 100;
 9 | const int LOOP = 1;
10 | float **data;
11 | float **matrix = nullptr;
12 | 
13 | void init_data()
14 | {
15 |     data = new float *[N], matrix = new float *[N];
16 |     float * tmp = new float[N*N];
17 |     for (int i = 0; i < N; i++)
18 |         data[i] = new float[N],
19 |         matrix[i] = tmp+i*N;
20 |     for (int i = 0; i < N; i++)
21 |         for (int j = i; j < N; j++)
22 |             data[i][j] = rand() * 1.0 / RAND_MAX * L;
23 |     for (int i = 0; i < N - 1; i++)
24 |         for (int j = i + 1; j < N; j++)
25 |             for (int k = 0; k < N; k++)
26 |                 data[j][k] += data[i][k];
27 | }
28 | 
29 | int main()
30 | {
31 |     int **a, **b;
32 |     a = new int *[2];
33 |     b = new int *[2];
34 |     a[0] = new int[2];
35 |     a[1] = new int[2];
36 |     b[0] = new int[2];
37 |     b[1] = new int[2];
38 |     a[0][0] = 1;
39 |     b[0][0] = a[0][0];
40 |     a[0][0] = 2;
41 |     cout<<b[0][0]<<endl;
42 | }


--------------------------------------------------------------------------------
/Lab5/x86/x86.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab5/x86/x86.csv


--------------------------------------------------------------------------------
/Lab6/Certificate.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/Certificate.pdf


--------------------------------------------------------------------------------
/Lab6/Ex1/01-add-error-handling.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void init(int *a, int N)
 4 | {
 5 |     int i;
 6 |     for (i = 0; i < N; ++i)
 7 |     {
 8 |         a[i] = i;
 9 |     }
10 | }
11 | 
12 | __global__ void doubleElements(int *a, int N)
13 | {
14 | 
15 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
16 |     int stride = gridDim.x * blockDim.x;
17 | 
18 |     for (int i = idx; i < N + stride; i += stride)
19 |     {
20 |         a[i] *= 2;
21 |     }
22 | }
23 | 
24 | bool checkElementsAreDoubled(int *a, int N)
25 | {
26 |     int i;
27 |     for (i = 0; i < N; ++i)
28 |     {
29 |         if (a[i] != i * 2)
30 |             return false;
31 |     }
32 |     return true;
33 | }
34 | 
35 | int main()
36 | {
37 |     /*
38 |      * Add error handling to this source code to learn what errors
39 |      * exist, and then correct them. Googling error messages may be
40 |      * of service if actions for resolving them are not clear to you.
41 |      */
42 | 
43 |     cudaError_t err;
44 | 
45 |     int N = 10000;
46 |     int *a;
47 | 
48 |     size_t size = N * sizeof(int);
49 |     err = cudaMallocManaged(&a, size);
50 |     if (err != cudaSuccess)
51 |     {
52 |         printf("cuda memory error occur : %s\n", cudaGetErrorString(err));
53 |     }
54 | 
55 |     init(a, N);
56 | 
57 |     size_t threads_per_block = 2048;
58 |     size_t number_of_blocks = 32;
59 | 
60 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
61 | 
62 |     err = cudaGetLastError();
63 |     if (err != cudaSuccess)
64 |     {
65 |         printf("cuda kernel function error occur : %s\n", cudaGetErrorString(err));
66 |     }
67 |     cudaDeviceSynchronize();
68 | 
69 |     err = cudaGetLastError();
70 |     if (err != cudaSuccess)
71 |     {
72 |         printf("cuda synchronize error occur : %s\n", cudaGetErrorString(err));
73 |     }
74 | 
75 |     bool areDoubled = checkElementsAreDoubled(a, N);
76 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
77 | 
78 |     cudaFree(a);
79 | }
80 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-basic-parallel.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Refactor firstParallel so that it can run on the GPU.
 5 |  */
 6 | 
 7 | __global__ void firstParallel()
 8 | {
 9 |     printf("This should be running in parallel.\n");
10 | }
11 | 
12 | int main()
13 | {
14 |     /*
15 |      * Refactor this call to firstParallel to execute in parallel
16 |      * on the GPU.
17 |      */
18 | 
19 |     firstParallel<<<5, 5>>>();
20 |     cudaDeviceSynchronize();
21 | 
22 |     /*
23 |      * Some code is needed below so that the CPU will wait
24 |      * for the GPU kernels to complete before proceeding.
25 |      */
26 | }


--------------------------------------------------------------------------------
/Lab6/Ex1/01-double-elements.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Initialize array values on the host.
 5 |  */
 6 | 
 7 | void init(int *a, int N)
 8 | {
 9 |     int i;
10 |     for (i = 0; i < N; ++i)
11 |     {
12 |         a[i] = i;
13 |     }
14 | }
15 | 
16 | /*
17 |  * Double elements in parallel on the GPU.
18 |  */
19 | 
20 | __global__ void doubleElements(int *a, int N)
21 | {
22 |     int i;
23 |     i = blockIdx.x * blockDim.x + threadIdx.x;
24 |     if (i < N)
25 |     {
26 |         a[i] *= 2;
27 |     }
28 | }
29 | 
30 | /*
31 |  * Check all elements have been doubled on the host.
32 |  */
33 | 
34 | bool checkElementsAreDoubled(int *a, int N)
35 | {
36 |     int i;
37 |     for (i = 0; i < N; ++i)
38 |     {
39 |         if (a[i] != i * 2)
40 |             return false;
41 |     }
42 |     return true;
43 | }
44 | 
45 | int main()
46 | {
47 |     int N = 100;
48 |     int *a;
49 | 
50 |     size_t size = N * sizeof(int);
51 | 
52 |     /*
53 |      * Refactor this memory allocation to provide a pointer
54 |      * `a` that can be used on both the host and the device.
55 |      */
56 |     cudaMallocManaged(&a, size);
57 |     // a = (int *)malloc(size);
58 | 
59 |     init(a, N);
60 | 
61 |     size_t threads_per_block = 10;
62 |     size_t number_of_blocks = 10;
63 | 
64 |     /*
65 |      * This launch will not work until the pointer `a` is also
66 |      * available to the device.
67 |      */
68 | 
69 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
70 |     cudaDeviceSynchronize();
71 | 
72 |     bool areDoubled = checkElementsAreDoubled(a, N);
73 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
74 | 
75 |     /*
76 |      * Refactor to free memory that has been allocated to be
77 |      * accessed by both the host and the device.
78 |      */
79 | 
80 |     cudaFree(a);
81 | }
82 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-grid-stride-double.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void init(int *a, int N)
 4 | {
 5 |     int i;
 6 |     for (i = 0; i < N; ++i)
 7 |     {
 8 |         a[i] = i;
 9 |     }
10 | }
11 | 
12 | /*
13 |  * In the current application, `N` is larger than the grid.
14 |  * Refactor this kernel to use a grid-stride loop in order that
15 |  * each parallel thread work on more than one element of the array.
16 |  */
17 | 
18 | __global__ void doubleElements(int *a, int N)
19 | {
20 |     int i;
21 |     i = blockIdx.x * blockDim.x + threadIdx.x;
22 |     int step = gridDim.x * blockDim.x;
23 |     for (; i < N; i += step)
24 |     {
25 |         a[i] *= 2;
26 |     }
27 | }
28 | 
29 | bool checkElementsAreDoubled(int *a, int N)
30 | {
31 |     int i;
32 |     for (i = 0; i < N; ++i)
33 |     {
34 |         if (a[i] != i * 2)
35 |             return false;
36 |     }
37 |     return true;
38 | }
39 | 
40 | int main()
41 | {
42 |     /*
43 |      * `N` is greater than the size of the grid (see below).
44 |      */
45 | 
46 |     int N = 10000;
47 |     int *a;
48 | 
49 |     size_t size = N * sizeof(int);
50 |     cudaMallocManaged(&a, size);
51 | 
52 |     init(a, N);
53 | 
54 |     /*
55 |      * The size of this grid is 256*32 = 8192.
56 |      */
57 | 
58 |     size_t threads_per_block = 256;
59 |     size_t number_of_blocks = 32;
60 | 
61 |     doubleElements<<<number_of_blocks, threads_per_block>>>(a, N);
62 |     cudaDeviceSynchronize();
63 | 
64 |     bool areDoubled = checkElementsAreDoubled(a, N);
65 |     printf("All elements were doubled? %s\n", areDoubled ? "TRUE" : "FALSE");
66 | 
67 |     cudaFree(a);
68 | }
69 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-hello-gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void helloCPU()
 4 | {
 5 |     printf("Hello from the CPU.\n");
 6 | }
 7 | 
 8 | /*
 9 |  * Refactor the `helloGPU` definition to be a kernel
10 |  * that can be launched on the GPU. Update its message
11 |  * to read "Hello from the GPU!"
12 |  */
13 | 
14 | __global__ void helloGPU()
15 | {
16 |     printf("Hello from the GPU.\n");
17 | }
18 | 
19 | int main()
20 | {
21 | 
22 |     helloGPU<<<1, 1>>>();
23 |     cudaDeviceSynchronize();
24 | 
25 |     helloCPU();
26 | 
27 |     /*
28 |      * Refactor this call to `helloGPU` so that it launches
29 |      * as a kernel on the GPU.
30 |      */
31 | 
32 |     helloGPU<<<1, 1>>>();
33 |     cudaDeviceSynchronize();
34 |     /*
35 |      * Add code below to synchronize on the completion of the
36 |      * `helloGPU` kernel completion before continuing the CPU
37 |      * thread.
38 |      */
39 | }
40 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-mismatched-config-loop.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Currently, `initializeElementsTo`, if executed in a thread whose
 5 |  * `i` is calculated to be greater than `N`, will try to access a value
 6 |  * outside the range of `a`.
 7 |  *
 8 |  * Refactor the kernel defintition to prevent our of range accesses.
 9 |  */
10 | 
11 | __global__ void initializeElementsTo(int initialValue, int *a, int N)
12 | {
13 |     int i = threadIdx.x + blockIdx.x * blockDim.x;
14 |     if (i < N)
15 |     {
16 |         a[i] = initialValue;
17 |         printf("block %d thread %d set %d to the value %d \n", blockIdx.x, threadIdx.x, i, initialValue);
18 |     }
19 | }
20 | 
21 | int main()
22 | {
23 |     /*
24 |      * Do not modify `N`.
25 |      */
26 | 
27 |     int N = 1000;
28 | 
29 |     int *a;
30 |     size_t size = N * sizeof(int);
31 | 
32 |     cudaMallocManaged(&a, size);
33 | 
34 |     /*
35 |      * Assume we have reason to want the number of threads
36 |      * fixed at `256`: do not modify `threads_per_block`.
37 |      */
38 | 
39 |     size_t threads_per_block = 256;
40 | 
41 |     /*
42 |      * Assign a value to `number_of_blocks` that will
43 |      * allow for a working execution configuration given
44 |      * the fixed values for `N` and `threads_per_block`.
45 |      */
46 | 
47 |     size_t number_of_blocks = (N + threads_per_block - 1) / threads_per_block;
48 | 
49 |     int initialValue = 6;
50 | 
51 |     initializeElementsTo<<<number_of_blocks, threads_per_block>>>(initialValue, a, N);
52 |     cudaDeviceSynchronize();
53 | 
54 |     /*
55 |      * Check to make sure all values in `a`, were initialized.
56 |      */
57 | 
58 |     for (int i = 0; i < N; ++i)
59 |     {
60 |         if (a[i] != initialValue)
61 |         {
62 |             printf("FAILURE: target value: %d\t a[%d]: %d\n", initialValue, i, a[i]);
63 |             exit(1);
64 |         }
65 |     }
66 |     printf("SUCCESS!\n");
67 | 
68 |     cudaFree(a);
69 | }
70 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-multi-block-loop.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Refactor `loop` to be a CUDA Kernel. The new kernel should
 5 |  * only do the work of 1 iteration of the original loop.
 6 |  */
 7 | 
 8 | __global__ void loop(int N)
 9 | {
10 |     for (int i = 0; i < N; ++i)
11 |     {
12 |         printf("This is iteration number %d in block %d thrad %d\n", threadIdx.x + blockIdx.x * blockDim.x, blockIdx.x, threadIdx.x);
13 |     }
14 | }
15 | 
16 | int main()
17 | {
18 |     /*
19 |      * When refactoring `loop` to launch as a kernel, be sure
20 |      * to use the execution configuration to control how many
21 |      * "iterations" to perform.
22 |      *
23 |      * For this exercise, be sure to use more than 1 block in
24 |      * the execution configuration.
25 |      */
26 | 
27 |     int N = 10;
28 |     loop<<<2, 5>>>(N);
29 |     cudaDeviceSynchronize();
30 | }
31 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-single-block-loop.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Refactor `loop` to be a CUDA Kernel. The new kernel should
 5 |  * only do the work of 1 iteration of the original loop.
 6 |  */
 7 | 
 8 | __global__ void loop(int N)
 9 | {
10 |     int threadIndex = threadIdx.x;
11 |     for (int i = 0; i < N; ++i)
12 |     {
13 |         printf("This is iteration number %d in thread %d \n", i, threadIndex);
14 |     }
15 | }
16 | 
17 | int main()
18 | {
19 |     /*
20 |      * When refactoring `loop` to launch as a kernel, be sure
21 |      * to use the execution configuration to control how many
22 |      * "iterations" to perform.
23 |      *
24 |      * For this exercise, only use 1 block of threads.
25 |      */
26 | 
27 |     int N = 10;
28 |     loop<<<1, N>>>(N);
29 |     cudaDeviceSynchronize();
30 | }
31 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-thread-and-block-idx.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void printSuccessForCorrectExecutionConfiguration()
 4 | {
 5 |     if (threadIdx.x == blockIdx.x)
 6 |     {
 7 |         printf("Success!\n");
 8 |     }
 9 |     else
10 |     {
11 |         printf("Failure. Update the execution configuration as necessary.\n");
12 |     }
13 | }
14 | 
15 | int main()
16 | {
17 |     /*
18 |      * Update the execution configuration so that the kernel
19 |      * will print `"Success!"`.
20 |      */
21 | 
22 |     printSuccessForCorrectExecutionConfiguration<<<2, 2>>>();
23 |     cudaDeviceSynchronize();
24 | }
25 | 


--------------------------------------------------------------------------------
/Lab6/Ex1/01-vector-add.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void initWith(float num, float *a, int N)
 4 | {
 5 |     for (int i = 0; i < N; ++i)
 6 |     {
 7 |         a[i] = num;
 8 |     }
 9 | }
10 | 
11 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
12 | {
13 |     int begin = threadIdx.x + blockIdx.x * blockDim.x;
14 |     int gridStride = gridDim.x * blockDim.x;
15 |     for (int i = begin; i < N; i += gridStride)
16 |     {
17 |         result[i] = a[i] + b[i];
18 |     }
19 | }
20 | 
21 | void checkElementsAre(float target, float *array, int N)
22 | {
23 |     for (int i = 0; i < N; i++)
24 |     {
25 |         if (array[i] != target)
26 |         {
27 |             printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
28 |             exit(1);
29 |         }
30 |     }
31 |     printf("SUCCESS! All values added correctly.\n");
32 | }
33 | 
34 | int main()
35 | {
36 |     const int N = 2 << 20;
37 |     size_t size = N * sizeof(float);
38 | 
39 |     cudaError_t err_tmp;
40 |     cudaError_t err = cudaSuccess;
41 | 
42 |     float *a;
43 |     float *b;
44 |     float *c;
45 | 
46 |     err_tmp = cudaMallocManaged(&a, size);
47 |     err = err_tmp == cudaSuccess ? err : err_tmp;
48 |     err_tmp = cudaMallocManaged(&b, size);
49 |     err = err_tmp == cudaSuccess ? err : err_tmp;
50 |     err_tmp = cudaMallocManaged(&c, size);
51 |     err = err_tmp == cudaSuccess ? err : err_tmp;
52 | 
53 |     if (err != cudaSuccess)
54 |     {
55 |         printf("cuda memory error occur : %s\n", cudaGetErrorString(err));
56 |     }
57 | 
58 |     initWith(3, a, N);
59 |     initWith(4, b, N);
60 |     initWith(0, c, N);
61 | 
62 |     addVectorsInto<<<2, 5>>>(c, a, b, N);
63 | 
64 |     err = cudaGetLastError();
65 |     if (err != cudaSuccess)
66 |     {
67 |         printf("cuda kernel function error occur : %s\n", cudaGetErrorString(err));
68 |     }
69 |     cudaDeviceSynchronize();
70 |     err = cudaGetLastError();
71 |     if (err != cudaSuccess)
72 |     {
73 |         printf("cuda synchronize error occur : %s\n", cudaGetErrorString(err));
74 |     }
75 | 
76 |     checkElementsAre(7, c, N);
77 | 
78 |     cudaFree(a);
79 |     cudaFree(b);
80 |     cudaFree(c);
81 | }
82 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-get-device-properties.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main()
 4 | {
 5 |     /*
 6 |      * Assign values to these variables so that the output string below prints the
 7 |      * requested properties of the currently active GPU.
 8 |      */
 9 | 
10 |     int deviceId;
11 |     cudaGetDevice(&deviceId);
12 |     cudaDeviceProp props;
13 |     cudaGetDeviceProperties(&props, deviceId);
14 | 
15 |     int computeCapabilityMajor = props.major;
16 |     int computeCapabilityMinor = props.minor;
17 |     int multiProcessorCount = props.multiProcessorCount;
18 |     int warpSize = props.warpSize;
19 | 
20 |     /*
21 |      * There should be no need to modify the output string below.
22 |      */
23 | 
24 |     printf("Device ID: %d\nNumber of SMs: %d\nCompute Capability Major: %d\nCompute Capability Minor: %d\nWarp Size: %d\n", deviceId, multiProcessorCount, computeCapabilityMajor, computeCapabilityMinor, warpSize);
25 | }
26 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-page-faults-solution-cpu-only.cu:
--------------------------------------------------------------------------------
 1 | __global__ void deviceKernel(int *a, int N)
 2 | {
 3 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 4 |     int stride = blockDim.x * gridDim.x;
 5 | 
 6 |     for (int i = idx; i < N; i += stride)
 7 |     {
 8 |         a[i] = 1;
 9 |     }
10 | }
11 | 
12 | void hostFunction(int *a, int N)
13 | {
14 |     for (int i = 0; i < N; ++i)
15 |     {
16 |         a[i] = 1;
17 |     }
18 | }
19 | 
20 | int main()
21 | {
22 |     int N = 2 << 24;
23 |     size_t size = N * sizeof(int);
24 |     int *a;
25 |     cudaMallocManaged(&a, size);
26 |     hostFunction(a, N);
27 |     cudaFree(a);
28 | }
29 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-page-faults-solution-cpu-then-gpu.cu:
--------------------------------------------------------------------------------
 1 | __global__ void deviceKernel(int *a, int N)
 2 | {
 3 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 4 |     int stride = blockDim.x * gridDim.x;
 5 | 
 6 |     for (int i = idx; i < N; i += stride)
 7 |     {
 8 |         a[i] = 1;
 9 |     }
10 | }
11 | 
12 | void hostFunction(int *a, int N)
13 | {
14 |     for (int i = 0; i < N; ++i)
15 |     {
16 |         a[i] = 1;
17 |     }
18 | }
19 | 
20 | int main()
21 | {
22 |     int N = 2 << 24;
23 |     size_t size = N * sizeof(int);
24 |     int *a;
25 |     cudaMallocManaged(&a, size);
26 | 
27 |     hostFunction(a, N);
28 |     deviceKernel<<<256, 256>>>(a, N);
29 |     cudaDeviceSynchronize();
30 | 
31 |     cudaFree(a);
32 | }
33 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-page-faults-solution-gpu-only.cu:
--------------------------------------------------------------------------------
 1 | __global__ void deviceKernel(int *a, int N)
 2 | {
 3 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 4 |     int stride = blockDim.x * gridDim.x;
 5 | 
 6 |     for (int i = idx; i < N; i += stride)
 7 |     {
 8 |         a[i] = 1;
 9 |     }
10 | }
11 | 
12 | void hostFunction(int *a, int N)
13 | {
14 |     for (int i = 0; i < N; ++i)
15 |     {
16 |         a[i] = 1;
17 |     }
18 | }
19 | 
20 | int main()
21 | {
22 |     int N = 2 << 24;
23 |     size_t size = N * sizeof(int);
24 |     int *a;
25 |     cudaMallocManaged(&a, size);
26 | 
27 |     deviceKernel<<<256, 256>>>(a, N);
28 |     cudaDeviceSynchronize();
29 | 
30 |     cudaFree(a);
31 | }
32 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-page-faults-solution-gpu-then-cpu.cu:
--------------------------------------------------------------------------------
 1 | __global__ void deviceKernel(int *a, int N)
 2 | {
 3 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 4 |     int stride = blockDim.x * gridDim.x;
 5 | 
 6 |     for (int i = idx; i < N; i += stride)
 7 |     {
 8 |         a[i] = 1;
 9 |     }
10 | }
11 | 
12 | void hostFunction(int *a, int N)
13 | {
14 |     for (int i = 0; i < N; ++i)
15 |     {
16 |         a[i] = 1;
17 |     }
18 | }
19 | 
20 | int main()
21 | {
22 |     int N = 2 << 24;
23 |     size_t size = N * sizeof(int);
24 |     int *a;
25 |     cudaMallocManaged(&a, size);
26 |     deviceKernel<<<256, 256>>>(a, N);
27 |     cudaDeviceSynchronize();
28 |     hostFunction(a, N);
29 |     cudaFree(a);
30 | }
31 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-saxpy-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define N 2048 * 2048 // Number of elements in each vector
 4 | 
 5 | /*
 6 |  * Optimize this already-accelerated codebase. Work iteratively,
 7 |  * and use nsys to support your work.
 8 |  *
 9 |  * Aim to profile `saxpy` (without modifying `N`) running under
10 |  * 20us.
11 |  *
12 |  * Some bugs have been placed in this codebase for your edification.
13 |  */
14 | 
15 | __global__ void saxpy(int *a, int *b, int *c)
16 | {
17 |     int tid = blockIdx.x * blockDim.x * threadIdx.x;
18 |     int stride = blockDim.x * gridDim.x;
19 | 
20 |     for (int i = tid; i < N; i += stride)
21 |         c[i] = 2 * a[i] + b[i];
22 | }
23 | 
24 | int main()
25 | {
26 |     int *a, *b, *c;
27 | 
28 |     int deviceId;
29 |     int numberOfSMs;
30 | 
31 |     cudaGetDevice(&deviceId);
32 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
33 | 
34 |     int size = N * sizeof(int); // The total number of bytes per vector
35 | 
36 |     cudaMallocManaged(&a, size);
37 |     cudaMallocManaged(&b, size);
38 |     cudaMallocManaged(&c, size);
39 | 
40 |     // Initialize memory
41 |     for (int i = 0; i < N; ++i)
42 |     {
43 |         a[i] = 2;
44 |         b[i] = 1;
45 |         c[i] = 0;
46 |     }
47 | 
48 |     cudaMemPrefetchAsync(a, size, deviceId);
49 |     cudaMemPrefetchAsync(b, size, deviceId);
50 |     cudaMemPrefetchAsync(c, size, deviceId);
51 | 
52 |     int threads_per_block = 512;
53 |     int number_of_blocks = numberOfSMs * 32;
54 | 
55 |     saxpy<<<number_of_blocks, threads_per_block>>>(a, b, c);
56 | 
57 |     // Print out the first and last 5 values of c for a quality check
58 |     for (int i = 0; i < 5; ++i)
59 |         printf("c[%d] = %d, ", i, c[i]);
60 |     printf("\n");
61 |     for (int i = N - 5; i < N; ++i)
62 |         printf("c[%d] = %d, ", i, c[i]);
63 |     printf("\n");
64 | 
65 |     cudaFree(a);
66 |     cudaFree(b);
67 |     cudaFree(c);
68 | }
69 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-vector-add-1.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Host function to initialize vector elements. This function
 5 |  * simply initializes each element to equal its index in the
 6 |  * vector.
 7 |  */
 8 | 
 9 | void initWith(float num, float *a, int N)
10 | {
11 |     for (int i = 0; i < N; ++i)
12 |     {
13 |         a[i] = num;
14 |     }
15 | }
16 | 
17 | /*
18 |  * Device kernel stores into `result` the sum of each
19 |  * same-indexed value of `a` and `b`.
20 |  */
21 | 
22 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
23 | {
24 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
25 |     int stride = blockDim.x * gridDim.x;
26 | 
27 |     for (int i = index; i < N; i += stride)
28 |     {
29 |         result[i] = a[i] + b[i];
30 |     }
31 | }
32 | 
33 | /*
34 |  * Host function to confirm values in `vector`. This function
35 |  * assumes all values are the same `target` value.
36 |  */
37 | 
38 | void checkElementsAre(float target, float *vector, int N)
39 | {
40 |     for (int i = 0; i < N; i++)
41 |     {
42 |         if (vector[i] != target)
43 |         {
44 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
45 |             exit(1);
46 |         }
47 |     }
48 |     printf("Success! All values calculated correctly.\n");
49 | }
50 | 
51 | int main()
52 | {
53 |     const int N = 2 << 24;
54 |     size_t size = N * sizeof(float);
55 | 
56 |     float *a;
57 |     float *b;
58 |     float *c;
59 | 
60 |     cudaMallocManaged(&a, size);
61 |     cudaMallocManaged(&b, size);
62 |     cudaMallocManaged(&c, size);
63 | 
64 |     initWith(3, a, N);
65 |     initWith(4, b, N);
66 |     initWith(0, c, N);
67 | 
68 |     size_t threadsPerBlock;
69 |     size_t numberOfBlocks;
70 | 
71 |     /*
72 |      * nsys should register performance changes when execution configuration
73 |      * is updated.
74 |      */
75 | 
76 |     threadsPerBlock = 32;
77 |     numberOfBlocks = 40;
78 | 
79 |     cudaError_t addVectorsErr;
80 |     cudaError_t asyncErr;
81 | 
82 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
83 | 
84 |     addVectorsErr = cudaGetLastError();
85 |     if (addVectorsErr != cudaSuccess)
86 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
87 | 
88 |     asyncErr = cudaDeviceSynchronize();
89 |     if (asyncErr != cudaSuccess)
90 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
91 | 
92 |     checkElementsAre(7, c, N);
93 | 
94 |     cudaFree(a);
95 |     cudaFree(b);
96 |     cudaFree(c);
97 | }
98 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-vector-add-init-in-kernel-solution.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | /*
  4 |  * Refactor host function to run as CUDA kernel
  5 |  */
  6 | 
  7 | __global__ void initWith(float num, float *a, int N)
  8 | {
  9 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 10 |     int stride = blockDim.x * gridDim.x;
 11 | 
 12 |     for (int i = index; i < N; i += stride)
 13 |     {
 14 |         a[i] = num;
 15 |     }
 16 | }
 17 | 
 18 | __global__ void addArraysInto(float *result, float *a, float *b, int N)
 19 | {
 20 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 21 |     int stride = blockDim.x * gridDim.x;
 22 | 
 23 |     for (int i = index; i < N; i += stride)
 24 |     {
 25 |         result[i] = a[i] + b[i];
 26 |     }
 27 | }
 28 | 
 29 | void checkElementsAre(float target, float *array, int N)
 30 | {
 31 |     for (int i = 0; i < N; i++)
 32 |     {
 33 |         if (array[i] != target)
 34 |         {
 35 |             printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
 36 |             exit(1);
 37 |         }
 38 |     }
 39 |     printf("Success! All values calculated correctly.\n");
 40 | }
 41 | 
 42 | int main()
 43 | {
 44 |     int deviceId;
 45 |     int numberOfSMs;
 46 | 
 47 |     cudaGetDevice(&deviceId);
 48 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
 49 |     printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
 50 | 
 51 |     const int N = 2 << 24;
 52 |     size_t size = N * sizeof(float);
 53 | 
 54 |     float *a;
 55 |     float *b;
 56 |     float *c;
 57 | 
 58 |     cudaMallocManaged(&a, size);
 59 |     cudaMallocManaged(&b, size);
 60 |     cudaMallocManaged(&c, size);
 61 | 
 62 |     size_t threadsPerBlock;
 63 |     size_t numberOfBlocks;
 64 | 
 65 |     threadsPerBlock = 256;
 66 |     numberOfBlocks = 32 * numberOfSMs;
 67 | 
 68 |     cudaError_t addArraysErr;
 69 |     cudaError_t asyncErr;
 70 | 
 71 |     /*
 72 |      * Launch kernels.
 73 |      */
 74 | 
 75 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
 76 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
 77 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
 78 | 
 79 |     /*
 80 |      * Now that initialization is happening on a GPU, host code
 81 |      * must be synchronized to wait for its completion.
 82 |      */
 83 | 
 84 |     cudaDeviceSynchronize();
 85 | 
 86 |     addArraysInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
 87 | 
 88 |     addArraysErr = cudaGetLastError();
 89 |     if (addArraysErr != cudaSuccess)
 90 |         printf("Error: %s\n", cudaGetErrorString(addArraysErr));
 91 | 
 92 |     asyncErr = cudaDeviceSynchronize();
 93 |     if (asyncErr != cudaSuccess)
 94 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
 95 | 
 96 |     checkElementsAre(7, c, N);
 97 | 
 98 |     cudaFree(a);
 99 |     cudaFree(b);
100 |     cudaFree(c);
101 | }
102 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-vector-add-prefetch-solution-cpu-also.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | void initWith(float num, float *a, int N)
  4 | {
  5 |     for (int i = 0; i < N; ++i)
  6 |     {
  7 |         a[i] = num;
  8 |     }
  9 | }
 10 | 
 11 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
 12 | {
 13 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 14 |     int stride = blockDim.x * gridDim.x;
 15 | 
 16 |     for (int i = index; i < N; i += stride)
 17 |     {
 18 |         result[i] = a[i] + b[i];
 19 |     }
 20 | }
 21 | 
 22 | void checkElementsAre(float target, float *vector, int N)
 23 | {
 24 |     for (int i = 0; i < N; i++)
 25 |     {
 26 |         if (vector[i] != target)
 27 |         {
 28 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
 29 |             exit(1);
 30 |         }
 31 |     }
 32 |     printf("Success! All values calculated correctly.\n");
 33 | }
 34 | 
 35 | int main()
 36 | {
 37 |     int deviceId;
 38 |     int numberOfSMs;
 39 | 
 40 |     cudaGetDevice(&deviceId);
 41 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
 42 |     printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
 43 | 
 44 |     const int N = 2 << 24;
 45 |     size_t size = N * sizeof(float);
 46 | 
 47 |     float *a;
 48 |     float *b;
 49 |     float *c;
 50 | 
 51 |     cudaMallocManaged(&a, size);
 52 |     cudaMallocManaged(&b, size);
 53 |     cudaMallocManaged(&c, size);
 54 | 
 55 |     /*
 56 |      * Prefetching can also be used to prevent CPU page faults.
 57 |      */
 58 | 
 59 |     cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
 60 |     cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
 61 |     cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
 62 |     initWith(3, a, N);
 63 |     initWith(4, b, N);
 64 |     initWith(0, c, N);
 65 | 
 66 |     cudaMemPrefetchAsync(a, size, deviceId);
 67 |     cudaMemPrefetchAsync(b, size, deviceId);
 68 |     cudaMemPrefetchAsync(c, size, deviceId);
 69 | 
 70 |     size_t threadsPerBlock;
 71 |     size_t numberOfBlocks;
 72 | 
 73 |     threadsPerBlock = 256;
 74 |     numberOfBlocks = 32 * numberOfSMs;
 75 | 
 76 |     cudaError_t addVectorsErr;
 77 |     cudaError_t asyncErr;
 78 | 
 79 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
 80 | 
 81 |     addVectorsErr = cudaGetLastError();
 82 |     if (addVectorsErr != cudaSuccess)
 83 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
 84 | 
 85 |     asyncErr = cudaDeviceSynchronize();
 86 |     if (asyncErr != cudaSuccess)
 87 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
 88 | 
 89 |     /*
 90 |      * Prefetching can also be used to prevent CPU page faults.
 91 |      */
 92 | 
 93 |     cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
 94 |     checkElementsAre(7, c, N);
 95 | 
 96 |     cudaFree(a);
 97 |     cudaFree(b);
 98 |     cudaFree(c);
 99 | }
100 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-vector-add-prefetch-solution.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | /*
  4 |  * Refactor host function to run as CUDA kernel
  5 |  */
  6 | 
  7 | __global__
  8 | void initWith(float num, float *a, int N)
  9 | {
 10 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 11 |   int stride = blockDim.x * gridDim.x;
 12 | 
 13 |   for(int i = index; i < N; i += stride)
 14 |   {
 15 |     a[i] = num;
 16 |   }
 17 | }
 18 | 
 19 | __global__
 20 | void addArraysInto(float *result, float *a, float *b, int N)
 21 | {
 22 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 23 |   int stride = blockDim.x * gridDim.x;
 24 | 
 25 |   for(int i = index; i < N; i += stride)
 26 |   {
 27 |     result[i] = a[i] + b[i];
 28 |   }
 29 | }
 30 | 
 31 | void checkElementsAre(float target, float *array, int N)
 32 | {
 33 |   for(int i = 0; i < N; i++)
 34 |   {
 35 |     if(array[i] != target)
 36 |     {
 37 |       printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
 38 |       exit(1);
 39 |     }
 40 |   }
 41 |   printf("Success! All values calculated correctly.\n");
 42 | }
 43 | 
 44 | int main()
 45 | {
 46 |   int deviceId;
 47 |   int numberOfSMs;
 48 | 
 49 |   cudaGetDevice(&deviceId);
 50 |   cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
 51 |   printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
 52 | 
 53 |   const int N = 2<<24;
 54 |   size_t size = N * sizeof(float);
 55 | 
 56 |   float *a;
 57 |   float *b;
 58 |   float *c;
 59 | 
 60 |   cudaMallocManaged(&a, size);
 61 |   cudaMallocManaged(&b, size);
 62 |   cudaMallocManaged(&c, size);
 63 | 
 64 |   size_t threadsPerBlock;
 65 |   size_t numberOfBlocks;
 66 | 
 67 |   threadsPerBlock = 256;
 68 |   numberOfBlocks = 32 * numberOfSMs;
 69 | 
 70 |   cudaError_t addArraysErr;
 71 |   cudaError_t asyncErr;
 72 | 
 73 |   /*
 74 |    * Launch kernels.
 75 |    */
 76 | 
 77 |   initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
 78 |   initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
 79 |   initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
 80 | 
 81 |   /*
 82 |    * Now that initialization is happening on a GPU, host code
 83 |    * must be synchronized to wait for its completion.
 84 |    */
 85 | 
 86 |   cudaDeviceSynchronize();
 87 |   
 88 |   cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
 89 |   cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
 90 |   cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
 91 | 
 92 |   addArraysInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
 93 | 
 94 |   addArraysErr = cudaGetLastError();
 95 |   if(addArraysErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addArraysErr));
 96 | 
 97 |   asyncErr = cudaDeviceSynchronize();
 98 |   if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
 99 | 
100 |   checkElementsAre(7, c, N);
101 | 
102 |   cudaFree(a);
103 |   cudaFree(b);
104 |   cudaFree(c);
105 | }
106 | 


--------------------------------------------------------------------------------
/Lab6/Ex2/02-vector-add.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /*
 4 |  * Host function to initialize vector elements. This function
 5 |  * simply initializes each element to equal its index in the
 6 |  * vector.
 7 |  */
 8 | 
 9 | void initWith(float num, float *a, int N)
10 | {
11 |     for (int i = 0; i < N; ++i)
12 |     {
13 |         a[i] = num;
14 |     }
15 | }
16 | 
17 | /*
18 |  * Device kernel stores into `result` the sum of each
19 |  * same-indexed value of `a` and `b`.
20 |  */
21 | 
22 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
23 | {
24 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
25 |     int stride = blockDim.x * gridDim.x;
26 | 
27 |     for (int i = index; i < N; i += stride)
28 |     {
29 |         result[i] = a[i] + b[i];
30 |     }
31 | }
32 | 
33 | /*
34 |  * Host function to confirm values in `vector`. This function
35 |  * assumes all values are the same `target` value.
36 |  */
37 | 
38 | void checkElementsAre(float target, float *vector, int N)
39 | {
40 |     for (int i = 0; i < N; i++)
41 |     {
42 |         if (vector[i] != target)
43 |         {
44 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
45 |             exit(1);
46 |         }
47 |     }
48 |     printf("Success! All values calculated correctly.\n");
49 | }
50 | 
51 | int main()
52 | {
53 |     const int N = 2 << 24;
54 |     size_t size = N * sizeof(float);
55 | 
56 |     float *a;
57 |     float *b;
58 |     float *c;
59 | 
60 |     cudaMallocManaged(&a, size);
61 |     cudaMallocManaged(&b, size);
62 |     cudaMallocManaged(&c, size);
63 | 
64 |     initWith(3, a, N);
65 |     initWith(4, b, N);
66 |     initWith(0, c, N);
67 | 
68 |     size_t threadsPerBlock;
69 |     size_t numberOfBlocks;
70 | 
71 |     /*
72 |      * nsys should register performance changes when execution configuration
73 |      * is updated.
74 |      */
75 | 
76 |     threadsPerBlock = 64;
77 |     numberOfBlocks = 32;
78 | 
79 |     cudaError_t addVectorsErr;
80 |     cudaError_t asyncErr;
81 | 
82 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
83 | 
84 |     addVectorsErr = cudaGetLastError();
85 |     if (addVectorsErr != cudaSuccess)
86 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
87 | 
88 |     asyncErr = cudaDeviceSynchronize();
89 |     if (asyncErr != cudaSuccess)
90 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
91 | 
92 |     checkElementsAre(7, c, N);
93 | 
94 |     cudaFree(a);
95 |     cudaFree(b);
96 |     cudaFree(c);
97 | }
98 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-init-kernel-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void initWith(float num, float *a, int N)
 4 | {
 5 | 
 6 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 7 |     int stride = blockDim.x * gridDim.x;
 8 | 
 9 |     for (int i = index; i < N; i += stride)
10 |     {
11 |         a[i] = num;
12 |     }
13 | }
14 | 
15 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
16 | {
17 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
18 |     int stride = blockDim.x * gridDim.x;
19 | 
20 |     for (int i = index; i < N; i += stride)
21 |     {
22 |         result[i] = a[i] + b[i];
23 |     }
24 | }
25 | 
26 | void checkElementsAre(float target, float *vector, int N)
27 | {
28 |     for (int i = 0; i < N; i++)
29 |     {
30 |         if (vector[i] != target)
31 |         {
32 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
33 |             exit(1);
34 |         }
35 |     }
36 |     printf("Success! All values calculated correctly.\n");
37 | }
38 | 
39 | int main()
40 | {
41 |     int deviceId;
42 |     int numberOfSMs;
43 | 
44 |     cudaGetDevice(&deviceId);
45 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
46 | 
47 |     const int N = 2 << 24;
48 |     size_t size = N * sizeof(float);
49 | 
50 |     float *a;
51 |     float *b;
52 |     float *c;
53 | 
54 |     cudaMallocManaged(&a, size);
55 |     cudaMallocManaged(&b, size);
56 |     cudaMallocManaged(&c, size);
57 | 
58 |     cudaMemPrefetchAsync(a, size, deviceId);
59 |     cudaMemPrefetchAsync(b, size, deviceId);
60 |     cudaMemPrefetchAsync(c, size, deviceId);
61 | 
62 |     size_t threadsPerBlock;
63 |     size_t numberOfBlocks;
64 | 
65 |     threadsPerBlock = 256;
66 |     numberOfBlocks = 32 * numberOfSMs;
67 | 
68 |     cudaError_t addVectorsErr;
69 |     cudaError_t asyncErr;
70 | 
71 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
72 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
73 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
74 | 
75 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
76 | 
77 |     addVectorsErr = cudaGetLastError();
78 |     if (addVectorsErr != cudaSuccess)
79 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
80 | 
81 |     asyncErr = cudaDeviceSynchronize();
82 |     if (asyncErr != cudaSuccess)
83 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
84 | 
85 |     checkElementsAre(7, c, N);
86 | 
87 |     cudaFree(a);
88 |     cudaFree(b);
89 |     cudaFree(c);
90 | }
91 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-nbody.cu:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "timer.h"
  5 | #include "files.h"
  6 | 
  7 | #define SOFTENING 1e-9f
  8 | 
  9 | /*
 10 |  * Each body contains x, y, and z coordinate positions,
 11 |  * as well as velocities in the x, y, and z directions.
 12 |  */
 13 | 
 14 | typedef struct
 15 | {
 16 |     float x, y, z, vx, vy, vz;
 17 | } Body;
 18 | 
 19 | /*
 20 |  * Calculate the gravitational impact of all bodies in the system
 21 |  * on all others.
 22 |  */
 23 | 
 24 | __global__ void bodyForce(Body *p, float dt, int n)
 25 | {
 26 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 27 |     int gridnum = blockDim.x * gridDim.x;
 28 |     for (int i = tid; i < n; i += gridnum)
 29 |     {
 30 |         float Fx = 0.0f;
 31 |         float Fy = 0.0f;
 32 |         float Fz = 0.0f;
 33 | 
 34 |         for (int j = 0; j < n; j++)
 35 |         {
 36 |             float dx = p[j].x - p[i].x;
 37 |             float dy = p[j].y - p[i].y;
 38 |             float dz = p[j].z - p[i].z;
 39 |             float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
 40 |             float invDist = rsqrtf(distSqr);
 41 |             float invDist3 = invDist * invDist * invDist;
 42 | 
 43 |             Fx += dx * invDist3;
 44 |             Fy += dy * invDist3;
 45 |             Fz += dz * invDist3;
 46 |         }
 47 | 
 48 |         p[i].vx += dt * Fx;
 49 |         p[i].vy += dt * Fy;
 50 |         p[i].vz += dt * Fz;
 51 |     }
 52 | }
 53 | 
 54 | int main(const int argc, const char **argv)
 55 | {
 56 | 
 57 |     // The assessment will test against both 2<11 and 2<15.
 58 |     // Feel free to pass the command line argument 15 when you gernate ./nbody report files
 59 |     int nBodies = 2 << 11;
 60 |     if (argc > 1)
 61 |         nBodies = 2 << atoi(argv[1]);
 62 | 
 63 |     // The assessment will pass hidden initialized values to check for correctness.
 64 |     // You should not make changes to these files, or else the assessment will not work.
 65 |     const char *initialized_values;
 66 |     const char *solution_values;
 67 | 
 68 |     if (nBodies == 2 << 11)
 69 |     {
 70 |         initialized_values = "files/initialized_4096";
 71 |         solution_values = "files/solution_4096";
 72 |     }
 73 |     else
 74 |     { // nBodies == 2<<15
 75 |         initialized_values = "files/initialized_65536";
 76 |         solution_values = "files/solution_65536";
 77 |     }
 78 | 
 79 |     if (argc > 2)
 80 |         initialized_values = argv[2];
 81 |     if (argc > 3)
 82 |         solution_values = argv[3];
 83 | 
 84 |     const float dt = 0.01f; // Time step
 85 |     const int nIters = 10;  // Simulation iterations
 86 | 
 87 |     int deviceId;
 88 |     int numberofSMs;
 89 |     cudaGetDevice(&deviceId);
 90 |     cudaDeviceGetAttribute(&numberofSMs, cudaDevAttrMultiProcessorCount, deviceId);
 91 | 
 92 |     int bytes = nBodies * sizeof(Body);
 93 |     float *buf;
 94 | 
 95 |     cudaMallocManaged(&buf, bytes);
 96 | 
 97 |     Body *p = (Body *)buf;
 98 | 
 99 |     read_values_from_file(initialized_values, buf, bytes);
100 | 
101 |     double totalTime = 0.0;
102 | 
103 |     /*
104 |      * This simulation will run for 10 cycles of time, calculating gravitational
105 |      * interaction amongst bodies, and adjusting their positions to reflect.
106 |      */
107 | 
108 |     cudaMemPrefetchAsync(p, bytes, deviceId);
109 | 
110 |     for (int iter = 0; iter < nIters; iter++)
111 |     {
112 |         StartTimer();
113 | 
114 |         /*
115 |          * You will likely wish to refactor the work being done in `bodyForce`,
116 |          * and potentially the work to integrate the positions.
117 |          */
118 | 
119 |         size_t threadsPerBlock;
120 |         size_t numberofBlocks;
121 |         threadsPerBlock = 1024;
122 |         numberofBlocks = numberofSMs * 32;
123 |         bodyForce<<<numberofBlocks, threadsPerBlock>>>(p, dt, nBodies); // compute interbody forces
124 |         cudaDeviceSynchronize();
125 | 
126 |         /*
127 |          * This position integration cannot occur until this round of `bodyForce` has completed.
128 |          * Also, the next round of `bodyForce` cannot begin until the integration is complete.
129 |          */
130 | 
131 |         for (int i = 0; i < nBodies; i++)
132 |         { // integrate position
133 |             p[i].x += p[i].vx * dt;
134 |             p[i].y += p[i].vy * dt;
135 |             p[i].z += p[i].vz * dt;
136 |         }
137 | 
138 |         const double tElapsed = GetTimer() / 1000.0;
139 |         totalTime += tElapsed;
140 |     }
141 | 
142 |     double avgTime = totalTime / (double)(nIters);
143 |     float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
144 |     write_values_to_file(solution_values, buf, bytes);
145 | 
146 |     // You will likely enjoy watching this value grow as you accelerate the application,
147 |     // but beware that a failure to correctly synchronize the device might result in
148 |     // unrealistically high values.
149 |     printf("%0.3f Billion Interactions / second", billionsOfOpsPerSecond);
150 | 
151 |     free(buf);
152 | }
153 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-prefetch-check-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void initWith(float num, float *a, int N)
 4 | {
 5 | 
 6 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 7 |     int stride = blockDim.x * gridDim.x;
 8 | 
 9 |     for (int i = index; i < N; i += stride)
10 |     {
11 |         a[i] = num;
12 |     }
13 | }
14 | 
15 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
16 | {
17 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
18 |     int stride = blockDim.x * gridDim.x;
19 | 
20 |     for (int i = index; i < N; i += stride)
21 |     {
22 |         result[i] = a[i] + b[i];
23 |     }
24 | }
25 | 
26 | void checkElementsAre(float target, float *vector, int N)
27 | {
28 |     for (int i = 0; i < N; i++)
29 |     {
30 |         if (vector[i] != target)
31 |         {
32 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
33 |             exit(1);
34 |         }
35 |     }
36 |     printf("Success! All values calculated correctly.\n");
37 | }
38 | 
39 | int main()
40 | {
41 |     int deviceId;
42 |     int numberOfSMs;
43 | 
44 |     cudaGetDevice(&deviceId);
45 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
46 | 
47 |     const int N = 2 << 24;
48 |     size_t size = N * sizeof(float);
49 | 
50 |     float *a;
51 |     float *b;
52 |     float *c;
53 | 
54 |     cudaMallocManaged(&a, size);
55 |     cudaMallocManaged(&b, size);
56 |     cudaMallocManaged(&c, size);
57 | 
58 |     cudaMemPrefetchAsync(a, size, deviceId);
59 |     cudaMemPrefetchAsync(b, size, deviceId);
60 |     cudaMemPrefetchAsync(c, size, deviceId);
61 | 
62 |     size_t threadsPerBlock;
63 |     size_t numberOfBlocks;
64 | 
65 |     threadsPerBlock = 256;
66 |     numberOfBlocks = 32 * numberOfSMs;
67 | 
68 |     cudaError_t addVectorsErr;
69 |     cudaError_t asyncErr;
70 | 
71 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(3, a, N);
72 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(4, b, N);
73 |     initWith<<<numberOfBlocks, threadsPerBlock>>>(0, c, N);
74 | 
75 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
76 | 
77 |     addVectorsErr = cudaGetLastError();
78 |     if (addVectorsErr != cudaSuccess)
79 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
80 | 
81 |     asyncErr = cudaDeviceSynchronize();
82 |     if (asyncErr != cudaSuccess)
83 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
84 | 
85 |     cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
86 | 
87 |     checkElementsAre(7, c, N);
88 | 
89 |     cudaFree(a);
90 |     cudaFree(b);
91 |     cudaFree(c);
92 | }
93 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-print-numbers-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <unistd.h>
 3 | 
 4 | __global__ void printNumber(int number)
 5 | {
 6 |     printf("%d\n", number);
 7 | }
 8 | 
 9 | int main()
10 | {
11 |     for (int i = 0; i < 5; ++i)
12 |     {
13 |         cudaStream_t stream;
14 |         cudaStreamCreate(&stream);
15 |         printNumber<<<1, 1, 0, stream>>>(i);
16 |         cudaStreamDestroy(stream);
17 |     }
18 |     cudaDeviceSynchronize();
19 | }
20 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-print-numbers.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void printNumber(int number)
 4 | {
 5 |     printf("%d\n", number);
 6 | }
 7 | 
 8 | int main()
 9 | {
10 |     for (int i = 0; i < 5; ++i)
11 |     {
12 |         printNumber<<<1, 1>>>(i);
13 |     }
14 |     cudaDeviceSynchronize();
15 | }
16 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-stream-init-solution.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdio.h>
  3 | 
  4 | __global__ void initWith(float num, float *a, int N)
  5 | {
  6 | 
  7 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
  8 |     int stride = blockDim.x * gridDim.x;
  9 | 
 10 |     for (int i = index; i < N; i += stride)
 11 |     {
 12 |         a[i] = num;
 13 |     }
 14 | }
 15 | 
 16 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
 17 | {
 18 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 19 |     int stride = blockDim.x * gridDim.x;
 20 | 
 21 |     for (int i = index; i < N; i += stride)
 22 |     {
 23 |         result[i] = a[i] + b[i];
 24 |     }
 25 | }
 26 | 
 27 | void checkElementsAre(float target, float *vector, int N)
 28 | {
 29 |     for (int i = 0; i < N; i++)
 30 |     {
 31 |         if (vector[i] != target)
 32 |         {
 33 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
 34 |             exit(1);
 35 |         }
 36 |     }
 37 |     printf("Success! All values calculated correctly.\n");
 38 | }
 39 | 
 40 | int main()
 41 | {
 42 |     int deviceId;
 43 |     int numberOfSMs;
 44 | 
 45 |     cudaGetDevice(&deviceId);
 46 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
 47 | 
 48 |     const int N = 2 << 24;
 49 |     size_t size = N * sizeof(float);
 50 | 
 51 |     float *a;
 52 |     float *b;
 53 |     float *c;
 54 | 
 55 |     cudaMallocManaged(&a, size);
 56 |     cudaMallocManaged(&b, size);
 57 |     cudaMallocManaged(&c, size);
 58 | 
 59 |     cudaMemPrefetchAsync(a, size, deviceId);
 60 |     cudaMemPrefetchAsync(b, size, deviceId);
 61 |     cudaMemPrefetchAsync(c, size, deviceId);
 62 | 
 63 |     size_t threadsPerBlock;
 64 |     size_t numberOfBlocks;
 65 | 
 66 |     threadsPerBlock = 256;
 67 |     numberOfBlocks = 32 * numberOfSMs;
 68 | 
 69 |     cudaError_t addVectorsErr;
 70 |     cudaError_t asyncErr;
 71 | 
 72 |     /*
 73 |      * Create 3 streams to run initialize the 3 data vectors in parallel.
 74 |      */
 75 | 
 76 |     cudaStream_t stream1, stream2, stream3;
 77 |     cudaStreamCreate(&stream1);
 78 |     cudaStreamCreate(&stream2);
 79 |     cudaStreamCreate(&stream3);
 80 | 
 81 |     /*
 82 |      * Give each `initWith` launch its own non-standard stream.
 83 |      */
 84 | 
 85 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);
 86 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);
 87 |     initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);
 88 | 
 89 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
 90 | 
 91 |     addVectorsErr = cudaGetLastError();
 92 |     if (addVectorsErr != cudaSuccess)
 93 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
 94 | 
 95 |     asyncErr = cudaDeviceSynchronize();
 96 |     if (asyncErr != cudaSuccess)
 97 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
 98 | 
 99 |     cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
100 | 
101 |     checkElementsAre(7, c, N);
102 | 
103 |     /*
104 |      * Destroy streams when they are no longer needed.
105 |      */
106 | 
107 |     cudaStreamDestroy(stream1);
108 |     cudaStreamDestroy(stream2);
109 |     cudaStreamDestroy(stream3);
110 | 
111 |     cudaFree(a);
112 |     cudaFree(b);
113 |     cudaFree(c);
114 | }
115 | 


--------------------------------------------------------------------------------
/Lab6/Ex3/03-vector-add-prefetch-solution.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void initWith(float num, float *a, int N)
 4 | {
 5 |     for (int i = 0; i < N; ++i)
 6 |     {
 7 |         a[i] = num;
 8 |     }
 9 | }
10 | 
11 | __global__ void addVectorsInto(float *result, float *a, float *b, int N)
12 | {
13 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
14 |     int stride = blockDim.x * gridDim.x;
15 | 
16 |     for (int i = index; i < N; i += stride)
17 |     {
18 |         result[i] = a[i] + b[i];
19 |     }
20 | }
21 | 
22 | void checkElementsAre(float target, float *vector, int N)
23 | {
24 |     for (int i = 0; i < N; i++)
25 |     {
26 |         if (vector[i] != target)
27 |         {
28 |             printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
29 |             exit(1);
30 |         }
31 |     }
32 |     printf("Success! All values calculated correctly.\n");
33 | }
34 | 
35 | int main()
36 | {
37 |     int deviceId;
38 |     int numberOfSMs;
39 | 
40 |     cudaGetDevice(&deviceId);
41 |     cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
42 | 
43 |     const int N = 2 << 24;
44 |     size_t size = N * sizeof(float);
45 | 
46 |     float *a;
47 |     float *b;
48 |     float *c;
49 | 
50 |     cudaMallocManaged(&a, size);
51 |     cudaMallocManaged(&b, size);
52 |     cudaMallocManaged(&c, size);
53 | 
54 |     initWith(3, a, N);
55 |     initWith(4, b, N);
56 |     initWith(0, c, N);
57 | 
58 |     cudaMemPrefetchAsync(a, size, deviceId);
59 |     cudaMemPrefetchAsync(b, size, deviceId);
60 |     cudaMemPrefetchAsync(c, size, deviceId);
61 | 
62 |     size_t threadsPerBlock;
63 |     size_t numberOfBlocks;
64 | 
65 |     threadsPerBlock = 256;
66 |     numberOfBlocks = 32 * numberOfSMs;
67 | 
68 |     cudaError_t addVectorsErr;
69 |     cudaError_t asyncErr;
70 | 
71 |     addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
72 | 
73 |     addVectorsErr = cudaGetLastError();
74 |     if (addVectorsErr != cudaSuccess)
75 |         printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
76 | 
77 |     asyncErr = cudaDeviceSynchronize();
78 |     if (asyncErr != cudaSuccess)
79 |         printf("Error: %s\n", cudaGetErrorString(asyncErr));
80 | 
81 |     checkElementsAre(7, c, N);
82 | 
83 |     cudaFree(a);
84 |     cudaFree(b);
85 |     cudaFree(c);
86 | }
87 | 


--------------------------------------------------------------------------------
/Lab6/report/Lab6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/Lab6.pdf


--------------------------------------------------------------------------------
/Lab6/report/image-20220613213324039.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613213324039.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613213540824.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613213540824.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613214044904.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613214044904.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613215757260.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613215757260.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613220353591.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613220353591.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613222635516.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613222635516.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220613223730354.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220613223730354.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220614213733118.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220614213733118.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220615125537746.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220615125537746.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220615131242726.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220615131242726.png


--------------------------------------------------------------------------------
/Lab6/report/image-20220615131738098.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/Lab6/report/image-20220615131738098.png


--------------------------------------------------------------------------------
/Lab6/report/report.md:
--------------------------------------------------------------------------------
  1 | 加速计算正在取代CPU计算，成为目前最佳的计算方法，其中通过GPU进行加速又是在目前一种主流的加速计算方式。在本次实验中，就基于NVIDIA的GPU学习平台，学习了有关CUDA的一些基本的原理和使用技巧。
  2 | 
  3 | CUDA计算平台，提供了一种可以扩展C、C++、Python和Fortran等语言的编码范式，该凡是能够在世界上性能超强的并行处理器NVIDIA GPU上运行经过加速的大规模并行代码。CUDA可以毫不费力地大幅度加速应用程序，具有适用于DNN、BLAS、图形分析和FFT等更多运算的高度优化库生态系统，并且还附带了强大的命令行和可视化性能分析工具。
  4 | 
  5 | ## 使用CUDA C/C++加速应用程序
  6 | 
  7 | ### 学习目标
  8 | 
  9 | * 编写编译及运行既可以调用CPU函数又可以启动GPU核函数的C/C++程序
 10 | * 使用执行配置控制并行线程层次结构
 11 | * 重构串行循环以在GPU上并行执行迭代
 12 | * 分配和释放可用于CPU和GPU的内存
 13 | * 处理CUDA代码生成的错误
 14 | * 加速CPU应用程序
 15 | 
 16 | ### 学习及实验
 17 | 
 18 | 加速系统又称异构系统，由 CPU 和 GPU 组成。加速系统会运行 CPU 程序，这些程序会调用能够利用GPU进行并行优化加速的核函数。一般的，并不会把程序的所有过程全部部署到GPU上进行运算，原因是GPU上更适合一些大规模的算术运算，而对于逻辑判断等运算并没有显著优势，并且还可能性能差于CPU。因此需要在合适的地方将计算过程通过核函数的方式分发给GPU，进行加速优化。
 19 | 
 20 | #### 核函数
 21 | 
 22 | 如果想要让某个函数能够分配到GPU上执行，需要首先将这个函数声明为核函数，即使用`__global__`关键字，这个关键字表明以下函数将在GPU上运行，并且可以又CPU或GPU调用。通常的，将CPU上执行的代码成为主机代码，而将GPU上执行的代码成为设备代码。
 23 | 
 24 | 而在调用和函数的时候，需要使用一种特殊的语法标记`GPUFunction<<<number_of_blocks,threads_per_block>>>()`，这表明，启用了number_of_blocks个线程块，而每个线程块内包含threads_per_block，这个函数将会被分配到所有的线程上去并行执行。CUDA的线程层次结构如下图所示
 25 | 
 26 | ![image-20220613213324039](E:\project\parallel\Lab6\report\image-20220613213324039.png)
 27 | 
 28 | 同时，由于核函数的启动方式是异步的，CPU代码将不会等待GPU上的程序完全执行完成后再开始。因此在调用了核函数之后，一般需要进行一次同步，即需要等待所有的在GPU上执行的函数完成后，才能够继续串行去执行后续的代码。这就需要使用到`cudaDeviceSynchronize()`函数。
 29 | 
 30 | #### CUDA线程层次结构变量
 31 | 
 32 | 在上一小节中曾经介绍了CUDA的线程层次，即是通过线程块来进行组织的。每一个核函数可以使用多个线程块，并且可以指定线程块内线程的数量。因此我们就需要有一些变量，能够在函数内部确定当前的函数是在哪一个线程块中的哪一个线程上执行的，于是就提出了线程层次结构变量。
 33 | 
 34 | 每个线程在其线程块内部均会被分配一个索引，从0开始。此外，每个线程块也会被分配一个索引，并从0开始。正如线程组成线程块，线程块又会组成网格，而网格是 CUDA 线程层次结构中级别最高的实体。简言之，CUDA 核函数在由一个或多个线程块组成的网格中执行，且每个线程块中均包含相同数量的一个或多个线程。
 35 | 
 36 | CUDA 核函数可以访问能够识别如下两种索引的特殊变量：正在执行核函数的线程（位于线程块内）索引和线程所在的线程块（位于网格内）索引。这两种变量分别为 `threadIdx.x` 和 `blockIdx.x`。
 37 | 
 38 | #### 加速for循环
 39 | 
 40 | 对于一个循环而言，并非要顺次运行循环的每次迭代，而是让每次迭代都在自身线程中并行运行。因此必须首先编写循环的单次迭代工作的核函数，由于核函数于其他正在运行的核函数无关，因此执行配置必须使用核函数执行正确的次数。
 41 | 
 42 | 当然由于每一个线程块内的线程数量是有一个上限的，因此很有必要在调用核函数的时候，同时启用多个线程块，并且在每个线程块内启用多个线程。这时，就可以通过 `threadIdx.x` 和 `blockIdx.x`来定位当前线程在整个任务中的一个为止，并根据此计算出当前线程所负责的任务。
 43 | 
 44 | 接下来的练习就是使用多个线程块来加速for循环，由于各个线程之间是并行的，因此数字0-9的输出很有可能是乱序的。
 45 | 
 46 | #### CPU和GPU内存分配
 47 | 
 48 | CUDA的最新版本已经能够轻松的分配可用于CPU主机和任意数量GPU设备的内存。尽管现在有许多适用于内存管理并且可支持加速应用程序中最优性能的，但是现在要介绍的基础CUDA内存管理技术还是不能支持远超CPU应用程序的卓越性能。
 49 | 
 50 | 其主要区别就是在申请内存的时候，使用`cudaMallocMananged`函数，并在释放内存的时候，使用`cudaFree`函数。
 51 | 
 52 | 在实验中，要求尝试能够分配一种既能在CPU上访问又能在GPU上访问的内存，并且对于这个数组进行一个翻倍的操作。因此只需要使用`cudaMallocManaged`和`cudaFree`函数进行内存的申请和释放，并且将loop函数重构为核函数即可。
 53 | 
 54 | #### 数据与网格不匹配
 55 | 
 56 | 当然，由于数据的规模不同，很有可能会出现数据规模与网络大小不匹配的现象，包括各个网格分配的任务数量不同，以及数据比网格大的时候需要采用循环划分的方式。
 57 | 
 58 | 那么首先就是要注意，需要在每个线程执行核函数的时候判断当前的任务标号是否是合法的，并且如果数据集比网格的大的时候，还会涉及到跨网格的数据访问，这就类似于之前在进行任务划分时的循环任务划分的方式。这时只需要注意修改一下循环的步幅即可，将之前的步幅修改为`gridDim.x * blockDim.x`，而在每个线程内仍采用之前的寻址方式。
 59 | 
 60 | ![image-20220613223730354](E:\project\parallel\Lab6\report\image-20220613223730354.png)
 61 | 
 62 | #### 错误处理
 63 | 
 64 | 对于任何应用程序而言，错误的检查和处理都是至关重要的，加速CUDA代码中同样有错误处理的相关操作，并且CUDA的很多内置函数都以`cudaError_t`为返回值类型，该返回值可以用于检查调用函数的时候是否发生了错误。但是由于核函数必须声明为`void`类型，因此就不能够通过返回值来判断是否发生了错误，可以使用CUDA提供的方法`cudaGetLastError`方法获取最近的一次错误信息。对于内存申请，以及核函数执行之后的同步操作都需要使用`cudaError_t`的返回值来检查是否发生了错误。
 65 | 
 66 | ## CUDA C/C++统一内存
 67 | 
 68 | ### 学习目标
 69 | 
 70 | * 使用 Nsight Systems 命令行分析被加速的应用程序的性能
 71 | * 利用对流多处理器的理解优化执行配置
 72 | * 理解统一内存在也错误和数据迁移方面的行为
 73 | * 使用异步内存预取减少页错误和数据迁移以提高性能
 74 | * 采用循环式的迭代开发加速应用恒旭的优化加速和部署
 75 | 
 76 | ### 学习和实验
 77 | 
 78 | #### 使用nsys性能分析工具
 79 | 
 80 | nsys时NVIDIA的命令行分析器，提供分析被加速的应用程序性能的强大功能。nsys会执行使用nvcc编译的可执行程序，并打印应用程序的GPU活动的摘要输出、CUDA API调用情况以及同意内存活动的相关信息。
 81 | 
 82 | 使用`nsys profile` 将会生成一个`qdrep`报告文件，可以增加指令选项`--stats=true`打印输出摘要信息。
 83 | 
 84 | #### 流多处理器
 85 | 
 86 | 运行CUDA应用程序的GPU具有成为流多处理器（SM）的处理单元，在执行核函数的期间，将线程块提供给SM以供其执行。为支持GPU执行尽可能多的并行操作，通常可以选择线程块的数量数倍于指定的GPU上SM数量的网格大小来提高性能。并且SM会在一个名为warp的线程块内创建、管理、调度和执行包含32个线程的线程组，因此可以选择线程数量数倍于32的线程块大小来提升性能。
 87 | 
 88 | ![image-20220614213733118](E:\project\parallel\Lab6\report\image-20220614213733118.png)
 89 | 
 90 | 在CUDA中提供了API能够获取SM的数量，即可以调用函数`cudaDeviceGetAttribute`，指定获取`cudaDevAttrMultiProcessorCount`属性的值，便可以得到流多处理器的数量。之后在声明线程的数量的时候，就可以将其数量指定为SM的数倍。
 91 | 
 92 | #### 统一内存
 93 | 
 94 | 通常的使用函数`cudaMallocMananged`分配旨在供主机或设备代码使用的内存，这种方法的便利是能够自动实现内存的迁移并简化编程，但同样会带来内存迁移而产生的性能损失。因此有必要对CUDA的统一内存进行了解。
 95 | 
 96 | 如果是在CPU上调用了`cudaMallocManaged`函数，则所申请的内存会出现在CPU上，而当GPU想要访问相关的数据的时候，就需要将这部分内存分配的GPU上。而如果在GPU上的核函数调用了`cudaMallocManaged`函数，则申请的内存会分配在GPU上，因此当CPU想要访问这部分数据的时候，又要将数据从GPU迁移的CPU上。
 97 | 
 98 | #### 异步内存预取
 99 | 
100 | 在分配UM的时候，最初可能并没有驻留在CPU或者GPU上，因此当某些线程执行工作的时候就可能发生页错误，因此就会触发内存的迁移，这会造成性能的损失。
101 | 
102 | 在主机到设备和设备到主机的内存传输过程中，我们使用异步内存预取的技术来减少页错误和按需内存迁移成本。通过此技术，可以在应用程序代码使用统一内存 (UM) 之前，在后台将其异步迁移至系统中的任何 CPU 或 GPU 设备。此举可以减少页错误和按需数据迁移所带来的成本，并进而提高 GPU 核函数和 CPU 函数的性能。
103 | 
104 | 但是由于预取往往会以更大的数据块来迁移数据，因此这种迁移的成本也是很高的，也要尽可能减少这种预取操作。一般的，当在运行之前已经知道数据访问需要且数据访问并未采用稀疏模式的时候，就可以考虑使用异步内存预取。
105 | 
106 | CUDA也提供了相关的函数来实现异步内存预取，可以使用`cudaMemPrefetchAsync`来实现CPU和GPU之间的数据预取。
107 | 
108 | ## CUDA加速应用程序的异步流和可视化分析
109 | 
110 | ### 学习目标
111 | 
112 | * 使用Nsight Systems直观描述由GPU加速的CUDA应用程序的时间表
113 | * 使用Nsight Systems识别和利用CUDA应用程序中的优化机会
114 | * 利用CUDA流在被加速的应用程序中并发执行核函数
115 | 
116 | ### 学习和实验
117 | 
118 | #### 比较异步预取
119 | 
120 | 在上一章的学习过程中我们了解了异步预取对于程序性能的重要影响，即如果已经提前明确了在程序的哪一部分会发生内存的迁移，则可以提前将这部分数据预取到GPU或者CPU上，来减少分页错误。在本次实验中，就可以通过使用Nsight Systems这个可视化的性能分析工具，来观察异步预取对于程序的性能影响。
121 | 
122 | 这里对比了不进行异步预取和提前进行异步预取这两种方式，在进行向量加法时的访存差异。可以看到，由于在进行向量加法之前，体检将数据预取到了GPU，因此在执行核函数的时候，并没有发生内存的迁移。
123 | 
124 | ![image-20220615125537746](E:\project\parallel\Lab6\report\image-20220615125537746.png)
125 | 
126 | #### 并发CUDA流
127 | 
128 | 在CUDA编程中，流时由按照顺序执行的一系列命令构成的。在CUDA应用程序中，核函数的执行以及一些内存传输均在CUDA流中进行。除了默认的流意外，CUDA还可以创建并使用非默认的CUDA流，此举可以支持执行多个操作。对于一个给定的流，其中的所有操作都会顺序执行，不同的非默认流之间是并行执行的，默认流拥有阻断能力，即他会等待所有其他的流执行完成后才开始执行自己的操作，并在此时阻塞其他的非默认流。
129 | 
130 | 可以使用`cudaStreamCreate`来创建一个流，并且将得到的流作为一个参数传递给核函数，`someKernel<<<number_of_blocks, threads_per_block, 0, stream>>>()`。
131 | 
132 | ![image-20220615131242726](E:\project\parallel\Lab6\report\image-20220615131242726.png)
133 | 
134 | ## 证书
135 | 
136 | ![image-20220615131738098](E:\project\parallel\Lab6\report\image-20220615131738098.png)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Parallel-Programming
2 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | # test.sh
2 | # !/bin/sh
3 | pssh -h $PBS_NODEFILE -i "if [ ! -d \"/home/sTest/test\" ];then mkdir -p \"/home/sTest/test\"; fi" 1>&2  # 这里有一个if语句，是保证mkdir命令执行不会报错，也就是如果文件夹不存在则创建路径。文件夹路径就是/home/sTest/test。
4 | pscp -h $PBS_NODEFILE /home/sTest/test/hello /home/sTest/test 1>&2  # 第一个文件路径是你的可执行文件在master节点的路径，第二个文件夹路径是你希望把文件发送到其他计算节点的具体文件夹，应与当前节点可执行文件所在文件夹路径保持一致。
5 | /home/sTest/test/hello  # 执行文件
6 | # 使用本脚本前，先把第三四五行#号后的注释以及本行注释删掉。


--------------------------------------------------------------------------------
/实验指导书/CUDA_C_Best_Practices_Guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/CUDA_C_Best_Practices_Guide.pdf


--------------------------------------------------------------------------------
/实验指导书/OpenMP-4.5-1115-CPP-web.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/OpenMP-4.5-1115-CPP-web.pdf


--------------------------------------------------------------------------------
/实验指导书/OpenMPRefCard-5-2-web.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/OpenMPRefCard-5-2-web.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-1 实验环境搭建.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-1 实验环境搭建.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-2 体系结构相关及性能测试.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-2 体系结构相关及性能测试.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-3 SIMD编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-3 SIMD编程.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-4 Pthread编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-4 Pthread编程.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-5 OpenMP编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-5 OpenMP编程.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-6 MPI编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-6 MPI编程.pdf


--------------------------------------------------------------------------------
/实验指导书/实验教学指导书-7 GPU编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/实验指导书/实验教学指导书-7 GPU编程.pdf


--------------------------------------------------------------------------------
/调研/Intel Core 12th CPU 架构调研.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/Intel Core 12th CPU 架构调研.pdf


--------------------------------------------------------------------------------
/调研/image-20220226163655963.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220226163655963.png


--------------------------------------------------------------------------------
/调研/image-20220226164012526.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220226164012526.png


--------------------------------------------------------------------------------
/调研/image-20220227154957590.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227154957590.png


--------------------------------------------------------------------------------
/调研/image-20220227155106394.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227155106394.png


--------------------------------------------------------------------------------
/调研/image-20220227170838930.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227170838930.png


--------------------------------------------------------------------------------
/调研/image-20220227170901851.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227170901851.png


--------------------------------------------------------------------------------
/调研/image-20220227175847534.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227175847534.png


--------------------------------------------------------------------------------
/调研/image-20220227210558360.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/image-20220227210558360.png


--------------------------------------------------------------------------------
/调研/intel-architecture-day-2021-presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/intel-architecture-day-2021-presentation.pdf


--------------------------------------------------------------------------------
/调研/s_a991b8031e974f40838bdc54f32a43b9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NKULYX/NKU-COSC0025-Parallel-Programming/b5921c871d2697f405a9f004093d66f5c7d93897/调研/s_a991b8031e974f40838bdc54f32a43b9.jpg


--------------------------------------------------------------------------------
/调研/调研.md:
--------------------------------------------------------------------------------
 1 | # Intel Core CPU的发展历史
 2 | 
 3 | | 代数 |   型号    | 核心数 | 线程数 | 最大主频 | 三级缓存 | 二级缓存 |
 4 | | :--: | :-------: | :----: | :----: | :------: | :------: | :------: |
 5 | |  7   | i7-7700K  |   4    |   8    |  4.5GHz  |   8MB    |   1MB    |
 6 | |  8   | i7-8086K  |   6    |   12   |  5.0GHz  |   12MB   |  1.5MB   |
 7 | |  9   | i9-9900K  |   8    |   16   |  5.0GHz  |   16MB   |   2MB    |
 8 | |  10  | i9-10900K |   10   |   20   |  5.2GHz  |   20MB   |  2.5MB   |
 9 | |  11  | i9-11900K |   8    |   16   |  5.3GHz  |   16MB   |   2MB    |
10 | |  12  | i9-12900H |   16   |   20   |  5.2GHz  |   30MB   |   14MB   |
11 | 
12 | # Intel 12th Core CPU参数
13 | 
14 | 以Intel 12th Core i9 为例
15 | 
16 | |      指标       |     数据      |
17 | | :-------------: | :-----------: |
18 | |    最大主频     |    5.2GHz     |
19 | | P-Core 最大频率 |    5.1GHz     |
20 | | E-Core 最大频率 |    3.9GHZ     |
21 | |  处理器内核数   |      16       |
22 | |     P-Core      |       8       |
23 | |     E-Core      |       8       |
24 | |  处理器线程数   |      24       |
25 | |     L3 Cash     |     30MB      |
26 | |     L2 Cash     |     14MB      |
27 | |  最大内存速度   | DDR5 4800MT/s |
28 | 
29 | # Intel 7 制程工艺
30 | 
31 | Intel移动端的处理器早在2019年就开始使用了10nm的制程工艺，而桌面端直到上一代11代处理器仍在打磨14nm制程工艺，如今在Intel 12th CPU上终于采用了最新的Intel7制程工艺。Intel 7 实际上是原来叫做 10nm Enhaced SuperFin，属于Intel第三代10nm技术。相较于14nm技术，10nm制程工艺能够在同等面积的芯片上集成更多的晶体管，因此性能能够有很大的提升，除此之外，10nm的能耗也要由于14nm的制程工艺。虽然目前最细的制程工艺是台积电的7nm，但是Intel7的10nm制程工艺的晶体管密度还要比台积电7nm的晶体管密度稍高一些。
32 | 
33 | |                  | Intel 10nm | 台积电 7nm |
34 | | :--------------: | :--------: | :--------: |
35 | | 万晶体管/ $mm^2$ |   10076    |    9650    |
36 | 
37 | # Intel 12th Core 架构设计
38 | 
39 | Intel 12th Core 采用了大小核混合架构的设计，即性能核心（Performance Core）和能效核心（Efficient Core）。
40 | 
41 | ![Intel 12代酷睿细节公开：功耗低至9W、大小核必须配Win11](E:\学习资料\大二\并行程序设计\Experiment\调研\s_a991b8031e974f40838bdc54f32a43b9.jpg)
42 | 
43 | P-Core支持超线程，频率在3.2-5.1GHz，E-Core不支持超线程，频率在2.4-3.9GHz。在缓存方面，每个P-Core拥有256KB的指令缓存和48KB的数据缓存，每一个P-Core独享1.25MB的L2 Cash；每一个E-Core拥有64KB的指令缓存和32KB的数据缓存，八个E-Core共享4MB的L2 Cash。P-Core和E-Core共享30MB的三级缓存。
44 | 
45 | ![image-20220227170838930](E:\学习资料\大二\并行程序设计\Experiment\调研\image-20220227170838930.png)
46 | 
47 | ![image-20220227170901851](E:\学习资料\大二\并行程序设计\Experiment\调研\image-20220227170901851.png)
48 | 
49 | 大小核设计的目的：大小核设计在ARM架构下已经很早就手机上进行推广了，但是x86的PC端如今是第一次采用大小核的架构设计。除了考虑到大小核设计能够降低功耗之外，其最大的目的还是兼顾了单核性能的提升和多核性能的均衡。P-Core 最大限度地提高单线程性能和相应速度，而 E-Core 则为现代多任务处理提供可扩展的多线程性能和高效的后台任务卸载。
50 | 
51 | 现代的CPU微架构主要可以分为前端和后端两部分，前端主要负责取指和解码两部分操作，后端主要实现执行和写回这两个操作。前端需要尽可能多从内存中获取指令，并将其解码成后端能够直接执行的微操作，此外，前端还需要能够进行准确快速的分支判断。因此前端在设计架构时的目标，一个是要有足够的深度，能够缓存足够多的指令，减少数据从内存传输的CPU的等待时间，也能够降低访存带来的功。另一个是提高分支判断的准确性，能够正确预测后续的指令，避免因为预测失误而冲刷流水线导致巨大的性能损失。后端优化地主要目的则是要尽可能地提高宽度，让更多解码后的微指令可以并行乱序地去执行。需要提供更多地发射端口、乱序执行窗口以及更多的算术逻辑单元ALU等等。
52 | 
53 | 就E-Core而言，在前端方面，包含64KB的指令缓存，可以再不消耗子系统功率的情况下保存可用的指令；使用了Intel的第一个按需指令长的解码器，可以预测解码信息，加速现代工作的负载；进一步优化的分支预测单元，借助更深的分支历史和更大的指令尺寸，拥有5000个条目的分支目标缓存区，极大提高了分支预测的准确性。E-Core还包含了两个3宽度的乱序解码器，继承了Tremont的低功耗内核，将解码宽度由原来的4提升到6，能够单周期完成高达6个指令的解码，还能在这两个解码器之间做负载均衡和乱序执行，从而进一步提升解码操作的并行性。
54 | 
55 | ![image-20220227175847534](E:\学习资料\大二\并行程序设计\Experiment\调研\image-20220227175847534.png)
56 | 
57 | 在后端方面，具备5组宽度分配、8组宽度引退、256个乱序窗口入口、17个执行端口，以及4个整数ALU、2个载入AGU、2个存储AGU、2个跳转端口、2个整数存储数据、2个浮点/矢量存储、2个浮点/矢量堆栈、以及第3个矢量ALU。
58 | 
59 | ![image-20220227210558360](E:\学习资料\大二\并行程序设计\Experiment\调研\image-20220227210558360.png)
60 | 
61 | # 性能优化和调度
62 | 
63 | 为了能够保证分支预测的准确性，减少因为预测错误导致流水线的冲刷，在E-Core的分支预测单元中，采用了机器学习的方法，不仅能够提升分支预测的准确性，还能够根据代码量动态调整缓存的大小。后端在分配微指令的时候，也可以预先执行一些简单的指令，从而快速释放后端指令流水线的资源，提升系统的吞吐量。存储系统也可以学习数据的读写模型，从而判断数据之间的依赖程度，并以此作为进出缓存的重要依据。
64 | 
65 | Intel提出的线程调度器，存在于操作系统与硬件之间，可以监测线程和内核的运行情况，并将这些信息传送给操作系统，为OS中的线程调度提供支持。对于性能要求比较高的进程，就会被有限放在P-Core上进行，而对于一些后台任务则会被放在E-Core上运行，如果有优先级更高的进程出现的时候，线程调度器就可以帮助操作系统来决定将P-Core上的哪些线程转移到E-Core上来运行。这些调度器都是动态来完成的，而不是依赖具体的软件或者静态的调度方法实现的。Intel Thread Director 与操作系统（OS）无缝协作智能优化性能，在正确时间将正确的任务放在正确的内核上。
66 | 
67 | 
68 | 
69 | 英特尔超线程技术（英特尔 HT 技术）能够更高效地利用处理器资源，支持每个内核运行多条线程。作为一项特性，它还提高了处理器吞吐量以及线程化软件的整体性能。
70 | 
71 | * 在运行多个要求严苛的应用程序的同时保证系统的响应能力
72 | * 在保证系统安全，高效，可靠的同时，把对工作效率的影响降到最低
73 | * 给未来业务增长和全新解决方案能力提供充足的扩展空间
74 | 
75 | # 性能对比
76 | 
77 | 我们选取了Intel Core i9-12900HK，Apple M1 MAX，Intel Core i9-11980HK和AMD Ryzen 9 5980HX进行对比，通过数据可以得出，无论是在单核性能还是多核性能方面，Intel Core i9-12900HK均有显著的优势，Intel相比自身上一代处理器，在单核和多核方面的性能提升分别为6%和26%，而对比AMD同期的Ryzen9，在单核和多核方面性能更是高出了22%和57%。
78 | 
79 | |          CPU          | 单核性能 | 多核性能 |
80 | | :-------------------: | :------: | :------: |
81 | | Intel Core i9-12900HK |   1851   |  13256   |
82 | |     Apple M1 MAX      |   1785   |  12753   |
83 | | Intel Core i9-11980HK |   1745   |  10486   |
84 | |  AMD Ryzen 9 5980HX   |   1523   |   8469   |
85 | 
86 | # 总结
87 | 
88 | 本次Intel最新推出的12th Core CPU，首次采用了Intel 7 的制程工艺，并且率先在桌面端推出了大小核结合的CPU架构。通过性能核心和能效核心的配合，在控制功耗的前提下，既提升了单核的处理性能，又发挥了多核的并行性能。不仅在硬件层面做出了极大的优化，而且采用了线程调度器，能够合理地在硬件和操作系统之间调度线程。
89 | 
90 | Intel将现有的计算模式分为：标量积算，向量计算，矩阵计算和空间计算四大类，分别对应基于CPU，GPU，ASIC和FPGA的计算模式，通过这四种类型计算的组合，就可以提供超过摩尔定律发展的非线性的性能提升。当前的芯片设计架构思路主要分为两个方面，一方面是针对特定的专业领域设计专属的芯片架构，另一方面是对不同计算模式之间取长补短达到异构计算的目的，通过这两方面来取得系统性能和功耗的平衡和优化。


--------------------------------------------------------------------------------