├── lab1
    ├── ex1_ppt35_mpi.c
    ├── ex1_ppt35_omp.c
    ├── ex1_ppt36_mpi.c
    ├── ex1_ppt36_omp.c
    ├── ex3_MybcastMPI.c
    ├── ex4Allgather.c
    ├── ex4Alltoall.c
    └── hw
    │   ├── ex2_1-3.c
    │   ├── ex2_3-2.c
    │   └── ex2_3-5.c
├── lab2
    ├── ex1_LU.c
    ├── ex2_QR.c
    └── ex3_summa.c
├── readme.txt
└── 报告
    ├── 图片1.png
    ├── 图片10.PNG
    ├── 图片11.PNG
    ├── 图片2.png
    ├── 图片3.png
    ├── 图片4.png
    ├── 图片5.png
    ├── 图片6.PNG
    ├── 图片7.PNG
    ├── 图片8.png
    ├── 图片9.png
    ├── 实验报告.md
    └── 实验报告.pdf


/lab1/ex1_ppt35_mpi.c:
--------------------------------------------------------------------------------
 1 | #include "mpi.h"
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | 
 5 | 
 6 | int main(int argc, char *argv[])
 7 | {
 8 |     int pid, pnums;
 9 |     MPI_Init(&argc, &argv);
10 |     MPI_Comm_rank(MPI_COMM_WORLD, &pid);
11 |     MPI_Comm_size(MPI_COMM_WORLD, &pnums);
12 |     int data, recvdata, logN;
13 |     MPI_Status status;
14 |     data = pid+1;
15 |     recvdata = 0;
16 |     logN = (int)log2(pnums);
17 |     printf("process id: %d,data:%d\n", pid,data);
18 |     //sum
19 |     for (int i = 1; i <= logN; i++)
20 |     {
21 |         int tag = i;
22 |         int step = (int)pow(2, i);
23 |         if (pid % step == 0)
24 |         {
25 |             MPI_Recv(&recvdata, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD, &status);
26 |             data += recvdata;
27 |         }
28 |         else
29 |          if(pid % step == step/2)
30 |         {
31 |             MPI_Send(&data, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD);
32 |         }
33 |     }
34 |     //spread
35 |     for (int i = logN; i >0; i--)
36 |     {
37 |         int tag = i;
38 |         int step = (int)pow(2, i);
39 |         if (pid % step == 0)
40 |         {
41 |             MPI_Send(&data, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD);
42 |         }
43 |         else
44 |         if(pid % step == step/2)
45 |         {
46 |             MPI_Recv(&recvdata, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD, &status);
47 |             data = recvdata;
48 |         }
49 |         
50 |     }
51 |     printf("%d sum is %d\n", pid, data);
52 |     MPI_Finalize();
53 |     return 0;
54 | }


--------------------------------------------------------------------------------
/lab1/ex1_ppt35_omp.c:
--------------------------------------------------------------------------------
 1 | #include "omp.h"
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | 
 5 | int N = 8;
 6 | 
 7 | int main(){
 8 |     int step,logN;
 9 |     int num[N];
10 |     for (int i = 0; i < N; i++)
11 |     {
12 |         num[i] = i+1;
13 |     }
14 |     logN = (int)log2(N);
15 |     int pid;
16 |     printf("numbers:");
17 |     for (int i = 0; i < N; i++)
18 |     {
19 |         printf("%d", num[i]);
20 |         if(i == N-1)
21 |             printf("\n");
22 |     }
23 |     for (int i = logN; i > 0; i--)
24 |     {
25 |         step = (int)pow(2, logN-i+1);
26 | #pragma omp parallel private(pid)
27 |         {
28 |             pid = omp_get_thread_num();
29 |             if (!(pid % step))
30 |             {
31 |                 num[pid] = num[pid]+num[pid+step/2];
32 |             }
33 | #pragma omp barrier
34 |         }
35 |     }
36 |     for (int i = 1; i <= logN; i++)
37 |     {
38 |         step = (int)pow(2, logN-i+1);
39 | #pragma omp parallel private(pid)
40 |         {
41 |             pid = omp_get_thread_num();
42 |             if (!(pid % step))
43 |             {
44 |                 num[pid+step/2] = num[pid];
45 |             }
46 | #pragma omp barrier
47 |         }
48 |     }
49 |     printf("after sum:");
50 |     for (int i = 0; i < N; i++)
51 |     {
52 |         printf("%d", num[i]);
53 |         if(i == N-1)
54 |             printf("\n");
55 |     }
56 |     return 0;
57 | }
58 | 


--------------------------------------------------------------------------------
/lab1/ex1_ppt36_mpi.c:
--------------------------------------------------------------------------------
 1 | #include "mpi.h"
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | 
 5 | int main(int argc, char *argv[])
 6 | {
 7 |     int id_procs, num_procs;
 8 | 
 9 |     MPI_Init(&argc, &argv);
10 |     MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
11 |     MPI_Comm_rank(MPI_COMM_WORLD, &id_procs);
12 |     int data = id_procs+1;
13 |     int recvdata;
14 |     MPI_Status status;
15 |     printf("process id %d data = %d\n",id_procs, data);
16 |     int logN = (int)log2(num_procs);
17 |     for(int i = 0; i < logN; i++) {
18 |         int tag = i+1;
19 |         int step = (int)pow(2,i);
20 |         int dest = id_procs ^ step;
21 |         MPI_Send(&data, 1, MPI_INT, dest, tag, MPI_COMM_WORLD);
22 |         MPI_Recv(&recvdata, 1, MPI_INT, dest, tag, MPI_COMM_WORLD, &status);
23 |         data += recvdata;
24 |     }
25 | 
26 | 
27 |     printf("process id %d sum is = %d\n",id_procs, data);
28 | 
29 |     MPI_Finalize();
30 |     return 0;
31 | }


--------------------------------------------------------------------------------
/lab1/ex1_ppt36_omp.c:
--------------------------------------------------------------------------------
 1 | #include "omp.h"
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | 
 5 | int N = 8;
 6 | 
 7 | int  main(){
 8 |     int step,logN;
 9 |     int num[N];
10 |     for (int i = 0; i < N; i++)
11 |     {
12 |         num[i] = i+1;
13 |     }
14 |     logN = (int)log2(N);
15 |     int pid, tmp, dest;
16 |     printf("numbers:");
17 |     for (int i = 0; i < N; i++)
18 |     {
19 |         printf("%d ", num[i]);
20 |         if(i == N-1)
21 |             printf("\n");
22 |     }
23 |     for (int i = 0; i < logN; i++)
24 |     {
25 |         step = (int)pow(2, i);
26 | #pragma omp parallel private(pid, tmp, dest)
27 |         {
28 |             pid = omp_get_thread_num();
29 |             tmp = num[pid];
30 |             dest = pid^step;
31 |             tmp = num[pid] + num[dest];
32 | #pragma omp barrier
33 |             num[pid] = tmp;
34 | #pragma omp barrier
35 |         }
36 |     }
37 |     printf("after sum:");
38 |     for (int i = 0; i < N; i++)
39 |     {
40 |         printf("%d ", num[i]);
41 |         if(i == N-1)
42 |             printf("\n");
43 |     }
44 | }


--------------------------------------------------------------------------------
/lab1/ex3_MybcastMPI.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | /*
 6 | int MyBcastMPI(void* data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator)
 7 | {
 8 |     return 0;
 9 | }
10 | */
11 | int main(int argc, char *argv[])
12 | {
13 |     int id_procs, num_procs;
14 |     char seq[16];
15 |     int root = 3;
16 |     MPI_Group world_group, new_group;
17 |     MPI_Init(&argc, &argv);
18 |     MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
19 |     MPI_Comm_rank(MPI_COMM_WORLD, &id_procs);
20 |     if (id_procs == root)
21 |     {
22 |         strcpy(seq, "hello,MPI!");
23 |     }
24 |     MPI_Barrier(MPI_COMM_WORLD);
25 | 
26 |     MPI_Comm split_comm_world;
27 |     MPI_Status status;
28 | 
29 |     int rank;
30 |     int size;
31 | 
32 |     // MPI split COMM_WORLD into 4 groups 
33 |     MPI_Comm_split(MPI_COMM_WORLD, id_procs % 4, id_procs, &split_comm_world);
34 |     MPI_Comm_rank(split_comm_world, &rank);
35 |     MPI_Comm_size(split_comm_world, &size);
36 |     //create new group H
37 |     MPI_Comm h_comm_world;
38 |     MPI_Comm_group(MPI_COMM_WORLD, &world_group);
39 |     int grpsize = num_procs / 2;
40 |     int zerolist[] = {0, 1, 2, 3};
41 |     int zerocnt = 0;
42 | 
43 |     MPI_Group_incl(world_group, grpsize, zerolist, &new_group);
44 |     MPI_Comm_create(MPI_COMM_WORLD, new_group, &h_comm_world);
45 |     // message from root to 0 proc of MPI_COMM_WORLD
46 |     if (id_procs == root)
47 |     {
48 |         MPI_Send(&seq, 16, MPI_CHAR, 0, 1, MPI_COMM_WORLD);
49 |     }
50 |     else if (id_procs == 0)
51 |     {
52 |         MPI_Recv(&seq, 16, MPI_CHAR, root, 1, MPI_COMM_WORLD, &status);
53 |     }
54 |     MPI_Barrier(MPI_COMM_WORLD);
55 |     // Broadcast within the group H
56 |     if(h_comm_world != MPI_COMM_NULL)
57 |         MPI_Bcast(&seq, 16, MPI_CHAR, 0, h_comm_world);
58 |     MPI_Barrier(MPI_COMM_WORLD);
59 |     //Broadcasr within the group N
60 | 
61 |     MPI_Bcast(&seq, 16, MPI_CHAR, 0, split_comm_world);
62 |     MPI_Barrier(MPI_COMM_WORLD);
63 | 
64 |     printf("MPI Comm rank %d, original id %d, size %d. the new msg is %s\n", rank, id_procs, size, seq);
65 |     MPI_Finalize();
66 |     return 0;
67 | }


--------------------------------------------------------------------------------
/lab1/ex4Allgather.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include"mpi.h"
 4 | #include<memory.h>
 5 | 
 6 | //采用MPI_Send和MPI_Recv编写代码实现的MPI_Allgather
 7 | void MPI_Allgather_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount,
 8 |         MPI_Datatype recvdatatype, MPI_Comm comm)
 9 | {
10 |     int rank, size, i;
11 |     MPI_Status status;
12 |     MPI_Comm_rank(comm, &rank);
13 |     MPI_Comm_size(comm, &size);
14 | 	
15 |     for (i = 0; i < size; i++)
16 |     {
17 |         if (i != rank)
18 |         {
19 |             MPI_Send(senddata, sendcount, senddatatype, i, rank , comm);
20 |             MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status);
21 |         }
22 |         else
23 |         {
24 |         	//memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount);
25 |         	recvdata[i] = *senddata;
26 |         	}
27 |     }
28 | }
29 | int main(int argc, char* argv[])
30 | {
31 |     int i, rank, size, tag = 1;
32 |     int senddata, recvdata[8];
33 |     double start_time, end_time, s_t, e_t;
34 |     int count = 1;
35 |     MPI_Init(&argc, &argv);
36 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
37 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
38 |     senddata = rank + 1;
39 |     start_time = MPI_Wtime();
40 |     //自己编写的MPI_Allgather的性能测试 
41 |     MPI_Allgather_my(&senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD);
42 |     end_time = MPI_Wtime();
43 |     MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
44 |     MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
45 | 
46 | 	for (i = 0; i < size; i++)
47 | 		printf("My rank =  %d  After mygather recv = %d\n", rank, recvdata[i]);
48 | 
49 |     if (rank == 0)
50 |     {
51 |         printf("myallgather : count = %d total time = %f\n", count, e_t - s_t);
52 |     }
53 |     MPI_Barrier(MPI_COMM_WORLD);
54 | 	//MPI原有的MPI_Allgather的功能测试
55 |     start_time = MPI_Wtime();
56 |     MPI_Allgather(&senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD);
57 |     end_time = MPI_Wtime();
58 |     MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
59 |     MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
60 | 	for (i = 0; i < size; i++)
61 | 		printf("My rank =  %d  recv = %d\n", rank, recvdata[i]);
62 | 
63 |     if (rank == 0)
64 |     {
65 |         printf("allgather : count = %d total time = %f\n", count, e_t - s_t);
66 |     }
67 |     MPI_Finalize();
68 |     return 0;
69 | }


--------------------------------------------------------------------------------
/lab1/ex4Alltoall.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include"mpi.h"
 4 | #include<memory.h>
 5 | 
 6 | //采用MPI_Send和MPI_Recv编写代码实现的MPI_Allgather
 7 | void MPI_Alltoall_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount,
 8 |         MPI_Datatype recvdatatype, MPI_Comm comm)
 9 | {
10 |     int rank, size;
11 |     MPI_Status status;
12 |     MPI_Comm_rank(comm, &rank);
13 |     MPI_Comm_size(comm, &size);
14 |     for (int i = 0; i < size; i++)
15 |     {
16 |         if (i != rank)
17 |         {
18 |             MPI_Send(senddata + i * sendcount, sendcount, senddatatype, i, rank , comm);
19 |             MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status);
20 |         }
21 |         else
22 |         {
23 |         	//memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount);
24 |         	recvdata[i] = senddata[i];
25 |         }
26 |     }
27 | }
28 | int main(int argc, char* argv[])
29 | {
30 |     int i, rank, size, tag = 1;
31 |     int senddata[8], recvdata[8];
32 | 
33 |     
34 |     double start_time, end_time, s_t, e_t;
35 |     int count = 1;
36 |     MPI_Init(&argc, &argv);
37 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
38 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
39 |     for (int j = 0; j < size; j++)
40 |     {
41 |         senddata[j] = j+1;
42 |     }
43 |     start_time = MPI_Wtime();
44 |     //自己编写的MPI_Allgather的性能测试 
45 |     MPI_Alltoall_my(senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD);
46 |     end_time = MPI_Wtime();
47 |     MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
48 |     MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
49 | 
50 | 	for (i = 0; i < size; i++)
51 | 		printf("My rank =  %d  After myalltoall recv = %d\n", rank, recvdata[i]);
52 | 
53 |     if (rank == 0)
54 |     {
55 |         printf("myalltoall : count = %d total time = %f\n", count, e_t - s_t);
56 |     }
57 |     MPI_Barrier(MPI_COMM_WORLD);
58 | 	//MPI原有的MPI_Allgather的功能测试
59 |     start_time = MPI_Wtime();
60 |     MPI_Alltoall(senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD);
61 |     end_time = MPI_Wtime();
62 |     MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
63 |     MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
64 | 	for (i = 0; i < size; i++)
65 | 		printf("My rank =  %d  recv = %d\n", rank, recvdata[i]);
66 | 
67 |     if (rank == 0)
68 |     {
69 |         printf("alltoall : count = %d total time = %f\n", count, e_t - s_t);
70 |     }
71 |     MPI_Finalize();
72 |     return 0;
73 | }


--------------------------------------------------------------------------------
/lab1/hw/ex2_1-3.c:
--------------------------------------------------------------------------------
 1 | #include "omp.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <math.h>
 5 | int n = 50;
 6 | int main(){
 7 |     int a[n], b[n], c[n];
 8 |     for (int i = 0; i < n; i++)
 9 |     {
10 |         a[i] = (int)(rand()%n)+1;
11 |         b[i] = (int)(rand()%n)+1;
12 |         c[i] = (int)(rand()%n)+1;
13 |     }
14 |     
15 | #pragma omp simd
16 |     for (int i = 0; i < n; i++)
17 |     {
18 |         a[i] = b[i] + c[i+1];
19 |         c[i] = a[i] + b[i];
20 |     }
21 |     printf("a:\n");    
22 |     for (int i = 0; i < n; i++)
23 |     {
24 |         printf("%d\t", a[i]);
25 |     }
26 |     printf("\n");
27 |     printf("b:\n");    
28 |     for (int i = 0; i < n; i++)
29 |     {
30 |         printf("%d\t", b[i]);
31 |     }
32 |     printf("\n");
33 |     printf("c:\n");    
34 |     for (int i = 0; i < n; i++)
35 |     {
36 |         printf("%d\t", c[i]);
37 |     }
38 |     printf("\n");
39 |     return 0;
40 | }


--------------------------------------------------------------------------------
/lab1/hw/ex2_3-2.c:
--------------------------------------------------------------------------------
 1 | #include "omp.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <math.h>
 5 | 
 6 | int N = 50;
 7 | int main(){
 8 |     int count = 100;
 9 |     int x[100], y[200], b[100];
10 |     int a[100][100], c[100][100];
11 |     for (int i = 0; i < 100; i++)
12 |     {
13 |         x[i] = i+1;
14 |         b[i] = i+2;
15 |     }
16 |     for (int i = 0; i < 200; i++)
17 |     {
18 |         y[i] = i+1;
19 |     }
20 |     for (int i = 0; i < 100; i++)
21 |     {
22 |         for (int j = 0; j < 100; j++)
23 |         {
24 |             a[i][j] = i+j+2;
25 |             c[i][j] = i+j+3;
26 |         }
27 |     }
28 |     for (int i = 0; i < count; i++)
29 |     {
30 |         for (int j = 0; j < count-1; j++)
31 |         {
32 |             b[j] = a[j][N];
33 |             #pragma omp parallel for
34 |             for (int k = 0; k < count; i++)
35 |             {
36 |                 a[j+1][k] = b[j] + c[j][k];
37 |             }
38 |         }
39 |     }
40 |     for (int i = 0; i < count; i++)
41 |     {
42 |         /* code */
43 |         #pragma omp parallel for
44 |         for (int j = 0; j < count-1; i++)
45 |         {
46 |             y[i+j] = a[j+1][N];
47 |         }
48 |     }
49 |     #pragma omp parallel for
50 |     for (int i = 0; i < count; i++)
51 |     {
52 |         /* code */
53 |         x[i] = y[i] + 10;
54 |     }
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/lab1/hw/ex2_3-5.c:
--------------------------------------------------------------------------------
 1 | #include "omp.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <math.h>
 5 | 
 6 | void loop1(){
 7 |     int A[101], B[101], C[101], D[101];
 8 |     for (int i = 0; i < 101; i++)
 9 |     {
10 |         A[i] = i;
11 |         B[i] = i+1;
12 |         C[i] = i+2;
13 |         D[i] = i+3;
14 |     }
15 |     for (int i = 1; i <= 100; i++)
16 |     {
17 |         B[i] = C[i-1]*2;
18 |         C[i] = 1.0/B[i];
19 |     }
20 | #pragma omp parallel for
21 |     for (int i = 1; i <= 100; i++)
22 |     {
23 |         /* code */
24 |         A[i] = A[i] + B[i-1];
25 |         D[i] = C[i] * C[i];
26 |     }
27 | }
28 | 
29 | void loop2(){
30 |     int count = 1000;
31 |     int A[1000], B[1000], C[1000], D[1000];
32 |     for (int i = 0; i < count; i++)
33 |     {
34 |         A[i] = i+1;
35 |         B[i] = i+2;
36 |         C[i] = i+3;
37 |         D[i] = i+4;
38 |     }
39 | #pragma omp parallel for
40 |     for (int i = 1; i <= 500; i++)
41 |     {
42 |         A[i] = B[i] + C[i];
43 |         D[i] = (A[i] + A[1000-i])/2.0;
44 |     }
45 | #pragma omp parallel for
46 |     for (int i = 501; i < count; i++)
47 |     {
48 |         /* code */
49 |         A[i] = B[i] + C[i];
50 |         D[i] = (A[i]+A[1000-i])/2.0;
51 |     }
52 | }
53 | 
54 | void loop3(){
55 |     int count = 100;
56 |     int A[500][200];
57 |     int C[100][100];
58 |     int D[100][100];
59 |     for (int i = 0; i < count; i++)
60 |     {
61 |         for (int j = 0; j < count; j++)
62 |         {
63 |             C[i][j] = i+j+1;
64 |             D[i][j] = i+j+3;
65 |         }
66 |     }
67 |     for (int i = 0; i < 500; i++)
68 |     {
69 |         for (int j = 0; j < 200; j++)
70 |         {
71 |             A[i][j] = i+j+2;
72 |         }
73 |     }
74 |     
75 | #pragma omp parallel for
76 |     for (int i = 0; i < count; i++)
77 |     {
78 |         for (int j = 0; i < count; j++)
79 |         {
80 |             A[3*i+2*j][2*j] = C[i][j]*2;
81 |             D[i][j] = A[(i-j+6)>0 ? (i-j+6):-(i-j+6)][i+j];
82 |         }   
83 |     }
84 | }


--------------------------------------------------------------------------------
/lab2/ex1_LU.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include "mpi.h"
  4 | #include "omp.h"
  5 | #define a(x,y) a[x*M +y]
  6 | #define A(x,y) A[x*M+y]
  7 | #define l(x,y) l[x*M+y]
  8 | #define u(x,y) u[x*M+y]
  9 | #define floatsize sizeof(float)
 10 | #define intsize sizeof(int)
 11 | 
 12 | int M;
 13 | int m;
 14 | float * A;
 15 | int my_rank;
 16 | int p;
 17 | MPI_Status status;
 18 | 
 19 | void fatal(char * message)
 20 | {
 21 |     printf("%s\n", message);
 22 |     exit(1);
 23 | }
 24 | 
 25 | void Env_Fin(float *a, float *f)
 26 | {
 27 |     free(a);
 28 |     free(f);
 29 | }
 30 | 
 31 | int main(int argc, char * argv[])
 32 | {
 33 |     int i,j,k, my_rank, group_size;
 34 |     int i1, i2;
 35 |     int v,w;
 36 |     float *a, *f, *l, *u;
 37 |     printf("Input matrix row length:\n");
 38 |     scanf("%d", &M);
 39 |     doubel time;
 40 |     MPI_Init(&argc, &argv);
 41 |     MPI_Comm_size(MPI_COMM_WORLD, &group_size);
 42 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
 43 | 
 44 |     p = group_size;
 45 |     if (my_rank == 0)
 46 |     {
 47 |         A = (float *)malloc(floatsize * M * M);
 48 |         for (int i = 0; i < M; i++)
 49 |         {
 50 |             for (int j = 0; j < M; j++)
 51 |             {
 52 |                 A(i,j) = (float)(rand() % 25 + 1);
 53 |             }
 54 |         }
 55 |         time = MPI_Wtime();
 56 |         
 57 |     }
 58 |     MPI_Bcast(&M,1,MPI_INT,0,MPI_COMM_WORLD);
 59 |     m=M/p;
 60 |     if (M%p!=0) m++;
 61 |     a=(float*)malloc(floatsize*m*M);
 62 | 
 63 |     f=(float*)malloc(floatsize*M);
 64 | 
 65 |     if (my_rank==0)
 66 |     {
 67 |         l=(float*)malloc(floatsize*M*M);
 68 |         u=(float*)malloc(floatsize*M*M);
 69 |     }
 70 |  
 71 |     if (a==NULL) fatal("allocate error\n");
 72 |     if (my_rank==0)
 73 |     {
 74 |         for(i=0;i<m;i++)
 75 |             for(j=0;j<M;j++)
 76 |                 a(i,j)=A((i*p),j);
 77 |         for(i=0;i<M;i++)
 78 |             if ((i%p)!=0)
 79 |         {
 80 |             i1=i%p;
 81 |             i2=i/p+1;
 82 |             MPI_Send(&A(i,0),M,MPI_FLOAT,i1,i2,MPI_COMM_WORLD);
 83 |         }
 84 |     }
 85 |     else
 86 |     {
 87 |         for ( i = 0; i < m; i++)
 88 |         {
 89 |             MPI_Recv(&a(i,0),M,MPI_FLOAT, 0,i+1, MPI_COMM_WORLD, &status);
 90 |         }
 91 |         
 92 |     }
 93 |     for(i=0;i<m;i++)
 94 |         for(j=0;j<p;j++)
 95 |     {
 96 |         /*j号进程负责广播主行元素*/
 97 |         if (my_rank==j)
 98 |         {
 99 |             v=i*p+j;
100 |             for (k=v;k<M;k++)
101 |                 f[k]=a(i,k);
102 |  
103 |             MPI_Bcast(f,M,MPI_FLOAT,my_rank,MPI_COMM_WORLD);
104 |         }
105 |         else
106 |         {
107 |             v=i*p+j;
108 |             MPI_Bcast(f,M,MPI_FLOAT,j,MPI_COMM_WORLD);
109 |         }
110 |  
111 |         /*编号小于my_rank的进程（包括my_rank本身）利用主行对其第i+1,…,m-1行数据做行变换*/
112 |         /*********MPI 并行优化 ********/
113 |         if (my_rank<=j)
114 |         {
115 |            #pragma  omp parallel shared(a,f,v,m) private(k,w)
116 |            {
117 |                 #pragma omp for
118 |  
119 |                 for(k=i+1;k<m;k++)
120 |                 {
121 |  
122 |                     a(k,v)=a(k,v)/f[v];
123 |  
124 |                 }
125 |  
126 |  
127 |                 for(k=i+1;k<m;k++)
128 |                 {
129 |                     #pragma omp for
130 |                     for(w=v+1;w<M;w++)
131 |                         a(k,w)=a(k,w)-f[w]*a(k,v);
132 |                 }
133 |             }
134 |         }
135 |  
136 |         /*编号大于my_rank的进程利用主行对其第i,…,m-1行数据做行变换*/
137 |         if (my_rank>j)
138 |         {
139 |  
140 |             #pragma  omp parallel shared(a,f,v,m) private(k,w)
141 |             {
142 |                 #pragma omp for
143 |                 for(k=i;k<m;k++)
144 |                 {
145 |                     a(k,v)=a(k,v)/f[v];
146 |  
147 |                 }
148 |  
149 |                 for(k=i;k<m;k++)
150 |                 {
151 |                     #pragma omp for
152 |                     for(w=v+1;w<M;w++)
153 |                         a(k,w)=a(k,w)-f[w]*a(k,v);
154 |                 }
155 |             }
156 |         }
157 |     }
158 |  
159 |     /*0号进程从其余各进程中接收子矩阵a，得到经过变换的矩阵A*/
160 |     if (my_rank==0)
161 |     {
162 |         for(i=0;i<m;i++)
163 |             for(j=0;j<M;j++)
164 |                 A(i*p,j)=a(i,j);
165 |     }
166 |     if (my_rank!=0)
167 |     {
168 |         for(i=0;i<m;i++)
169 |             MPI_Send(&a(i,0),M,MPI_FLOAT,0,i,MPI_COMM_WORLD);
170 |     }
171 |     else
172 |     {
173 |         for(i=1;i<p;i++)
174 |             for(j=0;j<m;j++)
175 |         {
176 |             MPI_Recv(&a(j,0),M,MPI_FLOAT,i,j,MPI_COMM_WORLD,&status);
177 |             for(k=0;k<M;k++)
178 |                 A((j*p+i),k)=a(j,k);
179 |         }
180 |     }
181 |     if (my_rank==0)
182 |     {
183 |             for(i=0;i<M;i++)
184 |             {
185 |                 for(j=0;j<M;j++)
186 |                 {
187 |                     if(i>j)
188 |                     {
189 |                         l(i,j)=A(i,j);
190 |                         u(i,j)=0.0;
191 |                     }
192 |                     else if(i<j)
193 |                     {
194 |                         u(i,j)=A(i,j);
195 |                         l(i,j)=0.0;
196 |                     }
197 |                     else
198 |                     {
199 |                         u(i,j)=0.0;
200 |                         l(i,j)=1.0;
201 |                     }
202 |                 }
203 |             }
204 |         for(i=0;i<M;i++)
205 |         {
206 |             for(j=0;j<M;j++)
207 |             {
208 |                 if (i>j)
209 |                     l(i,j)=A(i,j);
210 |                 else
211 |                    u(i,j)=A(i,j);
212 |             }
213 |         }
214 |         time = MPI_Wtime() - time;
215 |         printf("Input matrix:\n");
216 |         printf("%d\t %d\n",M, N);
217 |         for(i=0;i<M;i++)
218 |         {
219 |             for(j=0;j<N;j++)
220 |                 printf("%f\t",A(i,j));
221 |             printf("\n");
222 |         }
223 |         printf("\nOutput of LU operation\n");
224 |         printf("Matrix L:\n");
225 |         for(i=0;i<M;i++)
226 |         {
227 |             for(j=0;j<M;j++)
228 |                 printf("%f\t",l(i,j));
229 |             printf("\n");
230 |         }
231 |         printf("Matrix U:\n");
232 |         for(i=0;i<M;i++)
233 |         {
234 |             for(j=0;j<M;j++)
235 |                 printf("%f\t",u(i,j));
236 |             printf("\n");
237 |         }
238 |         printf("total time: %f\n", time);
239 |     }
240 |     MPI_Finalize();
241 |     Env_Fin(a,f);
242 |     return(0);
243 | }


--------------------------------------------------------------------------------
/lab2/ex2_QR.c:
--------------------------------------------------------------------------------
  1 | #include "omp.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <math.h>
  5 | #include <string.h>
  6 | 
  7 | int N;
  8 | #define A(x, y) A[x * N + y]
  9 | #define Q(x, y) Q[x * N + y]
 10 | #define R(x, y) R[x * N + y]
 11 | #define tmp(x, y) tmp[x * N + y]
 12 | 
 13 | void Env_Fin(float *a, float *q, float *r, float *tmp, \
 14 |                 float *aj, float *ai, float *qi, float *qj)
 15 | {
 16 |     free(a);
 17 |     free(q);
 18 |     free(r);
 19 |     free(tmp);
 20 |     free(qi);
 21 |     free(qj);
 22 |     free(ai);
 23 |     free(aj);
 24 | }
 25 | 
 26 | int main(){
 27 |     printf("input matrix rows :");
 28 |     scanf("%d", &N);
 29 |     float *A, *Q, *R, *tmp;
 30 |     A =(float *)malloc(N * N * sizeof(float));
 31 |     Q =(float *)malloc(N * N * sizeof(float));
 32 |     R =(float *)malloc(N * N * sizeof(float));
 33 |     tmp = (float *)malloc(N * N * sizeof(float));
 34 |     for (int i = 0; i < N; i++)
 35 |     {
 36 |         for (int j = 0; j < N; j++)
 37 |         {
 38 |             A(i,j) = (float)(rand() % 10) + 1;
 39 |             if(i == j)
 40 |                 Q(i,j) = 1;
 41 |             R(i,j) = 0;
 42 |         }
 43 |     }
 44 |     printf("matrix A:\n");
 45 |     for(int i=0;i<N;i++)
 46 |     {
 47 |         for(int j=0;j<N;j++)
 48 |             printf("%f\t",A(i,j));
 49 |         printf("\n");
 50 |     }
 51 |     float *Ai, *Aj, *Qi, *Qj;
 52 |     Aj =(float *)malloc(N * sizeof(float));
 53 |     Ai =(float *)malloc(N * sizeof(float));
 54 |     Qi =(float *)malloc(N * sizeof(float));
 55 |     Qj =(float *)malloc(N * sizeof(float));
 56 |     for (int i = 0; i < N; i++)
 57 |     {
 58 |         Ai[i] = 0;
 59 |         Aj[i] = 0;
 60 |         Qi[i] = 0;
 61 |         Qj[i] = 0;
 62 |     }
 63 |     float c, s, sq;
 64 |     double start = omp_get_wtime();
 65 |     for(int j = 0; j < N; j++){
 66 |         for (int i = j+1; i < N; i++)
 67 |         {
 68 |             sq = sqrt(A(j, j) * A(j, j) + A(i, j) * A(i, j));
 69 |             c = A(j, j)/sq;
 70 |             s = A(i, j)/sq;
 71 |             int k ;
 72 | #pragma omp parallel for private(k)
 73 |             for (k = 0; k < N; k++)
 74 |             {
 75 |                 Aj[k] = c * A(j, k) + s * A(i, k);
 76 |                 Qj[k] = c * Q(j, k) + s * Q(i, k); 
 77 |                 Ai[k] = - s * A(j, k) + c * A(i, k);
 78 |                 Qi[k] = - s * Q(j, k) + c * Q(i, k);
 79 |             }
 80 | #pragma omp barrier
 81 | #pragma omp parallel for private(k)
 82 |             for (k = 0; k < N; k++)
 83 |             {
 84 |                 A(j, k) = Aj[k];
 85 |                 A(i, k) = Ai[k];
 86 |                 Q(i, k) = Qi[k];
 87 |                 Q(j, k) = Qj[k];
 88 |             }
 89 | #pragma omp barrier            
 90 |         }
 91 |     }
 92 |     for (int i = 0; i < N; i++)
 93 |     {
 94 |         for (int j = 0; j < N; j++)
 95 |         {
 96 |             R(i, j) = A(i, j);
 97 |             tmp(j, i) = Q(i, j);
 98 |         }
 99 |     }
100 |     double end = omp_get_wtime();
101 |     double time = end - start;
102 |     printf("matrix Q:\n");
103 |     for(int i=0;i<N;i++)
104 |     {
105 |         for(int j=0;j<N;j++)
106 |             printf("%f\t",tmp(i,j));
107 |         printf("\n");
108 |     }
109 |     printf("matrix R:\n");
110 |     for(int i=0;i<N;i++)
111 |     {
112 |         for(int j=0;j<N;j++)
113 |             printf("%f\t",R(i,j));
114 |         printf("\n");
115 |     }   
116 |     printf("total time: %f\n", time);
117 |     Env_Fin(A, Q, R, tmp, Aj, Ai, Qi, Qj);     
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/lab2/ex3_summa.c:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdio.h>
  3 | #include <math.h>
  4 | #include <stdlib.h>
  5 | #include <time.h>
  6 | #include <sys/time.h>
  7 |  
  8 | void PrintMatrixForVector(int * matrix,int high,int len)
  9 | {
 10 |     int i;
 11 |     for(i=0;i<high*len;i++)
 12 |     {
 13 |         printf("%6d  ",matrix[i]);	
 14 |         if(i%len==len-1&&i!=0)
 15 | 			printf("\n");
 16 |     }
 17 | }
 18 | 
 19 | void MatrixMultiply(int * A,int *B,int *C,unsigned m,unsigned n,unsigned p)
 20 | {   int i,j,k;
 21 |     for(i=0;i<m;i++)
 22 |        for(j=0;j<p;j++)
 23 |            
 24 | 	   {
 25 | 		   int result=0; 
 26 | 		   for(k=0;k<n;k++)
 27 |             {
 28 |                result=A[i*n+k]*B[k*p+j]+result;
 29 |             }
 30 |            C[i*p+j]=result;
 31 | 	   }
 32 | }
 33 |  
 34 | 
 35 | void MatrixAdd(int * A,int *B,unsigned m,unsigned n) //the result remain in A
 36 | {  int i,j;
 37 |    for(i=0;i<m;i++)
 38 |       for(j=0;j<n;j++)
 39 |         {
 40 |             A[i*n+j]=A[i*n+j]+B[i*n+j];
 41 |         }
 42 | }
 43 |  
 44 |  
 45 | void PrintMatrix(int ** matrix,int high,int len)
 46 | {
 47 |     int i,j;
 48 |     for(i=0;i<high;i++)
 49 |     {
 50 |         for(j=0;j<len;j++)
 51 |         {
 52 |             printf("%6d  ",matrix[i][j]);
 53 |         }
 54 |         printf("\n");
 55 |     }
 56 | }
 57 |  
 58 |  
 59 |  
 60 | /****随机数据以待计算****/
 61 | void RandomMatrix(int *matrix,int len)
 62 | {
 63 |    struct timeval tpstart;  //使数据更均匀
 64 |    gettimeofday(&tpstart,NULL);
 65 |    srand(tpstart.tv_usec);
 66 |    int i=0;
 67 |    for(i=0;i<len;i++)
 68 |    matrix[i]=rand()%8;
 69 | }
 70 | int main(int argc,char **argv)
 71 | {
 72 | 	int rank;
 73 |     MPI_Status status;
 74 |     MPI_Init(&argc,&argv);
 75 | 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 76 |  
 77 | 	int nodeNum;  //节点数，必须为可开平方
 78 | 	
 79 | 	int matrixHighA;// 矩阵A行数
 80 | 	int matrixLenA;//矩阵A列数
 81 | 	
 82 | 	
 83 | 	int matrixHighB;
 84 |     int matrixLenB;
 85 | 	/****   参数检查  参数错误用默认参数*****/
 86 | 	if(argc!=5&&rank==0)
 87 | 	{
 88 |  
 89 | 		 printf("The para is wrong!using default para\n");
 90 | 		 nodeNum=4;
 91 | 		 
 92 | 		 matrixHighA=6;
 93 | 		 matrixLenA=8;
 94 | 		 
 95 | 		 
 96 | 		 matrixHighB=8;
 97 | 		 matrixLenB=10;
 98 | 	}
 99 | 	else
100 | 	{
101 |  
102 | 		nodeNum=atoi(argv[1]);
103 | 		
104 | 		matrixHighA=atoi(argv[2]);
105 | 		matrixLenA=atoi(argv[3]);
106 | 		
107 | 		
108 | 		matrixHighB=atoi(argv[3]);
109 |         matrixLenB=atoi(argv[4]);
110 |  
111 | 	}
112 |     int p=sqrt(nodeNum);
113 |  
114 | /*************计算local A   B行列数*************************/
115 |     int localHighA=matrixHighA/p;
116 |     int localLenA=matrixLenA/p;
117 |     
118 |  
119 |     int localHighB=matrixHighB/p;
120 |     int localLenB=matrixLenB/p;
121 |     
122 |     
123 |     int i;
124 |     int j;
125 |     int k;
126 |     int l;
127 |  
128 |  
129 |     int * A=(int *)malloc(localLenA*localHighA*sizeof(int));
130 |     RandomMatrix(A,localHighA*localLenA);
131 |  
132 |  
133 |  
134 |     int * B=(int *)malloc(localLenB*localHighB*sizeof(int));
135 |     RandomMatrix(B,localHighB*localLenB);
136 |  
137 |     int * C=(int *)malloc(localHighA*localLenB*sizeof(int));
138 |  
139 |     for(i=0;i<localHighA*localLenB;i++)C[i]=0;
140 | 	int myRow=rank/p;//计算当前节点在mesh中的行列值
141 | 	int myCol=rank%p;
142 |  
143 |  
144 |     //将数据发送到0号进程收集显示
145 |     MPI_Send(A,localHighA*localLenA,MPI_INT,0,rank+100,MPI_COMM_WORLD);
146 |     MPI_Send(B,localHighB*localLenB,MPI_INT,0,rank+200,MPI_COMM_WORLD);
147 |  
148 |  
149 | 	if(rank==0)
150 | 	{
151 | 		int **matrixA=(int **)malloc(matrixHighA*sizeof(int *));
152 | 		for (i=0;i<matrixHighA;i++)
153 | 			matrixA[i]=(int *)malloc(matrixLenA*sizeof(int));
154 |  
155 | 		int **matrixB=(int **)malloc(matrixHighB*sizeof(int *));
156 | 		for (i=0;i<matrixHighB;i++)
157 | 			matrixB[i]=(int *)malloc(matrixLenB*sizeof(int));
158 |         for(i=0;i<nodeNum;i++)
159 |         {
160 |             int *receiveATemp=(int *)malloc(localLenA*localHighA*sizeof(int));
161 |             int *receiveBTemp=(int *)malloc(localLenB*localHighB*sizeof(int));
162 |  
163 |             MPI_Recv(receiveATemp,localHighA*localLenA,MPI_INT,i,i+100,MPI_COMM_WORLD,&status);
164 |             MPI_Recv(receiveBTemp,localHighB*localLenB,MPI_INT,i,i+200,MPI_COMM_WORLD,&status);
165 |             l=0;
166 |             for(j=0;j<localHighA;j++)
167 |                 for(k=0;k<localLenA;k++)
168 |                 {
169 |                     matrixA[j+(int)(i/p)*localHighA][k+(int)(i%p)*localLenA]=receiveATemp[l++];
170 |                 }
171 |             l=0;
172 |             for(j=0;j<localHighB;j++)
173 |                 for(k=0;k<localLenB;k++)
174 |                 {
175 |                     matrixB[j+(int)(i/p)*localHighB][k+(int)(i%p)*localLenB]=receiveBTemp[l++];
176 |                 }
177 |             free(receiveATemp);
178 |             free(receiveBTemp);  
179 |         }  
180 |         printf("A:\n");
181 |         PrintMatrix(matrixA,matrixHighA,matrixLenA);
182 |         printf("B:\n");
183 |         PrintMatrix(matrixB,matrixHighB,matrixLenB);
184 |         for (i=0;i<matrixHighA;i++)
185 |             free(matrixA[i]);
186 |         for (i=0;i<matrixHighB;i++)
187 |             free(matrixB[i]);
188 |         free(matrixA);
189 |         free(matrixB);
190 | 	}
191 |  
192 |  
193 |     for(i=0;i<p;i++)//每个节点向同行同列发送局部数据 A B
194 | 	{
195 | 	//	if(myCol!=i)
196 | 		{
197 | 			MPI_Send(A,localHighA*localLenA,MPI_INT,myRow*p+i,1,MPI_COMM_WORLD);
198 | 			MPI_Send(B,localHighB*localLenB,MPI_INT,myRow*p+i,2,MPI_COMM_WORLD);
199 | 		}
200 | 	//	if(myRow!=i)
201 | 		{
202 | 		   MPI_Send(A,localHighA*localLenA,MPI_INT,i*p+myCol,1,MPI_COMM_WORLD);
203 | 		   MPI_Send(B,localHighB*localLenB,MPI_INT,i*p+myCol,2,MPI_COMM_WORLD);		
204 | 		}	
205 | 	}
206 |  
207 |  
208 |  
209 | 	int *receiveA=(int *)malloc(localLenA*localHighA*sizeof(int));
210 | 	int *receiveB=(int *)malloc(localLenB*localHighB*sizeof(int));
211 |     int *resultC= (int *)malloc(localHighA*localLenB*sizeof(int));
212 | 	for(i=0;i<localHighA*localLenB;i++)resultC[i]=0;
213 |  
214 | /*********************计算矩阵的值*****************************/
215 | 	for(i=0;i<p;i++)
216 | 	{
217 | 		MPI_Recv(receiveA,localHighA*localLenA,MPI_INT,myRow*p+i,1,MPI_COMM_WORLD,&status);
218 | 		MPI_Recv(receiveB,localHighB*localLenB,MPI_INT,i*p+myCol,2,MPI_COMM_WORLD,&status);
219 |  
220 | 	
221 |         MatrixMultiply(receiveA,receiveB,resultC,localHighA,localLenA,localLenB);
222 | 		MatrixAdd(C,resultC,localHighA,localLenB);	
223 | 	}
224 |  
225 |     MPI_Send(C,localHighA*localLenB,MPI_INT,0,rank+400,MPI_COMM_WORLD);//将局部结果C发送至0号收集
226 |     if(rank==0)//收集数据并且在后面显示
227 |     { 
228 |         int **matrixC=(int **)malloc(matrixHighA*sizeof(int *));
229 |         for (i=0;i<matrixHighA;i++)
230 |             matrixC[i]=(int *)malloc(matrixLenB*sizeof(int));
231 | 
232 |         for(i=0;i<nodeNum;i++)
233 |         {
234 |            int *receiveCTemp=(int *)malloc(localLenB*localHighA*sizeof(int));
235 |            MPI_Recv(receiveCTemp,localHighA*localLenB,MPI_INT,i,i+400,MPI_COMM_WORLD,&status);
236 |            l=0;
237 |            for(j=0;j<localHighA;j++)
238 |                for(k=0;k<localLenB;k++)
239 |                {
240 |                    matrixC[j+(int)(i/p)*localHighA][k+(int)(i%p)*localLenB]=receiveCTemp[l++];
241 |                }
242 |            free(receiveCTemp);
243 |         }
244 |         printf("C:\n");
245 |         PrintMatrix(matrixC,matrixHighA,matrixLenB); 
246 |     } 
247 |     MPI_Finalize();
248 |     return 0;
249 | }
250 |  


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
 1 | 1 课程主页lec2_PP.ppt的P35-36两种重复计算方式的OpenMP和MPI实现。（任务数可约定为2的幂次方） 
 2 | 2 前期练习作业题目中的相关程序实现（ex-21-1/2/3 中有要求向量化/并行化的程序实习）。 
 3 | 3 新的广播MyBcastMPI实现。基本思路：（1）将MPI进程按所在节点划分子通讯域N；（2）可以将各子通讯域的首进程（编号为0）再组成一个子通讯域H；（3）由广播的root进程将消息发给原来最大通讯域中的0号进程h，再由h在H通讯域中广播（MPI_Bcast），各首进程然后在各自子通讯域N中再行广播（MPI_Bcast）。 
 4 | 4 用MPI_Send和MPI_Recv来模拟实现诸如MPI_Alltoall, MPI_Allgather功能并与标准MPI实现做简要性能对比。
 5 | 
 6 | 
 7 | （a） 在教材中 18.5 节 LU 分解的并行 MPI 实现基础上，给出 MPI+OpenMP 混 合实现；
 8 | （b） 针对教材中 18.6 节 QR 分解，给出纯 OpenMP 实现；
 9 | （c） SUMMA 并行矩阵乘法的 MPI 实现。参看文件 summa_2010.pdf 和原始文 章 lawn96.pdf。 
10 | （d） 结合自己研究方向中一个小问题的并行实现（MPI 或 OpenMP 或混合，三选一）


--------------------------------------------------------------------------------
/报告/图片1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片1.png


--------------------------------------------------------------------------------
/报告/图片10.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片10.PNG


--------------------------------------------------------------------------------
/报告/图片11.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片11.PNG


--------------------------------------------------------------------------------
/报告/图片2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片2.png


--------------------------------------------------------------------------------
/报告/图片3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片3.png


--------------------------------------------------------------------------------
/报告/图片4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片4.png


--------------------------------------------------------------------------------
/报告/图片5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片5.png


--------------------------------------------------------------------------------
/报告/图片6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片6.PNG


--------------------------------------------------------------------------------
/报告/图片7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片7.PNG


--------------------------------------------------------------------------------
/报告/图片8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片8.png


--------------------------------------------------------------------------------
/报告/图片9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/图片9.png


--------------------------------------------------------------------------------
/报告/实验报告.md:
--------------------------------------------------------------------------------
  1 | # <center> 并行程序设计实验报告  
  2 | + lab1  
  3 |     1.  分别用omp和mpi实现树型求和和蝶式求和  
  4 |         树型omp实现思路：对每一层的计算并行化，对叶节点编号，序号值为   线程号。
  5 |         考虑兄弟结点相加时把结果存在序号较小的结点中。
  6 |         将二叉树每一次计算步骤编号为1-n步，因为任务数为2的幂次方，因    此n= log_2⁡N
  7 |         要并行化的计算是每一步的节点与兄弟节点的加法。每一步的节点序    号与兄弟节点序号的差值为2^(n−i)，i为步数。将加法交予不同的线    程并行实现。
  8 |         分发结果就是求和的逆向实现，区别在于是自上而下的。  
  9 |         ```c
 10 |         //主要代码：
 11 |             for (int i = logN; i > 0; i--)
 12 |             {
 13 |                 step = (int)pow(2, logN-i+1);
 14 |             #pragma omp parallel private(pid)
 15 |                 {
 16 |                     pid = omp_get_thread_num();
 17 |                     if (!(pid % step))
 18 |                     {
 19 |                         num[pid] = num[pid]+num[pid+step/2];
 20 |                     }
 21 |             #pragma omp barrier
 22 |                 }
 23 |             }
 24 |             for (int i = 1; i <= logN; i++)
 25 |             {
 26 |                 step = (int)pow(2, logN-i+1);
 27 |             #pragma omp parallel private(pid)
 28 |                 {
 29 |                     pid = omp_get_thread_num();
 30 |                     if (!(pid % step))
 31 |                     {
 32 |                         num[pid+step/2] = num[pid];
 33 |                     }
 34 |             #pragma omp barrier
 35 |                 }
 36 |             }
 37 |         ```  
 38 |         实验结果：  
 39 |         ![imag](./图片1.png)  
 40 |         树型mpi实现思路：在计算全和时，在第i步，进程id整除2^(n−i    +1) 的负责接收数据并求和，其余进程负责发送数据。
 41 |         在分发全和时，在第i步，进程id整除2^(n−i+1) 的负责发送数据， 其余进程负责接收数据。
 42 |         发送与接收数据的进程编号在第i步相差2^(n−i) 。  
 43 |         ```c
 44 |         //主要代码：  
 45 |         for (int i = 1; i <= logN; i++)
 46 |         {
 47 |             int tag = i;
 48 |             int step = (int)pow(2, i);
 49 |             if (pid % step == 0)
 50 |             {
 51 |                 MPI_Recv(&recvdata, 1, MPI_INT, pid+step/2,     tag, MPI_COMM_WORLD, &status);
 52 |                 data += recvdata;
 53 |             }
 54 |             else
 55 |              if(pid % step == step/2)
 56 |             {
 57 |                 MPI_Send(&data, 1, MPI_INT, pid-step/2, tag,    MPI_COMM_WORLD);
 58 |             }
 59 |         }
 60 |         //spread
 61 |         for (int i = logN; i >0; i--)
 62 |         {
 63 |             int tag = i;
 64 |             int step = (int)pow(2, i);
 65 |             if (pid % step == 0)
 66 |             {
 67 |                 MPI_Send(&data, 1, MPI_INT, pid+step/2, tag,    MPI_COMM_WORLD);
 68 |             }
 69 |             else
 70 |             if(pid % step == step/2)
 71 |             {
 72 |                 MPI_Recv(&recvdata, 1, MPI_INT, pid-step/2,     tag, MPI_COMM_WORLD, &status);
 73 |                 data = recvdata;
 74 |             }
 75 | 
 76 |         }    
 77 |         ```  
 78 |         实验结果：  
 79 |         ![imag](./图片2.png)   
 80 |         蝶式求和omp实现思路：仍然是对每一步的计算并行化，对叶节点编 号，序号值为线程号。考虑兄弟结点相加时把结果存在序号较小的结 点中。
 81 |         依然将计算步骤编号从开始到结束编为1-n步，因为任务数为2的幂次    方，因此n= log_2⁡N
 82 |         要并行化的计算是每一步的节点与兄弟节点的加法。关键在于如何确    定相加节点的序号差值。
 83 |         将编号写为n位的二进制格式，可以发现，在第i步，相加的节点序号    恰好在第i位不同，而其余位都相同。
 84 |         ```c
 85 |         //主要代码：
 86 |         for (int i = 0; i < logN; i++)
 87 |         {
 88 |             step = (int)pow(2, i);
 89 |         #pragma omp parallel private(pid, tmp, dest)
 90 |             {
 91 |                 pid = omp_get_thread_num();
 92 |                 tmp = num[pid];
 93 |                 dest = pid^step;
 94 |                 tmp = num[pid] + num[dest];
 95 |         #pragma omp barrier
 96 |                 num[pid] = tmp;
 97 |         #pragma omp barrier
 98 |             }
 99 |         }
100 |         ```  
101 |         实验结果：  
102 |         ![imag](./图片3.png)
103 |         蝶式求和mpi实现思路：思路与omp实现相同，区别在于不用关心哪两    个节点相加，
104 |         此时发送目的进程和接受来源进程的id的二进制格式在第i步时的第i    位不同。  
105 |         ```c
106 |         //主要代码：  
107 |         int logN = (int)log2(num_procs);
108 |         for(int i = 0; i < logN; i++) {
109 |             int tag = i+1;
110 |             int step = (int)pow(2,i);
111 |             int dest = id_procs ^ step;
112 |             MPI_Send(&data, 1, MPI_INT, dest, tag,  MPI_COMM_WORLD);
113 |             MPI_Recv(&recvdata, 1, MPI_INT, dest, tag,  MPI_COMM_WORLD, &status);
114 |             data += recvdata;
115 |         }
116 |         ```  
117 |         实验结果：  
118 |         ![imag](./图片4.png)  
119 |     2. 对作业中有并行化要求的题目代码实现  
120 |         ```c
121 |         //hw21-1-3
122 |         #pragma omp simd
123 |         for (int i = 0; i < n; i++)
124 |         {
125 |             a[i] = b[i] + c[i+1];
126 |             c[i] = a[i] + b[i];
127 |         }
128 |         ```  
129 |         ```c
130 |         //hw21-3-2 
131 |         for (int i = 0; i < count; i++)
132 |         {
133 |             for (int j = 0; j < count-1; j++)
134 |             {
135 |                 b[j] = a[j][N];
136 |                 #pragma omp parallel for
137 |                 for (int k = 0; k < count; i++)
138 |                 {
139 |                     a[j+1][k] = b[j] + c[j][k];
140 |                 }
141 |             }
142 |         }
143 |         for (int i = 0; i < count; i++)
144 |         {
145 |             /* code */
146 |             #pragma omp parallel for
147 |             for (int j = 0; j < count-1; i++)
148 |             {
149 |                 y[i+j] = a[j+1][N];
150 |             }
151 |         }
152 |         #pragma omp parallel for
153 |         for (int i = 0; i < count; i++)
154 |         {
155 |             /* code */
156 |             x[i] = y[i] + 10;
157 |         }
158 |         ``` 
159 |         ```c
160 |         //hw21-3-5
161 |         //loop1
162 |         #pragma omp parallel for
163 |         for (int i = 1; i <= 100; i++)
164 |         {
165 |             /* code */
166 |             A[i] = A[i] + B[i-1];
167 |             D[i] = C[i] * C[i];  
168 | 
169 |         }
170 |         //loop2
171 |         #pragma omp parallel for
172 |         for (int i = 1; i <= 500; i++)
173 |         {
174 |             A[i] = B[i] + C[i];
175 |             D[i] = (A[i] + A[1000-i])/2.0;
176 |         }
177 |         #pragma omp parallel for
178 |         for (int i = 501; i < count; i++)
179 |         {
180 |             /* code */
181 |             A[i] = B[i] + C[i];
182 |             D[i] = (A[i]+A[1000-i])/2.0;
183 |         }
184 |         //loop3
185 |         #pragma omp parallel for
186 |         for (int i = 0; i < count; i++)
187 |         {
188 |             for (int j = 0; i < count; j++)
189 |             {
190 |                 A[3*i+2*j][2*j] = C[i][j]*2;
191 |                 D[i][j] = A[(i-j+6)>0 ? (i-j+6):-(i-j+6)][i+j];
192 |             }   
193 |         }
194 |         ```  
195 |     3. 实现MyBcast()  
196 |         思路：  
197 |             （1）将MPI进程按所在节点划分子通讯域N；  
198 |             （2）可以将各子通讯域的首进程（编号为0）再组成一个子通讯域H；  
199 |             （3）由广播的root进程将消息发给原来最大通讯域中的0号进程h，再由h在H通讯域中广播（MPI_Bcast），各首进程然后在各自子通讯域N中再行广播（MPI_Bcast）；  
200 |             （4）子通讯域H：将N的首进程通过MPI_Group_incl()函数建立一个组，再用MPI_Comm_create()建立子通讯域H。  
201 |         ```
202 |         //伪代码
203 |         MPI_Comm_split(MPI_COMM_WORLD, color, key, &split_comm_world);
204 |         建立MPI_COMM_WORLD的进程组World_Group；
205 |         通过World_Group建立h_Group；
206 |         PI_Comm_create(MPI_COMM_WORLD, h_Group, &h_comm_world)；
207 |         oot进程发送消息：
208 |         	MPI_Send(data, count, MPI_TYPE, 0, 1, MPI_COMM_WORLD);
209 |         原通讯域的0号进程接收：
210 |             MPI_Recv(data, count, MPI_TYPE, root, 1, MPI_COMM_WORLD,&status);
211 |         号进程在H中广播：
212 |         	 MPI_Bcast(data, count, MPI_TYPE, 0, h_comm_world);
213 |         在N中广播
214 |         	 MPI_Bcast(data, count, MPI_TYPE, 0, split_comm_world);
215 |         ```
216 |         ```c
217 |         //主要代码：
218 |         MPI_Comm h_comm_world;
219 |         MPI_Comm_group(MPI_COMM_WORLD, &world_group);
220 |         int grpsize = num_procs / 2;
221 |         int zerolist[] = {0, 1, 2, 3};
222 |         int zerocnt = 0;
223 | 
224 |         MPI_Group_incl(world_group, grpsize, zerolist, &    new_group);
225 |         MPI_Comm_create(MPI_COMM_WORLD, new_group, &    h_comm_world);
226 |         // message from root to 0 proc of MPI_COMM_WORLD
227 |         if (id_procs == root)
228 |         {
229 |             MPI_Send(&seq, 16, MPI_CHAR, 0, 1, MPI_COMM_WORLD);
230 |         }
231 |         else if (id_procs == 0)
232 |         {
233 |             MPI_Recv(&seq, 16, MPI_CHAR, root, 1,   MPI_COMM_WORLD, &status);
234 |         }
235 |         MPI_Barrier(MPI_COMM_WORLD);
236 |         // Broadcast within the group H
237 |         if(h_comm_world != MPI_COMM_NULL)
238 |             MPI_Bcast(&seq, 16, MPI_CHAR, 0, h_comm_world);
239 |         MPI_Barrier(MPI_COMM_WORLD);
240 |         //Broadcasr within the group N
241 | 
242 |         MPI_Bcast(&seq, 16, MPI_CHAR, 0, split_comm_world);
243 |         MPI_Barrier(MPI_COMM_WORLD);
244 |         ```
245 |         实验结果：  
246 |         ![imag](./图片5.png)  
247 |     4.   
248 |         用MPI_Send和MPI_Recv来模拟实现诸如MPI_Alltoall,      MPI_Allgather功能并与标准MPI实现做简要性能对比.  
249 |         ```c
250 |         //MPI_Alltoall实现
251 |         void MPI_Alltoall_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount,
252 |         MPI_Datatype recvdatatype, MPI_Comm comm)
253 |         {
254 |             int rank, size;
255 |             MPI_Status status;
256 |             MPI_Comm_rank(comm, &rank);
257 |             MPI_Comm_size(comm, &size);
258 |             for (int i = 0; i < size; i++)
259 |             {
260 |                 if (i != rank)
261 |                 {
262 |                     MPI_Send(senddata + i * sendcount, sendcount,       senddatatype, i, rank , comm);
263 |                     MPI_Recv(recvdata + i * recvcount, recvcount,       recvdatatype, i, i, comm, &status);
264 |                 }
265 |                 else
266 |                 {
267 |                 	//memcpy(recvdata + i * recvcount, senddata,        sizeof(senddatatype)*sendcount);
268 |                 	recvdata[i] = senddata[i];
269 |                 }
270 |             }
271 |         }
272 |         ```
273 |         ```c
274 |         //MPI_Allgather实现  
275 |         void MPI_Allgather_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount,
276 |         MPI_Datatype recvdatatype, MPI_Comm comm)
277 |         {
278 |             int rank, size, i;
279 |             MPI_Status status;
280 |             MPI_Comm_rank(comm, &rank);
281 |             MPI_Comm_size(comm, &size);
282 | 
283 |             for (i = 0; i < size; i++)
284 |             {
285 |                 if (i != rank)
286 |                 {
287 |                     MPI_Send(senddata, sendcount, senddatatype, i,      rank , comm);
288 |                     MPI_Recv(recvdata + i * recvcount, recvcount,       recvdatatype, i, i, comm, &status);
289 |                 }
290 |                 else
291 |                 {
292 |                 	//memcpy(recvdata + i * recvcount, senddata,        sizeof(senddatatype)*sendcount);
293 |                 	recvdata[i] = *senddata;
294 |                 	}
295 |             }
296 |         }
297 |         ```  
298 |         性能对比：  
299 |         ![imag](./图片7.png)  
300 |         ![imag](./图片6.png)  
301 | + lab2  
302 |     1. 用mpi和omp实现LU分解  
303 |         + 原理  
304 | 	    利用主行i对j >i的其他行做初等行变换，由于各行计算时没有数据相关，因此可以对矩阵按行划分做并行计算。课本的并行算法考虑到各处理器之间的负载均衡，使用的是交叉行划分。划分后各处理器轮流选出主行并广播给其他处理器，其他处理器利用接收的主行对部分行向量进行计算。
305 |         + 思路  
306 | 	    在课本第18章附录2的LU分解的MPI实现上，对处理器用主行做行变换的循环部分用openmp实现for编译制导，将计算平均分给各线程处理，其他不动。  
307 |         ```c
308 |         //主要代码部分  
309 |         if (my_rank<=j)
310 |         {
311 |            #pragma  omp parallel shared(a,f,v,m) private(k,w)
312 |            {
313 |                 #pragma omp for
314 |                 for(k=i+1;k<m;k++)
315 |                 {
316 |                     a(k,v)=a(k,v)/f[v];
317 |                 }
318 |                 for(k=i+1;k<m;k++)
319 |                 {
320 |                     #pragma omp for
321 |                     for(w=v+1;w<M;w++)
322 |                         a(k,w)=a(k,w)-f[w]*a(k,v);
323 |                 }
324 |             }
325 |         }
326 |         /*编号大于my_rank的进程利用主行对其第i,…,m-1行数据做行变换*/
327 |         if (my_rank>j)
328 |         {
329 |             #pragma  omp parallel shared(a,f,v,m) private(k,w)
330 |             {
331 |                 #pragma omp for
332 |                 for(k=i;k<m;k++)
333 |                 {
334 |                     a(k,v)=a(k,v)/f[v]; 
335 |                 }
336 |                 for(k=i;k<m;k++)
337 |                 {
338 |                     #pragma omp for
339 |                     for(w=v+1;w<M;w++)
340 |                         a(k,w)=a(k,w)-f[w]*a(k,v);
341 |                 }
342 |             }
343 |         }
344 |         ```    
345 |         实验结果(只跑了6维矩阵和5维矩阵作为演示):      
346 |         ![imag](./图片8.png)  
347 |         ![imag](./图片9.png)  
348 |         性能对比：  
349 |         |线程数|100 * 100|1000 * 1000|10000 * 10000|  
350 |         |---|----|----|---|  
351 |         |2|0.1143s|0.55024s|237.638s|
352 |         |4|0.0968s|0.508s|229.686s|  
353 |         |8|0.1019s|0.349117s|201.801s|  
354 |     2. 用纯omp实现QR分解  
355 |         QR分解的串行算法有三层循环，直接对最内层循环做并行化处理。  
356 |         ```c
357 |         //主要代码  
358 |         for(int j = 0; j < N; j++){
359 |             for (int i = j+1; i < N; i++)
360 |             {
361 |                 sq = sqrt(A(j, j) * A(j, j) + A(i, j) * A(i, j));
362 |                 c = A(j, j)/sq;
363 |                 s = A(i, j)/sq;
364 |                 int k ;
365 |             #pragma omp parallel for private(k)
366 |                 for (k = 0; k < N; k++)
367 |                 {
368 |                     Aj[k] = c * A(j, k) + s * A(i, k);
369 |                     Qj[k] = c * Q(j, k) + s * Q(i, k); 
370 |                     Ai[k] = - s * A(j, k) + c * A(i, k);
371 |                     Qi[k] = - s * Q(j, k) + c * Q(i, k);
372 |                 }
373 |             #pragma omp parallel for private(k)
374 |                 for (k = 0; k < N; k++)
375 |                 {
376 |                     A(j, k) = Aj[k];
377 |                     A(i, k) = Ai[k];
378 |                     Q(i, k) = Qi[k];
379 |                     Q(j, k) = Qj[k];
380 |                 }         
381 |             }
382 |         }
383 |         ```
384 |         性能对比：  
385 |         |线程数|矩阵维数n=8|矩阵维数n=16|矩阵维数n=32|  
386 |         |----|-----|-----|-----|  
387 |         |1|0.000073|0.000293|0.001481|
388 |         |2|0.000182|2.136728|0.002181|
389 |         |4|0.434824|4.581992|0.004917|
390 |         |8|1.970921|10.218330|45.5784|  
391 |         分析：  
392 |         我们惊讶的发现，即使不考虑节点自身的问题————如运算资源在被其他程序所消耗等，线程数增加后，运行时间反而增大了。特别是在线程数增加到8后，对32维矩阵的QR分解时间增大到一种吓人的地步。我从知乎上得到了一个很有说服力的解释：  
393 |         嵌套循环，内层循环并行。并行线程的创建与销毁会有开销，在嵌套循环的时候如果对内层for并行的话，这个开销会很大。解决方法是把并行放在外层的for。  
394 |     3. 用mpi实现summa算法  
395 |         原理：  
396 |         ![imag](./图片10.png)  
397 |         ![imag](./图片11.png)  
398 |         ```c
399 |         //主要代码
400 |             //将数据发送到0号进程收集显示
401 |         MPI_Send(A,localHighA*localLenA,MPI_INT,0,rank+100, MPI_COMM_WORLD);
402 |         MPI_Send(B,localHighB*localLenB,MPI_INT,0,rank+200, MPI_COMM_WORLD);
403 | 	    if(rank==0)
404 | 	    {
405 | 	    	int **matrixA=(int **)malloc(matrixHighA*sizeof(int*));
406 | 	    	for (i=0;i<matrixHighA;i++)
407 | 	    		matrixA[i]=(int *)malloc(matrixLenA*sizeof(int));
408 | 	    	int **matrixB=(int **)malloc(matrixHighB*sizeof(int*));
409 | 	    	for (i=0;i<matrixHighB;i++)
410 | 	    		matrixB[i]=(int *)malloc(matrixLenB*sizeof(int));
411 |             for(i=0;i<nodeNum;i++)
412 |             {
413 |                 int *receiveATemp=(int *)malloc    (localLenA*localHighA*sizeof(int));
414 |                 int *receiveBTemp=(int *)malloc    (localLenB*localHighB*sizeof(int));
415 |                 MPI_Recv(receiveATemp,localHighA*localLenA,    MPI_INT,i,i+100,MPI_COMM_WORLD,&status);
416 |                 MPI_Recv(receiveBTemp,localHighB*localLenB,    MPI_INT,i,i+200,MPI_COMM_WORLD,&status);
417 |                 l=0;
418 |                 for(j=0;j<localHighA;j++)
419 |                     for(k=0;k<localLenA;k++)
420 |                     {
421 |                         matrixA[j+(int)(i/p)*localHighA][k+(int)(i%p)*localLenA]=receiveATemp[l++];
422 |                     }
423 |                 l=0;
424 |                 for(j=0;j<localHighB;j++)
425 |                     for(k=0;k<localLenB;k++)
426 |                     {
427 |                         matrixB[j+(int)(i/p)*localHighB][k+(int)(i%p)*localLenB]=receiveBTemp[l++];
428 |                     }
429 |                 free(receiveATemp);
430 |                 free(receiveBTemp);  
431 |             }  
432 |             printf("A:\n");
433 |             PrintMatrix(matrixA,matrixHighA,matrixLenA);
434 |             printf("B:\n");
435 |             PrintMatrix(matrixB,matrixHighB,matrixLenB);
436 |             for (i=0;i<matrixHighA;i++)
437 |                 free(matrixA[i]);
438 |             for (i=0;i<matrixHighB;i++)
439 |                 free(matrixB[i]);
440 |             free(matrixA);
441 |             free(matrixB);
442 | 	    }
443 |         for(i=0;i<p;i++) //每个节点向同行同列发送局部数据 A B
444 | 	    {
445 | 	        {
446 | 	    	    MPI_Send(A,localHighA*localLenA,MPI_INT，myRow*p+i,1,MPI_COMM_WORLD);
447 | 	    		MPI_Send(B,localHighB*localLenB,MPI_INT,myRow*p+i,2,MPI_COMM_WORLD);
448 | 	    	}
449 | 	    	{
450 | 	    	   MPI_Send(A,localHighA*localLenA,MPI_INT,i*p  +myCol,1,MPI_COMM_WORLD);
451 | 	    	   MPI_Send(B,localHighB*localLenB,MPI_INT,i*p +myCol,2,MPI_COMM_WORLD);		
452 | 	    	}	
453 | 	    }
454 | 	    int *receiveA=(int *)malloc(localLenA*localHighA*sizeof(int));
455 | 	    int *receiveB=(int *)malloc(localLenB*localHighB*sizeof(int));
456 |         int *resultC= (int *)malloc(localHighA*localLenB*sizeof(int));
457 | 	    for(i=0;i<localHighA*localLenB;i++)resultC[i]=0;
458 | 	    for(i=0;i<p;i++)
459 | 	    {
460 | 	    	MPI_Recv(receiveA,localHighA*localLenA,MPI_INT, myRow*p+i,1,MPI_COMM_WORLD,&status);
461 | 	    	MPI_Recv(receiveB,localHighB*localLenB,MPI_INT,i*p+myCol,2,MPI_COMM_WORLD,&status);
462 |             MatrixMultiply(receiveA,receiveB,resultC,localHighA,localLenA,localLenB);
463 | 	    	MatrixAdd(C,resultC,localHighA,localLenB);	
464 | 	    }
465 |         MPI_Send(C,localHighA*localLenB,MPI_INT,0,rank+400,    MPI_COMM_WORLD);//将局部结果C发送至0号收集
466 |         if(rank==0)//收集数据并且在后面显示
467 |         { 
468 |             int **matrixC=(int **)malloc(matrixHighA*sizeof(int *));
469 |             for (i=0;i<matrixHighA;i++)
470 |                 matrixC[i]=(int *)malloc(matrixLenB*sizeof(int));
471 |             for(i=0;i<nodeNum;i++)
472 |             {
473 |                int *receiveCTemp=(int *)malloc     (localLenB*localHighA*sizeof(int));
474 |                MPI_Recv(receiveCTemp,localHighA*localLenB, MPI_INT,i,i+400,MPI_COMM_WORLD,&status);
475 |                l=0;
476 |                for(j=0;j< localHighA;j++)
477 |                    for(k=0;k< localLenB;k++)
478 |                    {
479 |                        matrixC[j+(int)(i/p) *localHighA][k+(int)(i%p)*localLenB]=receiveCTemp[l++];
480 |                    }
481 |                free(receiveCTemp);
482 |             }
483 |             printf(" C:\n");
484 |             PrintMatrix(matrixC,matrixHighA,matrixLenB); 
485 |         } 
486 |         ```
487 |         性能对比：  
488 |         |length/process number|1|2|4|8|  
489 |         |-----|-----|----|----|----|  
490 |         |16|0.000023|0.000051|0.000148|1.002412|  
491 |         |256|0.129714|0.043931|0.023101|0.011794|  
492 |         |1024|9.266333|1.976020|1.265144|0.594526|  
493 | 


--------------------------------------------------------------------------------
/报告/实验报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Serkonosand/ParallelProgramming2021/eea0f34c72f477e3c4ef96f1dc67d6ee5a927c14/报告/实验报告.pdf


--------------------------------------------------------------------------------