├── lab1 ├── ex1_ppt35_mpi.c ├── ex1_ppt35_omp.c ├── ex1_ppt36_mpi.c ├── ex1_ppt36_omp.c ├── ex3_MybcastMPI.c ├── ex4Allgather.c ├── ex4Alltoall.c └── hw │ ├── ex2_1-3.c │ ├── ex2_3-2.c │ └── ex2_3-5.c ├── lab2 ├── ex1_LU.c ├── ex2_QR.c └── ex3_summa.c ├── readme.txt └── 报告 ├── 图片1.png ├── 图片10.PNG ├── 图片11.PNG ├── 图片2.png ├── 图片3.png ├── 图片4.png ├── 图片5.png ├── 图片6.PNG ├── 图片7.PNG ├── 图片8.png ├── 图片9.png ├── 实验报告.md └── 实验报告.pdf /lab1/ex1_ppt35_mpi.c: -------------------------------------------------------------------------------- 1 | #include "mpi.h" 2 | #include 3 | #include 4 | 5 | 6 | int main(int argc, char *argv[]) 7 | { 8 | int pid, pnums; 9 | MPI_Init(&argc, &argv); 10 | MPI_Comm_rank(MPI_COMM_WORLD, &pid); 11 | MPI_Comm_size(MPI_COMM_WORLD, &pnums); 12 | int data, recvdata, logN; 13 | MPI_Status status; 14 | data = pid+1; 15 | recvdata = 0; 16 | logN = (int)log2(pnums); 17 | printf("process id: %d,data:%d\n", pid,data); 18 | //sum 19 | for (int i = 1; i <= logN; i++) 20 | { 21 | int tag = i; 22 | int step = (int)pow(2, i); 23 | if (pid % step == 0) 24 | { 25 | MPI_Recv(&recvdata, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD, &status); 26 | data += recvdata; 27 | } 28 | else 29 | if(pid % step == step/2) 30 | { 31 | MPI_Send(&data, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD); 32 | } 33 | } 34 | //spread 35 | for (int i = logN; i >0; i--) 36 | { 37 | int tag = i; 38 | int step = (int)pow(2, i); 39 | if (pid % step == 0) 40 | { 41 | MPI_Send(&data, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD); 42 | } 43 | else 44 | if(pid % step == step/2) 45 | { 46 | MPI_Recv(&recvdata, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD, &status); 47 | data = recvdata; 48 | } 49 | 50 | } 51 | printf("%d sum is %d\n", pid, data); 52 | MPI_Finalize(); 53 | return 0; 54 | } -------------------------------------------------------------------------------- /lab1/ex1_ppt35_omp.c: -------------------------------------------------------------------------------- 1 | #include "omp.h" 2 | #include 3 | #include 4 | 5 | int N = 8; 6 | 7 | int main(){ 8 | int step,logN; 9 | int num[N]; 10 | for (int i = 0; i < N; i++) 11 | { 12 | num[i] = i+1; 13 | } 14 | logN = (int)log2(N); 15 | int pid; 16 | printf("numbers:"); 17 | for (int i = 0; i < N; i++) 18 | { 19 | printf("%d", num[i]); 20 | if(i == N-1) 21 | printf("\n"); 22 | } 23 | for (int i = logN; i > 0; i--) 24 | { 25 | step = (int)pow(2, logN-i+1); 26 | #pragma omp parallel private(pid) 27 | { 28 | pid = omp_get_thread_num(); 29 | if (!(pid % step)) 30 | { 31 | num[pid] = num[pid]+num[pid+step/2]; 32 | } 33 | #pragma omp barrier 34 | } 35 | } 36 | for (int i = 1; i <= logN; i++) 37 | { 38 | step = (int)pow(2, logN-i+1); 39 | #pragma omp parallel private(pid) 40 | { 41 | pid = omp_get_thread_num(); 42 | if (!(pid % step)) 43 | { 44 | num[pid+step/2] = num[pid]; 45 | } 46 | #pragma omp barrier 47 | } 48 | } 49 | printf("after sum:"); 50 | for (int i = 0; i < N; i++) 51 | { 52 | printf("%d", num[i]); 53 | if(i == N-1) 54 | printf("\n"); 55 | } 56 | return 0; 57 | } 58 | -------------------------------------------------------------------------------- /lab1/ex1_ppt36_mpi.c: -------------------------------------------------------------------------------- 1 | #include "mpi.h" 2 | #include 3 | #include 4 | 5 | int main(int argc, char *argv[]) 6 | { 7 | int id_procs, num_procs; 8 | 9 | MPI_Init(&argc, &argv); 10 | MPI_Comm_size(MPI_COMM_WORLD, &num_procs); 11 | MPI_Comm_rank(MPI_COMM_WORLD, &id_procs); 12 | int data = id_procs+1; 13 | int recvdata; 14 | MPI_Status status; 15 | printf("process id %d data = %d\n",id_procs, data); 16 | int logN = (int)log2(num_procs); 17 | for(int i = 0; i < logN; i++) { 18 | int tag = i+1; 19 | int step = (int)pow(2,i); 20 | int dest = id_procs ^ step; 21 | MPI_Send(&data, 1, MPI_INT, dest, tag, MPI_COMM_WORLD); 22 | MPI_Recv(&recvdata, 1, MPI_INT, dest, tag, MPI_COMM_WORLD, &status); 23 | data += recvdata; 24 | } 25 | 26 | 27 | printf("process id %d sum is = %d\n",id_procs, data); 28 | 29 | MPI_Finalize(); 30 | return 0; 31 | } -------------------------------------------------------------------------------- /lab1/ex1_ppt36_omp.c: -------------------------------------------------------------------------------- 1 | #include "omp.h" 2 | #include 3 | #include 4 | 5 | int N = 8; 6 | 7 | int main(){ 8 | int step,logN; 9 | int num[N]; 10 | for (int i = 0; i < N; i++) 11 | { 12 | num[i] = i+1; 13 | } 14 | logN = (int)log2(N); 15 | int pid, tmp, dest; 16 | printf("numbers:"); 17 | for (int i = 0; i < N; i++) 18 | { 19 | printf("%d ", num[i]); 20 | if(i == N-1) 21 | printf("\n"); 22 | } 23 | for (int i = 0; i < logN; i++) 24 | { 25 | step = (int)pow(2, i); 26 | #pragma omp parallel private(pid, tmp, dest) 27 | { 28 | pid = omp_get_thread_num(); 29 | tmp = num[pid]; 30 | dest = pid^step; 31 | tmp = num[pid] + num[dest]; 32 | #pragma omp barrier 33 | num[pid] = tmp; 34 | #pragma omp barrier 35 | } 36 | } 37 | printf("after sum:"); 38 | for (int i = 0; i < N; i++) 39 | { 40 | printf("%d ", num[i]); 41 | if(i == N-1) 42 | printf("\n"); 43 | } 44 | } -------------------------------------------------------------------------------- /lab1/ex3_MybcastMPI.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | /* 6 | int MyBcastMPI(void* data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator) 7 | { 8 | return 0; 9 | } 10 | */ 11 | int main(int argc, char *argv[]) 12 | { 13 | int id_procs, num_procs; 14 | char seq[16]; 15 | int root = 3; 16 | MPI_Group world_group, new_group; 17 | MPI_Init(&argc, &argv); 18 | MPI_Comm_size(MPI_COMM_WORLD, &num_procs); 19 | MPI_Comm_rank(MPI_COMM_WORLD, &id_procs); 20 | if (id_procs == root) 21 | { 22 | strcpy(seq, "hello,MPI!"); 23 | } 24 | MPI_Barrier(MPI_COMM_WORLD); 25 | 26 | MPI_Comm split_comm_world; 27 | MPI_Status status; 28 | 29 | int rank; 30 | int size; 31 | 32 | // MPI split COMM_WORLD into 4 groups 33 | MPI_Comm_split(MPI_COMM_WORLD, id_procs % 4, id_procs, &split_comm_world); 34 | MPI_Comm_rank(split_comm_world, &rank); 35 | MPI_Comm_size(split_comm_world, &size); 36 | //create new group H 37 | MPI_Comm h_comm_world; 38 | MPI_Comm_group(MPI_COMM_WORLD, &world_group); 39 | int grpsize = num_procs / 2; 40 | int zerolist[] = {0, 1, 2, 3}; 41 | int zerocnt = 0; 42 | 43 | MPI_Group_incl(world_group, grpsize, zerolist, &new_group); 44 | MPI_Comm_create(MPI_COMM_WORLD, new_group, &h_comm_world); 45 | // message from root to 0 proc of MPI_COMM_WORLD 46 | if (id_procs == root) 47 | { 48 | MPI_Send(&seq, 16, MPI_CHAR, 0, 1, MPI_COMM_WORLD); 49 | } 50 | else if (id_procs == 0) 51 | { 52 | MPI_Recv(&seq, 16, MPI_CHAR, root, 1, MPI_COMM_WORLD, &status); 53 | } 54 | MPI_Barrier(MPI_COMM_WORLD); 55 | // Broadcast within the group H 56 | if(h_comm_world != MPI_COMM_NULL) 57 | MPI_Bcast(&seq, 16, MPI_CHAR, 0, h_comm_world); 58 | MPI_Barrier(MPI_COMM_WORLD); 59 | //Broadcasr within the group N 60 | 61 | MPI_Bcast(&seq, 16, MPI_CHAR, 0, split_comm_world); 62 | MPI_Barrier(MPI_COMM_WORLD); 63 | 64 | printf("MPI Comm rank %d, original id %d, size %d. the new msg is %s\n", rank, id_procs, size, seq); 65 | MPI_Finalize(); 66 | return 0; 67 | } -------------------------------------------------------------------------------- /lab1/ex4Allgather.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include"mpi.h" 4 | #include 5 | 6 | //采用MPI_Send和MPI_Recv编写代码实现的MPI_Allgather 7 | void MPI_Allgather_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount, 8 | MPI_Datatype recvdatatype, MPI_Comm comm) 9 | { 10 | int rank, size, i; 11 | MPI_Status status; 12 | MPI_Comm_rank(comm, &rank); 13 | MPI_Comm_size(comm, &size); 14 | 15 | for (i = 0; i < size; i++) 16 | { 17 | if (i != rank) 18 | { 19 | MPI_Send(senddata, sendcount, senddatatype, i, rank , comm); 20 | MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status); 21 | } 22 | else 23 | { 24 | //memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount); 25 | recvdata[i] = *senddata; 26 | } 27 | } 28 | } 29 | int main(int argc, char* argv[]) 30 | { 31 | int i, rank, size, tag = 1; 32 | int senddata, recvdata[8]; 33 | double start_time, end_time, s_t, e_t; 34 | int count = 1; 35 | MPI_Init(&argc, &argv); 36 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 37 | MPI_Comm_size(MPI_COMM_WORLD, &size); 38 | senddata = rank + 1; 39 | start_time = MPI_Wtime(); 40 | //自己编写的MPI_Allgather的性能测试 41 | MPI_Allgather_my(&senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD); 42 | end_time = MPI_Wtime(); 43 | MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); 44 | MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 45 | 46 | for (i = 0; i < size; i++) 47 | printf("My rank = %d After mygather recv = %d\n", rank, recvdata[i]); 48 | 49 | if (rank == 0) 50 | { 51 | printf("myallgather : count = %d total time = %f\n", count, e_t - s_t); 52 | } 53 | MPI_Barrier(MPI_COMM_WORLD); 54 | //MPI原有的MPI_Allgather的功能测试 55 | start_time = MPI_Wtime(); 56 | MPI_Allgather(&senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD); 57 | end_time = MPI_Wtime(); 58 | MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); 59 | MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 60 | for (i = 0; i < size; i++) 61 | printf("My rank = %d recv = %d\n", rank, recvdata[i]); 62 | 63 | if (rank == 0) 64 | { 65 | printf("allgather : count = %d total time = %f\n", count, e_t - s_t); 66 | } 67 | MPI_Finalize(); 68 | return 0; 69 | } -------------------------------------------------------------------------------- /lab1/ex4Alltoall.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include"mpi.h" 4 | #include 5 | 6 | //采用MPI_Send和MPI_Recv编写代码实现的MPI_Allgather 7 | void MPI_Alltoall_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount, 8 | MPI_Datatype recvdatatype, MPI_Comm comm) 9 | { 10 | int rank, size; 11 | MPI_Status status; 12 | MPI_Comm_rank(comm, &rank); 13 | MPI_Comm_size(comm, &size); 14 | for (int i = 0; i < size; i++) 15 | { 16 | if (i != rank) 17 | { 18 | MPI_Send(senddata + i * sendcount, sendcount, senddatatype, i, rank , comm); 19 | MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status); 20 | } 21 | else 22 | { 23 | //memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount); 24 | recvdata[i] = senddata[i]; 25 | } 26 | } 27 | } 28 | int main(int argc, char* argv[]) 29 | { 30 | int i, rank, size, tag = 1; 31 | int senddata[8], recvdata[8]; 32 | 33 | 34 | double start_time, end_time, s_t, e_t; 35 | int count = 1; 36 | MPI_Init(&argc, &argv); 37 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 38 | MPI_Comm_size(MPI_COMM_WORLD, &size); 39 | for (int j = 0; j < size; j++) 40 | { 41 | senddata[j] = j+1; 42 | } 43 | start_time = MPI_Wtime(); 44 | //自己编写的MPI_Allgather的性能测试 45 | MPI_Alltoall_my(senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD); 46 | end_time = MPI_Wtime(); 47 | MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); 48 | MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 49 | 50 | for (i = 0; i < size; i++) 51 | printf("My rank = %d After myalltoall recv = %d\n", rank, recvdata[i]); 52 | 53 | if (rank == 0) 54 | { 55 | printf("myalltoall : count = %d total time = %f\n", count, e_t - s_t); 56 | } 57 | MPI_Barrier(MPI_COMM_WORLD); 58 | //MPI原有的MPI_Allgather的功能测试 59 | start_time = MPI_Wtime(); 60 | MPI_Alltoall(senddata, count, MPI_INT, recvdata, count, MPI_INT, MPI_COMM_WORLD); 61 | end_time = MPI_Wtime(); 62 | MPI_Reduce(&start_time, &s_t, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); 63 | MPI_Reduce(&end_time, &e_t, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 64 | for (i = 0; i < size; i++) 65 | printf("My rank = %d recv = %d\n", rank, recvdata[i]); 66 | 67 | if (rank == 0) 68 | { 69 | printf("alltoall : count = %d total time = %f\n", count, e_t - s_t); 70 | } 71 | MPI_Finalize(); 72 | return 0; 73 | } -------------------------------------------------------------------------------- /lab1/hw/ex2_1-3.c: -------------------------------------------------------------------------------- 1 | #include "omp.h" 2 | #include 3 | #include 4 | #include 5 | int n = 50; 6 | int main(){ 7 | int a[n], b[n], c[n]; 8 | for (int i = 0; i < n; i++) 9 | { 10 | a[i] = (int)(rand()%n)+1; 11 | b[i] = (int)(rand()%n)+1; 12 | c[i] = (int)(rand()%n)+1; 13 | } 14 | 15 | #pragma omp simd 16 | for (int i = 0; i < n; i++) 17 | { 18 | a[i] = b[i] + c[i+1]; 19 | c[i] = a[i] + b[i]; 20 | } 21 | printf("a:\n"); 22 | for (int i = 0; i < n; i++) 23 | { 24 | printf("%d\t", a[i]); 25 | } 26 | printf("\n"); 27 | printf("b:\n"); 28 | for (int i = 0; i < n; i++) 29 | { 30 | printf("%d\t", b[i]); 31 | } 32 | printf("\n"); 33 | printf("c:\n"); 34 | for (int i = 0; i < n; i++) 35 | { 36 | printf("%d\t", c[i]); 37 | } 38 | printf("\n"); 39 | return 0; 40 | } -------------------------------------------------------------------------------- /lab1/hw/ex2_3-2.c: -------------------------------------------------------------------------------- 1 | #include "omp.h" 2 | #include 3 | #include 4 | #include 5 | 6 | int N = 50; 7 | int main(){ 8 | int count = 100; 9 | int x[100], y[200], b[100]; 10 | int a[100][100], c[100][100]; 11 | for (int i = 0; i < 100; i++) 12 | { 13 | x[i] = i+1; 14 | b[i] = i+2; 15 | } 16 | for (int i = 0; i < 200; i++) 17 | { 18 | y[i] = i+1; 19 | } 20 | for (int i = 0; i < 100; i++) 21 | { 22 | for (int j = 0; j < 100; j++) 23 | { 24 | a[i][j] = i+j+2; 25 | c[i][j] = i+j+3; 26 | } 27 | } 28 | for (int i = 0; i < count; i++) 29 | { 30 | for (int j = 0; j < count-1; j++) 31 | { 32 | b[j] = a[j][N]; 33 | #pragma omp parallel for 34 | for (int k = 0; k < count; i++) 35 | { 36 | a[j+1][k] = b[j] + c[j][k]; 37 | } 38 | } 39 | } 40 | for (int i = 0; i < count; i++) 41 | { 42 | /* code */ 43 | #pragma omp parallel for 44 | for (int j = 0; j < count-1; i++) 45 | { 46 | y[i+j] = a[j+1][N]; 47 | } 48 | } 49 | #pragma omp parallel for 50 | for (int i = 0; i < count; i++) 51 | { 52 | /* code */ 53 | x[i] = y[i] + 10; 54 | } 55 | return 0; 56 | } -------------------------------------------------------------------------------- /lab1/hw/ex2_3-5.c: -------------------------------------------------------------------------------- 1 | #include "omp.h" 2 | #include 3 | #include 4 | #include 5 | 6 | void loop1(){ 7 | int A[101], B[101], C[101], D[101]; 8 | for (int i = 0; i < 101; i++) 9 | { 10 | A[i] = i; 11 | B[i] = i+1; 12 | C[i] = i+2; 13 | D[i] = i+3; 14 | } 15 | for (int i = 1; i <= 100; i++) 16 | { 17 | B[i] = C[i-1]*2; 18 | C[i] = 1.0/B[i]; 19 | } 20 | #pragma omp parallel for 21 | for (int i = 1; i <= 100; i++) 22 | { 23 | /* code */ 24 | A[i] = A[i] + B[i-1]; 25 | D[i] = C[i] * C[i]; 26 | } 27 | } 28 | 29 | void loop2(){ 30 | int count = 1000; 31 | int A[1000], B[1000], C[1000], D[1000]; 32 | for (int i = 0; i < count; i++) 33 | { 34 | A[i] = i+1; 35 | B[i] = i+2; 36 | C[i] = i+3; 37 | D[i] = i+4; 38 | } 39 | #pragma omp parallel for 40 | for (int i = 1; i <= 500; i++) 41 | { 42 | A[i] = B[i] + C[i]; 43 | D[i] = (A[i] + A[1000-i])/2.0; 44 | } 45 | #pragma omp parallel for 46 | for (int i = 501; i < count; i++) 47 | { 48 | /* code */ 49 | A[i] = B[i] + C[i]; 50 | D[i] = (A[i]+A[1000-i])/2.0; 51 | } 52 | } 53 | 54 | void loop3(){ 55 | int count = 100; 56 | int A[500][200]; 57 | int C[100][100]; 58 | int D[100][100]; 59 | for (int i = 0; i < count; i++) 60 | { 61 | for (int j = 0; j < count; j++) 62 | { 63 | C[i][j] = i+j+1; 64 | D[i][j] = i+j+3; 65 | } 66 | } 67 | for (int i = 0; i < 500; i++) 68 | { 69 | for (int j = 0; j < 200; j++) 70 | { 71 | A[i][j] = i+j+2; 72 | } 73 | } 74 | 75 | #pragma omp parallel for 76 | for (int i = 0; i < count; i++) 77 | { 78 | for (int j = 0; i < count; j++) 79 | { 80 | A[3*i+2*j][2*j] = C[i][j]*2; 81 | D[i][j] = A[(i-j+6)>0 ? (i-j+6):-(i-j+6)][i+j]; 82 | } 83 | } 84 | } -------------------------------------------------------------------------------- /lab2/ex1_LU.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "mpi.h" 4 | #include "omp.h" 5 | #define a(x,y) a[x*M +y] 6 | #define A(x,y) A[x*M+y] 7 | #define l(x,y) l[x*M+y] 8 | #define u(x,y) u[x*M+y] 9 | #define floatsize sizeof(float) 10 | #define intsize sizeof(int) 11 | 12 | int M; 13 | int m; 14 | float * A; 15 | int my_rank; 16 | int p; 17 | MPI_Status status; 18 | 19 | void fatal(char * message) 20 | { 21 | printf("%s\n", message); 22 | exit(1); 23 | } 24 | 25 | void Env_Fin(float *a, float *f) 26 | { 27 | free(a); 28 | free(f); 29 | } 30 | 31 | int main(int argc, char * argv[]) 32 | { 33 | int i,j,k, my_rank, group_size; 34 | int i1, i2; 35 | int v,w; 36 | float *a, *f, *l, *u; 37 | printf("Input matrix row length:\n"); 38 | scanf("%d", &M); 39 | doubel time; 40 | MPI_Init(&argc, &argv); 41 | MPI_Comm_size(MPI_COMM_WORLD, &group_size); 42 | MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); 43 | 44 | p = group_size; 45 | if (my_rank == 0) 46 | { 47 | A = (float *)malloc(floatsize * M * M); 48 | for (int i = 0; i < M; i++) 49 | { 50 | for (int j = 0; j < M; j++) 51 | { 52 | A(i,j) = (float)(rand() % 25 + 1); 53 | } 54 | } 55 | time = MPI_Wtime(); 56 | 57 | } 58 | MPI_Bcast(&M,1,MPI_INT,0,MPI_COMM_WORLD); 59 | m=M/p; 60 | if (M%p!=0) m++; 61 | a=(float*)malloc(floatsize*m*M); 62 | 63 | f=(float*)malloc(floatsize*M); 64 | 65 | if (my_rank==0) 66 | { 67 | l=(float*)malloc(floatsize*M*M); 68 | u=(float*)malloc(floatsize*M*M); 69 | } 70 | 71 | if (a==NULL) fatal("allocate error\n"); 72 | if (my_rank==0) 73 | { 74 | for(i=0;ij) 138 | { 139 | 140 | #pragma omp parallel shared(a,f,v,m) private(k,w) 141 | { 142 | #pragma omp for 143 | for(k=i;kj) 188 | { 189 | l(i,j)=A(i,j); 190 | u(i,j)=0.0; 191 | } 192 | else if(ij) 209 | l(i,j)=A(i,j); 210 | else 211 | u(i,j)=A(i,j); 212 | } 213 | } 214 | time = MPI_Wtime() - time; 215 | printf("Input matrix:\n"); 216 | printf("%d\t %d\n",M, N); 217 | for(i=0;i 3 | #include 4 | #include 5 | #include 6 | 7 | int N; 8 | #define A(x, y) A[x * N + y] 9 | #define Q(x, y) Q[x * N + y] 10 | #define R(x, y) R[x * N + y] 11 | #define tmp(x, y) tmp[x * N + y] 12 | 13 | void Env_Fin(float *a, float *q, float *r, float *tmp, \ 14 | float *aj, float *ai, float *qi, float *qj) 15 | { 16 | free(a); 17 | free(q); 18 | free(r); 19 | free(tmp); 20 | free(qi); 21 | free(qj); 22 | free(ai); 23 | free(aj); 24 | } 25 | 26 | int main(){ 27 | printf("input matrix rows :"); 28 | scanf("%d", &N); 29 | float *A, *Q, *R, *tmp; 30 | A =(float *)malloc(N * N * sizeof(float)); 31 | Q =(float *)malloc(N * N * sizeof(float)); 32 | R =(float *)malloc(N * N * sizeof(float)); 33 | tmp = (float *)malloc(N * N * sizeof(float)); 34 | for (int i = 0; i < N; i++) 35 | { 36 | for (int j = 0; j < N; j++) 37 | { 38 | A(i,j) = (float)(rand() % 10) + 1; 39 | if(i == j) 40 | Q(i,j) = 1; 41 | R(i,j) = 0; 42 | } 43 | } 44 | printf("matrix A:\n"); 45 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | void PrintMatrixForVector(int * matrix,int high,int len) 9 | { 10 | int i; 11 | for(i=0;i 并行程序设计实验报告 2 | + lab1 3 | 1. 分别用omp和mpi实现树型求和和蝶式求和 4 | 树型omp实现思路:对每一层的计算并行化,对叶节点编号,序号值为 线程号。 5 | 考虑兄弟结点相加时把结果存在序号较小的结点中。 6 | 将二叉树每一次计算步骤编号为1-n步,因为任务数为2的幂次方,因 此n= log_2⁡N 7 | 要并行化的计算是每一步的节点与兄弟节点的加法。每一步的节点序 号与兄弟节点序号的差值为2^(n−i),i为步数。将加法交予不同的线 程并行实现。 8 | 分发结果就是求和的逆向实现,区别在于是自上而下的。 9 | ```c 10 | //主要代码: 11 | for (int i = logN; i > 0; i--) 12 | { 13 | step = (int)pow(2, logN-i+1); 14 | #pragma omp parallel private(pid) 15 | { 16 | pid = omp_get_thread_num(); 17 | if (!(pid % step)) 18 | { 19 | num[pid] = num[pid]+num[pid+step/2]; 20 | } 21 | #pragma omp barrier 22 | } 23 | } 24 | for (int i = 1; i <= logN; i++) 25 | { 26 | step = (int)pow(2, logN-i+1); 27 | #pragma omp parallel private(pid) 28 | { 29 | pid = omp_get_thread_num(); 30 | if (!(pid % step)) 31 | { 32 | num[pid+step/2] = num[pid]; 33 | } 34 | #pragma omp barrier 35 | } 36 | } 37 | ``` 38 | 实验结果: 39 | ![imag](./图片1.png) 40 | 树型mpi实现思路:在计算全和时,在第i步,进程id整除2^(n−i +1) 的负责接收数据并求和,其余进程负责发送数据。 41 | 在分发全和时,在第i步,进程id整除2^(n−i+1) 的负责发送数据, 其余进程负责接收数据。 42 | 发送与接收数据的进程编号在第i步相差2^(n−i) 。 43 | ```c 44 | //主要代码: 45 | for (int i = 1; i <= logN; i++) 46 | { 47 | int tag = i; 48 | int step = (int)pow(2, i); 49 | if (pid % step == 0) 50 | { 51 | MPI_Recv(&recvdata, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD, &status); 52 | data += recvdata; 53 | } 54 | else 55 | if(pid % step == step/2) 56 | { 57 | MPI_Send(&data, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD); 58 | } 59 | } 60 | //spread 61 | for (int i = logN; i >0; i--) 62 | { 63 | int tag = i; 64 | int step = (int)pow(2, i); 65 | if (pid % step == 0) 66 | { 67 | MPI_Send(&data, 1, MPI_INT, pid+step/2, tag, MPI_COMM_WORLD); 68 | } 69 | else 70 | if(pid % step == step/2) 71 | { 72 | MPI_Recv(&recvdata, 1, MPI_INT, pid-step/2, tag, MPI_COMM_WORLD, &status); 73 | data = recvdata; 74 | } 75 | 76 | } 77 | ``` 78 | 实验结果: 79 | ![imag](./图片2.png) 80 | 蝶式求和omp实现思路:仍然是对每一步的计算并行化,对叶节点编 号,序号值为线程号。考虑兄弟结点相加时把结果存在序号较小的结 点中。 81 | 依然将计算步骤编号从开始到结束编为1-n步,因为任务数为2的幂次 方,因此n= log_2⁡N 82 | 要并行化的计算是每一步的节点与兄弟节点的加法。关键在于如何确 定相加节点的序号差值。 83 | 将编号写为n位的二进制格式,可以发现,在第i步,相加的节点序号 恰好在第i位不同,而其余位都相同。 84 | ```c 85 | //主要代码: 86 | for (int i = 0; i < logN; i++) 87 | { 88 | step = (int)pow(2, i); 89 | #pragma omp parallel private(pid, tmp, dest) 90 | { 91 | pid = omp_get_thread_num(); 92 | tmp = num[pid]; 93 | dest = pid^step; 94 | tmp = num[pid] + num[dest]; 95 | #pragma omp barrier 96 | num[pid] = tmp; 97 | #pragma omp barrier 98 | } 99 | } 100 | ``` 101 | 实验结果: 102 | ![imag](./图片3.png) 103 | 蝶式求和mpi实现思路:思路与omp实现相同,区别在于不用关心哪两 个节点相加, 104 | 此时发送目的进程和接受来源进程的id的二进制格式在第i步时的第i 位不同。 105 | ```c 106 | //主要代码: 107 | int logN = (int)log2(num_procs); 108 | for(int i = 0; i < logN; i++) { 109 | int tag = i+1; 110 | int step = (int)pow(2,i); 111 | int dest = id_procs ^ step; 112 | MPI_Send(&data, 1, MPI_INT, dest, tag, MPI_COMM_WORLD); 113 | MPI_Recv(&recvdata, 1, MPI_INT, dest, tag, MPI_COMM_WORLD, &status); 114 | data += recvdata; 115 | } 116 | ``` 117 | 实验结果: 118 | ![imag](./图片4.png) 119 | 2. 对作业中有并行化要求的题目代码实现 120 | ```c 121 | //hw21-1-3 122 | #pragma omp simd 123 | for (int i = 0; i < n; i++) 124 | { 125 | a[i] = b[i] + c[i+1]; 126 | c[i] = a[i] + b[i]; 127 | } 128 | ``` 129 | ```c 130 | //hw21-3-2 131 | for (int i = 0; i < count; i++) 132 | { 133 | for (int j = 0; j < count-1; j++) 134 | { 135 | b[j] = a[j][N]; 136 | #pragma omp parallel for 137 | for (int k = 0; k < count; i++) 138 | { 139 | a[j+1][k] = b[j] + c[j][k]; 140 | } 141 | } 142 | } 143 | for (int i = 0; i < count; i++) 144 | { 145 | /* code */ 146 | #pragma omp parallel for 147 | for (int j = 0; j < count-1; i++) 148 | { 149 | y[i+j] = a[j+1][N]; 150 | } 151 | } 152 | #pragma omp parallel for 153 | for (int i = 0; i < count; i++) 154 | { 155 | /* code */ 156 | x[i] = y[i] + 10; 157 | } 158 | ``` 159 | ```c 160 | //hw21-3-5 161 | //loop1 162 | #pragma omp parallel for 163 | for (int i = 1; i <= 100; i++) 164 | { 165 | /* code */ 166 | A[i] = A[i] + B[i-1]; 167 | D[i] = C[i] * C[i]; 168 | 169 | } 170 | //loop2 171 | #pragma omp parallel for 172 | for (int i = 1; i <= 500; i++) 173 | { 174 | A[i] = B[i] + C[i]; 175 | D[i] = (A[i] + A[1000-i])/2.0; 176 | } 177 | #pragma omp parallel for 178 | for (int i = 501; i < count; i++) 179 | { 180 | /* code */ 181 | A[i] = B[i] + C[i]; 182 | D[i] = (A[i]+A[1000-i])/2.0; 183 | } 184 | //loop3 185 | #pragma omp parallel for 186 | for (int i = 0; i < count; i++) 187 | { 188 | for (int j = 0; i < count; j++) 189 | { 190 | A[3*i+2*j][2*j] = C[i][j]*2; 191 | D[i][j] = A[(i-j+6)>0 ? (i-j+6):-(i-j+6)][i+j]; 192 | } 193 | } 194 | ``` 195 | 3. 实现MyBcast() 196 | 思路: 197 | (1)将MPI进程按所在节点划分子通讯域N; 198 | (2)可以将各子通讯域的首进程(编号为0)再组成一个子通讯域H; 199 | (3)由广播的root进程将消息发给原来最大通讯域中的0号进程h,再由h在H通讯域中广播(MPI_Bcast),各首进程然后在各自子通讯域N中再行广播(MPI_Bcast); 200 | (4)子通讯域H:将N的首进程通过MPI_Group_incl()函数建立一个组,再用MPI_Comm_create()建立子通讯域H。 201 | ``` 202 | //伪代码 203 | MPI_Comm_split(MPI_COMM_WORLD, color, key, &split_comm_world); 204 | 建立MPI_COMM_WORLD的进程组World_Group; 205 | 通过World_Group建立h_Group; 206 | PI_Comm_create(MPI_COMM_WORLD, h_Group, &h_comm_world); 207 | oot进程发送消息: 208 | MPI_Send(data, count, MPI_TYPE, 0, 1, MPI_COMM_WORLD); 209 | 原通讯域的0号进程接收: 210 | MPI_Recv(data, count, MPI_TYPE, root, 1, MPI_COMM_WORLD,&status); 211 | 号进程在H中广播: 212 | MPI_Bcast(data, count, MPI_TYPE, 0, h_comm_world); 213 | 在N中广播 214 | MPI_Bcast(data, count, MPI_TYPE, 0, split_comm_world); 215 | ``` 216 | ```c 217 | //主要代码: 218 | MPI_Comm h_comm_world; 219 | MPI_Comm_group(MPI_COMM_WORLD, &world_group); 220 | int grpsize = num_procs / 2; 221 | int zerolist[] = {0, 1, 2, 3}; 222 | int zerocnt = 0; 223 | 224 | MPI_Group_incl(world_group, grpsize, zerolist, & new_group); 225 | MPI_Comm_create(MPI_COMM_WORLD, new_group, & h_comm_world); 226 | // message from root to 0 proc of MPI_COMM_WORLD 227 | if (id_procs == root) 228 | { 229 | MPI_Send(&seq, 16, MPI_CHAR, 0, 1, MPI_COMM_WORLD); 230 | } 231 | else if (id_procs == 0) 232 | { 233 | MPI_Recv(&seq, 16, MPI_CHAR, root, 1, MPI_COMM_WORLD, &status); 234 | } 235 | MPI_Barrier(MPI_COMM_WORLD); 236 | // Broadcast within the group H 237 | if(h_comm_world != MPI_COMM_NULL) 238 | MPI_Bcast(&seq, 16, MPI_CHAR, 0, h_comm_world); 239 | MPI_Barrier(MPI_COMM_WORLD); 240 | //Broadcasr within the group N 241 | 242 | MPI_Bcast(&seq, 16, MPI_CHAR, 0, split_comm_world); 243 | MPI_Barrier(MPI_COMM_WORLD); 244 | ``` 245 | 实验结果: 246 | ![imag](./图片5.png) 247 | 4. 248 | 用MPI_Send和MPI_Recv来模拟实现诸如MPI_Alltoall, MPI_Allgather功能并与标准MPI实现做简要性能对比. 249 | ```c 250 | //MPI_Alltoall实现 251 | void MPI_Alltoall_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount, 252 | MPI_Datatype recvdatatype, MPI_Comm comm) 253 | { 254 | int rank, size; 255 | MPI_Status status; 256 | MPI_Comm_rank(comm, &rank); 257 | MPI_Comm_size(comm, &size); 258 | for (int i = 0; i < size; i++) 259 | { 260 | if (i != rank) 261 | { 262 | MPI_Send(senddata + i * sendcount, sendcount, senddatatype, i, rank , comm); 263 | MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status); 264 | } 265 | else 266 | { 267 | //memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount); 268 | recvdata[i] = senddata[i]; 269 | } 270 | } 271 | } 272 | ``` 273 | ```c 274 | //MPI_Allgather实现 275 | void MPI_Allgather_my(int* senddata, int sendcount, MPI_Datatype senddatatype, int* recvdata, int recvcount, 276 | MPI_Datatype recvdatatype, MPI_Comm comm) 277 | { 278 | int rank, size, i; 279 | MPI_Status status; 280 | MPI_Comm_rank(comm, &rank); 281 | MPI_Comm_size(comm, &size); 282 | 283 | for (i = 0; i < size; i++) 284 | { 285 | if (i != rank) 286 | { 287 | MPI_Send(senddata, sendcount, senddatatype, i, rank , comm); 288 | MPI_Recv(recvdata + i * recvcount, recvcount, recvdatatype, i, i, comm, &status); 289 | } 290 | else 291 | { 292 | //memcpy(recvdata + i * recvcount, senddata, sizeof(senddatatype)*sendcount); 293 | recvdata[i] = *senddata; 294 | } 295 | } 296 | } 297 | ``` 298 | 性能对比: 299 | ![imag](./图片7.png) 300 | ![imag](./图片6.png) 301 | + lab2 302 | 1. 用mpi和omp实现LU分解 303 | + 原理 304 | 利用主行i对j >i的其他行做初等行变换,由于各行计算时没有数据相关,因此可以对矩阵按行划分做并行计算。课本的并行算法考虑到各处理器之间的负载均衡,使用的是交叉行划分。划分后各处理器轮流选出主行并广播给其他处理器,其他处理器利用接收的主行对部分行向量进行计算。 305 | + 思路 306 | 在课本第18章附录2的LU分解的MPI实现上,对处理器用主行做行变换的循环部分用openmp实现for编译制导,将计算平均分给各线程处理,其他不动。 307 | ```c 308 | //主要代码部分 309 | if (my_rank<=j) 310 | { 311 | #pragma omp parallel shared(a,f,v,m) private(k,w) 312 | { 313 | #pragma omp for 314 | for(k=i+1;kj) 328 | { 329 | #pragma omp parallel shared(a,f,v,m) private(k,w) 330 | { 331 | #pragma omp for 332 | for(k=i;k