├── README.md
├── 复习资料
    ├── 并行复习笔记.pdf
    ├── 并行程序设计导论.pdf
    ├── 课程要点-CUDA编程.pdf
    ├── 课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf
    └── 课程要点-高性能并行程序设计.pdf
├── 并行程序设计_lab0
    ├── gemm.c
    ├── gemm.java
    ├── gemm.py
    └── 并行程序设计_20337025_崔璨明.pdf
├── 并行程序设计_lab1
    ├── lib_code
    │   ├── libmatrix_multiply.so
    │   ├── matrix_multiply.c
    │   ├── matrix_multiply.h
    │   ├── readme.txt
    │   ├── test
    │   └── test.c
    ├── mpi_gemm_1.cpp
    ├── mpi_gemm_2.cpp
    ├── readme.txt
    └── 并行程序设计_20337025_崔璨明.pdf
├── 并行程序设计_lab2
    ├── Monte_carlo.cpp
    ├── code
    │   ├── libparallel_for.so
    │   ├── parallel_for.cpp
    │   ├── parallel_for.h
    │   ├── parallel_for.o
    │   ├── test
    │   └── test.cpp
    ├── gemm_openmp.cpp
    ├── gemm_p.cpp
    ├── readme.txt
    └── 并行程序设计_20337025_崔璨明.pdf
├── 并行程序设计_lab3
    ├── code
    │   ├── baseline.cpp
    │   ├── baseline_lu.cpp
    │   ├── cuda_2d.cu
    │   ├── cuda_bl.cu
    │   ├── cuda_bl_lu.cu
    │   ├── deal_binary.h
    │   ├── openmp.cpp
    │   ├── openmp_lu.cpp
    │   ├── readme.txt
    │   ├── share_mem.cu
    │   ├── test.cpp
    │   └── test
    │   │   ├── test.in
    │   │   └── test.out
    ├── output
    │   ├── baseline.in
    │   ├── baseline.out
    │   ├── baseline_lu.in
    │   ├── baseline_lu.out
    │   ├── cuda_2d.in
    │   ├── cuda_2d.out
    │   ├── cuda_bl.in
    │   ├── cuda_bl.out
    │   ├── cuda_bl_lu.in
    │   ├── cuda_bl_lu.out
    │   ├── openmp.in
    │   ├── openmp.out
    │   ├── openmp_lu.in
    │   ├── openmp_lu.out
    │   ├── share_mem.in
    │   ├── share_mem.out
    │   ├── test0.in
    │   └── test0.out
    ├── readme.txt
    └── 并行程序设计_20337025_崔璨明.pdf
└── 并行程序设计_lab4
    ├── code
        ├── matrix_vector_mul.cu
        ├── matrix_vector_mul_v2.cu
        ├── matrix_vector_mul_v3.cu
        ├── matrix_vector_mul_v4.cu
        ├── read_data.h
        ├── readme.txt
        └── valid.cpp
    └── 并行程序设计_20337025_崔璨明.pdf


/README.md:
--------------------------------------------------------------------------------
1 | # SYSU_parallel_programming
2 | 中山大学计算机学院 并行程序设计与算法（课程记录）
3 | 
4 | 授课老师：陶钧,黄聃
5 | 


--------------------------------------------------------------------------------
/复习资料/并行复习笔记.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/并行复习笔记.pdf


--------------------------------------------------------------------------------
/复习资料/并行程序设计导论.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/并行程序设计导论.pdf


--------------------------------------------------------------------------------
/复习资料/课程要点-CUDA编程.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-CUDA编程.pdf


--------------------------------------------------------------------------------
/复习资料/课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf


--------------------------------------------------------------------------------
/复习资料/课程要点-高性能并行程序设计.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-高性能并行程序设计.pdf


--------------------------------------------------------------------------------
/并行程序设计_lab0/gemm.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include<time.h>
 4 | 
 5 | #define NUM 100
 6 | 
 7 | int M,N,K;
 8 | 
 9 | double A[2048][2048];
10 | double B[2048][2048];
11 | double C[2048][2048];
12 | 
13 | 
14 | 
15 | void init_Mat(int M,int N,int K){
16 | 
17 | 	srand(233);
18 | 
19 |     for (int m=0;m<M;m++){
20 | 		for(int n=0;n<N;n++){
21 | 			A[m][n]=(double)(rand()%1000/10.0);
22 | 			printf("%f",A[m][n]);
23 | 		}
24 | 	}
25 | 
26 | 	for (int n=0;n<N;n++){
27 | 		for(int k=0;k<K;k++){
28 | 			B[n][k]=(double)(rand()%1000/10.0);
29 | 		}
30 | 	}
31 | 
32 | 	for (int m=0;m<M;m++){
33 | 		for(int k=0;k<K;k++){
34 | 			C[m][k]=0;
35 | 		}
36 | 	}
37 | } 
38 | 
39 | 
40 | int main(){
41 |     clock_t start_time,end_time;
42 |     printf("input three integer(512 ~2048):\n");
43 |     scanf("%d %d %d",&M,&N,&K);
44 | 
45 | 	init_Mat(M,N,K);
46 | 
47 |     start_time=clock();
48 |     for(int m=0;m<M;m++){
49 | 		for(int k=0;k<K;k++){
50 | 			for(int n=0;n<N;n++){
51 | 				C[m][k]+=A[m][n]*B[n][k];
52 | 			}
53 | 		}
54 | 	}
55 |     end_time=clock();
56 | 
57 | 	printf("\n");
58 | 	printf("matrix_1:\n");
59 | 	for (int i = 0; i<M; i++){
60 | 		for(int j=0;j<N;j++){
61 | 			printf("%0.2f ",A[i][j]);
62 | 		}
63 | 		printf("\n");
64 | 	}
65 | 
66 | 	printf("\n");
67 | 	printf("matrix_2:\n");
68 | 	for (int i = 0; i<N; i++){
69 | 		for(int j=0;j<K;j++){
70 | 			printf("%0.2f ",B[i][j]);
71 | 		}
72 | 		printf("\n");
73 | 	}
74 | 
75 | 	printf("\n");
76 | 	printf("result:\n");
77 | 	for (int i = 0; i<M; i++){
78 | 		for(int j=0;j<K;j++){
79 | 			printf("%0.2f ",C[i][j]);
80 | 		}
81 | 		printf("\n");
82 | 	}
83 | 	
84 | 	printf("\n");
85 | 	printf("using time: %f  ms\n",(double)(end_time-start_time)*1000.0/CLOCKS_PER_SEC);
86 | 
87 |     return 0;
88 | }


--------------------------------------------------------------------------------
/并行程序设计_lab0/gemm.java:
--------------------------------------------------------------------------------
 1 | import java.util.Scanner;
 2 | import java.util.Random;
 3 | 
 4 | public class gemm {
 5 | public static void main(String[] args) {
 6 | 		Scanner input = new Scanner(System.in);
 7 | 		System.out.println("input M:");
 8 | 		int x = input.nextInt();
 9 | 		System.out.println("input N:");
10 | 		int y = input.nextInt();
11 | 		System.out.println("input K:");
12 | 		int z = input.nextInt();
13 | 
14 | 		double[][] a =new double[x][y];
15 | 		double[][] b =new double[y][z];
16 | 		double[][] c =new double[x][z];
17 | 		
18 |         Random random = new Random();
19 | 
20 | 		System.out.println();
21 |         System.out.println("matrix_1:");
22 |         for (int i = 0; i < a.length; i++) {
23 | 	        for (int j = 0; j < a[i].length; j++) {
24 | 		        a[i][j] =100* random.nextDouble();
25 | 		        System.out.print(a[i][j] + " ");
26 | 	        }  
27 | 	        System.out.println();
28 |         }
29 |    
30 | 		System.out.println();
31 |         System.out.println("matrix_2:");
32 |         for (int i = 0; i < b.length; i++) {
33 | 	        for (int j = 0; j < b[i].length; j++) {
34 | 		        b[i][j] = 100* random.nextDouble();
35 | 		        System.out.print(b[i][j] + " ");
36 | 	        }
37 | 	    System.out.println();
38 |         }
39 | 
40 | 		long start=System.currentTimeMillis();
41 | 
42 | 		for (int i = 0; i < a.length; i++) {
43 | 			for (int j = 0; j < a[i].length; j++) {
44 | 				for (int k = 0; k < b[j].length; k++) {
45 | 					c[i][k] += a[i][j] * b[j][k];
46 | 				}	
47 | 			}
48 | 		}
49 | 		long end=System.currentTimeMillis();
50 | 
51 | 		
52 | 		System.out.println();
53 |         System.out.println("result:");
54 | 		for (double[] row : c){
55 | 			for (double cloumn : row) {
56 | 				System.out.print(cloumn + " ");
57 | 			}
58 | 			System.out.println();
59 | 		}
60 | 		System.out.println("using time:"  + (end-start)+" ms");
61 | 		System.out.println();
62 | 	}
63 | }


--------------------------------------------------------------------------------
/并行程序设计_lab0/gemm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | 
 4 | M,N,K=input("input three integer(512 ~2048):\n").split()
 5 | 
 6 | M=int(M)
 7 | N=int(N)
 8 | K=int(K)
 9 | 
10 | 
11 | x=[[100.0*random.random()
12 |                 for row in range(N)]
13 |                 for col in range(M)]
14 | 
15 | y=[[100.0*random.random()
16 |                 for row in range(K)]
17 |                 for col in range(N)]
18 | 
19 | z=[[0.0 for row in range(K)]
20 |                 for col in range(M)]
21 | 
22 | 
23 | 
24 | print("matrix_1:\n",x)
25 | print("matrix_2\n",y)
26 | 
27 | start = time.clock()
28 | 
29 | for m in range(M):
30 |     for k in range(K):
31 |         for n in range(N):
32 |             z[m][k]+=x[m][n]*y[n][k]
33 | 
34 | end = time.clock()
35 | 
36 | print("result:\n",z)
37 | 
38 | print("using time:",1000*(end-start),'ms\n' )
39 | 


--------------------------------------------------------------------------------
/并行程序设计_lab0/并行程序设计_20337025_崔璨明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab0/并行程序设计_20337025_崔璨明.pdf


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/libmatrix_multiply.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab1/lib_code/libmatrix_multiply.so


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/matrix_multiply.c:
--------------------------------------------------------------------------------
 1 | #include "matrix_multiply.h"
 2 | #include<stdio.h>
 3 | #include<stdlib.h>
 4 | void matrix_multiply(double**A,double**B,double**C,int M,int N,int K){
 5 |     for(int m=0;m<M;m++){
 6 | 		for(int k=0;k<K;k++){
 7 | 			for(int n=0;n<N;n++){
 8 | 				C[m][k]+=A[m][n]*B[n][k];
 9 | 			}
10 | 		}
11 | 	}
12 | }


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/matrix_multiply.h:
--------------------------------------------------------------------------------
1 | #ifndef matrix_multiply_h
2 | #define matrix_multiply_h
3 | #include<stdio.h>
4 | #include<stdlib.h>
5 | void matrix_multiply(double**A,double**B,double**C,int M,int N,int K);
6 | #endif


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/readme.txt:
--------------------------------------------------------------------------------
1 | test.c文件为测试程序


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab1/lib_code/test


--------------------------------------------------------------------------------
/并行程序设计_lab1/lib_code/test.c:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include<time.h>
 4 | #include "matrix_multiply.h"
 5 | #define NUM 100
 6 | 
 7 | int M,N,K;
 8 | 
 9 | 
10 | int main(){
11 |     clock_t start_time,end_time;
12 |     printf("input three integer(512 ~2048):\n");
13 |     scanf("%d %d %d",&M,&N,&K);
14 | 
15 | 
16 | 	double **A=(double**)malloc(sizeof(double*)*M);
17 | 	double **B=(double**)malloc(sizeof(double*)*N);
18 | 	double **C=(double**)malloc(sizeof(double*)*M);
19 | 
20 | 	for(int i=0;i<M;i++){
21 | 		A[i]=(double *)malloc(sizeof(double)*N);
22 | 	}
23 | 	for(int i=0;i<N;i++){
24 | 		B[i]=(double *)malloc(sizeof(double)*K);
25 | 	}
26 | 	for(int i=0;i<M;i++){
27 | 		C[i]=(double *)malloc(sizeof(double)*K);
28 | 	}
29 | 
30 | 	srand(233);
31 |     for (int m=0;m<M;m++){
32 | 		for(int n=0;n<N;n++){
33 | 			A[m][n]=(double)(rand()%1000/10.0);
34 | 			//printf("%f",A[m][n]);
35 | 		}
36 | 	}
37 | 	for (int n=0;n<N;n++){
38 | 		for(int k=0;k<K;k++){
39 | 			B[n][k]=(double)(rand()%1000/10.0);
40 | 		}
41 | 	}
42 | 	for (int m=0;m<M;m++){
43 | 		for(int k=0;k<K;k++){
44 | 			C[m][k]=0;
45 | 		}
46 | 	}
47 | 
48 |     start_time=clock();
49 | 
50 |     matrix_multiply(A,B,C,M,N,K);
51 | 
52 |     end_time=clock();
53 | 
54 | 	printf("\n");
55 | 	printf("matrix_1:\n");
56 | 	for (int i = 0; i<M; i++){
57 | 		for(int j=0;j<N;j++){
58 | 			printf("%0.2f ",A[i][j]);
59 | 		}
60 | 		printf("\n");
61 | 	}
62 | 
63 | 	printf("\n");
64 | 	printf("matrix_2:\n");
65 | 	for (int i = 0; i<N; i++){
66 | 		for(int j=0;j<K;j++){
67 | 			printf("%0.2f ",B[i][j]);
68 | 		}
69 | 		printf("\n");
70 | 	}
71 | 
72 | 	printf("\n");
73 | 	printf("result:\n");
74 | 	for (int i = 0; i<M; i++){
75 | 		for(int j=0;j<K;j++){
76 | 			printf("%0.2f ",C[i][j]);
77 | 		}
78 | 		printf("\n");
79 | 	}
80 | 	
81 | 	printf("\n");
82 | 	printf("using time: %f  ms\n",(double)(end_time-start_time)*1000.0/CLOCKS_PER_SEC);
83 | 
84 |     return 0;
85 | }
86 | 
87 | 


--------------------------------------------------------------------------------
/并行程序设计_lab1/mpi_gemm_1.cpp:
--------------------------------------------------------------------------------
  1 | //p2p
  2 | 
  3 | #include<mpi.h>
  4 | #include<stdio.h>
  5 | #include<stdlib.h>
  6 | #include<iostream>
  7 | using namespace std;
  8 | 
  9 | // print matrix
 10 | void print_mat(int row,int col,double * matrix){
 11 | 	for(int i=0;i<row;i++){
 12 | 				for(int j=0;j<col;j++){
 13 | 					printf("%.2f \t",matrix[i*row+j]);
 14 | 				}
 15 | 				cout<<endl;
 16 | 			}
 17 | }
 18 | 
 19 | //init matrix
 20 | void init_Mat(int row, int col,double* mat){
 21 | 	for (int m=0;m<row;m++){
 22 | 		for (int n=0;n<col;n++){
 23 | 			mat[m*col+n]=(double)(rand()%1000/10.0);
 24 | 		}
 25 | 	}
 26 | }
 27 | 
 28 | int main(int argc, char * argv[] ){
 29 |     int M=atoi(argv[1]);
 30 | 	int N=atoi(argv[2]);
 31 | 	int K=atoi(argv[3]);
 32 |     double *b= new double [ N* K ];
 33 | 	double *result = new double [ M * K ];
 34 | 	double *a=NULL,*c=NULL;
 35 | 	int pid, process_num, line;
 36 | 
 37 |     MPI_Init(NULL,NULL);//Initialize
 38 |     MPI_Comm_rank(MPI_COMM_WORLD,&pid);//process id
 39 |     MPI_Comm_size(MPI_COMM_WORLD,&process_num);//num of process
 40 | 
 41 | 	line = M/process_num;//divide data
 42 | 	srand(233);
 43 | 
 44 | 	//main process
 45 | 	if(pid==0){
 46 | 			a=new double[M*N];
 47 | 			c=new double[M*K];
 48 | 			init_Mat(M,N,a);
 49 | 			init_Mat(N,K,b);
 50 | 			cout<<"Matrix A:"<<endl;
 51 | 			print_mat(M,N,a);
 52 | 			cout<<"Matrix B:"<<endl;
 53 | 			print_mat(N,K,b);
 54 | 
 55 | 			double start_time;
 56 | 			double end_time;
 57 | 			start_time=MPI_Wtime();
 58 | 			//send matrix N to sub processes
 59 | 			for (int i=1;i<process_num;i++){
 60 | 				MPI_Send(b,N*K,MPI_DOUBLE,i,0,MPI_COMM_WORLD);
 61 | 			}
 62 | 			// send each row of A to sub processes
 63 | 			for (int i=1;i<process_num;i++){
 64 | 				MPI_Send(a+(i-1)*line*N,N*line,MPI_DOUBLE,i,1,MPI_COMM_WORLD);
 65 | 			}
 66 | 			// receive result
 67 | 			for (int i=1;i<process_num;i++){
 68 | 				MPI_Recv(result,line*K,MPI_DOUBLE,i,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
 69 | 				for(int l=0;l<line;l++){
 70 | 					for(int k=0;k<K;k++){
 71 | 						c[((i-1)*line+l)*K+k]=result[l*K+k];
 72 | 					}
 73 | 				}
 74 | 			}
 75 | 
 76 | 			for (int i=(process_num-1)*line;i<M;i++){
 77 |             	for (int j=0;j<K;j++){
 78 |                 	double tmp=0;
 79 |                 	for (int k=0;k<N;k++)
 80 |                     	tmp += a[i*N+k]*b[k*K+j];
 81 |                 		c[i*K+j] = tmp;
 82 |             	}
 83 |         	}
 84 | 			end_time=MPI_Wtime();
 85 | 			double using_time=end_time-start_time;
 86 | 			cout<<"Matrix C:"<<endl;
 87 | 			print_mat(M,K,c);
 88 | 			cout<<"using time:"<<using_time<<endl;
 89 | 	}
 90 | 
 91 | 	//sub process
 92 | 	else{
 93 | 		double* temp = new double [ N * line ];
 94 | 		MPI_Recv(b,N*K,MPI_DOUBLE,0,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
 95 | 
 96 |         MPI_Recv(temp,N*line,MPI_DOUBLE,0,1,MPI_COMM_WORLD,MPI_STATUS_IGNORE);
 97 | 		for(int i=0;i<line;i++){
 98 | 			for(int j=0;j<N;j++){
 99 | 				double tmp=0;
100 | 				for(int k=0;k<N;k++)
101 | 					tmp += temp[i*N+k]*b[k*K+j];
102 | 				result[i*K+j] = tmp;
103 | 			}
104 | 		}
105 | 		MPI_Send(result, line*K, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
106 | 	}
107 | 	MPI_Finalize();
108 | 	return 0;
109 | }


--------------------------------------------------------------------------------
/并行程序设计_lab1/mpi_gemm_2.cpp:
--------------------------------------------------------------------------------
  1 | //collective communication
  2 | 
  3 | #include<mpi.h>
  4 | #include<stdio.h>
  5 | #include<stdlib.h>
  6 | #include<iostream>
  7 | using namespace std;
  8 | 
  9 | 
 10 | 
 11 | // print matrix
 12 | void print_mat(int row,int col,double * matrix){
 13 | 	for(int i=0;i<row;i++){
 14 | 				for(int j=0;j<col;j++){
 15 | 					printf("%.2f \t",matrix[i*row+j]);
 16 | 				}
 17 | 				cout<<endl;
 18 | 			}
 19 | }
 20 | 
 21 | //init matrix
 22 | void init_Mat(int row, int col,double* mat){
 23 | 	for (int m=0;m<row;m++){
 24 | 		for (int n=0;n<col;n++){
 25 | 			mat[m*col+n]=(double)(rand()%1000/10.0);
 26 | 		}
 27 | 	}
 28 | }
 29 | 
 30 | int main(int argc, char * argv[] ){
 31 |     int M=atoi(argv[1]);
 32 | 	int N=atoi(argv[2]);
 33 | 	int K=atoi(argv[3]);
 34 |     double* a = new double [M*N];
 35 | 	double* b = new double [N*K];
 36 | 	double* c = new double [M*K];
 37 |     int pid, process_num, line;
 38 | 
 39 | 	MPI_Init(NULL,NULL);//Initialize
 40 |     MPI_Comm_rank(MPI_COMM_WORLD,&pid);//process id
 41 |     MPI_Comm_size(MPI_COMM_WORLD,&process_num);//num of process
 42 | 	
 43 | 	line = M/process_num;//divide data
 44 | 	srand(233);
 45 | 	double * local_matrix = new double [line*N];
 46 | 	double * result = new double [M*K];
 47 | 
 48 |     //main process
 49 | 	if(pid==0){
 50 |         init_Mat(M,N,a);
 51 |         init_Mat(N,K,b);
 52 |         cout<<"Matrix A:"<<endl;
 53 | 		print_mat(M,N,a);
 54 | 		cout<<"Matrix B:"<<endl;
 55 | 		print_mat(N,K,b);
 56 | 
 57 | 
 58 |         double start_time;
 59 | 		double end_time;
 60 | 		start_time=MPI_Wtime();
 61 |         //send divition of a to sub process
 62 |         MPI_Scatter(a, line*N, MPI_DOUBLE, local_matrix, line*N, MPI_DOUBLE, 0, MPI_COMM_WORLD );
 63 |         //broadcast b to every process
 64 | 		MPI_Bcast(b, N*K, MPI_DOUBLE, 0, MPI_COMM_WORLD);
 65 |         //calculate local results
 66 |         for(int i= 0; i< M;i++){
 67 | 			for(int j=0;j<N;j++){
 68 | 				double tmp = 0;
 69 | 				for(int k=0;k<N;k++)
 70 | 					tmp += a[i*N+k] * b[k*K+ j];
 71 | 				result[i*K+ j ] = tmp;
 72 | 			}
 73 | 		}
 74 | 		//wait all
 75 | 		MPI_Barrier() ;
 76 |         //Collect data into a process
 77 |         MPI_Gather( result, line*K, MPI_DOUBLE, c, line*K, MPI_DOUBLE, 0, MPI_COMM_WORLD );
 78 |         //calculate the remain
 79 | 		for(int i = (process_num-1)*line;i<M;i++){
 80 | 			for(int j=0;j<N;j++){
 81 | 				double tmp = 0;
 82 | 				for(int k=0;k<N;k++)
 83 | 					tmp += a[i*N+k]*b[k*K+j];
 84 | 				c[i*K+j] = tmp;
 85 | 			}
 86 | 		}
 87 | 
 88 |         end_time=MPI_Wtime();
 89 | 		double using_time=end_time-start_time;
 90 | 		cout<<"Matrix C:"<<endl;
 91 | 		print_mat(M,K,c);
 92 | 		cout<<"using time:"<<using_time<<endl;
 93 |     }
 94 | 
 95 |     else{
 96 |         double * temp= new double [ N * line ];
 97 | 		MPI_Scatter(a, line*N, MPI_DOUBLE, temp, line*N, MPI_DOUBLE, 0, MPI_COMM_WORLD );
 98 | 		MPI_Bcast( b, N* K, MPI_DOUBLE, 0, MPI_COMM_WORLD );
 99 | 		for(int i=0;i<line;i++){
100 | 			for(int j=0;j<N;j++){
101 | 				double tmp=0;
102 | 				for(int k=0;k<N;k++)
103 | 					tmp += temp[i*N+k]*b[k*K+j];
104 | 				result[i*K+j] = tmp;
105 | 			}
106 | 		}
107 | 		MPI_Gather(result, line*K, MPI_DOUBLE, c, line*K, MPI_DOUBLE, 0, MPI_COMM_WORLD );
108 |     }
109 |     MPI_Finalize();
110 |     return 0;    
111 | }


--------------------------------------------------------------------------------
/并行程序设计_lab1/readme.txt:
--------------------------------------------------------------------------------
1 | mpi_gemm_1.cpp是点对点通信的MPI矩阵乘法程序
2 | mpi_gemm_2.cpp是集合通信的MPI矩阵乘法程序
3 | lib_code中为生成动态链接库的程序


--------------------------------------------------------------------------------
/并行程序设计_lab1/并行程序设计_20337025_崔璨明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab1/并行程序设计_20337025_崔璨明.pdf


--------------------------------------------------------------------------------
/并行程序设计_lab2/Monte_carlo.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <pthread.h>
 3 | #include <random>
 4 | 
 5 | #define THREAD_COUNT 8  // 线程数量
 6 | #define ITERATIONS 1000000  // 迭代次数
 7 | 
 8 | double sum = 0;  
 9 | pthread_mutex_t lock;  //互斥锁
10 | 
11 | void *calculate_area(void *thread_id_ptr) {
12 |     int thread_id = *(int *) thread_id_ptr;  // 获取线程ID
13 |     std::random_device rd; 
14 |     std::mt19937 gen(rd());  
15 |     std::uniform_real_distribution<> dis(0, 1);  
16 |     double local_sum = 0;  
17 | 
18 |     for (int i = 0; i < ITERATIONS; i++) { 
19 |         double x = dis(gen);  
20 |         double y = dis(gen);  
21 |          //  (x, y) 在 y=x^2 曲线下方，需要计入面积
22 |         if (y <= x * x) { 
23 |             local_sum++;
24 |         }
25 |     }
26 | 
27 |     //将当前线程的计算结果加到 sum 变量中
28 |     pthread_mutex_lock(&lock); 
29 |     sum += local_sum / ITERATIONS;  
30 |     pthread_mutex_unlock(&lock); 
31 | 
32 |     pthread_exit(NULL);
33 | }
34 | 
35 | int main() {
36 |     pthread_t threads[THREAD_COUNT];  // 创建线程数组
37 |     int thread_ids[THREAD_COUNT];  // 创建线程 ID 数组
38 |     pthread_mutex_init(&lock, NULL);  // 初始化互斥锁
39 | 
40 |       // 创建线程并执行计算
41 |     for (int i = 0; i < THREAD_COUNT; i++) {
42 |         thread_ids[i] = i;  
43 |         pthread_create(&threads[i], NULL, calculate_area, &thread_ids[i]);  
44 |     }
45 | 
46 |     // 等待所有线程完成计算
47 |     for (int i = 0; i < THREAD_COUNT; i++) {  
48 |         pthread_join(threads[i], NULL);  
49 |     }
50 | 
51 |     pthread_mutex_destroy(&lock);  
52 |     std::cout << "function: y=x^2\nEstimated area: " << sum / THREAD_COUNT << std::endl;  // 输出计算结果
53 |     return 0;
54 | }
55 | 


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/libparallel_for.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/libparallel_for.so


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/parallel_for.cpp:
--------------------------------------------------------------------------------
 1 | #include"parallel_for.h"
 2 | #include<stdlib.h>
 3 | #include <stdio.h>
 4 | #include <pthread.h>
 5 | #include<iostream>
 6 | 
 7 | //并行循环函数parallel_for
 8 | void parallel_for(int start, int end, int increment, void *(*functor)(void *), void *arg, int num_threads){
 9 |     pthread_t *threads = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
10 |     for_index *index_arr = (for_index *)malloc(num_threads * sizeof(for_index));
11 | 
12 |     //每个线程要处理的数据块大小
13 |     int block = (end - start) / num_threads;
14 |     //为每个线程分配参数
15 |     for (int i = 0; i < num_threads; i++){
16 |         index_arr[i].args = arg;
17 |         index_arr[i].start = start + i * block;
18 |         index_arr[i].end = index_arr[i].start + block;
19 |         //处理最后一个线程的数据块
20 |         if (i == (num_threads - 1))
21 |             index_arr[i].end = end;
22 |         index_arr[i].increment = increment;
23 |         pthread_create(&threads[i], NULL, functor, (void *)(index_arr + i));
24 |     }
25 |     //等待所有线程执行完成
26 |     for (int thread = 0; thread < num_threads; thread++)
27 |         pthread_join(threads[thread], NULL);
28 |     free(threads);
29 |     free(index_arr);
30 | }


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/parallel_for.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARALLEL_FOR_H
 2 | #define PARALLEL_FOR_H
 3 | #include <pthread.h>
 4 | 
 5 | // for 循环参数的结构体
 6 | struct for_index{
 7 |     void *args;
 8 |     int start; // 起始下标
 9 |     int end; //终止下标
10 |     int increment; // 步长
11 | };
12 | void parallel_for(int start, int end, int increment, void *(*functor)(void *), void *arg, int num_threads);
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/parallel_for.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/parallel_for.o


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/test


--------------------------------------------------------------------------------
/并行程序设计_lab2/code/test.cpp:
--------------------------------------------------------------------------------
  1 | #include <pthread.h>
  2 | 
  3 | #include<iostream>
  4 | #include<stdio.h>
  5 | #include<stdlib.h>
  6 | #include"parallel_for.h"
  7 | 
  8 | using namespace std;
  9 | 
 10 | int M,N,K;
 11 | int Thread_NUM =4;
 12 | double *A;
 13 | double *B;
 14 | double *C;
 15 | 
 16 | //print matrix
 17 | void print_mat(int row,int col,double * matrix){
 18 | 	for(int i=0;i<row;i++){
 19 | 				for(int j=0;j<col;j++){
 20 | 					printf("%.2f \t",matrix[i*row+j]);
 21 | 				}
 22 | 				cout<<endl;
 23 | 			}
 24 | }
 25 | 
 26 | //init matrix
 27 | void init_Mat(int M,int N,int K){
 28 | 	srand(243);
 29 |     A = new double [M*N];
 30 |     B = new double [N*K];
 31 |     C = new double [M*K];
 32 |     for (int m=0;m<M;m++){
 33 | 		for(int n=0;n<N;n++){
 34 | 			A[m*M+n]=(double)(rand()%1000/10.0);
 35 | 		}
 36 | 	}
 37 | 	for (int n=0;n<N;n++){
 38 | 		for(int k=0;k<K;k++){
 39 | 			B[n*N+k]=(double)(rand()%1000/10.0);
 40 | 		}
 41 | 	}
 42 | 	for (int m=0;m<M;m++){
 43 | 		for(int k=0;k<K;k++){
 44 | 			C[m*M+k]=0;
 45 | 		}
 46 | 	}
 47 | } 
 48 | 
 49 | //函数参数的结构体
 50 | struct args{
 51 |     double*A;
 52 |     double *B;
 53 |     double *C;
 54 |     int m;
 55 |     int n;
 56 |     int k;
 57 | };
 58 | 
 59 | 
 60 | void *gemm_fun(void *args){
 61 |     struct for_index *idx = (struct for_index *)args;
 62 |     struct args *matrix = (struct args *)(idx->args);
 63 | 
 64 |     int K=matrix->k;
 65 |     int N=matrix->n;
 66 | 
 67 |     for (int m = idx->start; m < idx->end; m = m + idx->increment){
 68 |         for (int k = 0; k < K; k++){
 69 |             matrix->C[m * K + k] =0;
 70 |             for (int n = 0; n < N; n++){
 71 |                 matrix->C[m * K + k]  += matrix->A[m * N + n] * matrix->B[n* K + k];
 72 |             }
 73 |         }
 74 |     }
 75 |     return NULL;
 76 | }
 77 | 
 78 | int main(int argc, char *argv[])
 79 | {
 80 |     int M=atoi(argv[1]);
 81 | 	int N=atoi(argv[2]);
 82 | 	int K=atoi(argv[3]);
 83 |     Thread_NUM=atoi(argv[4]);
 84 | 
 85 |     init_Mat(M,N,K);
 86 | 
 87 |     struct args *arg = new args();
 88 |     arg->A=A;
 89 |     arg->B=B;
 90 |     arg->C=C;
 91 |     arg->n=N;
 92 |     arg->m=M;
 93 |     arg->k=K;
 94 | 
 95 |     clock_t start_time=clock();
 96 |     parallel_for(0, M, 1, gemm_fun, arg, Thread_NUM);
 97 |     clock_t end_time=clock();
 98 |     double using_time=(double)(end_time-start_time)/CLOCKS_PER_SEC;
 99 | 
100 |     cout<<"result:"<<endl;
101 |     print_mat(M,K,C);
102 |     cout<<"uisng time:"<<using_time<<" s"<<endl;
103 |     return 0;
104 | }


--------------------------------------------------------------------------------
/并行程序设计_lab2/gemm_openmp.cpp:
--------------------------------------------------------------------------------
  1 | #include<iostream>
  2 | #include<stdio.h>
  3 | #include<stdlib.h>
  4 | #include<ctime>
  5 | #include <omp.h>
  6 | #include <random>
  7 | 
  8 | 
  9 | 
 10 | int Thread_NUM =4;
 11 | 
 12 | using namespace std;
 13 | 
 14 | int M,N,K;
 15 | 
 16 | double **A;
 17 | double **B;
 18 | double **C;
 19 | 
 20 | // init matrix
 21 | void init_Mat(int M,int N,int K){
 22 | 	srand(243);
 23 |     A = new double* [M];
 24 |     B = new double* [N];
 25 |     C = new double* [M];
 26 |     for (int m=0;m<M;m++){
 27 |         A[m]=new double[N];
 28 | 		for(int n=0;n<N;n++){
 29 | 			A[m][n]=(double)(rand()%1000/10.0);
 30 | 		}
 31 | 	}
 32 | 	for (int n=0;n<N;n++){
 33 |         B[n]=new double [K];
 34 | 		for(int k=0;k<K;k++){
 35 | 			B[n][k]=(double)(rand()%1000/10.0);
 36 | 		}
 37 | 	}
 38 | 	for (int m=0;m<M;m++){
 39 |         C[m]=new double[K];
 40 | 		for(int k=0;k<K;k++){
 41 | 			C[m][k]=0;
 42 | 		}
 43 | 	}
 44 | } 
 45 | 
 46 | //use openmp
 47 | void parallel_gemm(){
 48 |     //默认调度
 49 |     //#pragma omp parallel for num_threads(Thread_NUM)
 50 |     //静态调度
 51 |     #pragma omp parallel for num_threads(Thread_NUM)\
 52 |     schedule(static, 1)
 53 |     //动态调度
 54 |     //#pragma omp parallel for num_threads(Thread_NUM)\
 55 |     //schedule(dynamic, 1)
 56 | 
 57 |     for(int m=0;m<M;m++){
 58 |         for(int n=0;n<N;n++){
 59 |             for(int k=0;k<K;k++){
 60 |                 C[m][k]+=A[m][n]*B[n][k];
 61 |             }
 62 |         }
 63 |     }
 64 | }
 65 | 
 66 | //serial gemm
 67 | void serial_gemm(){
 68 |     for(int m=0;m<M;m++){
 69 |         for(int n=0;n<N;n++){
 70 |             for(int k=0;k<K;k++){
 71 |                 C[m][k]+=A[m][n]*B[n][k];
 72 |             }
 73 |         }
 74 |     }
 75 | }
 76 | 
 77 | // print matrix
 78 | void print_mat(int row,int col,double ** matrix){
 79 | 	for(int i=0;i<row;i++){
 80 | 				for(int j=0;j<col;j++){
 81 | 					printf("%.2f \t",matrix[i][j]);
 82 | 				}
 83 | 				cout<<endl;
 84 | 			}
 85 | }
 86 | 
 87 | 
 88 | 
 89 | int main(int argc, char * argv[] ){
 90 |     // 矩阵 A、B 和 C 的维度以及线程数量
 91 |     M=atoi(argv[1]);
 92 |     N=atoi(argv[2]);
 93 |     K=atoi(argv[3]);
 94 |     Thread_NUM=atoi(argv[4]);
 95 | 
 96 |     clock_t start_time1,end_time1,start_time2,end_time2;
 97 | 
 98 |     init_Mat(M,N,K);
 99 |     start_time1=clock();
100 |     serial_gemm();
101 |     end_time1=clock();
102 |     double using_time1=(double)(end_time1-start_time1)/CLOCKS_PER_SEC;
103 | 
104 |     init_Mat(M,N,K);
105 |     start_time2=clock();
106 |     parallel_gemm();
107 |     end_time2=clock();
108 |     double using_time2=(double)(end_time2-start_time2)/CLOCKS_PER_SEC;
109 |     
110 |     cout<<"result:"<<endl;
111 |     print_mat(M,K,C);
112 |     cout<<"normal gemm uisng time:"<<using_time1<<" s"<<endl;
113 |     cout<<"openmp gemm uisng time:"<<using_time2<<" s"<<endl;
114 |     return 0;
115 |  }


--------------------------------------------------------------------------------
/并行程序设计_lab2/gemm_p.cpp:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include<stdlib.h>
  3 | #include<pthread.h>
  4 | #include<iostream>
  5 | 
  6 | using namespace std;
  7 | 
  8 | int M,N,K;
  9 | 
 10 | double **A;
 11 | double **B;
 12 | double **C;
 13 | 
 14 | int thread_count;
 15 | 
 16 | void init_Mat(int M,int N,int K){
 17 | 	srand(243);
 18 |     A = new double* [M];
 19 |     B = new double* [N];
 20 |     C = new double* [M];
 21 |     for (int m=0;m<M;m++){
 22 |         A[m]=new double[N];
 23 | 		for(int n=0;n<N;n++){
 24 | 			A[m][n]=(double)(rand()%1000/10.0);
 25 | 		}
 26 | 	}
 27 | 	for (int n=0;n<N;n++){
 28 |         B[n]=new double [K];
 29 | 		for(int k=0;k<K;k++){
 30 | 			B[n][k]=(double)(rand()%1000/10.0);
 31 | 		}
 32 | 	}
 33 | 	for (int m=0;m<M;m++){
 34 |         C[m]=new double[K];
 35 | 		for(int k=0;k<K;k++){
 36 | 			C[m][k]=0;
 37 | 		}
 38 | 	}
 39 | } 
 40 | 
 41 | // print matrix
 42 | void print_mat(int row,int col,double ** matrix){
 43 | 	for(int i=0;i<row;i++){
 44 | 				for(int j=0;j<col;j++){
 45 | 					printf("%.2f \t",matrix[i][j]);
 46 | 				}
 47 | 				cout<<endl;
 48 | 			}
 49 | }
 50 | 
 51 | void *gemm(void *rank) 
 52 | {
 53 |     // 获取线程编号
 54 |     int p_rank = (long)rank;
 55 |     // 指定每个线程要计算的行数
 56 |     int p_first_row, p_end_row; 
 57 |     int quotient = M / thread_count;
 58 |     int remainder = M % thread_count;
 59 |     int p_cols=0;
 60 |     if (p_rank < remainder)
 61 |     {
 62 |         p_cols = quotient + 1;
 63 |         p_first_row = p_rank * p_cols;
 64 |     }
 65 |     else
 66 |     {
 67 |         p_cols = quotient;
 68 |         p_first_row = p_rank * p_cols + remainder;
 69 |     }
 70 |     p_end_row = p_first_row + p_cols;
 71 | 
 72 |     // 执行矩阵乘法
 73 |     for (int m = p_first_row; m < p_end_row; m++)
 74 |     {
 75 |         for (int k= 0; k < K; k++)
 76 |         {
 77 |             C[m][k] = 0;
 78 |             for (int n = 0; n < N; n++)
 79 |             {
 80 |                 C[m][k] += A[m][n] * B[n][k];
 81 |             }
 82 |         }
 83 |     }
 84 |     return NULL;
 85 | }
 86 | 
 87 | int main(int argc, char * argv[] ){
 88 |     // 矩阵 A、B 和 C 的维度以及线程数量
 89 |     M=atoi(argv[1]);
 90 |     N=atoi(argv[2]);
 91 |     K=atoi(argv[3]);
 92 |     thread_count = atoi(argv[4]);
 93 | 
 94 |     // 初始化
 95 |     init_Mat(M,N,K);
 96 | 
 97 |     cout<<"matrix A:"<<endl;
 98 |     print_mat(M,N,A);
 99 |     cout<<"matrix B:"<<endl;
100 |     print_mat(N,K,B);
101 | 
102 |     pthread_t *thread_handles;
103 |     // 创建一个线程数组，大小为线程数量
104 |     thread_handles = (pthread_t *)malloc(thread_count * sizeof(pthread_t)); 
105 | 
106 |     clock_t start_time=clock();
107 |     for (int t = 0; t < thread_count; t++)          
108 |     {
109 |         // 创建一个线程，并指定该线程要执行的函数为 gemm
110 |         // 将线程编号 t 传递给 gemm 函数
111 |         pthread_create(&thread_handles[t], NULL, gemm, (void *)t);
112 |     }
113 |     // 等待所有线程执行完成
114 |     for (int t = 0; t < thread_count; t++) 
115 |     {
116 |         pthread_join(thread_handles[t], NULL);
117 |     }
118 |     
119 |     clock_t end_time=clock();
120 |     double using_time=(double)(end_time-start_time)/CLOCKS_PER_SEC;
121 | 
122 |     cout<<"result:"<<endl;
123 |     print_mat(M,K,C);
124 |     cout<<"uisng time:"<<using_time<<" s"<<endl;
125 | 
126 |     free(thread_handles);
127 | 
128 |     return 0;
129 | }
130 | 
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/并行程序设计_lab2/readme.txt:
--------------------------------------------------------------------------------
1 | gemm_p.cpp：1、通过 Pthreads实现通用矩阵乘法
2 | Monte_carlo.cpp：2、编写一个多线程程序来实现面积计算
3 | gemm_openmp.cpp：3、4、通过OpenMP实现通用矩阵乘法，并采用不同调度
4 | code文件夹：构造基于Pthreads的并行for循环分解、分配和执行机制。
5 | 


--------------------------------------------------------------------------------
/并行程序设计_lab2/并行程序设计_20337025_崔璨明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/并行程序设计_20337025_崔璨明.pdf


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/baseline.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <vector>
  4 | #include <cstdlib>
  5 | #include <ctime>
  6 | #include <fstream>
  7 | #include <iomanip>
  8 | #include "deal_binary.h"
  9 | using std::string;
 10 | 
 11 | const int ARRAY_SIZE[] = {5, 16, 128, 1024, 2048, 3000,4000};
 12 | const int ELEMENT_RANGE = 16;
 13 | const int WINDOW_SIZE = 5;
 14 | const int NUM_ARRAYS = sizeof(ARRAY_SIZE) / sizeof(ARRAY_SIZE[0]);
 15 | const int KERNEL_RADIUS = WINDOW_SIZE / 2;
 16 | 
 17 | // 生成随机二维数组
 18 | void generateRandomArray(std::vector<std::vector<int> >& array,int row,int col) {
 19 |     array.resize(row);
 20 |     for(int i=0;i<row;i++){
 21 |         array[i].resize(col);
 22 |     }
 23 | 	srand(static_cast<unsigned int>(8234));
 24 |     for (int i = 0; i < row; i++) {
 25 |         for (int j = 0; j < col; j++) {
 26 |             array[i][j] = rand() % ELEMENT_RANGE;
 27 |         }
 28 |         //printf("\n");
 29 |     }
 30 | }
 31 | 
 32 | 
 33 | // 计算熵
 34 | float calculateEntropy(const std::vector<std::vector<int> >& array, int x, int y) {
 35 |     std::vector<int> counts(ELEMENT_RANGE, 0);
 36 | 
 37 |     int startX = std::max(0, x - WINDOW_SIZE / 2);
 38 |     int startY = std::max(0, y - WINDOW_SIZE / 2);
 39 |     int endX = std::min(static_cast<int>(array.size()) - 1, x + WINDOW_SIZE / 2);
 40 |     int endY = std::min(static_cast<int>(array[0].size()) - 1, y + WINDOW_SIZE / 2);
 41 | 
 42 | 	//printf("(%d %d),(%d %d)\n",startX,startY,endX,endY);
 43 | 
 44 |     for (int i = startX; i <= endX; i++) {
 45 |         for (int j = startY; j <= endY; j++) {
 46 |             counts[array[i][j]]++;
 47 |         }
 48 |     }
 49 | 
 50 |     float entropy = 0.0;
 51 |     int windowSize = (endX - startX + 1) * (endY - startY + 1);
 52 |     //printf("%d\n",windowSize);
 53 |     for (int i = 0; i < ELEMENT_RANGE; i++) {
 54 |         float probability = float(counts[i]) / windowSize;
 55 |         if (counts[i]!=0) {
 56 |         	//printf("%d ",i);
 57 |             entropy -= probability * log2(probability);
 58 |         }
 59 |     }
 60 | 	//printf("\n");
 61 |     return entropy;
 62 | }
 63 | 
 64 | int main() {
 65 |     // 设置随机种子
 66 |     srand(static_cast<unsigned int>(time(NULL)));
 67 |     std::vector<std::vector<int> > array;
 68 | 
 69 |     std::vector<float> res;
 70 |     read(array,res,"test/test.in","test/test.out");
 71 | 
 72 |     int row=array.size();
 73 |     int col=array[0].size();
 74 | 
 75 |     //int row=200;
 76 |     //int col=500;
 77 |     //generateRandomArray(array,row,col);
 78 |     std::vector<std::vector<float> > entropyArray;
 79 |     entropyArray.resize(row);
 80 |     for(int i=0;i<row;i++){
 81 |         entropyArray[i].resize(col);
 82 |     }
 83 | 
 84 |     for (int x = 0; x < row; x++) {
 85 |         for (int y = 0; y < col; y++) {
 86 |             entropyArray[x][y] = calculateEntropy(array, x, y);
 87 |         }
 88 |     }
 89 |     /*
 90 |     for (int x = 0; x < row; x++) {
 91 |         for (int y = 0; y < col; y++) {
 92 |             std::cout <<  entropyArray[x][y] << " ";
 93 |         }
 94 |         std::cout << std::endl;
 95 |     }
 96 |     std::cout << std::endl;
 97 |     */
 98 |     //std::cout<<entropyArray.size()<<entropyArray[0].size()<<std::endl;
 99 |     write(array,entropyArray,"output/share_mem.in","output/share_mem.out");
100 | /*
101 |     for (int i = 0; i <NUM_ARRAYS; i++) {
102 |         std::vector<std::vector<int> > array;
103 |         generateRandomArray(array, ARRAY_SIZE[i]);
104 | 
105 |         std::vector<std::vector<float> > entropyArray(ARRAY_SIZE[i], std::vector<float>(ARRAY_SIZE[i]));
106 | 		
107 | 		clock_t start, finish;
108 |     	//clock_t为CPU时钟计时单元数
109 |     	start = clock();
110 |         
111 | 		for (int x = 0; x < ARRAY_SIZE[i]; x++) {
112 |             for (int y = 0; y < ARRAY_SIZE[i]; y++) {
113 |                 entropyArray[x][y] = calculateEntropy(array, x, y);
114 |             }
115 |         }
116 | 		
117 | 		finish=clock();
118 |         // 输出结果
119 |         std::cout << "Array Size: " << ARRAY_SIZE[i] << " using time: "<<1000*float(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
120 |         
121 | 		
122 | 		for (int x = 0; x < ARRAY_SIZE[i]; x++) {
123 |             for (int y = 0; y < ARRAY_SIZE[i]; y++) {
124 |                 //std::cout << std::fixed << std::setprecision(5) << entropyArray[x][y] << " ";
125 |             }
126 |             //std::cout << std::endl;
127 |         }
128 |         std::cout << std::endl;
129 |     }
130 | */
131 |     return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/baseline_lu.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <vector>
  4 | #include <cstdlib>
  5 | #include <ctime>
  6 | #include <iomanip>
  7 | #include "deal_binary.h"
  8 | using std::string;
  9 | 
 10 | 
 11 | const int ARRAY_SIZE[] = {5, 16, 128, 1024, 2048, 3000,4000};
 12 | const int ELEMENT_RANGE = 16;
 13 | const int WINDOW_SIZE = 5;
 14 | const int NUM_ARRAYS = sizeof(ARRAY_SIZE) / sizeof(ARRAY_SIZE[0]);
 15 | const int KERNEL_RADIUS = WINDOW_SIZE / 2;
 16 | 
 17 | const int LOG_TABLE_SIZE = 25;
 18 | std::vector<double> logTable(LOG_TABLE_SIZE);
 19 | 
 20 | // 初始化对数表
 21 | void initializeLogTable() {
 22 |     for (int i = 1; i <= LOG_TABLE_SIZE; i++) {
 23 |         logTable[i - 1] = log2(i);
 24 |     }
 25 | }
 26 | 
 27 | // 查找对数值
 28 | double lookupLog(int n) {
 29 |     if (n >= 1 && n <= LOG_TABLE_SIZE) {
 30 |         return logTable[n - 1];
 31 |     } else {
 32 |         // 处理超出查表范围的情况
 33 |         return log2(n);
 34 |     }
 35 | }
 36 | 
 37 | // 生成随机二维数组
 38 | void generateRandomArray(std::vector<std::vector<int> >& array,int row,int col) {
 39 |     array.resize(row);
 40 |     for(int i=0;i<row;i++){
 41 |         array[i].resize(col);
 42 |     }
 43 | 	srand(static_cast<unsigned int>(2234));
 44 |     for (int i = 0; i < row; i++) {
 45 |         for (int j = 0; j < col; j++) {
 46 |             array[i][j] = rand() % ELEMENT_RANGE;
 47 |         }
 48 |         //printf("\n");
 49 |     }
 50 | }
 51 | 
 52 | // 计算熵
 53 | double calculateEntropy(const std::vector<std::vector<int> >& array, int x, int y) {
 54 |     std::vector<int> counts(ELEMENT_RANGE, 0);
 55 | 
 56 |     int startX = std::max(0, x - WINDOW_SIZE / 2);
 57 |     int startY = std::max(0, y - WINDOW_SIZE / 2);
 58 |     int endX = std::min(static_cast<int>(array.size()) - 1, x + WINDOW_SIZE / 2);
 59 |     int endY = std::min(static_cast<int>(array[0].size()) - 1, y + WINDOW_SIZE / 2);
 60 | 
 61 | 	//printf("(%d %d),(%d %d)\n",startX,startY,endX,endY);
 62 | 
 63 |     for (int i = startX; i <= endX; i++) {
 64 |         for (int j = startY; j <= endY; j++) {
 65 |             counts[array[i][j]]++;
 66 |         }
 67 |     }
 68 | 
 69 |     float entropy = 0.0;
 70 |     int windowSize = (endX - startX + 1) * (endY - startY + 1);
 71 |     //printf("%d\n",windowSize);
 72 |     for (int i = 0; i < ELEMENT_RANGE; i++) {
 73 |         float probability = float(counts[i]) / windowSize;
 74 |         if (counts[i]!=0) {
 75 |         	//printf("%d ",i);
 76 |             entropy -= probability * (lookupLog(counts[i])-lookupLog(windowSize));
 77 |         }
 78 |     }
 79 | 	//printf("\n");
 80 |     return entropy;
 81 | }
 82 | 
 83 | int main() {
 84 |     // 设置随机种子
 85 |     srand(static_cast<unsigned int>(time(NULL)));
 86 | 
 87 |     // 初始化对数表
 88 |     initializeLogTable();
 89 | 
 90 |     /*
 91 |     for (int i = 0; i < NUM_ARRAYS; i++) {
 92 |         std::vector<std::vector<int> > array;
 93 |         generateRandomArray(array, ARRAY_SIZE[i]);
 94 | 
 95 |         std::vector<std::vector<double> > entropyArray(ARRAY_SIZE[i], std::vector<double>(ARRAY_SIZE[i]));
 96 | 
 97 |         clock_t start, finish;
 98 |         start = clock();
 99 | 
100 |         for (int x = 0; x < ARRAY_SIZE[i]; x++) {
101 |             for (int y = 0; y < ARRAY_SIZE[i]; y++) {
102 |                 entropyArray[x][y] = calculateEntropy(array, x, y);
103 |             }
104 |         }
105 | 
106 |         finish = clock();
107 |         std::cout << "Array Size: " << ARRAY_SIZE[i] << " using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl;
108 |         
109 |     }*/
110 |     std::vector<std::vector<int> > array;
111 |     std::vector<float> res;
112 |     read(array,res,"test/test.in","test/test.out");
113 |     int row=array.size();
114 |     int col=array[0].size();
115 |     generateRandomArray(array,row,col);
116 |     std::vector<std::vector<float> > entropyArray;
117 |     entropyArray.resize(row);
118 |     for(int i=0;i<row;i++){
119 |         entropyArray[i].resize(col);
120 |     }
121 |     for (int x = 0; x < row; x++) {
122 |         for (int y = 0; y < col; y++) {
123 |             entropyArray[x][y] = calculateEntropy(array, x, y);
124 |         }
125 |     }
126 |     write(array,entropyArray,"output/baseline_lu.in","output/baseline_lu.out");
127 |     return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/cuda_2d.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <cuda_runtime.h>
  4 | #include <iomanip>
  5 | #define BLOCK_SIZE 16
  6 | #include "deal_binary.h"
  7 | using std::string;
  8 | 
  9 | // CUDA核函数，计算以每个元素为中心的窗口中的熵
 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col)
 11 | {
 12 |     int c = blockIdx.y * blockDim.y + threadIdx.y;
 13 |     int r = blockIdx.x * blockDim.x + threadIdx.x;
 14 | 
 15 |     if (r < row && c < col)
 16 |     {
 17 |         int windowSize = 5;
 18 |         int windowStartRow = r - 2;
 19 |         int windowStartCol = c - 2;
 20 |         int windowEndRow = windowStartRow + 4;
 21 |         int windowEndCol = windowStartCol + 4;
 22 | 
 23 |        // 边界处理
 24 |         if (windowStartRow < 0)
 25 |             windowStartRow = 0;
 26 |         if (windowStartCol < 0)
 27 |             windowStartCol = 0;
 28 |         if (windowEndRow >= row)
 29 |             windowEndRow = row - 1;
 30 |         if (windowEndCol >= col)
 31 |             windowEndCol = col - 1;
 32 | 
 33 |         float entropy = 0.0f;
 34 |         int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1);
 35 | 
 36 |         // 计算窗口内元素的频率
 37 |         int frequency[16] = { 0 };
 38 |         for (int i = windowStartRow; i <= windowEndRow; i++)
 39 |         {
 40 |             for (int j = windowStartCol; j <= windowEndCol; j++)
 41 |             {
 42 |                 int value = input[i * col + j];
 43 |                 frequency[value]++;
 44 |             }
 45 |         }
 46 | 
 47 |         // 计算熵
 48 |         for (int k = 0; k < 16; k++)
 49 |         {
 50 |             float prob = static_cast<float>(frequency[k]) / windowElements;
 51 |             if (prob > 0.0f)
 52 |                 entropy -= prob * log2f(prob);
 53 |         }
 54 | 
 55 |         output[r * col + c] = entropy;
 56 |     }
 57 | }
 58 | 
 59 | int main()
 60 | {
 61 |     int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000};
 62 |     srand(static_cast<unsigned int>(1234));
 63 |     /*
 64 |     for (int i = 0; i < 7; i++)
 65 |     {
 66 |         int size = sizes[i];
 67 | 
 68 |         // 随机生成二维数组
 69 |         int* hostInput = new int[size * size];
 70 |         for (int j = 0; j < size * size; j++)
 71 |             hostInput[j] = rand() % 16;
 72 | 
 73 |         int* deviceInput;
 74 |         cudaMalloc((void**)&deviceInput, size * size * sizeof(int));
 75 |         cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice);
 76 | 
 77 |         float* hostOutput = new float[size * size];
 78 |         float* deviceOutput;
 79 |         cudaMalloc((void**)&deviceOutput, size * size * sizeof(float));
 80 | 
 81 |         // 定义CUDA的网格和块大小
 82 |         dim3 gridSize((size + BLOCK_SIZE - 1) / BLOCK_SIZE, (size + BLOCK_SIZE - 1) / BLOCK_SIZE);
 83 |         dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
 84 | 
 85 | 
 86 |         clock_t start, finish;
 87 |     	//clock_t为CPU时钟计时单元数
 88 |     	start = clock();
 89 |         // 调用CUDA核函数
 90 |         calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, size);
 91 |         cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost);
 92 | 
 93 |         finish=clock();
 94 |         // 输出结果
 95 |         std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
 96 |         // 输出结果
 97 |         //std::cout << "Array size: " << size << " x " << size << std::endl;
 98 |         for (int row = 0; row < size; row++)
 99 |         {
100 |             for (int col = 0; col < size; col++)
101 |             {
102 |                 //std::cout << std::fixed << std::setprecision(5) << hostOutput[row * size + col] << " ";
103 |             }
104 |             //std::cout << std::endl;
105 |         }
106 | 
107 |         // 释放内存
108 |         delete[] hostInput;
109 |         delete[] hostOutput;
110 |         cudaFree(deviceInput);
111 |         cudaFree(deviceOutput);
112 |     }
113 |     */
114 |     std::vector<std::vector<int> > array;
115 |     std::vector<float> res;
116 |     read(array,res,"test/test.in","test/test.out");
117 |     int row=array.size();
118 |     int col=array[0].size();
119 | 
120 | 
121 |     int* hostInput = new int[row*col];
122 |     for(int i=0;i<row;i++){
123 |         for(int j=0;j<col;j++){
124 |             hostInput[i*col+j]=array[i][j];
125 |         }
126 |     }
127 | 
128 |     int* deviceInput;
129 |     cudaMalloc((void**)&deviceInput, row * col * sizeof(int));
130 |     cudaMemcpy(deviceInput, hostInput, row * col * sizeof(int), cudaMemcpyHostToDevice);
131 | 
132 |     float* hostOutput = new float[row * col];
133 |     float* deviceOutput;
134 |     cudaMalloc((void**)&deviceOutput, row * col * sizeof(float));
135 |     // 定义CUDA的网格和块大小
136 |     dim3 gridSize((row + BLOCK_SIZE - 1) / BLOCK_SIZE, (col + BLOCK_SIZE - 1) / BLOCK_SIZE);
137 |     dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
138 |     
139 |     calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, row,col);
140 |     cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost);
141 | 
142 |     std::vector<std::vector<float> > entropyArray;
143 |     entropyArray.resize(row);
144 |     for(int i=0;i<row;i++){
145 |         entropyArray[i].resize(col);
146 |     }
147 |     for(int i=0;i<row;i++){
148 |         for(int j=0;j<col;j++){
149 |             array[i][j]=hostInput[i*col+j];
150 |             entropyArray[i][j]=hostOutput[i*col+j];
151 |             //if(hostOutput[i*col+j]==0.0) printf("%d %d\n",i,j);
152 |         }
153 |     }
154 | 
155 |     write(array,entropyArray,"output/cuda_2d.in","output/cuda_2d.out");
156 |     
157 | 
158 |     // 释放内存
159 |     delete[] hostInput;
160 |     delete[] hostOutput;
161 |     cudaFree(deviceInput);
162 |     cudaFree(deviceOutput);
163 | 
164 | 
165 |     return 0;
166 | }
167 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/cuda_bl.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <cuda_runtime.h>
  4 | #include <iomanip>
  5 | #define BLOCK_SIZE 16
  6 | #include "deal_binary.h"
  7 | using std::string;
  8 | 
  9 | // CUDA核函数，计算以每个元素为中心的窗口中的熵
 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col)
 11 | {
 12 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 13 | 
 14 |     if (index < row * col)
 15 |     {
 16 |         int r = index / col;
 17 |         int c = index % col;
 18 | 
 19 |         int windowSize = 5;
 20 |         int windowStartRow = r - 2;
 21 |         int windowStartCol = c - 2;
 22 |         int windowEndRow = windowStartRow + 4;
 23 |         int windowEndCol = windowStartCol + 4;
 24 | 
 25 |         // 边界处理
 26 |         if (windowStartRow < 0)
 27 |             windowStartRow = 0;
 28 |         if (windowStartCol < 0)
 29 |             windowStartCol = 0;
 30 |         if (windowEndRow >= row)
 31 |             windowEndRow = row - 1;
 32 |         if (windowEndCol >= col)
 33 |             windowEndCol = col - 1;
 34 | 
 35 |         float entropy = 0.0f;
 36 |         int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1);
 37 | 
 38 |         // 计算窗口内元素的频率
 39 |         int frequency[16] = { 0 };
 40 |         for (int i = windowStartRow; i <= windowEndRow; i++)
 41 |         {
 42 |             for (int j = windowStartCol; j <= windowEndCol; j++)
 43 |             {
 44 |                 int value = input[i * col + j];
 45 |                 frequency[value]++;
 46 |             }
 47 |         }
 48 | 
 49 |         // 计算熵
 50 |         for (int k = 0; k < 16; k++)
 51 |         {
 52 |             float prob = static_cast<float>(frequency[k]) / windowElements;
 53 |             if (prob > 0.0f)
 54 |                 entropy -= prob * log2f(prob);
 55 |         }
 56 | 
 57 |         output[index] = entropy;
 58 |     }
 59 | }
 60 | 
 61 | int main()
 62 | {
 63 |     int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000};
 64 |     srand(static_cast<unsigned int>(1234));
 65 |     /*
 66 |     for (int i = 0; i < 7; i++)
 67 |     {
 68 |         int size = sizes[i];
 69 | 
 70 |         // 随机生成二维数组
 71 |         int* hostInput = new int[size * size];
 72 |         for (int j = 0; j < size * size; j++)
 73 |             hostInput[j] = rand() % 16;
 74 | 
 75 |         int* deviceInput;
 76 |         cudaMalloc((void**)&deviceInput, size * size * sizeof(int));
 77 |         cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice);
 78 | 
 79 |         float* hostOutput = new float[size * size];
 80 |         float* deviceOutput;
 81 |         cudaMalloc((void**)&deviceOutput, size * size * sizeof(float));
 82 | 
 83 |         // 定义CUDA的网格和块大小
 84 |         int gridSize = (size * size + BLOCK_SIZE - 1) / BLOCK_SIZE;
 85 |         int blockSize = BLOCK_SIZE;
 86 |         
 87 | 
 88 |         clock_t start, finish;
 89 |     	//clock_t为CPU时钟计时单元数
 90 |     	start = clock();
 91 |         // 调用CUDA核函数
 92 |         calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, size);
 93 | 
 94 |         cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost);
 95 |         finish=clock();
 96 |         // 输出结果
 97 |         std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
 98 |         //std::cout << "Array Size: " << size << std::endl;
 99 |         for (int row = 0; row < size; row++)
100 |         {
101 |             for (int col = 0; col < size; col++)
102 |             {
103 |                 //std::cout << std::fixed << std::setprecision(5) << hostOutput[row * size + col] << " ";
104 |             }
105 |             //std::cout << std::endl;
106 |         }
107 | 
108 |         // 释放内存
109 |         delete[] hostInput;
110 |         delete[] hostOutput;
111 |         cudaFree(deviceInput);
112 |         cudaFree(deviceOutput);
113 |     }
114 |     */
115 |     std::vector<std::vector<int> > array;
116 |     std::vector<float> res;
117 |     read(array,res,"test/test.in","test/test.out");
118 |     int row=array.size();
119 |     int col=array[0].size();
120 | 
121 | 
122 |     int* hostInput = new int[row*col];
123 |     for(int i=0;i<row;i++){
124 |         for(int j=0;j<col;j++){
125 |             hostInput[i*col+j]=array[i][j];
126 |         }
127 |     }
128 | 
129 |     int* deviceInput;
130 |     cudaMalloc((void**)&deviceInput, row * col * sizeof(int));
131 |     cudaMemcpy(deviceInput, hostInput, row * col * sizeof(int), cudaMemcpyHostToDevice);
132 | 
133 |     float* hostOutput = new float[row * col];
134 |     float* deviceOutput;
135 |     cudaMalloc((void**)&deviceOutput, row * col * sizeof(float));
136 | 
137 |     // 定义CUDA的网格和块大小
138 |     int gridSize = (row * col + BLOCK_SIZE - 1) / BLOCK_SIZE;
139 |     int blockSize = BLOCK_SIZE;
140 |     calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, row,col);
141 |     cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost);
142 | 
143 | 
144 |     std::vector<std::vector<float> > entropyArray;
145 |     entropyArray.resize(row);
146 |     for(int i=0;i<row;i++){
147 |         entropyArray[i].resize(col);
148 |     }
149 |     for(int i=0;i<row;i++){
150 |         for(int j=0;j<col;j++){
151 |             //array[i][j]=hostInput[i*col+j];
152 |             entropyArray[i][j]=hostOutput[i*col+j];
153 |         }
154 |     }
155 | 
156 |     write(array,entropyArray,"output/cuda_bl.in","output/cuda_bl.out");
157 |     
158 | 
159 |     // 释放内存
160 |     delete[] hostInput;
161 |     delete[] hostOutput;
162 |     cudaFree(deviceInput);
163 |     cudaFree(deviceOutput);
164 |     return 0;
165 | }
166 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/cuda_bl_lu.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <cuda_runtime.h>
  4 | #include <iomanip>
  5 | #define BLOCK_SIZE 16
  6 | #include "deal_binary.h"
  7 | using std::string;
  8 | 
  9 | // CUDA核函数，计算以每个元素为中心的窗口中的熵
 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col)
 11 | {
 12 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 13 |     //printf("hhh\n");
 14 |     double logTable[26];
 15 |     for (int k = 1; k <= 25; k++)
 16 |     {
 17 |         logTable[k] = log2f(k);
 18 |         //printf("%f\n",logTable[k-1]);
 19 |     }
 20 |     //printf("kkk\n");
 21 |     if (index < row * col)
 22 |     {
 23 |         int r = index / col;
 24 |         int c = index % col;
 25 | 
 26 |         int windowSize = 5;
 27 |         int windowStartRow = r - 2;
 28 |         int windowStartCol = c - 2;
 29 |         int windowEndRow = windowStartRow + 4;
 30 |         int windowEndCol = windowStartCol + 4;
 31 | 
 32 |         // 边界处理
 33 |         if (windowStartRow < 0)
 34 |             windowStartRow = 0;
 35 |         if (windowStartCol < 0)
 36 |             windowStartCol = 0;
 37 |         if (windowEndRow >= row)
 38 |             windowEndRow = row - 1;
 39 |         if (windowEndCol >= col)
 40 |             windowEndCol = col - 1;
 41 | 
 42 |         float entropy = 0.0f;
 43 |         int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1);
 44 | 
 45 |         // 计算窗口内元素的频率
 46 |         int frequency[16] = { 0 };
 47 |         for (int i = windowStartRow; i <= windowEndRow; i++)
 48 |         {
 49 |             for (int j = windowStartCol; j <= windowEndCol; j++)
 50 |             {
 51 |                 int value = input[i * col + j];
 52 |                 frequency[value]++;
 53 |             }
 54 |         }
 55 | 
 56 |         // 计算熵
 57 |         for (int k = 0; k < 16; k++)
 58 |         {
 59 |             float prob = static_cast<float>(frequency[k]) / windowElements;
 60 |             //printf("%d \n",frequency[k]);
 61 |             if (prob > 0.0f){
 62 |                 if(frequency[k] >= 1 && frequency[k] <= 25)
 63 |                 entropy -= prob * (logTable[frequency[k]]-logTable[windowElements]);
 64 |                 else 
 65 |                 entropy -= prob * log2f(prob);
 66 |             }
 67 |         }
 68 | 
 69 |         output[index] = entropy;
 70 |     }
 71 | }
 72 | 
 73 | int main()
 74 | {
 75 |     int sizes[] = { 5, 16, 128, 1024, 2048, 3000, 4000 };
 76 |     srand(static_cast<unsigned int>(1234));
 77 |     /*
 78 |     for (int i = 0; i < 7; i++)
 79 |     {
 80 |         int size = sizes[i];
 81 |         
 82 |         // 随机生成二维数组
 83 |         int* hostInput = new int[size * size];
 84 |         for (int j = 0; j < size * size; j++)
 85 |             hostInput[j] = rand() % 16;
 86 | 
 87 |         int* deviceInput;
 88 |         cudaMalloc((void**)&deviceInput, size * size * sizeof(int));
 89 |         cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice);
 90 | 
 91 |         float* hostOutput = new float[size * size];
 92 |         float* deviceOutput;
 93 |         cudaMalloc((void**)&deviceOutput, size * size * sizeof(float));
 94 | 
 95 |         // 定义CUDA的网格和块大小
 96 |         int gridSize = (size * size + BLOCK_SIZE - 1) / BLOCK_SIZE;
 97 |         int blockSize = BLOCK_SIZE;
 98 | 
 99 |         // 预计算对数表
100 |         float logTable[25];
101 |         for (int k = 0; k < 25; k++)
102 |         {
103 |             logTable[k] = log2f(static_cast<float>(k + 1));
104 |             //printf("%f ",logTable[k]);
105 |         }
106 |         //printf("\n");
107 | 
108 |         clock_t start, finish;
109 |         start = clock();
110 |         // 调用CUDA核函数
111 |         calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, size, logTable);
112 | 
113 |         cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost);
114 |         finish = clock();
115 |         // 输出结果
116 |         std::cout << "Array Size: " << size << " using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl;
117 | 
118 |         // 释放内存
119 |         delete[] hostInput;
120 |         delete[] hostOutput;
121 |         cudaFree(deviceInput);
122 |         cudaFree(deviceOutput);
123 |     }
124 |     */
125 |     std::vector<std::vector<int> > array;
126 |     std::vector<float> res;
127 |     read(array,res,"test/test.in","test/test.out");
128 |     int row=array.size();
129 |     int col=array[0].size();
130 | 
131 | 
132 |     int* hostInput = new int[row*col];
133 |     for(int i=0;i<row;i++){
134 |         for(int j=0;j<col;j++){
135 |             hostInput[i*col+j]=array[i][j];
136 |         }
137 |     }
138 | 
139 |     int* deviceInput;
140 |     cudaMalloc((void**)&deviceInput, row * col * sizeof(int));
141 |     cudaMemcpy(deviceInput, hostInput, row * col * sizeof(int), cudaMemcpyHostToDevice);
142 | 
143 |     float* hostOutput = new float[row * col];
144 |     float* deviceOutput;
145 |     cudaMalloc((void**)&deviceOutput, row * col * sizeof(float));
146 | 
147 |     // 定义CUDA的网格和块大小
148 |     int gridSize = (row * col + BLOCK_SIZE - 1) / BLOCK_SIZE;
149 |     int blockSize = BLOCK_SIZE;
150 | 
151 |     // 预计算对数表
152 |     double* logTable=new double[26];
153 |     for (int k = 1; k <= 25; k++)
154 |     {
155 |         logTable[k] = log2(double(k));
156 |         //printf("%f\n",logTable[k-1]);
157 |     }
158 | 
159 | 
160 |     calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, row,col);
161 |     cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost);
162 | 
163 | 
164 |     std::vector<std::vector<float> > entropyArray;
165 |     entropyArray.resize(row);
166 |     for(int i=0;i<row;i++){
167 |         entropyArray[i].resize(col);
168 |     }
169 |     for(int i=0;i<row;i++){
170 |         for(int j=0;j<col;j++){
171 |             array[i][j]=hostInput[i*col+j];
172 |             entropyArray[i][j]=hostOutput[i*col+j];
173 |         }
174 |     }
175 | 
176 |     write(array,entropyArray,"output/cuda_bl_lu.in","output/cuda_bl_lu.out");
177 |     
178 | 
179 |     // 释放内存
180 |     delete[] hostInput;
181 |     delete[] hostOutput;
182 |     cudaFree(deviceInput);
183 |     cudaFree(deviceOutput);
184 | 
185 | 
186 |     return 0;
187 | }
188 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/deal_binary.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cmath>
 3 | #include <vector>
 4 | #include <cstdlib>
 5 | #include <ctime>
 6 | #include <fstream>
 7 | #include <iomanip>
 8 | using std::string;
 9 | 
10 | void read(std::vector<std::vector<int> >& array,std::vector<float>& res,string instr,string out_str){
11 |     int row_size, col_size;
12 |     std::vector<int> matrix_test;
13 |     std::ifstream inFile(instr, std::ios::binary);
14 |     if (inFile.is_open()) {
15 |     // First, read the size of the matrix (ROWS and COLS).
16 |     inFile.read(reinterpret_cast<char *>(&row_size),sizeof(row_size));
17 |     inFile.read(reinterpret_cast<char *>(&col_size),sizeof(col_size));
18 |     // Resize the matrix based on the size read from thefile.
19 |     matrix_test.resize(row_size * col_size);
20 |     // Then, read the actual data of the matrix.
21 |     inFile.read(reinterpret_cast<char *>
22 |     (matrix_test.data()), matrix_test.size() * sizeof(int));
23 |     }
24 | 
25 |     array.resize(row_size);
26 |     for(int i=0;i<row_size;i++){
27 |         array[i].resize(col_size);
28 |     }
29 |     
30 |     for(int i=0;i<row_size;i++){
31 |         for(int j=0;j<col_size;j++){
32 |              array[i][j]=matrix_test[i*col_size+j];
33 |         }
34 |     }
35 |     printf("size:%d %d\n",row_size,col_size);
36 |     //printf("##########################################################################################\n");
37 |     
38 |     res.resize(row_size * col_size);
39 |     std::ifstream inFile_2(out_str, std::ios::binary);
40 |     if (inFile_2.is_open()) {
41 |     // Resize the matrix based on the size read from thefile.
42 |     // Then, read the actual data of the matrix.
43 |     inFile_2.read(reinterpret_cast<char *>(res.data()), res.size() * sizeof(float));
44 |     }
45 | }
46 | 
47 | 
48 | 
49 | void write(std::vector<std::vector<int> >& array,std::vector<std::vector<float> > &res,string in_str,string out_str){
50 |     std::ofstream inFile(in_str, std::ios::binary);
51 |     std::ofstream outFile(out_str, std::ios::binary);
52 | 
53 |     int row=array.size();
54 |     int col=array[0].size();
55 | 
56 |     std::vector<int> in_matrix;
57 |     std::vector<float> entropy_matrix;
58 |     in_matrix.resize(row*col+2);
59 |     entropy_matrix.resize(row*col);
60 | 
61 |     in_matrix[0]=row;
62 |     in_matrix[1]=col;
63 | 
64 |     for(int i=0;i<row;i++){
65 |         for(int j=0;j<col;j++){
66 |             in_matrix[i*col+j+2]=array[i][j];
67 |             entropy_matrix[i*col+j]=res[i][j];
68 |         }
69 |     }
70 | 
71 |     //写输入
72 |     if (inFile.is_open()) {
73 |         inFile.write(reinterpret_cast<const char *>(in_matrix.data()), in_matrix.size() *sizeof(int));
74 |         inFile.close();
75 |     } 
76 |     else {
77 |         std::cout << "Unable to open file";
78 |     }
79 | 
80 |     //写熵矩阵
81 |     if (outFile.is_open()) {
82 |         outFile.write(reinterpret_cast<const char *>(entropy_matrix.data()), entropy_matrix.size() *sizeof(float));
83 |         outFile.close();
84 |     } 
85 |     else {
86 |         std::cout << "Unable to open file";
87 |     }
88 | }


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/openmp.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <vector>
  4 | #include <cstdlib>
  5 | #include <ctime>
  6 | #include <omp.h>
  7 | #include <iomanip>
  8 | #include "deal_binary.h"
  9 | using std::string;
 10 | #define ELEMENT_RANGE 16
 11 | 
 12 | const int WINDOW_SIZE = 5;
 13 | // 生成随机二维数组
 14 | void generateRandomArray(std::vector<std::vector<int> >& array, int size) {
 15 | 	srand(static_cast<unsigned int>(1234));
 16 |     array.resize(size, std::vector<int>(size));
 17 |     for (int i = 0; i < size; i++) {
 18 |         for (int j = 0; j < size; j++) {
 19 |             array[i][j] = rand() % ELEMENT_RANGE;
 20 |         }
 21 |         //printf("\n");
 22 |     }
 23 | }
 24 | 
 25 | 
 26 | // 计算窗口中的熵
 27 | double calculateEntropy(const std::vector<std::vector<int> >& array, int x, int y) {
 28 |     std::vector<int> counts(ELEMENT_RANGE, 0);
 29 | 
 30 |     int startX = std::max(0, x - WINDOW_SIZE / 2);
 31 |     int startY = std::max(0, y - WINDOW_SIZE / 2);
 32 |     int endX = std::min(static_cast<int>(array.size()) - 1, x + WINDOW_SIZE / 2);
 33 |     int endY = std::min(static_cast<int>(array[0].size()) - 1, y + WINDOW_SIZE / 2);
 34 | 
 35 | 	//printf("(%d %d),(%d %d)\n",startX,startY,endX,endY);
 36 | 
 37 |     for (int i = startX; i <= endX; i++) {
 38 |         for (int j = startY; j <= endY; j++) {
 39 |             counts[array[i][j]]++;
 40 |         }
 41 |     }
 42 | 
 43 |     float entropy = 0.0;
 44 |     int windowSize = (endX - startX + 1) * (endY - startY + 1);
 45 |     //printf("%d\n",windowSize);
 46 |     for (int i = 0; i < ELEMENT_RANGE; i++) {
 47 |         float probability = float(counts[i]) / windowSize;
 48 |         if (counts[i]!=0) {
 49 |         	//printf("%d ",i);
 50 |             entropy -= probability * log2(probability);
 51 |         }
 52 |     }
 53 | 	//printf("\n");
 54 |     return entropy;
 55 | }
 56 | 
 57 | int main() {
 58 |     // 设置随机种子
 59 |     std::srand(static_cast<unsigned int>(1234));
 60 |     /*
 61 |     // 定义数组大小
 62 |     int sizesArr[] = {5, 16, 128, 1024, 2048, 3000,4000};
 63 |     std::vector<int> sizes;
 64 |     for(int i=0;i<7;i++)
 65 |         sizes.push_back(sizesArr[i]);
 66 |     // 并行计算熵
 67 |     #pragma omp parallel for num_threads(40)
 68 |     //#pragma omp parallel for collapse(24)
 69 |     for (int i=0;i<sizes.size();i++) {
 70 |         // 生成随机二维数组
 71 |         int size=sizes[i];
 72 |         std::vector<std::vector<int> > array(size, std::vector<int>(size));
 73 |         generateRandomArray(array,size);
 74 |         
 75 | 
 76 |         clock_t start, finish;
 77 |     	start = clock();
 78 |         // 计算熵
 79 |         std::vector<std::vector<double> > entropyArray(size, std::vector<double>(size));
 80 |         for (int i = 0; i < size; ++i) {
 81 |             for (int j = 0; j < size; ++j) {
 82 |                 entropyArray[i][j] = calculateEntropy(array, i, j);
 83 |             }
 84 |         }
 85 |         finish=clock();
 86 |         // 输出结果
 87 |         std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
 88 | 
 89 |         
 90 |     }
 91 |     */
 92 |     std::vector<std::vector<int> > array;
 93 |     std::vector<float> res;
 94 |     read(array,res,"test/test.in","test/test.out");
 95 |     int row=array.size();
 96 |     int col=array[0].size();
 97 |     std::vector<std::vector<float> > entropyArray;
 98 |     entropyArray.resize(row);
 99 |     for(int i=0;i<row;i++){
100 |         entropyArray[i].resize(col);
101 |     }
102 |     #pragma omp parallel for num_threads(20)
103 |     for (int x = 0; x < row; x++) {
104 |         for (int y = 0; y < col; y++) {
105 |             entropyArray[x][y] = calculateEntropy(array, x, y);
106 |         }
107 |     }
108 |     write(array,entropyArray,"output/openmp.in","output/openmp.out");
109 | 
110 |     return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/openmp_lu.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <vector>
  4 | #include <cstdlib>
  5 | #include <ctime>
  6 | #include <omp.h>
  7 | #include <iomanip>
  8 | #include "deal_binary.h"
  9 | using std::string;
 10 | #define ELEMENT_RANGE 16
 11 | const int LOG_TABLE_SIZE = 25;
 12 | std::vector<double> logTable(LOG_TABLE_SIZE);
 13 | const int WINDOW_SIZE = 5;
 14 | 
 15 | // 初始化对数表
 16 | void initializeLogTable() {
 17 |     for (int i = 1; i <= LOG_TABLE_SIZE; i++) {
 18 |         logTable[i - 1] = log2(i);
 19 |     }
 20 | }
 21 | 
 22 | // 查找对数值
 23 | double lookupLog(int n) {
 24 |     if (n >= 1 && n <= LOG_TABLE_SIZE) {
 25 |         return logTable[n - 1];
 26 |     } else {
 27 |         // 处理超出查表范围的情况
 28 |         return log2(n);
 29 |     }
 30 | }
 31 | 
 32 | // 生成随机二维数组
 33 | void generateRandomArray(std::vector<std::vector<int> >& array, int size) {
 34 | 	srand(static_cast<unsigned int>(1234));
 35 |     array.resize(size, std::vector<int>(size));
 36 |     for (int i = 0; i < size; i++) {
 37 |         for (int j = 0; j < size; j++) {
 38 |             array[i][j] = rand() % ELEMENT_RANGE;
 39 |         }
 40 |         //printf("\n");
 41 |     }
 42 | }
 43 | 
 44 | 
 45 | // 计算窗口中的熵
 46 | double calculateEntropy(const std::vector<std::vector<int> >& array, int x, int y) {
 47 |     std::vector<int> counts(ELEMENT_RANGE, 0);
 48 | 
 49 |     int startX = std::max(0, x - WINDOW_SIZE / 2);
 50 |     int startY = std::max(0, y - WINDOW_SIZE / 2);
 51 |     int endX = std::min(static_cast<int>(array.size()) - 1, x + WINDOW_SIZE / 2);
 52 |     int endY = std::min(static_cast<int>(array[0].size()) - 1, y + WINDOW_SIZE / 2);
 53 | 
 54 | 	//printf("(%d %d),(%d %d)\n",startX,startY,endX,endY);
 55 | 
 56 |     for (int i = startX; i <= endX; i++) {
 57 |         for (int j = startY; j <= endY; j++) {
 58 |             counts[array[i][j]]++;
 59 |         }
 60 |     }
 61 | 
 62 |     float entropy = 0.0;
 63 |     int windowSize = (endX - startX + 1) * (endY - startY + 1);
 64 |     //printf("%d\n",windowSize);
 65 |     for (int i = 0; i < ELEMENT_RANGE; i++) {
 66 |         float probability = float(counts[i]) / windowSize;
 67 |         if (counts[i]!=0) {
 68 |         	//printf("%d ",i);
 69 |             entropy -= probability * (lookupLog(counts[i])-lookupLog(windowSize));
 70 |         }
 71 |     }
 72 | 	//printf("\n");
 73 |     return entropy;
 74 | }
 75 | 
 76 | int main() {
 77 |     // 设置随机种子
 78 |     std::srand(static_cast<unsigned int>(1234));
 79 |     // 初始化对数表
 80 |     initializeLogTable();
 81 |     /*
 82 |     // 定义数组大小
 83 |     int sizesArr[] = {5, 16, 128, 1024, 2048, 3000,4000};
 84 |     std::vector<int> sizes;
 85 |     for(int i=0;i<7;i++)
 86 |         sizes.push_back(sizesArr[i]);
 87 |     // 并行计算熵
 88 |     #pragma omp parallel for num_threads(40)
 89 |     //#pragma omp parallel for collapse(24)
 90 |     for (int i=0;i<sizes.size();i++) {
 91 |         // 生成随机二维数组
 92 |         int size=sizes[i];
 93 |         std::vector<std::vector<int> > array(size, std::vector<int>(size));
 94 |         generateRandomArray(array,size);
 95 |         
 96 | 
 97 |         clock_t start, finish;
 98 |     	start = clock();
 99 |         // 计算熵
100 |         std::vector<std::vector<double> > entropyArray(size, std::vector<double>(size));
101 |         for (int i = 0; i < size; ++i) {
102 |             for (int j = 0; j < size; ++j) {
103 |                 entropyArray[i][j] = calculateEntropy(array, i, j);
104 |             }
105 |         }
106 |         finish=clock();
107 |         // 输出结果
108 |         std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
109 | 
110 |         
111 |     }
112 |     */
113 |     std::vector<std::vector<int> > array;
114 |     std::vector<float> res;
115 |     read(array,res,"test/test.in","test/test.out");
116 |     int row=array.size();
117 |     int col=array[0].size();
118 |     std::vector<std::vector<float> > entropyArray;
119 |     entropyArray.resize(row);
120 |     for(int i=0;i<row;i++){
121 |         entropyArray[i].resize(col);
122 |     }
123 |     #pragma omp parallel for num_threads(40)
124 |     for (int x = 0; x < row; x++) {
125 |         for (int y = 0; y < col; y++) {
126 |             entropyArray[x][y] = calculateEntropy(array, x, y);
127 |         }
128 |     }
129 |     write(array,entropyArray,"output/openmp_lu.in","output/openmp_lu.out");
130 |     return 0;
131 | }
132 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/readme.txt:
--------------------------------------------------------------------------------
 1 | CPU串行版本：
 2 | baseline.cpp
 3 | baseline_lu.cpp
 4 | 
 5 | CPU并行版本：
 6 | openmp.cpp
 7 | openmp_lu.cpp
 8 | 
 9 | GPU版本：
10 | cuda_bl.cu
11 | cuda_bl_lu.cu
12 | cuda_2d.cu
13 | share_mem.cu
14 | 
15 | deal_binary.h是读写二进制文件的头文件
16 | 
17 | 若要运行程序来测试其他测例，则要修改源文件中的路径：
18 | 在main函数中可以看到有read()和write()两个函数，修改参数中的路径即可，如：
19 | read(array,res,"test/test.in","test/test.out")中，"test/test.in"是要读取的输入文件的路径，"test/test.out"是要读取的输出文件的路径（可以忽略）
20 | write(array,entropyArray,"output/share_mem.in","output/share_mem.out")中，"output/share_mem.out"是该程序计算得到的熵矩阵的保存路径
21 | 修改这两个路径即可。
22 | 
23 | 编译命令：
24 | nvcc cuda_bl.cu -o cuda_bl -std=c++11
25 | g++ baseline.cpp -o baseline


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/share_mem.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cmath>
  3 | #include <cuda_runtime.h>
  4 | #include <iomanip>
  5 | #define BLOCK_SIZE 16
  6 | #include "deal_binary.h"
  7 | using std::string;
  8 | 
  9 | // CUDA核函数，计算以每个元素为中心的窗口中的熵
 10 | // 计算每个元素的熵的核函数（使用共享内存优化）
 11 | __global__ void calculateEntropy_share(int* input, float* output, int width, int height) {
 12 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
 13 |     int col = blockIdx.y * blockDim.y + threadIdx.y;
 14 |     // 定义共享内存
 15 |     __shared__ int shared_input[8 + 4][8 + 4];
 16 |     // 计算线程在共享内存中的索引
 17 |     int shared_row = threadIdx.x + 2;
 18 |     int shared_col = threadIdx.y + 2;
 19 |     // 将数据从全局内存复制到共享内存
 20 |     if (row < height && col < width) {
 21 |         int global_index = col * height + row;
 22 |         shared_input[shared_row][shared_col] = input[global_index];
 23 |     }
 24 |     // 线程同步，确保数据复制完成
 25 |     __syncthreads();
 26 |     if (row < height && col < width) {
 27 |         float entropy = 0;
 28 |         // 记录每个数字出现过的次数
 29 |         int record[16];
 30 |         // 窗口内元素总数
 31 |         int count = 0, x, y;
 32 |         for (int i = 0; i < 16; i++) {
 33 |             record[i] = 0;
 34 |         }
 35 |         for (int i = -2; i <= 2; i++) {
 36 |             for (int j = -2; j <= 2; j++) {
 37 |                 x = shared_col + i;
 38 |                 y = shared_row + j;
 39 |                 printf("%d %d\n",x,y);
 40 |                 int value = shared_input[y][x];
 41 |                 record[value]++;
 42 |                 count++;
 43 |             }
 44 |         }
 45 |         // 计算熵值
 46 |         for (int i = 0; i < 16; i++) {
 47 |             //entropy -= (float)record[i] * (log_table[record[i]]-log_table[count]) / count;
 48 |             float prob = (float)(record[i]) / count;
 49 |             //printf("%f\n",prob);
 50 |             if (prob > 0.0f)
 51 |                 entropy -= prob * log2f(prob);
 52 |         }
 53 |         output[col * height + row] = entropy;
 54 |     }
 55 | }
 56 | 
 57 | // CUDA核函数，计算以每个元素为中心的窗口中的熵
 58 | __global__ void calculateEntropy(int* input, float* output, int row,int col)
 59 | {
 60 |     int c = blockIdx.y * blockDim.y + threadIdx.y;
 61 |     int r = blockIdx.x * blockDim.x + threadIdx.x;
 62 | 
 63 |     if (r < row && c < col)
 64 |     {
 65 |         int windowSize = 5;
 66 |         int windowStartRow = r - 2;
 67 |         int windowStartCol = c - 2;
 68 |         int windowEndRow = windowStartRow + 4;
 69 |         int windowEndCol = windowStartCol + 4;
 70 | 
 71 |        // 边界处理
 72 |         if (windowStartRow < 0)
 73 |             windowStartRow = 0;
 74 |         if (windowStartCol < 0)
 75 |             windowStartCol = 0;
 76 |         if (windowEndRow >= row)
 77 |             windowEndRow = row - 1;
 78 |         if (windowEndCol >= col)
 79 |             windowEndCol = col - 1;
 80 | 
 81 |         float entropy = 0.0f;
 82 |         int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1);
 83 | 
 84 |         // 计算窗口内元素的频率
 85 |         int frequency[16] = { 0 };
 86 |         for (int i = windowStartRow; i <= windowEndRow; i++)
 87 |         {
 88 |             for (int j = windowStartCol; j <= windowEndCol; j++)
 89 |             {
 90 |                 int value = input[i * col + j];
 91 |                 frequency[value]++;
 92 |             }
 93 |         }
 94 | 
 95 |         // 计算熵
 96 |         for (int k = 0; k < 16; k++)
 97 |         {
 98 |             float prob = static_cast<float>(frequency[k]) / windowElements;
 99 |             if (prob > 0.0f)
100 |                 entropy -= prob * log2f(prob);
101 |         }
102 | 
103 |         output[r * col + c] = entropy;
104 |     }
105 | }
106 | 
107 | int main()
108 | {
109 |     int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000};
110 |     srand(static_cast<unsigned int>(1234));
111 |     std::vector<std::vector<int> > array;
112 |     std::vector<float> res;
113 |     read(array,res,"test/test.in","test/test.out");
114 |     int row=array.size();
115 |     int col=array[0].size();
116 | 
117 | 
118 |     int* hostInput = new int[row*col];
119 |     for(int i=0;i<row;i++){
120 |         for(int j=0;j<col;j++){
121 |             hostInput[i*col+j]=array[i][j];
122 |         }
123 |     }
124 | 
125 |     int* deviceInput;
126 |     cudaMalloc((void**)&deviceInput, row * col * sizeof(int));
127 |     cudaMemcpy(deviceInput, hostInput, row * col * sizeof(int), cudaMemcpyHostToDevice);
128 | 
129 |     float* hostOutput = new float[row * col];
130 |     float* deviceOutput;
131 |     cudaMalloc((void**)&deviceOutput, row * col * sizeof(float));
132 |     // 定义CUDA的网格和块大小
133 |     // 设置grid、block
134 |     dim3 block_size(8, 8);
135 |     dim3 grid_size((col + block_size.x - 1) / block_size.x, (row + block_size.y - 1) / block_size.y);
136 |     dim3 gridSize((row + BLOCK_SIZE - 1) / BLOCK_SIZE, (col + BLOCK_SIZE - 1) / BLOCK_SIZE);
137 |     dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
138 |     calculateEntropy<<<gridSize, blockSize>>>(deviceInput, deviceOutput, row,col);
139 |     //calculateEntropy<< <grid_size, block_size >> >(deviceInput, deviceOutput, row,col);
140 |     cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost);
141 | 
142 |     std::vector<std::vector<float> > entropyArray;
143 |     entropyArray.resize(row);
144 |     for(int i=0;i<row;i++){
145 |         entropyArray[i].resize(col);
146 |     }
147 |     for(int i=0;i<row;i++){
148 |         for(int j=0;j<col;j++){
149 |             array[i][j]=hostInput[i*col+j];
150 |             entropyArray[i][j]=hostOutput[i*col+j];
151 |         }
152 |     }
153 | 
154 |     write(array,entropyArray,"output/share_mem.in","output/share_mem.out");
155 |     
156 |     // 释放内存
157 |     delete[] hostInput;
158 |     delete[] hostOutput;
159 |     cudaFree(deviceInput);
160 |     cudaFree(deviceOutput);
161 | 
162 | 
163 |     return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/test.cpp:
--------------------------------------------------------------------------------
 1 | #include "deal_binary.h"
 2 | 
 3 | 
 4 | void read_my_res(std::vector<std::vector<int> >& array,std::vector<float>& res){
 5 |     int row_size, col_size;
 6 |     std::vector<int> matrix_test;
 7 |     std::ifstream inFile("output/baseline.in", std::ios::binary);
 8 |     if (inFile.is_open()) {
 9 |     // First, read the size of the matrix (ROWS and COLS).
10 |     inFile.read(reinterpret_cast<char *>(&row_size),sizeof(row_size));
11 |     inFile.read(reinterpret_cast<char *>(&col_size),sizeof(col_size));
12 |     // Resize the matrix based on the size read from thefile.
13 |     matrix_test.resize(row_size * col_size);
14 |     // Then, read the actual data of the matrix.
15 |     inFile.read(reinterpret_cast<char *>
16 |     (matrix_test.data()), matrix_test.size() * sizeof(int));
17 |     }
18 | 
19 |     array.resize(row_size);
20 |     for(int i=0;i<row_size;i++){
21 |         array[i].resize(col_size);
22 |     }
23 |     
24 |     for(int i=0;i<row_size;i++){
25 |         for(int j=0;j<col_size;j++){
26 |              array[i][j]=matrix_test[i*col_size+j];
27 |         }
28 |     }
29 |     printf("%d %d\n",row_size,col_size);
30 |     printf("##########################################################################################\n");
31 |     
32 |     res.resize(row_size * col_size);
33 |     std::ifstream inFile_2("output/baseline.out", std::ios::binary);
34 |     if (inFile_2.is_open()) {
35 |     // Resize the matrix based on the size read from thefile.
36 |     // Then, read the actual data of the matrix.
37 |     inFile_2.read(reinterpret_cast<char *>(res.data()), res.size() * sizeof(float));
38 |     }
39 | }
40 | 
41 | int main(){
42 |     std::vector<std::vector<int> > array;
43 |     std::vector<float> res;
44 |     read(array,res,"output/baseline.in","output/baseline.out");
45 | 
46 |     std::vector<std::vector<int> > array2;
47 |     std::vector<float> res2;
48 |     read_my_res(array2,res2);
49 | 
50 |     int row=array.size();
51 |     int col=array[0].size();
52 | 
53 |     for(int i=0;i<array.size();i++){
54 |         for(int j=0;j<array[0].size();j++){
55 |             //printf("%d %d\n",array[i][j],array2[i][j]);
56 |             //if(fabs(res[i*col+j]-res2[i*col+j])>1e-5)
57 |             printf("%f %f\n",res[i*col+j],res2[i*col+j]);
58 |         }
59 |     }
60 | 
61 |    
62 | 
63 |     return 0;
64 | }


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/test/test.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/code/test/test.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/code/test/test.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/code/test/test.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/baseline.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/baseline.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/baseline_lu.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline_lu.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/baseline_lu.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline_lu.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_2d.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_2d.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_2d.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_2d.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_bl.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_bl.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_bl_lu.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl_lu.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/cuda_bl_lu.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl_lu.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/openmp.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/openmp.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/openmp_lu.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp_lu.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/openmp_lu.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp_lu.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/share_mem.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/share_mem.in


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/share_mem.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/share_mem.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/output/test0.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/test0.out


--------------------------------------------------------------------------------
/并行程序设计_lab3/readme.txt:
--------------------------------------------------------------------------------
1 | output文件夹中是我随机生成的输入文件（大小和数组）in和对应程序的输出文件（中心熵矩阵）out，格式和要求的一致
2 | code文件夹为源代码


--------------------------------------------------------------------------------
/并行程序设计_lab3/并行程序设计_20337025_崔璨明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/并行程序设计_20337025_崔璨明.pdf


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/matrix_vector_mul.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cuda.h>
 3 | #include <vector>
 4 | #include "read_data.h"
 5 | 
 6 | __global__ void matrixVectorMul(float* A, float* b, float* c, int rows, int cols) {
 7 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (tid < rows) {
 9 |         float sum = 0.0f;
10 |         for (int j = 0; j < cols; j++) {
11 |             sum += A[tid * cols + j] * b[j];
12 |         }
13 |         c[tid] = sum;
14 |     }
15 | }
16 | 
17 | void matrixVectorMultiplication(std::vector<float>& A, std::vector<float>& b, std::vector<float>& c, int rows, int cols) {
18 |     // Device memory allocation
19 |     float *d_A, *d_b, *d_c;
20 |     cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
21 |     cudaMalloc((void**)&d_b, cols * sizeof(float));
22 |     cudaMalloc((void**)&d_c, rows * sizeof(float));
23 | 
24 |     // Copy data from host to device
25 |     cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice);
26 |     cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice);
27 | 
28 |     // Launch kernel
29 |     int blockSize = 256;
30 |     int gridSize = (rows + blockSize - 1) / blockSize;
31 |     matrixVectorMul<<<gridSize, blockSize>>>(d_A, d_b, d_c, rows, cols);
32 | 
33 |     // Copy result from device to host
34 |     cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost);
35 | 
36 |     // Free device memory
37 |     cudaFree(d_A);
38 |     cudaFree(d_b);
39 |     cudaFree(d_c);
40 | }
41 | 
42 | int main() {
43 |     //修改这里的路径，读入二进制文件和输出二进制文件
44 |     string read_dir="data/test5.in";
45 |     string save_dir="output/res5.out";
46 |     std::vector<float> A ;
47 |     std::vector<std::vector<float> >array_2d; 
48 |     std::vector<float> b ;
49 | 
50 |     read(A,array_2d,b,read_dir);
51 |     //generate_data(A,array_2d,b,4096);
52 | 
53 |     int rows = array_2d.size();
54 |     int cols = array_2d[0].size();
55 | 
56 |     std::vector<float> c(rows);
57 | 
58 |     clock_t start, finish;
59 |     start = clock();
60 |     matrixVectorMultiplication(A, b, c, rows, cols);
61 |     finish=clock();
62 | 
63 |     // Print result
64 |     std::cout << "Result: ";
65 |     for (int i = 0; i < rows; i++) {
66 |         printf("%.5f ",c[i]);
67 |     }
68 |     std::cout << std::endl;
69 |     // Print using time
70 |     std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
71 | 
72 |     write(c,save_dir);
73 |     return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/matrix_vector_mul_v2.cu:
--------------------------------------------------------------------------------
 1 | //v2版本，用了常量内存储存向量b
 2 | 
 3 | #include <iostream>
 4 | #include <cuda.h>
 5 | #include <vector>
 6 | #include "read_data.h"
 7 | 
 8 | __constant__ float d_b[2048];
 9 | 
10 | __global__ void matrixVectorMul(float* A,float* c, int rows, int cols) {
11 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
12 |     if (tid < rows) {
13 |         float sum = 0.0f;
14 |         for (int j = 0; j < cols; j++) {
15 |             sum += A[tid * cols + j] * d_b[j];
16 |         }
17 |         c[tid] = sum;
18 |     }
19 | }
20 | 
21 | 
22 | void matrixVectorMultiplication(const std::vector<float>& A, const std::vector<float>& b, std::vector<float>& c, int rows, int cols) {
23 |     // Device memory allocation
24 |     float *d_A, *d_c;
25 |     cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
26 |     cudaMalloc((void**)&d_c, rows * sizeof(float));
27 | 
28 |     // Copy data from host to device
29 |     cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice);
30 |     cudaMemcpyToSymbol(d_b, b.data(), cols * sizeof(float));
31 | 
32 |     // Launch kernel
33 |     int blockSize = 256;
34 |     int gridSize = (rows + blockSize - 1) / blockSize;
35 |     matrixVectorMul<<<gridSize, blockSize>>>(d_A, d_c, rows, cols);
36 | 
37 |     // Copy result from device to host
38 |     cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost);
39 | 
40 |     // Free device memory
41 |     cudaFree(d_A);
42 |     cudaFree(d_c);
43 | }
44 | 
45 | 
46 | int main() {
47 |     string read_dir="data/test1.in";
48 |     string save_dir="output/res1.out";
49 |     std::vector<float> A ;
50 |     std::vector<std::vector<float> >array_2d; 
51 |     std::vector<float> b ;
52 |     read(A,array_2d,b,read_dir);
53 |     //generate_data(A,array_2d,b,2048);
54 | 
55 |     int rows = array_2d.size();
56 |     int cols = array_2d[0].size();
57 | 
58 |     std::vector<float> c(rows);
59 | 
60 |     clock_t start, finish;
61 |     start = clock();
62 |     matrixVectorMultiplication(A, b, c, rows, cols);
63 |     finish=clock();
64 | 
65 |     // Print result
66 |     std::cout << "Result: ";
67 |     for (int i = 0; i < rows; i++) {
68 |         printf("%.5f ",c[i]);
69 |     }
70 |     std::cout << std::endl;
71 |     // Print using time
72 |     std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
73 |     write(c,save_dir);
74 |     return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/matrix_vector_mul_v3.cu:
--------------------------------------------------------------------------------
 1 | //v3版本，用了纹理内存进行优化，将内核函数中的内存访问从全局内存改为从纹理内存中读取数据
 2 | 
 3 | #include <iostream>
 4 | #include <cuda.h>
 5 | #include <vector>
 6 | #include "read_data.h"
 7 | 
 8 | texture<float, 1, cudaReadModeElementType> texA;
 9 | texture<float, 1, cudaReadModeElementType> texB;
10 | 
11 | __global__ void matrixVectorMul(float* c, int rows, int cols) {
12 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
13 |     if (tid < rows) {
14 |         float sum = 0.0f;
15 |         for (int j = 0; j < cols; j++) {
16 |             sum += tex1Dfetch(texA, tid * cols + j) * tex1Dfetch(texB, j);
17 |         }
18 |         c[tid] = sum;
19 |     }
20 | }
21 | 
22 | void matrixVectorMultiplication(std::vector<float>& A, std::vector<float>& b, std::vector<float>& c, int rows, int cols) {
23 |     // Device memory allocation
24 |     float *d_A, *d_b, *d_c;
25 |     cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
26 |     cudaMalloc((void**)&d_b, cols * sizeof(float));
27 |     cudaMalloc((void**)&d_c, rows * sizeof(float));
28 | 
29 |     // Copy data from host to device
30 |     cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice);
31 |     cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice);
32 | 
33 |     // Bind texture memory
34 |     cudaBindTexture(NULL, texA, d_A, rows * cols * sizeof(float));
35 |     cudaBindTexture(NULL, texB, d_b, cols * sizeof(float));
36 | 
37 |     // Launch kernel
38 |     int blockSize = 256;
39 |     int gridSize = (rows + blockSize - 1) / blockSize;
40 |     matrixVectorMul<<<gridSize, blockSize>>>(d_c, rows, cols);
41 | 
42 |     // Copy result from device to host
43 |     cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost);
44 | 
45 |     // Unbind texture memory
46 |     cudaUnbindTexture(texA);
47 |     cudaUnbindTexture(texB);
48 | 
49 |     // Free device memory
50 |     cudaFree(d_A);
51 |     cudaFree(d_b);
52 |     cudaFree(d_c);
53 | }
54 | 
55 | int main() {
56 |     string read_dir="data/test1.in";
57 |     string save_dir="output/res1.out";
58 |     std::vector<float> A ;
59 |     std::vector<std::vector<float> >array_2d; 
60 |     std::vector<float> b ;
61 | 
62 |     read(A,array_2d,b,read_dir);
63 |     //generate_data(A,array_2d,b,4096);
64 |     int rows = array_2d.size();
65 |     int cols = array_2d[0].size();
66 | 
67 |     std::vector<float> c(rows);
68 | 
69 |     clock_t start, finish;
70 |     start = clock();
71 |     matrixVectorMultiplication(A, b, c, rows, cols);
72 |     finish=clock();
73 | 
74 |     // Print result
75 |     std::cout << "Result: ";
76 |     for (int i = 0; i < rows; i++) {
77 |         //std::cout << c[i] << " ";
78 |         printf("%.5f ",c[i]);
79 |     }
80 |     std::cout << std::endl;
81 |     // Print using time
82 |     std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"<<std::endl;
83 |     write(c,save_dir);
84 |     return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/matrix_vector_mul_v4.cu:
--------------------------------------------------------------------------------
 1 | //v4版本，用了cublas进行优化
 2 | #include <iostream>
 3 | #include <cuda.h>
 4 | #include <vector>
 5 | #include <cublas_v2.h>
 6 | #include "read_data.h"
 7 | 
 8 | void matrixVectorMultiplication(std::vector<float>& A, std::vector<float>& b, std::vector<float>& c, int rows, int cols) {
 9 |     // Device memory allocation
10 |     float *d_A, *d_b, *d_c;
11 |     cudaMalloc((void**)&d_A, rows * cols * sizeof(float));
12 |     cudaMalloc((void**)&d_b, cols * sizeof(float));
13 |     cudaMalloc((void**)&d_c, rows * sizeof(float));
14 | 
15 |     // Copy data from host to device
16 |     cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice);
17 |     cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice);
18 | 
19 |     // cuBLAS initialization
20 |     cublasHandle_t handle;
21 |     cublasCreate(&handle);
22 | 
23 |     // Matrix-vector multiplication using cuBLAS
24 |     float alpha = 1.0f;
25 |     float beta = 0.0f;
26 |     cublasSgemv(handle, CUBLAS_OP_T, cols, rows, &alpha, d_A, cols, d_b, 1, &beta, d_c, 1);
27 | 
28 |     // Copy result from device to host
29 |     cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost);
30 | 
31 |     // Free device memory
32 |     cudaFree(d_A);
33 |     cudaFree(d_b);
34 |     cudaFree(d_c);
35 | 
36 |     // Destroy cuBLAS handle
37 |     cublasDestroy(handle);
38 | }
39 | 
40 | int main() {
41 |     string read_dir="data/test1.in";
42 |     string save_dir="output/res1.out";
43 |     std::vector<float> A;
44 |     std::vector<std::vector<float>> array_2d;
45 |     std::vector<float> b;
46 | 
47 |     read(A, array_2d, b, read_dir);
48 | 
49 |     int rows = array_2d.size();
50 |     int cols = array_2d[0].size();
51 | 
52 |     std::vector<float> c(rows);
53 | 
54 |     clock_t start, finish;
55 |     start = clock();
56 |     matrixVectorMultiplication(A, b, c, rows, cols);
57 |     finish = clock();
58 | 
59 |     // Print result
60 |     std::cout << "Result: ";
61 |     for (int i = 0; i < rows; i++) {
62 |         printf("%.5f ", c[i]);
63 |     }
64 |     std::cout << std::endl;
65 | 
66 |     // Print execution time
67 |     std::cout << "Using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl;
68 |     write(c,save_dir);
69 |     return 0;
70 | }
71 | 


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/read_data.h:
--------------------------------------------------------------------------------
  1 | //头文件，定义了读取、写入二进制文件的函数
  2 | 
  3 | #include <iostream>
  4 | #include <cmath>
  5 | #include <vector>
  6 | #include <cstdlib>
  7 | #include <ctime>
  8 | #include <fstream>
  9 | #include <iomanip>
 10 | using std::string;
 11 | 
 12 | void read(std::vector<float>& array_1d,std::vector<std::vector<float> >& array_2d,std::vector<float>& res,string path){
 13 |     int row_size, col_size;
 14 |     std::vector<float> matrix_test;
 15 |     std::vector<float> vec;
 16 |     std::ifstream inFile_in(path, std::ios::binary);
 17 |     if (inFile_in.is_open()) {
 18 |         // First, read the size of the matrix and the vector.
 19 |         inFile_in.read(reinterpret_cast<char *>(&row_size),
 20 |             sizeof(row_size));
 21 |         inFile_in.read(reinterpret_cast<char *>(&col_size),
 22 |             sizeof(col_size));
 23 |         // Resize the matrix and vector based on the size read from the file.
 24 |         matrix_test.resize(row_size * col_size);
 25 |         vec.resize(col_size);
 26 |         // Then, read the actual data of the matrix and the vector.
 27 |         inFile_in.read(reinterpret_cast<char *>(matrix_test.data()),
 28 |             matrix_test.size() * sizeof(float));
 29 |         inFile_in.read(reinterpret_cast<char *>(vec.data()), vec.size()
 30 |             * sizeof(float));
 31 |         inFile_in.close();
 32 |     } 
 33 |     else {
 34 |         std::cout << "Unable to open file";
 35 |     }
 36 | 
 37 |     array_1d.resize(row_size * col_size);
 38 |     array_2d.resize(row_size);
 39 |     for(int i=0;i<row_size;i++){
 40 |         array_2d[i].resize(col_size);
 41 |     }
 42 |     
 43 |     for(int i=0;i<row_size;i++){
 44 |         for(int j=0;j<col_size;j++){
 45 |              array_2d[i][j]=matrix_test[i*col_size+j];
 46 |              array_1d[i*col_size+j]=matrix_test[i*col_size+j];
 47 |         }
 48 |     }
 49 | 
 50 |     res.resize(col_size);
 51 |     for(int i=0;i<col_size;i++){
 52 |         res[i]=vec[i];
 53 |     }
 54 | 
 55 |     printf("Read complete. Size:%d %d\n",row_size,col_size);
 56 |     //printf("##########################################################################################\n");
 57 | }
 58 | 
 59 | void read_res(std::vector<float> &result,string path,int size){
 60 |     std::ifstream inFile_in(path, std::ios::binary);
 61 |     if (inFile_in.is_open()) {
 62 |         // Resize the matrix and vector based on the size read from the file.
 63 |         result.resize(size);
 64 |         // Then, read the actual data of the matrix and the vector.
 65 |         inFile_in.read(reinterpret_cast<char *>(result.data()),
 66 |             result.size() * sizeof(float));
 67 |         inFile_in.close();
 68 |     } 
 69 | }
 70 | 
 71 | void write(std::vector<float> &result,string save_dir){
 72 |     // Write result to file
 73 |     std::ofstream outFile_out(save_dir, std::ios::binary);
 74 |     if (outFile_out.is_open()) {
 75 |         outFile_out.write(reinterpret_cast<const char *>
 76 |             (result.data()), result.size() * sizeof(float));
 77 |         outFile_out.close();
 78 |     } 
 79 |     else {
 80 |         std::cout << "Unable to open file";
 81 |     }
 82 | }
 83 | 
 84 | void generate_data(std::vector<float>& array_1d,std::vector<std::vector<float> >& array_2d,std::vector<float>& res,int size){
 85 |     srand(static_cast<unsigned int>(1234));
 86 |     array_1d.resize(size*size);
 87 |     array_2d.resize(size);
 88 |     res.resize(size);
 89 |     for(int i=0;i<size;i++){
 90 |         array_2d[i].resize(size);
 91 |     }
 92 |     for (int i = 0; i < size; i++) {
 93 |         for (int j = 0; j < size; j++) {
 94 |             array_2d[i][j] = float(rand()%10000)/10000;
 95 |             array_1d[i*size+j]=array_2d[i][j];
 96 |             //printf("%f ",array_2d[i][j]);
 97 |         }
 98 |         //printf("\n");
 99 |         res[i]=float(rand()%10000)/10000;
100 |     }
101 |     printf("Generate complete. Size:%d %d\n",size,size);
102 | }


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/readme.txt:
--------------------------------------------------------------------------------
 1 | matrix_vector_mul原程序
 2 | matrix_vector_mul_v2（常量内存优化）
 3 | matrix_vector_mul_v3（纹理内存优化）
 4 | matrix_vector_mul_v4（cuBLAS优化）
 5 | 
 6 | read_data.h为处理数据的头文件，valid.cpp为验证正确性的程序
 7 | 
 8 | 编译命令：nvcc matrix_vector_mul_v4.cu -o matrix_vector_mul -std=c++11
 9 | 
10 | 读入的二进制文件放入data文件夹，运行程序会在output文件夹输出结果的二进制文件。


--------------------------------------------------------------------------------
/并行程序设计_lab4/code/valid.cpp:
--------------------------------------------------------------------------------
 1 | #include "read_data.h"
 2 | 
 3 | int SIZE=2048;
 4 | 
 5 | void test(string file1,string file2){
 6 |     std::vector<float> res1;
 7 |     std::vector<float> res2;
 8 |     read_res(res1,file1,SIZE);
 9 |     read_res(res2,file2,SIZE);
10 |     bool flag=false;
11 |     for(int i=0;i<SIZE;i++){
12 |         if(fabs(res1[i]-res2[i])>1e-5){
13 |             printf("%.5f %.5f\n",res1[i],res2[i]);
14 |             flag=true;
15 |         }
16 |     }
17 |     if(flag==false)
18 |     printf("The result is right.\n");
19 | }
20 | 
21 | int main(){
22 |     string file1="data/test5.out";
23 |     string file2="output/res5.out";
24 |     test(file1,file2);
25 |     return 0;
26 | }


--------------------------------------------------------------------------------
/并行程序设计_lab4/并行程序设计_20337025_崔璨明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab4/并行程序设计_20337025_崔璨明.pdf


--------------------------------------------------------------------------------