├── README.md ├── 复习资料 ├── 并行复习笔记.pdf ├── 并行程序设计导论.pdf ├── 课程要点-CUDA编程.pdf ├── 课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf └── 课程要点-高性能并行程序设计.pdf ├── 并行程序设计_lab0 ├── gemm.c ├── gemm.java ├── gemm.py └── 并行程序设计_20337025_崔璨明.pdf ├── 并行程序设计_lab1 ├── lib_code │ ├── libmatrix_multiply.so │ ├── matrix_multiply.c │ ├── matrix_multiply.h │ ├── readme.txt │ ├── test │ └── test.c ├── mpi_gemm_1.cpp ├── mpi_gemm_2.cpp ├── readme.txt └── 并行程序设计_20337025_崔璨明.pdf ├── 并行程序设计_lab2 ├── Monte_carlo.cpp ├── code │ ├── libparallel_for.so │ ├── parallel_for.cpp │ ├── parallel_for.h │ ├── parallel_for.o │ ├── test │ └── test.cpp ├── gemm_openmp.cpp ├── gemm_p.cpp ├── readme.txt └── 并行程序设计_20337025_崔璨明.pdf ├── 并行程序设计_lab3 ├── code │ ├── baseline.cpp │ ├── baseline_lu.cpp │ ├── cuda_2d.cu │ ├── cuda_bl.cu │ ├── cuda_bl_lu.cu │ ├── deal_binary.h │ ├── openmp.cpp │ ├── openmp_lu.cpp │ ├── readme.txt │ ├── share_mem.cu │ ├── test.cpp │ └── test │ │ ├── test.in │ │ └── test.out ├── output │ ├── baseline.in │ ├── baseline.out │ ├── baseline_lu.in │ ├── baseline_lu.out │ ├── cuda_2d.in │ ├── cuda_2d.out │ ├── cuda_bl.in │ ├── cuda_bl.out │ ├── cuda_bl_lu.in │ ├── cuda_bl_lu.out │ ├── openmp.in │ ├── openmp.out │ ├── openmp_lu.in │ ├── openmp_lu.out │ ├── share_mem.in │ ├── share_mem.out │ ├── test0.in │ └── test0.out ├── readme.txt └── 并行程序设计_20337025_崔璨明.pdf └── 并行程序设计_lab4 ├── code ├── matrix_vector_mul.cu ├── matrix_vector_mul_v2.cu ├── matrix_vector_mul_v3.cu ├── matrix_vector_mul_v4.cu ├── read_data.h ├── readme.txt └── valid.cpp └── 并行程序设计_20337025_崔璨明.pdf /README.md: -------------------------------------------------------------------------------- 1 | # SYSU_parallel_programming 2 | 中山大学计算机学院 并行程序设计与算法(课程记录) 3 | 4 | 授课老师:陶钧,黄聃 5 | -------------------------------------------------------------------------------- /复习资料/并行复习笔记.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/并行复习笔记.pdf -------------------------------------------------------------------------------- /复习资料/并行程序设计导论.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/并行程序设计导论.pdf -------------------------------------------------------------------------------- /复习资料/课程要点-CUDA编程.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-CUDA编程.pdf -------------------------------------------------------------------------------- /复习资料/课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-计算机体系结构-量化研究方法-第六版-第四章.pdf -------------------------------------------------------------------------------- /复习资料/课程要点-高性能并行程序设计.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/复习资料/课程要点-高性能并行程序设计.pdf -------------------------------------------------------------------------------- /并行程序设计_lab0/gemm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define NUM 100 6 | 7 | int M,N,K; 8 | 9 | double A[2048][2048]; 10 | double B[2048][2048]; 11 | double C[2048][2048]; 12 | 13 | 14 | 15 | void init_Mat(int M,int N,int K){ 16 | 17 | srand(233); 18 | 19 | for (int m=0;m 3 | #include 4 | void matrix_multiply(double**A,double**B,double**C,int M,int N,int K){ 5 | for(int m=0;m 4 | #include 5 | void matrix_multiply(double**A,double**B,double**C,int M,int N,int K); 6 | #endif -------------------------------------------------------------------------------- /并行程序设计_lab1/lib_code/readme.txt: -------------------------------------------------------------------------------- 1 | test.c文件为测试程序 -------------------------------------------------------------------------------- /并行程序设计_lab1/lib_code/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab1/lib_code/test -------------------------------------------------------------------------------- /并行程序设计_lab1/lib_code/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "matrix_multiply.h" 5 | #define NUM 100 6 | 7 | int M,N,K; 8 | 9 | 10 | int main(){ 11 | clock_t start_time,end_time; 12 | printf("input three integer(512 ~2048):\n"); 13 | scanf("%d %d %d",&M,&N,&K); 14 | 15 | 16 | double **A=(double**)malloc(sizeof(double*)*M); 17 | double **B=(double**)malloc(sizeof(double*)*N); 18 | double **C=(double**)malloc(sizeof(double*)*M); 19 | 20 | for(int i=0;i 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // print matrix 10 | void print_mat(int row,int col,double * matrix){ 11 | for(int i=0;i 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | 10 | 11 | // print matrix 12 | void print_mat(int row,int col,double * matrix){ 13 | for(int i=0;i 2 | #include 3 | #include 4 | 5 | #define THREAD_COUNT 8 // 线程数量 6 | #define ITERATIONS 1000000 // 迭代次数 7 | 8 | double sum = 0; 9 | pthread_mutex_t lock; //互斥锁 10 | 11 | void *calculate_area(void *thread_id_ptr) { 12 | int thread_id = *(int *) thread_id_ptr; // 获取线程ID 13 | std::random_device rd; 14 | std::mt19937 gen(rd()); 15 | std::uniform_real_distribution<> dis(0, 1); 16 | double local_sum = 0; 17 | 18 | for (int i = 0; i < ITERATIONS; i++) { 19 | double x = dis(gen); 20 | double y = dis(gen); 21 | // (x, y) 在 y=x^2 曲线下方,需要计入面积 22 | if (y <= x * x) { 23 | local_sum++; 24 | } 25 | } 26 | 27 | //将当前线程的计算结果加到 sum 变量中 28 | pthread_mutex_lock(&lock); 29 | sum += local_sum / ITERATIONS; 30 | pthread_mutex_unlock(&lock); 31 | 32 | pthread_exit(NULL); 33 | } 34 | 35 | int main() { 36 | pthread_t threads[THREAD_COUNT]; // 创建线程数组 37 | int thread_ids[THREAD_COUNT]; // 创建线程 ID 数组 38 | pthread_mutex_init(&lock, NULL); // 初始化互斥锁 39 | 40 | // 创建线程并执行计算 41 | for (int i = 0; i < THREAD_COUNT; i++) { 42 | thread_ids[i] = i; 43 | pthread_create(&threads[i], NULL, calculate_area, &thread_ids[i]); 44 | } 45 | 46 | // 等待所有线程完成计算 47 | for (int i = 0; i < THREAD_COUNT; i++) { 48 | pthread_join(threads[i], NULL); 49 | } 50 | 51 | pthread_mutex_destroy(&lock); 52 | std::cout << "function: y=x^2\nEstimated area: " << sum / THREAD_COUNT << std::endl; // 输出计算结果 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /并行程序设计_lab2/code/libparallel_for.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/libparallel_for.so -------------------------------------------------------------------------------- /并行程序设计_lab2/code/parallel_for.cpp: -------------------------------------------------------------------------------- 1 | #include"parallel_for.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | //并行循环函数parallel_for 8 | void parallel_for(int start, int end, int increment, void *(*functor)(void *), void *arg, int num_threads){ 9 | pthread_t *threads = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); 10 | for_index *index_arr = (for_index *)malloc(num_threads * sizeof(for_index)); 11 | 12 | //每个线程要处理的数据块大小 13 | int block = (end - start) / num_threads; 14 | //为每个线程分配参数 15 | for (int i = 0; i < num_threads; i++){ 16 | index_arr[i].args = arg; 17 | index_arr[i].start = start + i * block; 18 | index_arr[i].end = index_arr[i].start + block; 19 | //处理最后一个线程的数据块 20 | if (i == (num_threads - 1)) 21 | index_arr[i].end = end; 22 | index_arr[i].increment = increment; 23 | pthread_create(&threads[i], NULL, functor, (void *)(index_arr + i)); 24 | } 25 | //等待所有线程执行完成 26 | for (int thread = 0; thread < num_threads; thread++) 27 | pthread_join(threads[thread], NULL); 28 | free(threads); 29 | free(index_arr); 30 | } -------------------------------------------------------------------------------- /并行程序设计_lab2/code/parallel_for.h: -------------------------------------------------------------------------------- 1 | #ifndef PARALLEL_FOR_H 2 | #define PARALLEL_FOR_H 3 | #include 4 | 5 | // for 循环参数的结构体 6 | struct for_index{ 7 | void *args; 8 | int start; // 起始下标 9 | int end; //终止下标 10 | int increment; // 步长 11 | }; 12 | void parallel_for(int start, int end, int increment, void *(*functor)(void *), void *arg, int num_threads); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /并行程序设计_lab2/code/parallel_for.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/parallel_for.o -------------------------------------------------------------------------------- /并行程序设计_lab2/code/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab2/code/test -------------------------------------------------------------------------------- /并行程序设计_lab2/code/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include"parallel_for.h" 7 | 8 | using namespace std; 9 | 10 | int M,N,K; 11 | int Thread_NUM =4; 12 | double *A; 13 | double *B; 14 | double *C; 15 | 16 | //print matrix 17 | void print_mat(int row,int col,double * matrix){ 18 | for(int i=0;iargs); 63 | 64 | int K=matrix->k; 65 | int N=matrix->n; 66 | 67 | for (int m = idx->start; m < idx->end; m = m + idx->increment){ 68 | for (int k = 0; k < K; k++){ 69 | matrix->C[m * K + k] =0; 70 | for (int n = 0; n < N; n++){ 71 | matrix->C[m * K + k] += matrix->A[m * N + n] * matrix->B[n* K + k]; 72 | } 73 | } 74 | } 75 | return NULL; 76 | } 77 | 78 | int main(int argc, char *argv[]) 79 | { 80 | int M=atoi(argv[1]); 81 | int N=atoi(argv[2]); 82 | int K=atoi(argv[3]); 83 | Thread_NUM=atoi(argv[4]); 84 | 85 | init_Mat(M,N,K); 86 | 87 | struct args *arg = new args(); 88 | arg->A=A; 89 | arg->B=B; 90 | arg->C=C; 91 | arg->n=N; 92 | arg->m=M; 93 | arg->k=K; 94 | 95 | clock_t start_time=clock(); 96 | parallel_for(0, M, 1, gemm_fun, arg, Thread_NUM); 97 | clock_t end_time=clock(); 98 | double using_time=(double)(end_time-start_time)/CLOCKS_PER_SEC; 99 | 100 | cout<<"result:"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | 10 | int Thread_NUM =4; 11 | 12 | using namespace std; 13 | 14 | int M,N,K; 15 | 16 | double **A; 17 | double **B; 18 | double **C; 19 | 20 | // init matrix 21 | void init_Mat(int M,int N,int K){ 22 | srand(243); 23 | A = new double* [M]; 24 | B = new double* [N]; 25 | C = new double* [M]; 26 | for (int m=0;m 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | int M,N,K; 9 | 10 | double **A; 11 | double **B; 12 | double **C; 13 | 14 | int thread_count; 15 | 16 | void init_Mat(int M,int N,int K){ 17 | srand(243); 18 | A = new double* [M]; 19 | B = new double* [N]; 20 | C = new double* [M]; 21 | for (int m=0;m 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "deal_binary.h" 9 | using std::string; 10 | 11 | const int ARRAY_SIZE[] = {5, 16, 128, 1024, 2048, 3000,4000}; 12 | const int ELEMENT_RANGE = 16; 13 | const int WINDOW_SIZE = 5; 14 | const int NUM_ARRAYS = sizeof(ARRAY_SIZE) / sizeof(ARRAY_SIZE[0]); 15 | const int KERNEL_RADIUS = WINDOW_SIZE / 2; 16 | 17 | // 生成随机二维数组 18 | void generateRandomArray(std::vector >& array,int row,int col) { 19 | array.resize(row); 20 | for(int i=0;i(8234)); 24 | for (int i = 0; i < row; i++) { 25 | for (int j = 0; j < col; j++) { 26 | array[i][j] = rand() % ELEMENT_RANGE; 27 | } 28 | //printf("\n"); 29 | } 30 | } 31 | 32 | 33 | // 计算熵 34 | float calculateEntropy(const std::vector >& array, int x, int y) { 35 | std::vector counts(ELEMENT_RANGE, 0); 36 | 37 | int startX = std::max(0, x - WINDOW_SIZE / 2); 38 | int startY = std::max(0, y - WINDOW_SIZE / 2); 39 | int endX = std::min(static_cast(array.size()) - 1, x + WINDOW_SIZE / 2); 40 | int endY = std::min(static_cast(array[0].size()) - 1, y + WINDOW_SIZE / 2); 41 | 42 | //printf("(%d %d),(%d %d)\n",startX,startY,endX,endY); 43 | 44 | for (int i = startX; i <= endX; i++) { 45 | for (int j = startY; j <= endY; j++) { 46 | counts[array[i][j]]++; 47 | } 48 | } 49 | 50 | float entropy = 0.0; 51 | int windowSize = (endX - startX + 1) * (endY - startY + 1); 52 | //printf("%d\n",windowSize); 53 | for (int i = 0; i < ELEMENT_RANGE; i++) { 54 | float probability = float(counts[i]) / windowSize; 55 | if (counts[i]!=0) { 56 | //printf("%d ",i); 57 | entropy -= probability * log2(probability); 58 | } 59 | } 60 | //printf("\n"); 61 | return entropy; 62 | } 63 | 64 | int main() { 65 | // 设置随机种子 66 | srand(static_cast(time(NULL))); 67 | std::vector > array; 68 | 69 | std::vector res; 70 | read(array,res,"test/test.in","test/test.out"); 71 | 72 | int row=array.size(); 73 | int col=array[0].size(); 74 | 75 | //int row=200; 76 | //int col=500; 77 | //generateRandomArray(array,row,col); 78 | std::vector > entropyArray; 79 | entropyArray.resize(row); 80 | for(int i=0;i > array; 103 | generateRandomArray(array, ARRAY_SIZE[i]); 104 | 105 | std::vector > entropyArray(ARRAY_SIZE[i], std::vector(ARRAY_SIZE[i])); 106 | 107 | clock_t start, finish; 108 | //clock_t为CPU时钟计时单元数 109 | start = clock(); 110 | 111 | for (int x = 0; x < ARRAY_SIZE[i]; x++) { 112 | for (int y = 0; y < ARRAY_SIZE[i]; y++) { 113 | entropyArray[x][y] = calculateEntropy(array, x, y); 114 | } 115 | } 116 | 117 | finish=clock(); 118 | // 输出结果 119 | std::cout << "Array Size: " << ARRAY_SIZE[i] << " using time: "<<1000*float(finish - start) / CLOCKS_PER_SEC<<" ms"< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "deal_binary.h" 8 | using std::string; 9 | 10 | 11 | const int ARRAY_SIZE[] = {5, 16, 128, 1024, 2048, 3000,4000}; 12 | const int ELEMENT_RANGE = 16; 13 | const int WINDOW_SIZE = 5; 14 | const int NUM_ARRAYS = sizeof(ARRAY_SIZE) / sizeof(ARRAY_SIZE[0]); 15 | const int KERNEL_RADIUS = WINDOW_SIZE / 2; 16 | 17 | const int LOG_TABLE_SIZE = 25; 18 | std::vector logTable(LOG_TABLE_SIZE); 19 | 20 | // 初始化对数表 21 | void initializeLogTable() { 22 | for (int i = 1; i <= LOG_TABLE_SIZE; i++) { 23 | logTable[i - 1] = log2(i); 24 | } 25 | } 26 | 27 | // 查找对数值 28 | double lookupLog(int n) { 29 | if (n >= 1 && n <= LOG_TABLE_SIZE) { 30 | return logTable[n - 1]; 31 | } else { 32 | // 处理超出查表范围的情况 33 | return log2(n); 34 | } 35 | } 36 | 37 | // 生成随机二维数组 38 | void generateRandomArray(std::vector >& array,int row,int col) { 39 | array.resize(row); 40 | for(int i=0;i(2234)); 44 | for (int i = 0; i < row; i++) { 45 | for (int j = 0; j < col; j++) { 46 | array[i][j] = rand() % ELEMENT_RANGE; 47 | } 48 | //printf("\n"); 49 | } 50 | } 51 | 52 | // 计算熵 53 | double calculateEntropy(const std::vector >& array, int x, int y) { 54 | std::vector counts(ELEMENT_RANGE, 0); 55 | 56 | int startX = std::max(0, x - WINDOW_SIZE / 2); 57 | int startY = std::max(0, y - WINDOW_SIZE / 2); 58 | int endX = std::min(static_cast(array.size()) - 1, x + WINDOW_SIZE / 2); 59 | int endY = std::min(static_cast(array[0].size()) - 1, y + WINDOW_SIZE / 2); 60 | 61 | //printf("(%d %d),(%d %d)\n",startX,startY,endX,endY); 62 | 63 | for (int i = startX; i <= endX; i++) { 64 | for (int j = startY; j <= endY; j++) { 65 | counts[array[i][j]]++; 66 | } 67 | } 68 | 69 | float entropy = 0.0; 70 | int windowSize = (endX - startX + 1) * (endY - startY + 1); 71 | //printf("%d\n",windowSize); 72 | for (int i = 0; i < ELEMENT_RANGE; i++) { 73 | float probability = float(counts[i]) / windowSize; 74 | if (counts[i]!=0) { 75 | //printf("%d ",i); 76 | entropy -= probability * (lookupLog(counts[i])-lookupLog(windowSize)); 77 | } 78 | } 79 | //printf("\n"); 80 | return entropy; 81 | } 82 | 83 | int main() { 84 | // 设置随机种子 85 | srand(static_cast(time(NULL))); 86 | 87 | // 初始化对数表 88 | initializeLogTable(); 89 | 90 | /* 91 | for (int i = 0; i < NUM_ARRAYS; i++) { 92 | std::vector > array; 93 | generateRandomArray(array, ARRAY_SIZE[i]); 94 | 95 | std::vector > entropyArray(ARRAY_SIZE[i], std::vector(ARRAY_SIZE[i])); 96 | 97 | clock_t start, finish; 98 | start = clock(); 99 | 100 | for (int x = 0; x < ARRAY_SIZE[i]; x++) { 101 | for (int y = 0; y < ARRAY_SIZE[i]; y++) { 102 | entropyArray[x][y] = calculateEntropy(array, x, y); 103 | } 104 | } 105 | 106 | finish = clock(); 107 | std::cout << "Array Size: " << ARRAY_SIZE[i] << " using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl; 108 | 109 | }*/ 110 | std::vector > array; 111 | std::vector res; 112 | read(array,res,"test/test.in","test/test.out"); 113 | int row=array.size(); 114 | int col=array[0].size(); 115 | generateRandomArray(array,row,col); 116 | std::vector > entropyArray; 117 | entropyArray.resize(row); 118 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #define BLOCK_SIZE 16 6 | #include "deal_binary.h" 7 | using std::string; 8 | 9 | // CUDA核函数,计算以每个元素为中心的窗口中的熵 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col) 11 | { 12 | int c = blockIdx.y * blockDim.y + threadIdx.y; 13 | int r = blockIdx.x * blockDim.x + threadIdx.x; 14 | 15 | if (r < row && c < col) 16 | { 17 | int windowSize = 5; 18 | int windowStartRow = r - 2; 19 | int windowStartCol = c - 2; 20 | int windowEndRow = windowStartRow + 4; 21 | int windowEndCol = windowStartCol + 4; 22 | 23 | // 边界处理 24 | if (windowStartRow < 0) 25 | windowStartRow = 0; 26 | if (windowStartCol < 0) 27 | windowStartCol = 0; 28 | if (windowEndRow >= row) 29 | windowEndRow = row - 1; 30 | if (windowEndCol >= col) 31 | windowEndCol = col - 1; 32 | 33 | float entropy = 0.0f; 34 | int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1); 35 | 36 | // 计算窗口内元素的频率 37 | int frequency[16] = { 0 }; 38 | for (int i = windowStartRow; i <= windowEndRow; i++) 39 | { 40 | for (int j = windowStartCol; j <= windowEndCol; j++) 41 | { 42 | int value = input[i * col + j]; 43 | frequency[value]++; 44 | } 45 | } 46 | 47 | // 计算熵 48 | for (int k = 0; k < 16; k++) 49 | { 50 | float prob = static_cast(frequency[k]) / windowElements; 51 | if (prob > 0.0f) 52 | entropy -= prob * log2f(prob); 53 | } 54 | 55 | output[r * col + c] = entropy; 56 | } 57 | } 58 | 59 | int main() 60 | { 61 | int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000}; 62 | srand(static_cast(1234)); 63 | /* 64 | for (int i = 0; i < 7; i++) 65 | { 66 | int size = sizes[i]; 67 | 68 | // 随机生成二维数组 69 | int* hostInput = new int[size * size]; 70 | for (int j = 0; j < size * size; j++) 71 | hostInput[j] = rand() % 16; 72 | 73 | int* deviceInput; 74 | cudaMalloc((void**)&deviceInput, size * size * sizeof(int)); 75 | cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice); 76 | 77 | float* hostOutput = new float[size * size]; 78 | float* deviceOutput; 79 | cudaMalloc((void**)&deviceOutput, size * size * sizeof(float)); 80 | 81 | // 定义CUDA的网格和块大小 82 | dim3 gridSize((size + BLOCK_SIZE - 1) / BLOCK_SIZE, (size + BLOCK_SIZE - 1) / BLOCK_SIZE); 83 | dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE); 84 | 85 | 86 | clock_t start, finish; 87 | //clock_t为CPU时钟计时单元数 88 | start = clock(); 89 | // 调用CUDA核函数 90 | calculateEntropy<<>>(deviceInput, deviceOutput, size); 91 | cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost); 92 | 93 | finish=clock(); 94 | // 输出结果 95 | std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< > array; 115 | std::vector res; 116 | read(array,res,"test/test.in","test/test.out"); 117 | int row=array.size(); 118 | int col=array[0].size(); 119 | 120 | 121 | int* hostInput = new int[row*col]; 122 | for(int i=0;i>>(deviceInput, deviceOutput, row,col); 140 | cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost); 141 | 142 | std::vector > entropyArray; 143 | entropyArray.resize(row); 144 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #define BLOCK_SIZE 16 6 | #include "deal_binary.h" 7 | using std::string; 8 | 9 | // CUDA核函数,计算以每个元素为中心的窗口中的熵 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col) 11 | { 12 | int index = blockIdx.x * blockDim.x + threadIdx.x; 13 | 14 | if (index < row * col) 15 | { 16 | int r = index / col; 17 | int c = index % col; 18 | 19 | int windowSize = 5; 20 | int windowStartRow = r - 2; 21 | int windowStartCol = c - 2; 22 | int windowEndRow = windowStartRow + 4; 23 | int windowEndCol = windowStartCol + 4; 24 | 25 | // 边界处理 26 | if (windowStartRow < 0) 27 | windowStartRow = 0; 28 | if (windowStartCol < 0) 29 | windowStartCol = 0; 30 | if (windowEndRow >= row) 31 | windowEndRow = row - 1; 32 | if (windowEndCol >= col) 33 | windowEndCol = col - 1; 34 | 35 | float entropy = 0.0f; 36 | int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1); 37 | 38 | // 计算窗口内元素的频率 39 | int frequency[16] = { 0 }; 40 | for (int i = windowStartRow; i <= windowEndRow; i++) 41 | { 42 | for (int j = windowStartCol; j <= windowEndCol; j++) 43 | { 44 | int value = input[i * col + j]; 45 | frequency[value]++; 46 | } 47 | } 48 | 49 | // 计算熵 50 | for (int k = 0; k < 16; k++) 51 | { 52 | float prob = static_cast(frequency[k]) / windowElements; 53 | if (prob > 0.0f) 54 | entropy -= prob * log2f(prob); 55 | } 56 | 57 | output[index] = entropy; 58 | } 59 | } 60 | 61 | int main() 62 | { 63 | int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000}; 64 | srand(static_cast(1234)); 65 | /* 66 | for (int i = 0; i < 7; i++) 67 | { 68 | int size = sizes[i]; 69 | 70 | // 随机生成二维数组 71 | int* hostInput = new int[size * size]; 72 | for (int j = 0; j < size * size; j++) 73 | hostInput[j] = rand() % 16; 74 | 75 | int* deviceInput; 76 | cudaMalloc((void**)&deviceInput, size * size * sizeof(int)); 77 | cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice); 78 | 79 | float* hostOutput = new float[size * size]; 80 | float* deviceOutput; 81 | cudaMalloc((void**)&deviceOutput, size * size * sizeof(float)); 82 | 83 | // 定义CUDA的网格和块大小 84 | int gridSize = (size * size + BLOCK_SIZE - 1) / BLOCK_SIZE; 85 | int blockSize = BLOCK_SIZE; 86 | 87 | 88 | clock_t start, finish; 89 | //clock_t为CPU时钟计时单元数 90 | start = clock(); 91 | // 调用CUDA核函数 92 | calculateEntropy<<>>(deviceInput, deviceOutput, size); 93 | 94 | cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost); 95 | finish=clock(); 96 | // 输出结果 97 | std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< > array; 116 | std::vector res; 117 | read(array,res,"test/test.in","test/test.out"); 118 | int row=array.size(); 119 | int col=array[0].size(); 120 | 121 | 122 | int* hostInput = new int[row*col]; 123 | for(int i=0;i>>(deviceInput, deviceOutput, row,col); 141 | cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost); 142 | 143 | 144 | std::vector > entropyArray; 145 | entropyArray.resize(row); 146 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #define BLOCK_SIZE 16 6 | #include "deal_binary.h" 7 | using std::string; 8 | 9 | // CUDA核函数,计算以每个元素为中心的窗口中的熵 10 | __global__ void calculateEntropy(int* input, float* output, int row,int col) 11 | { 12 | int index = blockIdx.x * blockDim.x + threadIdx.x; 13 | //printf("hhh\n"); 14 | double logTable[26]; 15 | for (int k = 1; k <= 25; k++) 16 | { 17 | logTable[k] = log2f(k); 18 | //printf("%f\n",logTable[k-1]); 19 | } 20 | //printf("kkk\n"); 21 | if (index < row * col) 22 | { 23 | int r = index / col; 24 | int c = index % col; 25 | 26 | int windowSize = 5; 27 | int windowStartRow = r - 2; 28 | int windowStartCol = c - 2; 29 | int windowEndRow = windowStartRow + 4; 30 | int windowEndCol = windowStartCol + 4; 31 | 32 | // 边界处理 33 | if (windowStartRow < 0) 34 | windowStartRow = 0; 35 | if (windowStartCol < 0) 36 | windowStartCol = 0; 37 | if (windowEndRow >= row) 38 | windowEndRow = row - 1; 39 | if (windowEndCol >= col) 40 | windowEndCol = col - 1; 41 | 42 | float entropy = 0.0f; 43 | int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1); 44 | 45 | // 计算窗口内元素的频率 46 | int frequency[16] = { 0 }; 47 | for (int i = windowStartRow; i <= windowEndRow; i++) 48 | { 49 | for (int j = windowStartCol; j <= windowEndCol; j++) 50 | { 51 | int value = input[i * col + j]; 52 | frequency[value]++; 53 | } 54 | } 55 | 56 | // 计算熵 57 | for (int k = 0; k < 16; k++) 58 | { 59 | float prob = static_cast(frequency[k]) / windowElements; 60 | //printf("%d \n",frequency[k]); 61 | if (prob > 0.0f){ 62 | if(frequency[k] >= 1 && frequency[k] <= 25) 63 | entropy -= prob * (logTable[frequency[k]]-logTable[windowElements]); 64 | else 65 | entropy -= prob * log2f(prob); 66 | } 67 | } 68 | 69 | output[index] = entropy; 70 | } 71 | } 72 | 73 | int main() 74 | { 75 | int sizes[] = { 5, 16, 128, 1024, 2048, 3000, 4000 }; 76 | srand(static_cast(1234)); 77 | /* 78 | for (int i = 0; i < 7; i++) 79 | { 80 | int size = sizes[i]; 81 | 82 | // 随机生成二维数组 83 | int* hostInput = new int[size * size]; 84 | for (int j = 0; j < size * size; j++) 85 | hostInput[j] = rand() % 16; 86 | 87 | int* deviceInput; 88 | cudaMalloc((void**)&deviceInput, size * size * sizeof(int)); 89 | cudaMemcpy(deviceInput, hostInput, size * size * sizeof(int), cudaMemcpyHostToDevice); 90 | 91 | float* hostOutput = new float[size * size]; 92 | float* deviceOutput; 93 | cudaMalloc((void**)&deviceOutput, size * size * sizeof(float)); 94 | 95 | // 定义CUDA的网格和块大小 96 | int gridSize = (size * size + BLOCK_SIZE - 1) / BLOCK_SIZE; 97 | int blockSize = BLOCK_SIZE; 98 | 99 | // 预计算对数表 100 | float logTable[25]; 101 | for (int k = 0; k < 25; k++) 102 | { 103 | logTable[k] = log2f(static_cast(k + 1)); 104 | //printf("%f ",logTable[k]); 105 | } 106 | //printf("\n"); 107 | 108 | clock_t start, finish; 109 | start = clock(); 110 | // 调用CUDA核函数 111 | calculateEntropy<<>>(deviceInput, deviceOutput, size, logTable); 112 | 113 | cudaMemcpy(hostOutput, deviceOutput, size * size * sizeof(float), cudaMemcpyDeviceToHost); 114 | finish = clock(); 115 | // 输出结果 116 | std::cout << "Array Size: " << size << " using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl; 117 | 118 | // 释放内存 119 | delete[] hostInput; 120 | delete[] hostOutput; 121 | cudaFree(deviceInput); 122 | cudaFree(deviceOutput); 123 | } 124 | */ 125 | std::vector > array; 126 | std::vector res; 127 | read(array,res,"test/test.in","test/test.out"); 128 | int row=array.size(); 129 | int col=array[0].size(); 130 | 131 | 132 | int* hostInput = new int[row*col]; 133 | for(int i=0;i>>(deviceInput, deviceOutput, row,col); 161 | cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost); 162 | 163 | 164 | std::vector > entropyArray; 165 | entropyArray.resize(row); 166 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using std::string; 9 | 10 | void read(std::vector >& array,std::vector& res,string instr,string out_str){ 11 | int row_size, col_size; 12 | std::vector matrix_test; 13 | std::ifstream inFile(instr, std::ios::binary); 14 | if (inFile.is_open()) { 15 | // First, read the size of the matrix (ROWS and COLS). 16 | inFile.read(reinterpret_cast(&row_size),sizeof(row_size)); 17 | inFile.read(reinterpret_cast(&col_size),sizeof(col_size)); 18 | // Resize the matrix based on the size read from thefile. 19 | matrix_test.resize(row_size * col_size); 20 | // Then, read the actual data of the matrix. 21 | inFile.read(reinterpret_cast 22 | (matrix_test.data()), matrix_test.size() * sizeof(int)); 23 | } 24 | 25 | array.resize(row_size); 26 | for(int i=0;i(res.data()), res.size() * sizeof(float)); 44 | } 45 | } 46 | 47 | 48 | 49 | void write(std::vector >& array,std::vector > &res,string in_str,string out_str){ 50 | std::ofstream inFile(in_str, std::ios::binary); 51 | std::ofstream outFile(out_str, std::ios::binary); 52 | 53 | int row=array.size(); 54 | int col=array[0].size(); 55 | 56 | std::vector in_matrix; 57 | std::vector entropy_matrix; 58 | in_matrix.resize(row*col+2); 59 | entropy_matrix.resize(row*col); 60 | 61 | in_matrix[0]=row; 62 | in_matrix[1]=col; 63 | 64 | for(int i=0;i(in_matrix.data()), in_matrix.size() *sizeof(int)); 74 | inFile.close(); 75 | } 76 | else { 77 | std::cout << "Unable to open file"; 78 | } 79 | 80 | //写熵矩阵 81 | if (outFile.is_open()) { 82 | outFile.write(reinterpret_cast(entropy_matrix.data()), entropy_matrix.size() *sizeof(float)); 83 | outFile.close(); 84 | } 85 | else { 86 | std::cout << "Unable to open file"; 87 | } 88 | } -------------------------------------------------------------------------------- /并行程序设计_lab3/code/openmp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "deal_binary.h" 9 | using std::string; 10 | #define ELEMENT_RANGE 16 11 | 12 | const int WINDOW_SIZE = 5; 13 | // 生成随机二维数组 14 | void generateRandomArray(std::vector >& array, int size) { 15 | srand(static_cast(1234)); 16 | array.resize(size, std::vector(size)); 17 | for (int i = 0; i < size; i++) { 18 | for (int j = 0; j < size; j++) { 19 | array[i][j] = rand() % ELEMENT_RANGE; 20 | } 21 | //printf("\n"); 22 | } 23 | } 24 | 25 | 26 | // 计算窗口中的熵 27 | double calculateEntropy(const std::vector >& array, int x, int y) { 28 | std::vector counts(ELEMENT_RANGE, 0); 29 | 30 | int startX = std::max(0, x - WINDOW_SIZE / 2); 31 | int startY = std::max(0, y - WINDOW_SIZE / 2); 32 | int endX = std::min(static_cast(array.size()) - 1, x + WINDOW_SIZE / 2); 33 | int endY = std::min(static_cast(array[0].size()) - 1, y + WINDOW_SIZE / 2); 34 | 35 | //printf("(%d %d),(%d %d)\n",startX,startY,endX,endY); 36 | 37 | for (int i = startX; i <= endX; i++) { 38 | for (int j = startY; j <= endY; j++) { 39 | counts[array[i][j]]++; 40 | } 41 | } 42 | 43 | float entropy = 0.0; 44 | int windowSize = (endX - startX + 1) * (endY - startY + 1); 45 | //printf("%d\n",windowSize); 46 | for (int i = 0; i < ELEMENT_RANGE; i++) { 47 | float probability = float(counts[i]) / windowSize; 48 | if (counts[i]!=0) { 49 | //printf("%d ",i); 50 | entropy -= probability * log2(probability); 51 | } 52 | } 53 | //printf("\n"); 54 | return entropy; 55 | } 56 | 57 | int main() { 58 | // 设置随机种子 59 | std::srand(static_cast(1234)); 60 | /* 61 | // 定义数组大小 62 | int sizesArr[] = {5, 16, 128, 1024, 2048, 3000,4000}; 63 | std::vector sizes; 64 | for(int i=0;i<7;i++) 65 | sizes.push_back(sizesArr[i]); 66 | // 并行计算熵 67 | #pragma omp parallel for num_threads(40) 68 | //#pragma omp parallel for collapse(24) 69 | for (int i=0;i > array(size, std::vector(size)); 73 | generateRandomArray(array,size); 74 | 75 | 76 | clock_t start, finish; 77 | start = clock(); 78 | // 计算熵 79 | std::vector > entropyArray(size, std::vector(size)); 80 | for (int i = 0; i < size; ++i) { 81 | for (int j = 0; j < size; ++j) { 82 | entropyArray[i][j] = calculateEntropy(array, i, j); 83 | } 84 | } 85 | finish=clock(); 86 | // 输出结果 87 | std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< > array; 93 | std::vector res; 94 | read(array,res,"test/test.in","test/test.out"); 95 | int row=array.size(); 96 | int col=array[0].size(); 97 | std::vector > entropyArray; 98 | entropyArray.resize(row); 99 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "deal_binary.h" 9 | using std::string; 10 | #define ELEMENT_RANGE 16 11 | const int LOG_TABLE_SIZE = 25; 12 | std::vector logTable(LOG_TABLE_SIZE); 13 | const int WINDOW_SIZE = 5; 14 | 15 | // 初始化对数表 16 | void initializeLogTable() { 17 | for (int i = 1; i <= LOG_TABLE_SIZE; i++) { 18 | logTable[i - 1] = log2(i); 19 | } 20 | } 21 | 22 | // 查找对数值 23 | double lookupLog(int n) { 24 | if (n >= 1 && n <= LOG_TABLE_SIZE) { 25 | return logTable[n - 1]; 26 | } else { 27 | // 处理超出查表范围的情况 28 | return log2(n); 29 | } 30 | } 31 | 32 | // 生成随机二维数组 33 | void generateRandomArray(std::vector >& array, int size) { 34 | srand(static_cast(1234)); 35 | array.resize(size, std::vector(size)); 36 | for (int i = 0; i < size; i++) { 37 | for (int j = 0; j < size; j++) { 38 | array[i][j] = rand() % ELEMENT_RANGE; 39 | } 40 | //printf("\n"); 41 | } 42 | } 43 | 44 | 45 | // 计算窗口中的熵 46 | double calculateEntropy(const std::vector >& array, int x, int y) { 47 | std::vector counts(ELEMENT_RANGE, 0); 48 | 49 | int startX = std::max(0, x - WINDOW_SIZE / 2); 50 | int startY = std::max(0, y - WINDOW_SIZE / 2); 51 | int endX = std::min(static_cast(array.size()) - 1, x + WINDOW_SIZE / 2); 52 | int endY = std::min(static_cast(array[0].size()) - 1, y + WINDOW_SIZE / 2); 53 | 54 | //printf("(%d %d),(%d %d)\n",startX,startY,endX,endY); 55 | 56 | for (int i = startX; i <= endX; i++) { 57 | for (int j = startY; j <= endY; j++) { 58 | counts[array[i][j]]++; 59 | } 60 | } 61 | 62 | float entropy = 0.0; 63 | int windowSize = (endX - startX + 1) * (endY - startY + 1); 64 | //printf("%d\n",windowSize); 65 | for (int i = 0; i < ELEMENT_RANGE; i++) { 66 | float probability = float(counts[i]) / windowSize; 67 | if (counts[i]!=0) { 68 | //printf("%d ",i); 69 | entropy -= probability * (lookupLog(counts[i])-lookupLog(windowSize)); 70 | } 71 | } 72 | //printf("\n"); 73 | return entropy; 74 | } 75 | 76 | int main() { 77 | // 设置随机种子 78 | std::srand(static_cast(1234)); 79 | // 初始化对数表 80 | initializeLogTable(); 81 | /* 82 | // 定义数组大小 83 | int sizesArr[] = {5, 16, 128, 1024, 2048, 3000,4000}; 84 | std::vector sizes; 85 | for(int i=0;i<7;i++) 86 | sizes.push_back(sizesArr[i]); 87 | // 并行计算熵 88 | #pragma omp parallel for num_threads(40) 89 | //#pragma omp parallel for collapse(24) 90 | for (int i=0;i > array(size, std::vector(size)); 94 | generateRandomArray(array,size); 95 | 96 | 97 | clock_t start, finish; 98 | start = clock(); 99 | // 计算熵 100 | std::vector > entropyArray(size, std::vector(size)); 101 | for (int i = 0; i < size; ++i) { 102 | for (int j = 0; j < size; ++j) { 103 | entropyArray[i][j] = calculateEntropy(array, i, j); 104 | } 105 | } 106 | finish=clock(); 107 | // 输出结果 108 | std::cout << "Array Size: " << size << " using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< > array; 114 | std::vector res; 115 | read(array,res,"test/test.in","test/test.out"); 116 | int row=array.size(); 117 | int col=array[0].size(); 118 | std::vector > entropyArray; 119 | entropyArray.resize(row); 120 | for(int i=0;i 2 | #include 3 | #include 4 | #include 5 | #define BLOCK_SIZE 16 6 | #include "deal_binary.h" 7 | using std::string; 8 | 9 | // CUDA核函数,计算以每个元素为中心的窗口中的熵 10 | // 计算每个元素的熵的核函数(使用共享内存优化) 11 | __global__ void calculateEntropy_share(int* input, float* output, int width, int height) { 12 | int row = blockIdx.x * blockDim.x + threadIdx.x; 13 | int col = blockIdx.y * blockDim.y + threadIdx.y; 14 | // 定义共享内存 15 | __shared__ int shared_input[8 + 4][8 + 4]; 16 | // 计算线程在共享内存中的索引 17 | int shared_row = threadIdx.x + 2; 18 | int shared_col = threadIdx.y + 2; 19 | // 将数据从全局内存复制到共享内存 20 | if (row < height && col < width) { 21 | int global_index = col * height + row; 22 | shared_input[shared_row][shared_col] = input[global_index]; 23 | } 24 | // 线程同步,确保数据复制完成 25 | __syncthreads(); 26 | if (row < height && col < width) { 27 | float entropy = 0; 28 | // 记录每个数字出现过的次数 29 | int record[16]; 30 | // 窗口内元素总数 31 | int count = 0, x, y; 32 | for (int i = 0; i < 16; i++) { 33 | record[i] = 0; 34 | } 35 | for (int i = -2; i <= 2; i++) { 36 | for (int j = -2; j <= 2; j++) { 37 | x = shared_col + i; 38 | y = shared_row + j; 39 | printf("%d %d\n",x,y); 40 | int value = shared_input[y][x]; 41 | record[value]++; 42 | count++; 43 | } 44 | } 45 | // 计算熵值 46 | for (int i = 0; i < 16; i++) { 47 | //entropy -= (float)record[i] * (log_table[record[i]]-log_table[count]) / count; 48 | float prob = (float)(record[i]) / count; 49 | //printf("%f\n",prob); 50 | if (prob > 0.0f) 51 | entropy -= prob * log2f(prob); 52 | } 53 | output[col * height + row] = entropy; 54 | } 55 | } 56 | 57 | // CUDA核函数,计算以每个元素为中心的窗口中的熵 58 | __global__ void calculateEntropy(int* input, float* output, int row,int col) 59 | { 60 | int c = blockIdx.y * blockDim.y + threadIdx.y; 61 | int r = blockIdx.x * blockDim.x + threadIdx.x; 62 | 63 | if (r < row && c < col) 64 | { 65 | int windowSize = 5; 66 | int windowStartRow = r - 2; 67 | int windowStartCol = c - 2; 68 | int windowEndRow = windowStartRow + 4; 69 | int windowEndCol = windowStartCol + 4; 70 | 71 | // 边界处理 72 | if (windowStartRow < 0) 73 | windowStartRow = 0; 74 | if (windowStartCol < 0) 75 | windowStartCol = 0; 76 | if (windowEndRow >= row) 77 | windowEndRow = row - 1; 78 | if (windowEndCol >= col) 79 | windowEndCol = col - 1; 80 | 81 | float entropy = 0.0f; 82 | int windowElements = (windowEndRow - windowStartRow + 1) * (windowEndCol - windowStartCol + 1); 83 | 84 | // 计算窗口内元素的频率 85 | int frequency[16] = { 0 }; 86 | for (int i = windowStartRow; i <= windowEndRow; i++) 87 | { 88 | for (int j = windowStartCol; j <= windowEndCol; j++) 89 | { 90 | int value = input[i * col + j]; 91 | frequency[value]++; 92 | } 93 | } 94 | 95 | // 计算熵 96 | for (int k = 0; k < 16; k++) 97 | { 98 | float prob = static_cast(frequency[k]) / windowElements; 99 | if (prob > 0.0f) 100 | entropy -= prob * log2f(prob); 101 | } 102 | 103 | output[r * col + c] = entropy; 104 | } 105 | } 106 | 107 | int main() 108 | { 109 | int sizes[] = { 5, 16, 128, 1024, 2048, 3000 ,4000}; 110 | srand(static_cast(1234)); 111 | std::vector > array; 112 | std::vector res; 113 | read(array,res,"test/test.in","test/test.out"); 114 | int row=array.size(); 115 | int col=array[0].size(); 116 | 117 | 118 | int* hostInput = new int[row*col]; 119 | for(int i=0;i>>(deviceInput, deviceOutput, row,col); 139 | //calculateEntropy<< > >(deviceInput, deviceOutput, row,col); 140 | cudaMemcpy(hostOutput, deviceOutput, row* col * sizeof(float), cudaMemcpyDeviceToHost); 141 | 142 | std::vector > entropyArray; 143 | entropyArray.resize(row); 144 | for(int i=0;i >& array,std::vector& res){ 5 | int row_size, col_size; 6 | std::vector matrix_test; 7 | std::ifstream inFile("output/baseline.in", std::ios::binary); 8 | if (inFile.is_open()) { 9 | // First, read the size of the matrix (ROWS and COLS). 10 | inFile.read(reinterpret_cast(&row_size),sizeof(row_size)); 11 | inFile.read(reinterpret_cast(&col_size),sizeof(col_size)); 12 | // Resize the matrix based on the size read from thefile. 13 | matrix_test.resize(row_size * col_size); 14 | // Then, read the actual data of the matrix. 15 | inFile.read(reinterpret_cast 16 | (matrix_test.data()), matrix_test.size() * sizeof(int)); 17 | } 18 | 19 | array.resize(row_size); 20 | for(int i=0;i(res.data()), res.size() * sizeof(float)); 38 | } 39 | } 40 | 41 | int main(){ 42 | std::vector > array; 43 | std::vector res; 44 | read(array,res,"output/baseline.in","output/baseline.out"); 45 | 46 | std::vector > array2; 47 | std::vector res2; 48 | read_my_res(array2,res2); 49 | 50 | int row=array.size(); 51 | int col=array[0].size(); 52 | 53 | for(int i=0;i1e-5) 57 | printf("%f %f\n",res[i*col+j],res2[i*col+j]); 58 | } 59 | } 60 | 61 | 62 | 63 | return 0; 64 | } -------------------------------------------------------------------------------- /并行程序设计_lab3/code/test/test.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/code/test/test.in -------------------------------------------------------------------------------- /并行程序设计_lab3/code/test/test.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/code/test/test.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/baseline.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/baseline.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/baseline_lu.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline_lu.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/baseline_lu.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/baseline_lu.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_2d.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_2d.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_2d.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_2d.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_bl.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_bl.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_bl_lu.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl_lu.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/cuda_bl_lu.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/cuda_bl_lu.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/openmp.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/openmp.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/openmp_lu.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp_lu.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/openmp_lu.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/openmp_lu.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/share_mem.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/share_mem.in -------------------------------------------------------------------------------- /并行程序设计_lab3/output/share_mem.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/share_mem.out -------------------------------------------------------------------------------- /并行程序设计_lab3/output/test0.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/output/test0.out -------------------------------------------------------------------------------- /并行程序设计_lab3/readme.txt: -------------------------------------------------------------------------------- 1 | output文件夹中是我随机生成的输入文件(大小和数组)in和对应程序的输出文件(中心熵矩阵)out,格式和要求的一致 2 | code文件夹为源代码 -------------------------------------------------------------------------------- /并行程序设计_lab3/并行程序设计_20337025_崔璨明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab3/并行程序设计_20337025_崔璨明.pdf -------------------------------------------------------------------------------- /并行程序设计_lab4/code/matrix_vector_mul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "read_data.h" 5 | 6 | __global__ void matrixVectorMul(float* A, float* b, float* c, int rows, int cols) { 7 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 8 | if (tid < rows) { 9 | float sum = 0.0f; 10 | for (int j = 0; j < cols; j++) { 11 | sum += A[tid * cols + j] * b[j]; 12 | } 13 | c[tid] = sum; 14 | } 15 | } 16 | 17 | void matrixVectorMultiplication(std::vector& A, std::vector& b, std::vector& c, int rows, int cols) { 18 | // Device memory allocation 19 | float *d_A, *d_b, *d_c; 20 | cudaMalloc((void**)&d_A, rows * cols * sizeof(float)); 21 | cudaMalloc((void**)&d_b, cols * sizeof(float)); 22 | cudaMalloc((void**)&d_c, rows * sizeof(float)); 23 | 24 | // Copy data from host to device 25 | cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice); 26 | cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | // Launch kernel 29 | int blockSize = 256; 30 | int gridSize = (rows + blockSize - 1) / blockSize; 31 | matrixVectorMul<<>>(d_A, d_b, d_c, rows, cols); 32 | 33 | // Copy result from device to host 34 | cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost); 35 | 36 | // Free device memory 37 | cudaFree(d_A); 38 | cudaFree(d_b); 39 | cudaFree(d_c); 40 | } 41 | 42 | int main() { 43 | //修改这里的路径,读入二进制文件和输出二进制文件 44 | string read_dir="data/test5.in"; 45 | string save_dir="output/res5.out"; 46 | std::vector A ; 47 | std::vector >array_2d; 48 | std::vector b ; 49 | 50 | read(A,array_2d,b,read_dir); 51 | //generate_data(A,array_2d,b,4096); 52 | 53 | int rows = array_2d.size(); 54 | int cols = array_2d[0].size(); 55 | 56 | std::vector c(rows); 57 | 58 | clock_t start, finish; 59 | start = clock(); 60 | matrixVectorMultiplication(A, b, c, rows, cols); 61 | finish=clock(); 62 | 63 | // Print result 64 | std::cout << "Result: "; 65 | for (int i = 0; i < rows; i++) { 66 | printf("%.5f ",c[i]); 67 | } 68 | std::cout << std::endl; 69 | // Print using time 70 | std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< 4 | #include 5 | #include 6 | #include "read_data.h" 7 | 8 | __constant__ float d_b[2048]; 9 | 10 | __global__ void matrixVectorMul(float* A,float* c, int rows, int cols) { 11 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 12 | if (tid < rows) { 13 | float sum = 0.0f; 14 | for (int j = 0; j < cols; j++) { 15 | sum += A[tid * cols + j] * d_b[j]; 16 | } 17 | c[tid] = sum; 18 | } 19 | } 20 | 21 | 22 | void matrixVectorMultiplication(const std::vector& A, const std::vector& b, std::vector& c, int rows, int cols) { 23 | // Device memory allocation 24 | float *d_A, *d_c; 25 | cudaMalloc((void**)&d_A, rows * cols * sizeof(float)); 26 | cudaMalloc((void**)&d_c, rows * sizeof(float)); 27 | 28 | // Copy data from host to device 29 | cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice); 30 | cudaMemcpyToSymbol(d_b, b.data(), cols * sizeof(float)); 31 | 32 | // Launch kernel 33 | int blockSize = 256; 34 | int gridSize = (rows + blockSize - 1) / blockSize; 35 | matrixVectorMul<<>>(d_A, d_c, rows, cols); 36 | 37 | // Copy result from device to host 38 | cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost); 39 | 40 | // Free device memory 41 | cudaFree(d_A); 42 | cudaFree(d_c); 43 | } 44 | 45 | 46 | int main() { 47 | string read_dir="data/test1.in"; 48 | string save_dir="output/res1.out"; 49 | std::vector A ; 50 | std::vector >array_2d; 51 | std::vector b ; 52 | read(A,array_2d,b,read_dir); 53 | //generate_data(A,array_2d,b,2048); 54 | 55 | int rows = array_2d.size(); 56 | int cols = array_2d[0].size(); 57 | 58 | std::vector c(rows); 59 | 60 | clock_t start, finish; 61 | start = clock(); 62 | matrixVectorMultiplication(A, b, c, rows, cols); 63 | finish=clock(); 64 | 65 | // Print result 66 | std::cout << "Result: "; 67 | for (int i = 0; i < rows; i++) { 68 | printf("%.5f ",c[i]); 69 | } 70 | std::cout << std::endl; 71 | // Print using time 72 | std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< 4 | #include 5 | #include 6 | #include "read_data.h" 7 | 8 | texture texA; 9 | texture texB; 10 | 11 | __global__ void matrixVectorMul(float* c, int rows, int cols) { 12 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 13 | if (tid < rows) { 14 | float sum = 0.0f; 15 | for (int j = 0; j < cols; j++) { 16 | sum += tex1Dfetch(texA, tid * cols + j) * tex1Dfetch(texB, j); 17 | } 18 | c[tid] = sum; 19 | } 20 | } 21 | 22 | void matrixVectorMultiplication(std::vector& A, std::vector& b, std::vector& c, int rows, int cols) { 23 | // Device memory allocation 24 | float *d_A, *d_b, *d_c; 25 | cudaMalloc((void**)&d_A, rows * cols * sizeof(float)); 26 | cudaMalloc((void**)&d_b, cols * sizeof(float)); 27 | cudaMalloc((void**)&d_c, rows * sizeof(float)); 28 | 29 | // Copy data from host to device 30 | cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice); 31 | cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice); 32 | 33 | // Bind texture memory 34 | cudaBindTexture(NULL, texA, d_A, rows * cols * sizeof(float)); 35 | cudaBindTexture(NULL, texB, d_b, cols * sizeof(float)); 36 | 37 | // Launch kernel 38 | int blockSize = 256; 39 | int gridSize = (rows + blockSize - 1) / blockSize; 40 | matrixVectorMul<<>>(d_c, rows, cols); 41 | 42 | // Copy result from device to host 43 | cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost); 44 | 45 | // Unbind texture memory 46 | cudaUnbindTexture(texA); 47 | cudaUnbindTexture(texB); 48 | 49 | // Free device memory 50 | cudaFree(d_A); 51 | cudaFree(d_b); 52 | cudaFree(d_c); 53 | } 54 | 55 | int main() { 56 | string read_dir="data/test1.in"; 57 | string save_dir="output/res1.out"; 58 | std::vector A ; 59 | std::vector >array_2d; 60 | std::vector b ; 61 | 62 | read(A,array_2d,b,read_dir); 63 | //generate_data(A,array_2d,b,4096); 64 | int rows = array_2d.size(); 65 | int cols = array_2d[0].size(); 66 | 67 | std::vector c(rows); 68 | 69 | clock_t start, finish; 70 | start = clock(); 71 | matrixVectorMultiplication(A, b, c, rows, cols); 72 | finish=clock(); 73 | 74 | // Print result 75 | std::cout << "Result: "; 76 | for (int i = 0; i < rows; i++) { 77 | //std::cout << c[i] << " "; 78 | printf("%.5f ",c[i]); 79 | } 80 | std::cout << std::endl; 81 | // Print using time 82 | std::cout <<"using time: "<<1000*double(finish - start) / CLOCKS_PER_SEC<<" ms"< 3 | #include 4 | #include 5 | #include 6 | #include "read_data.h" 7 | 8 | void matrixVectorMultiplication(std::vector& A, std::vector& b, std::vector& c, int rows, int cols) { 9 | // Device memory allocation 10 | float *d_A, *d_b, *d_c; 11 | cudaMalloc((void**)&d_A, rows * cols * sizeof(float)); 12 | cudaMalloc((void**)&d_b, cols * sizeof(float)); 13 | cudaMalloc((void**)&d_c, rows * sizeof(float)); 14 | 15 | // Copy data from host to device 16 | cudaMemcpy(d_A, A.data(), rows * cols * sizeof(float), cudaMemcpyHostToDevice); 17 | cudaMemcpy(d_b, b.data(), cols * sizeof(float), cudaMemcpyHostToDevice); 18 | 19 | // cuBLAS initialization 20 | cublasHandle_t handle; 21 | cublasCreate(&handle); 22 | 23 | // Matrix-vector multiplication using cuBLAS 24 | float alpha = 1.0f; 25 | float beta = 0.0f; 26 | cublasSgemv(handle, CUBLAS_OP_T, cols, rows, &alpha, d_A, cols, d_b, 1, &beta, d_c, 1); 27 | 28 | // Copy result from device to host 29 | cudaMemcpy(c.data(), d_c, rows * sizeof(float), cudaMemcpyDeviceToHost); 30 | 31 | // Free device memory 32 | cudaFree(d_A); 33 | cudaFree(d_b); 34 | cudaFree(d_c); 35 | 36 | // Destroy cuBLAS handle 37 | cublasDestroy(handle); 38 | } 39 | 40 | int main() { 41 | string read_dir="data/test1.in"; 42 | string save_dir="output/res1.out"; 43 | std::vector A; 44 | std::vector> array_2d; 45 | std::vector b; 46 | 47 | read(A, array_2d, b, read_dir); 48 | 49 | int rows = array_2d.size(); 50 | int cols = array_2d[0].size(); 51 | 52 | std::vector c(rows); 53 | 54 | clock_t start, finish; 55 | start = clock(); 56 | matrixVectorMultiplication(A, b, c, rows, cols); 57 | finish = clock(); 58 | 59 | // Print result 60 | std::cout << "Result: "; 61 | for (int i = 0; i < rows; i++) { 62 | printf("%.5f ", c[i]); 63 | } 64 | std::cout << std::endl; 65 | 66 | // Print execution time 67 | std::cout << "Using time: " << 1000 * double(finish - start) / CLOCKS_PER_SEC << " ms" << std::endl; 68 | write(c,save_dir); 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /并行程序设计_lab4/code/read_data.h: -------------------------------------------------------------------------------- 1 | //头文件,定义了读取、写入二进制文件的函数 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | using std::string; 11 | 12 | void read(std::vector& array_1d,std::vector >& array_2d,std::vector& res,string path){ 13 | int row_size, col_size; 14 | std::vector matrix_test; 15 | std::vector vec; 16 | std::ifstream inFile_in(path, std::ios::binary); 17 | if (inFile_in.is_open()) { 18 | // First, read the size of the matrix and the vector. 19 | inFile_in.read(reinterpret_cast(&row_size), 20 | sizeof(row_size)); 21 | inFile_in.read(reinterpret_cast(&col_size), 22 | sizeof(col_size)); 23 | // Resize the matrix and vector based on the size read from the file. 24 | matrix_test.resize(row_size * col_size); 25 | vec.resize(col_size); 26 | // Then, read the actual data of the matrix and the vector. 27 | inFile_in.read(reinterpret_cast(matrix_test.data()), 28 | matrix_test.size() * sizeof(float)); 29 | inFile_in.read(reinterpret_cast(vec.data()), vec.size() 30 | * sizeof(float)); 31 | inFile_in.close(); 32 | } 33 | else { 34 | std::cout << "Unable to open file"; 35 | } 36 | 37 | array_1d.resize(row_size * col_size); 38 | array_2d.resize(row_size); 39 | for(int i=0;i &result,string path,int size){ 60 | std::ifstream inFile_in(path, std::ios::binary); 61 | if (inFile_in.is_open()) { 62 | // Resize the matrix and vector based on the size read from the file. 63 | result.resize(size); 64 | // Then, read the actual data of the matrix and the vector. 65 | inFile_in.read(reinterpret_cast(result.data()), 66 | result.size() * sizeof(float)); 67 | inFile_in.close(); 68 | } 69 | } 70 | 71 | void write(std::vector &result,string save_dir){ 72 | // Write result to file 73 | std::ofstream outFile_out(save_dir, std::ios::binary); 74 | if (outFile_out.is_open()) { 75 | outFile_out.write(reinterpret_cast 76 | (result.data()), result.size() * sizeof(float)); 77 | outFile_out.close(); 78 | } 79 | else { 80 | std::cout << "Unable to open file"; 81 | } 82 | } 83 | 84 | void generate_data(std::vector& array_1d,std::vector >& array_2d,std::vector& res,int size){ 85 | srand(static_cast(1234)); 86 | array_1d.resize(size*size); 87 | array_2d.resize(size); 88 | res.resize(size); 89 | for(int i=0;i res1; 7 | std::vector res2; 8 | read_res(res1,file1,SIZE); 9 | read_res(res2,file2,SIZE); 10 | bool flag=false; 11 | for(int i=0;i1e-5){ 13 | printf("%.5f %.5f\n",res1[i],res2[i]); 14 | flag=true; 15 | } 16 | } 17 | if(flag==false) 18 | printf("The result is right.\n"); 19 | } 20 | 21 | int main(){ 22 | string file1="data/test5.out"; 23 | string file2="output/res5.out"; 24 | test(file1,file2); 25 | return 0; 26 | } -------------------------------------------------------------------------------- /并行程序设计_lab4/并行程序设计_20337025_崔璨明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/91Mrcui/SYSU_parallel_programming/846bdc8757c9c31f6466a7510777991f059f4997/并行程序设计_lab4/并行程序设计_20337025_崔璨明.pdf --------------------------------------------------------------------------------