├── README.md ├── openacc ├── .vscode │ ├── settings.json │ └── tasks.json ├── assign-floyd │ ├── ans │ │ ├── assign-floyd.2023-05-31.tgz │ │ ├── device.sh │ │ ├── floyd.cpp │ │ ├── floyd.exe3 │ │ ├── floyd.log3 │ │ ├── floyd_multidevice copy 2.cpp │ │ ├── floyd_multidevice copy.cpp │ │ ├── floyd_multidevice.cpp │ │ ├── makefile │ │ └── run.sh │ ├── assign-floyd.2023-05-31.tgz │ ├── device.sh │ ├── floyd.cpp │ ├── floyd.exe3 │ ├── floyd.log3 │ ├── floyd_multidevice.cpp │ ├── floyd_optimize.cpp │ ├── makefile │ └── run.sh ├── exam-floyd │ ├── ans │ │ ├── device.sh │ │ ├── exam-floyd.2023-05-31.tgz │ │ ├── floyd copy 10.cpp │ │ ├── floyd copy 11.cpp │ │ ├── floyd copy 12.cpp │ │ ├── floyd copy 13.cpp │ │ ├── floyd copy 14.cpp │ │ ├── floyd copy 15.cpp │ │ ├── floyd copy 16.cpp │ │ ├── floyd copy 17.cpp │ │ ├── floyd copy 2.cpp │ │ ├── floyd copy 3.cpp │ │ ├── floyd copy 4.cpp │ │ ├── floyd copy 5.cpp │ │ ├── floyd copy 6.cpp │ │ ├── floyd copy 7.cpp │ │ ├── floyd copy 8.cpp │ │ ├── floyd copy 9.cpp │ │ ├── floyd copy.cpp │ │ ├── floyd.cpp │ │ ├── floyd.exe4 │ │ ├── floyd.log4 │ │ ├── makefile │ │ └── run.sh │ ├── device.sh │ ├── exam-floyd.2023-05-31.tgz │ ├── floyd copy.log4 │ ├── floyd.cpp │ ├── floyd.exe4 │ ├── floyd.log4 │ ├── makefile │ ├── output.i │ ├── output.o │ ├── output.s │ └── run.sh ├── image │ └── clauses.png ├── lab-floyd │ ├── ans │ │ ├── device.sh │ │ ├── floyd.cpp │ │ ├── floyd.exe0 │ │ ├── floyd.exe1 │ │ ├── floyd.exe2 │ │ ├── floyd.log0 │ │ ├── floyd.log1 │ │ ├── floyd.log2 │ │ ├── lab-floyd.2023-05-24.tgz │ │ ├── makefile │ │ └── run.sh │ ├── device.sh │ ├── floyd.exe0 │ ├── floyd.exe1 │ ├── floyd.exe2 │ ├── floyd.log0 │ ├── floyd.log1 │ ├── floyd.log2 │ ├── floyd_managed.cpp │ ├── floyd_multicore.cpp │ ├── floyd_multidevice.cpp │ ├── floyd_optimize.cpp │ ├── floyd_serial.cpp │ ├── lab-floyd.2023-05-24.tgz │ ├── makefile │ └── run.sh └── note.md └── openmp ├── .vscode ├── settings.json └── tasks.json ├── assign_prime ├── build │ ├── prime │ └── prime_old ├── prime ├── prime.cpp ├── prime_old.cpp ├── prime_solution ├── res.log ├── run.sh ├── temp │ ├── assign_01_prime │ │ ├── build │ │ │ ├── prime │ │ │ ├── primeSerial │ │ │ └── primexyy │ │ ├── prime.cpp │ │ └── primeSerial.cpp │ ├── build │ │ └── prime │ ├── prime │ ├── prime.cpp │ ├── prime_solution │ ├── run.sh │ ├── test.log │ └── test.sh ├── test.sh └── testmy.sh ├── assign_sort ├── assign_sort.2023-03-29.tgz ├── build │ ├── build │ │ ├── tp │ │ ├── tp2 │ │ └── tp3 │ ├── sort_radix │ ├── sort_radix.cpp │ ├── sort_sample │ ├── sort_sample copy 2.cpp │ ├── sort_sample copy 3.cpp │ ├── sort_sample copy.cpp │ ├── sort_sample.cpp │ ├── tp.cpp │ ├── tp2.cpp │ └── tp3.cpp ├── readme.txt ├── run.sh ├── sort_radix ├── sort_radix.cpp ├── sort_radix_solution ├── sort_sample ├── sort_sample.cpp └── sort_sample_solution ├── exam_knn ├── ans.txt ├── build │ ├── build │ │ ├── knn copy 2 │ │ └── knn copy 4 │ ├── knn │ ├── knn copy 2.cpp │ ├── knn copy 3.cpp │ ├── knn copy 4.cpp │ ├── knn copy 5.cpp │ ├── knn copy 6.cpp │ ├── knn copy.cpp │ ├── knn.cpp │ └── test ├── exam_knn.2023-05-10.tgz ├── knn ├── knn.cpp ├── knn_0 ├── out.txt ├── readme.txt ├── run.sh └── test.cpp ├── final_exam ├── build │ ├── circle │ ├── circle copy 2.cpp │ ├── circle copy 3.cpp │ └── circle copy.cpp ├── circle ├── circle.cpp └── run.sh ├── lab_knn ├── build │ ├── build │ │ └── knn │ ├── knn │ └── knn.cpp ├── knn ├── knn.cpp ├── knn_apx1 ├── knn_apx2 ├── knn_apx3 ├── knn_apx4 ├── lab_knn.2023-05-17.tgz ├── readme.txt └── run.sh ├── lab_par_for ├── DataSharing │ ├── Firstprivate │ │ ├── fp │ │ ├── fp.cpp │ │ └── makefile │ └── Lastprivate │ │ ├── fp │ │ ├── fp.cpp │ │ └── makefile ├── Intro05 │ ├── hello │ ├── hello.cpp │ └── makefile ├── PI │ ├── makefile │ ├── pi │ └── pi.cpp ├── PIv1 │ ├── makefile │ ├── pi │ └── pi.cpp ├── PIv2 │ ├── makefile │ ├── pi │ └── pi.cpp ├── PIv3 │ ├── makefile │ ├── pi │ └── pi.cpp ├── PIv4 │ ├── build │ │ └── pi │ ├── makefile │ ├── pi │ └── pi.cpp ├── PIv5 │ ├── makefile │ ├── pi │ └── pi.cpp ├── hello │ ├── hello │ ├── hello.cpp │ └── makefile ├── lab_par_for.tgz ├── par_for │ ├── makefile │ ├── par_for │ └── par_for.cpp └── readme.txt ├── lab_pi_integral ├── build │ ├── build │ │ └── tp │ ├── pi_integral_0 │ ├── pi_integral_1 │ ├── pi_integral_1_1.cpp │ ├── pi_integral_2 │ └── tp.cpp ├── lab_pi_integral.tgz ├── pi_integral_0.cpp ├── pi_integral_1.cpp ├── pi_integral_2.cpp ├── readme.txt ├── run.sh └── test.cpp ├── lab_pi_rnd ├── build │ ├── pi_rnd_1 │ ├── pi_rnd_2 copy 2.cpp │ ├── pi_rnd_2 copy 3.cpp │ ├── pi_rnd_2 copy.cpp │ └── pi_rnd_2.cpp ├── lab_pi_rnd.2023-04-12.tgz ├── pi_rnd_0 ├── pi_rnd_0.cpp ├── pi_rnd_1 ├── pi_rnd_1.cpp ├── pi_rnd_2 ├── pi_rnd_2.cpp ├── readme.txt └── run.sh ├── lab_scan_frag ├── lab_scan_frag.2023-04-19.tgz ├── readme.txt ├── run.sh ├── scan_frag ├── scan_frag_0 └── scan_frag_0.cpp ├── lab_scan_link ├── build │ └── scan_link_0 ├── lab_scan_link.2023-04-26.tgz ├── readme.txt ├── run.sh ├── scan_link ├── scan_link_0 └── scan_link_0.cpp ├── lab_scan_tree ├── lab_scan_tree.2023-05-06.tgz ├── readme.txt ├── run.sh ├── scan_tree ├── scan_tree_0 └── scan_tree_0.cpp ├── lab_scan_vect ├── build │ ├── scan_vect_1 │ ├── scan_vect_2 │ └── scan_vect_2.cpp ├── lab_scan_vect.tgz ├── readme.txt ├── run.sh ├── scan_vect ├── scan_vect_0 ├── scan_vect_0.cpp ├── scan_vect_1 ├── scan_vect_1.cpp ├── scan_vect_2 └── scan_vect_2.cpp ├── midterm_exam ├── build │ ├── segment_softmax │ ├── segment_softmax copy.cpp │ ├── segment_softmax.cpp │ ├── series_of_numbers │ ├── series_of_numbers copy │ ├── series_of_numbers copy 2.cpp │ ├── series_of_numbers copy.cpp │ ├── series_of_numbers.cpp │ ├── series_of_numbers_2 │ ├── series_of_numbers_2 copy 2.cpp │ ├── series_of_numbers_2 copy.cpp │ └── series_of_numbers_2.cpp ├── segment_softmax ├── segment_softmax.cpp ├── series.sh ├── series_of_numbers ├── series_of_numbers.cpp ├── softmax.sh └── submit │ ├── segment_softmax.cpp │ └── series_of_numbers.cpp └── notes ├── API.md ├── wordsharing.md └── 同步结构.md /README.md: -------------------------------------------------------------------------------- 1 | # ParallelComputingCourse 2 | 山东大学计算机学院 并行计算课程实验 3 | 4 | 所有代码仅供参考,并不保证正确性。 5 | 6 | 如果对您有用,希望能得到您的star。 7 | -------------------------------------------------------------------------------- /openacc/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "array": "cpp", 4 | "string": "cpp", 5 | "string_view": "cpp", 6 | "vector": "cpp" 7 | } 8 | } -------------------------------------------------------------------------------- /openacc/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "cppbuild", 5 | "label": "C/C++: g++ 生成活动文件", 6 | "command": "/usr/bin/g++", 7 | "args": [ 8 | "-fdiagnostics-color=always", 9 | "-g", 10 | "${file}", 11 | "-o", 12 | "${fileDirname}/${fileBasenameNoExtension}" 13 | ], 14 | "options": { 15 | "cwd": "${fileDirname}" 16 | }, 17 | "problemMatcher": [ 18 | "$gcc" 19 | ], 20 | "group": { 21 | "kind": "build", 22 | "isDefault": true 23 | }, 24 | "detail": "调试器生成的任务。" 25 | } 26 | ], 27 | "version": "2.0.0" 28 | } -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/assign-floyd.2023-05-31.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/ans/assign-floyd.2023-05-31.tgz -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=$devlst 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | 13 | inline int index(const int i, const int j) { 14 | return i * SIZE + j; 15 | } 16 | 17 | // add your codes begin 18 | // add your codes end 19 | 20 | 21 | int main() { 22 | const int size2 = SIZE * SIZE; 23 | float* data = new float[size2]; 24 | for (int i = 0; i < size2; i++) data[i] = -INF; 25 | 26 | srand(SIZE); 27 | for (int i = 0; i < SIZE*20; i++) { 28 | int prev = rand() % SIZE; 29 | int next = rand() % SIZE; 30 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 31 | i--; 32 | continue; 33 | } 34 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 35 | } 36 | 37 | double t = omp_get_wtime(); 38 | // add your codes begin 39 | // add your codes end 40 | t = omp_get_wtime() - t; 41 | printf("time %f %d\n", t, SIZE); 42 | 43 | for (int i = 0; i < 20; i++) { 44 | int prev = rand() % SIZE; 45 | int next = rand() % SIZE; 46 | if (prev == next) { 47 | i--; 48 | continue; 49 | } 50 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd.exe3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/ans/floyd.exe3 -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd.log3: -------------------------------------------------------------------------------- 1 | index(int, int): 2 | 13, Generating implicit acc routine seq 3 | Generating NVIDIA GPU code 4 | main: 5 | 5, include "math.h" 6 | 15, include "math.h" 7 | 36, include "cmath" 8 | 15, include "cmath" 9 | 1935, include "specfun.h" 10 | 45, include "stl_algobase.h" 11 | 45, #omp parallel 12 | 46, Begin single region 13 | End single region 14 | Barrier 15 | 73, Barrier 16 | 84, Barrier 17 | 24, Memory set idiom, loop replaced by call to __c_mset4 18 | 56, Generating enter data create(copy[:23040000],dkj[:4800],dik[:4800]) 19 | Generating enter data copyin(data[:23040000]) 20 | 59, Generating present(d0[:],d1[:]) 21 | Generating implicit firstprivate(begin,end) 22 | Generating NVIDIA GPU code 23 | 61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 24 | 59, Generating implicit copyout(dik[begin:end-begin]) [if not already present] 25 | 61, Generating implicit firstprivate(k) 26 | 65, Generating update self(dik[begin:step]) 27 | Generating present(d1[:],d0[:]) 28 | Generating NVIDIA GPU code 29 | 67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 30 | 65, Generating implicit copyout(dkj[:4800]) [if not already present] 31 | 67, Generating implicit firstprivate(k) 32 | 71, Generating update self(dkj[:4800]) 33 | Generating update device(dkj[:4800],dik[:4800]) 34 | Generating present(d1[:],d0[:]) 35 | Generating implicit firstprivate(begin,end) 36 | Generating NVIDIA GPU code 37 | 78, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */ 38 | 79, /* blockIdx.x threadIdx.x tiled */ 39 | 71, Generating implicit copyin(dkj[:4800],dik[begin:end-begin]) [if not already present] 40 | 79, Generating implicit private(_T25_5541,_T22_5541) 41 | 99, Generating update self(data[begin*4800:step*4800]) 42 | Generating exit data delete(dkj[:4800],dik[:4800],data[:23040000],copy[:23040000]) 43 | float const& std::max(float const&, float const&): 44 | 5, include "math.h" 45 | 15, include "math.h" 46 | 36, include "cmath" 47 | 15, include "cmath" 48 | 1935, include "specfun.h" 49 | 45, include "stl_algobase.h" 50 | 255, Generating implicit acc routine seq 51 | Generating NVIDIA GPU code 52 | #num_dev: 1 53 | #num_thread: 1 54 | #device_id: 0 0 4800 4800 55 | time 2.797260 4800 56 | test 1683 3274 -0.706498 57 | test 632 2448 -0.447264 58 | test 842 583 -0.503465 59 | test 807 4278 -0.626579 60 | test 803 353 -0.561426 61 | test 4022 1321 -0.534807 62 | test 2934 2255 -0.506989 63 | test 3334 4036 -0.535419 64 | test 3344 3528 -0.635352 65 | test 4058 965 -0.721051 66 | test 3765 3241 -0.468802 67 | test 2756 3304 -0.441437 68 | test 4583 1289 -0.648130 69 | test 1697 2723 -0.456125 70 | test 4475 3795 -0.479226 71 | test 1303 1358 -0.472469 72 | test 2269 4688 -0.527612 73 | test 1759 1063 -0.515894 74 | test 471 518 -0.486703 75 | test 542 1274 -0.587901 76 | 2.65user 0.24system 0:02.89elapsed 99%CPU (0avgtext+0avgdata 263092maxresident)k 77 | 0inputs+0outputs (3major+73084minor)pagefaults 0swaps 78 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd_multidevice copy 2.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma omp parallel num_threads(8) 42 | { 43 | int id, nthrds; 44 | id = omp_get_thread_num(); 45 | nthrds = omp_get_num_threads(); 46 | acc_set_device_num(id, acc_device_nvidia); 47 | #pragma acc data copy(data[0 : size2]) 48 | for (int k = id; k < SIZE; k += nthrds) { 49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(256) //async(id) 50 | for (int i = 0; i < SIZE; i++) { 51 | #pragma acc loop vector 52 | for (int j = 0; j < SIZE; j++) { 53 | float temp = data[index(i, k)] + data[index(k, j)]; 54 | if (data[index(i, j)] < temp) { 55 | data[index(i, j)] = temp; 56 | } 57 | } 58 | } 59 | } 60 | #pragma acc update device(imgData[starty*step:blocksize*step]) 61 | } 62 | 63 | // add your codes end 64 | t = omp_get_wtime() - t; 65 | printf("time %f %d\n", t, SIZE); 66 | 67 | for (int i = 0; i < 20; i++) { 68 | int prev = rand() % SIZE; 69 | int next = rand() % SIZE; 70 | if (prev == next) { 71 | i--; 72 | continue; 73 | } 74 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd_multidevice copy.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma omp parallel num_threads(8) 42 | { 43 | int id, nthrds; 44 | id = omp_get_thread_num(); 45 | nthrds = omp_get_num_threads(); 46 | acc_set_device_num(id, acc_device_nvidia); 47 | #pragma acc data copy(data[0 : size2]) 48 | for (int k = id; k < SIZE; k += nthrds) { 49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(256) //async(id) 50 | for (int i = 0; i < SIZE; i++) { 51 | #pragma acc loop vector 52 | for (int j = 0; j < SIZE; j++) { 53 | float temp = data[index(i, k)] + data[index(k, j)]; 54 | if (data[index(i, j)] < temp) { 55 | data[index(i, j)] = temp; 56 | } 57 | } 58 | } 59 | } 60 | #pragma acc update device(imgData[starty*step:blocksize*step]) 61 | } 62 | 63 | // add your codes end 64 | t = omp_get_wtime() - t; 65 | printf("time %f %d\n", t, SIZE); 66 | 67 | for (int i = 0; i < 20; i++) { 68 | int prev = rand() % SIZE; 69 | int next = rand() % SIZE; 70 | if (prev == next) { 71 | i--; 72 | continue; 73 | } 74 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/floyd_multidevice.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma omp parallel num_threads(8) 42 | { 43 | int id, nthrds; 44 | id = omp_get_thread_num(); 45 | nthrds = omp_get_num_threads(); 46 | acc_set_device_num(id, acc_device_nvidia); 47 | #pragma acc data copy(data[0 : size2]) 48 | for (int k = id; k < SIZE; k += nthrds) { 49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128) 50 | for (int i = 0; i < SIZE; i++) { 51 | float temp; 52 | #pragma acc loop vector 53 | for (int j = 0; j < SIZE; j++) { 54 | temp = data[index(i, k)] + data[index(k, j)]; 55 | if (data[index(i, j)] < temp) { 56 | data[index(i, j)] = temp; 57 | } 58 | } 59 | } 60 | } 61 | } 62 | 63 | // add your codes end 64 | t = omp_get_wtime() - t; 65 | printf("time %f %d\n", t, SIZE); 66 | 67 | for (int i = 0; i < 20; i++) { 68 | int prev = rand() % SIZE; 69 | int next = rand() % SIZE; 70 | if (prev == next) { 71 | i--; 72 | continue; 73 | } 74 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | all: clean serial multicore managed optimize multidevice 30 | 31 | clean: 32 | rm -f floyd.exe* floyd.prof* floyd.log* 33 | 34 | -------------------------------------------------------------------------------- /openacc/assign-floyd/ans/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | source device.sh 1 4 | #source device.sh 8 5 | make clean 6 | 7 | #timeout 1m time make serial 8 | #timeout 1m time make multicore 9 | #timeout 1m time make managed 10 | timeout 1m time make optimize 11 | #timeout 1m time make multidevice 12 | 13 | -------------------------------------------------------------------------------- /openacc/assign-floyd/assign-floyd.2023-05-31.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/assign-floyd.2023-05-31.tgz -------------------------------------------------------------------------------- /openacc/assign-floyd/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=$devlst 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/assign-floyd/floyd.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma acc data copy(data[0 : size2]) 42 | for (int k = 0; k < SIZE; k++) { 43 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128) 44 | for (int i = 0; i < SIZE; i++) { 45 | #pragma acc loop vector 46 | for (int j = 0; j < SIZE; j++) { 47 | // if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) { 48 | // data[index(i,j)] = data[index(i,k)] + data[index(k,j)]; 49 | // } 50 | float tp = data[i * SIZE + k] + data[k * SIZE + j]; 51 | if (data[i * SIZE + j] >= tp) continue; 52 | data[i * SIZE + j] = tp; 53 | } 54 | } 55 | } 56 | 57 | // add your codes end 58 | t = omp_get_wtime() - t; 59 | printf("time %f %d\n", t, SIZE); 60 | 61 | for (int i = 0; i < 20; i++) { 62 | int prev = rand() % SIZE; 63 | int next = rand() % SIZE; 64 | if (prev == next) { 65 | i--; 66 | continue; 67 | } 68 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /openacc/assign-floyd/floyd.exe3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/floyd.exe3 -------------------------------------------------------------------------------- /openacc/assign-floyd/floyd.log3: -------------------------------------------------------------------------------- 1 | main: 2 | 38, Generating copy(data[:23040000]) [if not already present] 3 | 42, Generating NVIDIA GPU code 4 | 44, #pragma acc loop gang, worker(4) /* blockIdx.x threadIdx.y */ 5 | 46, #pragma acc loop vector(128) /* threadIdx.x */ 6 | 46, Loop is parallelizable 7 | Generating implicit firstprivate(k) 8 | time 1.254653 4800 9 | test 1683 3274 -0.706498 10 | test 632 2448 -0.447264 11 | test 842 583 -0.503465 12 | test 807 4278 -0.626579 13 | test 803 353 -0.561426 14 | test 4022 1321 -0.534807 15 | test 2934 2255 -0.506989 16 | test 3334 4036 -0.535419 17 | test 3344 3528 -0.635352 18 | test 4058 965 -0.721051 19 | test 3765 3241 -0.468802 20 | test 2756 3304 -0.441437 21 | test 4583 1289 -0.648130 22 | test 1697 2723 -0.456125 23 | test 4475 3795 -0.479226 24 | test 1303 1358 -0.472469 25 | test 2269 4688 -0.527612 26 | test 1759 1063 -0.515894 27 | test 471 518 -0.486703 28 | test 542 1274 -0.587901 29 | 1.13user 0.26system 0:01.40elapsed 99%CPU (0avgtext+0avgdata 263100maxresident)k 30 | 0inputs+8outputs (0major+50299minor)pagefaults 0swaps 31 | -------------------------------------------------------------------------------- /openacc/assign-floyd/floyd_multidevice.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | using namespace std; 13 | // #define SIZE 1000 14 | #define DEVICE_NUM 2 15 | inline int index(const int i, const int j) { return i * SIZE + j; } 16 | 17 | // add your codes begin 18 | 19 | // add your codes end 20 | 21 | int main() { 22 | const int size2 = SIZE * SIZE; 23 | float* data = new float[size2]; 24 | for (int i = 0; i < size2; i++) data[i] = -INF; 25 | 26 | srand(SIZE); 27 | for (int i = 0; i < SIZE * 20; i++) { 28 | // 第一维坐标 29 | int prev = rand() % SIZE; 30 | // 第二维坐标 31 | int next = rand() % SIZE; 32 | // 如果为对称轴,或者已经被设置边权了 33 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 34 | i--; 35 | continue; 36 | } 37 | // 赋值 38 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 39 | } 40 | double t = omp_get_wtime(); 41 | 42 | // add your codes begin 43 | int blocksize = SIZE / DEVICE_NUM; 44 | for (int k = 0; k < SIZE; k++) { 45 | #pragma omp parallel num_threads(DEVICE_NUM) 46 | { 47 | int id, nthrds; 48 | id = omp_get_thread_num(); 49 | nthrds = omp_get_num_threads(); 50 | acc_set_device_num(id, acc_device_nvidia); 51 | #pragma acc data copyin(data[0 : size2]) 52 | { 53 | #pragma omp for schedule(static, 1) 54 | for (int device_id = 0; device_id < DEVICE_NUM; device_id++) { 55 | int st = id * blocksize; 56 | int ed = st + blocksize; 57 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128) 58 | for (int i = st; i < ed; i++) { 59 | #pragma acc loop vector 60 | for (int j = 0; j < SIZE; j++) { 61 | if (data[index(i, j)] < 62 | data[index(i, k)] + data[index(k, j)]) { 63 | data[index(i, j)] = 64 | data[index(i, k)] + data[index(k, j)]; 65 | } 66 | } 67 | } 68 | #pragma acc update self(data[st * SIZE : SIZE * blocksize]) 69 | } 70 | } 71 | } 72 | } 73 | 74 | // add your codes end 75 | t = omp_get_wtime() - t; 76 | printf("time %f %d\n", t, SIZE); 77 | 78 | for (int i = 0; i < 20; i++) { 79 | int prev = rand() % SIZE; 80 | int next = rand() % SIZE; 81 | if (prev == next) { 82 | i--; 83 | continue; 84 | } 85 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 86 | } 87 | } -------------------------------------------------------------------------------- /openacc/assign-floyd/floyd_optimize.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma acc data copy(data[0:size2]) 42 | for (int k = 0; k < SIZE; k++) { 43 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128) 44 | for (int i = 0; i < SIZE; i++) { 45 | #pragma acc loop vector 46 | for (int j = 0; j < SIZE; j++) { 47 | if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) { 48 | data[index(i,j)] = data[index(i,k)] + data[index(k,j)]; 49 | } 50 | } 51 | } 52 | } 53 | 54 | // add your codes end 55 | t = omp_get_wtime() - t; 56 | printf("time %f %d\n", t, SIZE); 57 | 58 | for (int i = 0; i < 20; i++) { 59 | int prev = rand() % SIZE; 60 | int next = rand() % SIZE; 61 | if (prev == next) { 62 | i--; 63 | continue; 64 | } 65 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /openacc/assign-floyd/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_multidevice.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | all: clean serial multicore managed optimize multidevice 30 | 31 | clean: 32 | rm -f floyd.exe* floyd.prof* floyd.log* 33 | 34 | -------------------------------------------------------------------------------- /openacc/assign-floyd/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | source device.sh 1 4 | #source device.sh 8 5 | make clean 6 | 7 | #timeout 1m time make serial 8 | #timeout 1m time make multicore 9 | #timeout 1m time make managed 10 | timeout 1m time make optimize 11 | # timeout 10s time make multidevice 12 | 13 | -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=$devlst 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/exam-floyd.2023-05-31.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/ans/exam-floyd.2023-05-31.tgz -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/floyd.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | 13 | inline int index(const int i, const int j) { 14 | return i * SIZE + j; 15 | } 16 | 17 | // add your codes begin 18 | // add your codes end 19 | 20 | 21 | int main() { 22 | const int size2 = SIZE * SIZE; 23 | float* data = new float[size2]; 24 | for (int i = 0; i < size2; i++) data[i] = -INF; 25 | 26 | srand(SIZE); 27 | for (int i = 0; i < SIZE*20; i++) { 28 | int prev = rand() % SIZE; 29 | int next = rand() % SIZE; 30 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 31 | i--; 32 | continue; 33 | } 34 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 35 | } 36 | 37 | double t = omp_get_wtime(); 38 | // add your codes begin 39 | // add your codes end 40 | t = omp_get_wtime() - t; 41 | printf("time %f %d\n", t, SIZE); 42 | 43 | for (int i = 0; i < 20; i++) { 44 | int prev = rand() % SIZE; 45 | int next = rand() % SIZE; 46 | if (prev == next) { 47 | i--; 48 | continue; 49 | } 50 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/floyd.exe4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/ans/floyd.exe4 -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/floyd.log4: -------------------------------------------------------------------------------- 1 | index(int, int): 2 | 13, Generating implicit acc routine seq 3 | Generating NVIDIA GPU code 4 | main: 5 | 5, include "math.h" 6 | 15, include "math.h" 7 | 36, include "cmath" 8 | 15, include "cmath" 9 | 1935, include "specfun.h" 10 | 45, include "stl_algobase.h" 11 | 45, #omp parallel 12 | 46, Begin single region 13 | End single region 14 | Barrier 15 | 73, Barrier 16 | 84, Barrier 17 | 24, Memory set idiom, loop replaced by call to __c_mset4 18 | 56, Generating enter data create(copy[:36000000],dkj[:6000],dik[:6000]) 19 | Generating enter data copyin(data[:36000000]) 20 | 59, Generating present(d0[:],d1[:]) 21 | Generating implicit firstprivate(begin,end) 22 | Generating NVIDIA GPU code 23 | 61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 24 | 59, Generating implicit copyout(dik[begin:end-begin]) [if not already present] 25 | 61, Generating implicit (k) 26 | 65, Generating update self(dik[begin:step]) 27 | Generating present(d1[:],d0[:]) 28 | Generating NVIDIA GPU code 29 | 67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 30 | 65, Generating implicit copyout(dkj[:6000]) [if not already present] 31 | 67, Generating implicit firstprivate(k) 32 | 71, Generating update self(dkj[:6000]) 33 | Generating update device(dkj[:6000],dik[:6000]) 34 | Generating present(d1[:],d0[:]) 35 | Generating implicit firstprivate(begin,end) 36 | Generating NVIDIA GPU code 37 | 78, #pragma acc loop gang, vector /* blockIdx.x threadIdx.x */ 38 | 79, /* blockIdx.x threadIdx.x tiled */ 39 | 71, Generating implicit copyin(dkj[:6000],dik[begin:end-begin]) [if not already present] 40 | 79, Generating implicit private(_T25_5541,_T22_5541) 41 | 99, Generating update self(data[begin*6000:step*6000]) 42 | Generating exit data delete(dkj[:6000],dik[:6000],data[:36000000],copy[:36000000]) 43 | float const& std::max(float const&, float const&): 44 | 5, include "math.h" 45 | 15, include "math.h" 46 | 36, include "cmath" 47 | 15, include "cmath" 48 | 1935, include "specfun.h" 49 | 45, include "stl_algobase.h" 50 | 255, Generating implicit acc routine seq 51 | Generating NVIDIA GPU code 52 | #num_dev: 8 53 | #num_thread: 8 54 | #device_id: 7 5250 6000 750 55 | #device_id: 4 3000 3750 750 56 | #device_id: 1 750 1500 750 57 | #device_id: 0 0 750 750 58 | #device_id: 5 3750 4500 750 59 | #device_id: 6 4500 5250 750 60 | #device_id: 3 2250 3000 750 61 | #device_id: 2 1500 2250 750 62 | time 3.402403 6000 63 | test 2161 4093 -0.683869 64 | test 4560 849 -0.384304 65 | test 4615 5729 -0.371038 66 | test 5055 5059 -0.487377 67 | test 1882 5483 -0.559907 68 | test 1312 3253 -0.648813 69 | test 4565 5567 -0.638792 70 | test 4094 4590 -0.523406 71 | test 4765 4454 -0.760118 72 | test 4230 4312 -0.455890 73 | test 4446 5673 -0.539349 74 | test 5203 3022 -0.435516 75 | test 3117 548 -0.571684 76 | test 1839 1493 -0.294103 77 | test 71 2247 -0.472646 78 | test 2630 2585 -0.511463 79 | test 692 1190 -0.315346 80 | test 3434 5308 -0.457022 81 | test 1271 2841 -0.614242 82 | test 4367 3153 -0.563043 83 | 13.72user 3.36system 0:04.19elapsed 407%CPU (0avgtext+0avgdata 1543936maxresident)k 84 | 0inputs+8outputs (1major+256099minor)pagefaults 0swaps 85 | -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | all: clean serial multicore managed optimize multidevice 30 | 31 | clean: 32 | rm -f floyd.exe* floyd.prof* floyd.log* 33 | 34 | -------------------------------------------------------------------------------- /openacc/exam-floyd/ans/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | make clean 4 | 5 | #source device.sh 1 6 | #timeout 1m time make serial 7 | #timeout 1m time make multicore 8 | #timeout 1m time make managed 9 | #timeout 1m time make optimize 10 | 11 | source device.sh 8 12 | timeout 1m time make multidevice 13 | 14 | -------------------------------------------------------------------------------- /openacc/exam-floyd/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=5,6,7 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/exam-floyd/exam-floyd.2023-05-31.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/exam-floyd.2023-05-31.tgz -------------------------------------------------------------------------------- /openacc/exam-floyd/floyd copy.log4: -------------------------------------------------------------------------------- 1 | main: 2 | 23, Memory set idiom, loop replaced by call to __c_mset4 3 | 27, Loop not vectorized/parallelized: contains call 4 | 30, index(int, int) inlined, size=2 (inline) file floyd.cpp (13) 5 | 34, index(int, int) inlined, size=2 (inline) file floyd.cpp (13) 6 | 45, #omp parallel 7 | Loop not vectorized: data dependency 8 | Loop unrolled 2 times 9 | 50, #omp parallel 10 | 85, Barrier 11 | 57, Generating copyin(data[begin*6000:blocksize*6000]) [if not already present] 12 | Generating create(dkj[:6000],dik[:6000]) [if not already present] 13 | 58, Generating update device(dkj[:6000],dik[:6000]) 14 | Generating implicit firstprivate(begin,end) 15 | Generating NVIDIA GPU code 16 | 63, #pragma acc loop gang /* blockIdx.x */ 17 | 67, #pragma acc loop vector(1024) /* threadIdx.x */ 18 | 58, Loop not vectorized/parallelized: contains call 19 | 67, Loop is parallelizable 20 | 73, Generating implicit firstprivate(begin,end) 21 | Generating NVIDIA GPU code 22 | 75, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 23 | 75, Generating implicit firstprivate(k) 24 | 79, Generating update self(dik[begin:blocksize]) 25 | Generating NVIDIA GPU code 26 | 81, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 27 | 81, Generating implicit firstprivate(k) 28 | 84, Generating update self(dkj[:6000]) 29 | 88, Generating update self(data[begin*6000:blocksize*6000]) 30 | 95, Loop not vectorized/parallelized: contains call 31 | 102, index(int, int) inlined, size=2 (inline) file floyd.cpp (13) 32 | time 1.593988 6000 33 | test 2161 4093 -0.683869 34 | test 4560 849 -0.384304 35 | test 4615 5729 -0.371038 36 | test 5055 5059 -0.487377 37 | test 1882 5483 -0.559907 38 | test 1312 3253 -0.648813 39 | test 4565 5567 -0.638792 40 | test 4094 4590 -0.523406 41 | test 4765 4454 -0.760118 42 | test 4230 4312 -0.455890 43 | test 4446 5673 -0.539349 44 | test 5203 3022 -0.435516 45 | test 3117 548 -0.571684 46 | test 1839 1493 -0.294103 47 | test 71 2247 -0.472646 48 | test 2630 2585 -0.511463 49 | test 692 1190 -0.315346 50 | test 3434 5308 -0.457022 51 | test 1271 2841 -0.614242 52 | test 4367 3153 -0.563043 53 | 3.79user 0.90system 0:01.85elapsed 253%CPU (0avgtext+0avgdata 627496maxresident)k 54 | 0inputs+0outputs (0major+150079minor)pagefaults 0swaps 55 | -------------------------------------------------------------------------------- /openacc/exam-floyd/floyd.exe4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/floyd.exe4 -------------------------------------------------------------------------------- /openacc/exam-floyd/floyd.log4: -------------------------------------------------------------------------------- 1 | main: 2 | 23, Memory set idiom, loop replaced by call to __c_mset4 3 | 45, #omp parallel 4 | 50, #omp parallel 5 | 85, Barrier 6 | 57, Generating copyin(data[begin*6000:blocksize*6000]) [if not already present] 7 | Generating create(dkj[:6000],dik[:6000]) [if not already present] 8 | 58, Generating update device(dik[:6000],dkj[:6000]) 9 | Generating implicit firstprivate(end,begin) 10 | Generating NVIDIA GPU code 11 | 63, #pragma acc loop gang /* blockIdx.x */ 12 | 67, #pragma acc loop vector(1024) /* threadIdx.x */ 13 | 67, Loop is parallelizable 14 | 73, Generating implicit firstprivate(begin,end) 15 | Generating NVIDIA GPU code 16 | 75, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 17 | 75, Generating implicit firstprivate(k) 18 | 79, Generating update self(dik[begin:blocksize]) 19 | Generating NVIDIA GPU code 20 | 81, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 21 | 81, Generating implicit firstprivate(k) 22 | 84, Generating update self(dkj[:6000]) 23 | 88, Generating update self(data[begin*6000:blocksize*6000]) 24 | time 1.702572 6000 25 | test 2161 4093 -0.683869 26 | test 4560 849 -0.384304 27 | test 4615 5729 -0.371038 28 | test 5055 5059 -0.487377 29 | test 1882 5483 -0.559907 30 | test 1312 3253 -0.648813 31 | test 4565 5567 -0.638792 32 | test 4094 4590 -0.523406 33 | test 4765 4454 -0.760118 34 | test 4230 4312 -0.455890 35 | test 4446 5673 -0.539349 36 | test 5203 3022 -0.435516 37 | test 3117 548 -0.571684 38 | test 1839 1493 -0.294103 39 | test 71 2247 -0.472646 40 | test 2630 2585 -0.511463 41 | test 692 1190 -0.315346 42 | test 3434 5308 -0.457022 43 | test 1271 2841 -0.614242 44 | test 4367 3153 -0.563043 45 | 3.86user 1.03system 0:01.98elapsed 247%CPU (0avgtext+0avgdata 627488maxresident)k 46 | 0inputs+8outputs (1major+150118minor)pagefaults 0swaps 47 | -------------------------------------------------------------------------------- /openacc/exam-floyd/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -O1 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | test: 30 | nvc++ -c floyd.cpp -o output.o -DSIZE=6000 31 | 32 | all: clean serial multicore managed optimize multidevice 33 | 34 | clean: 35 | rm -f floyd.exe* floyd.prof* floyd.log* 36 | 37 | -------------------------------------------------------------------------------- /openacc/exam-floyd/output.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/output.o -------------------------------------------------------------------------------- /openacc/exam-floyd/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | make clean 4 | 5 | #source device.sh 1 6 | #timeout 1m time make serial 7 | #timeout 1m time make multicore 8 | #timeout 1m time make managed 9 | #timeout 1m time make optimize 10 | 11 | source device.sh 3 12 | timeout 20s time make multidevice 13 | 14 | -------------------------------------------------------------------------------- /openacc/image/clauses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/image/clauses.png -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=$devlst 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | 13 | inline int index(const int i, const int j) { 14 | return i * SIZE + j; 15 | } 16 | 17 | // add your codes begin 18 | // add your codes end 19 | 20 | 21 | int main() { 22 | const int size2 = SIZE * SIZE; 23 | float* data = new float[size2]; 24 | for (int i = 0; i < size2; i++) data[i] = -INF; 25 | 26 | srand(SIZE); 27 | for (int i = 0; i < SIZE*20; i++) { 28 | int prev = rand() % SIZE; 29 | int next = rand() % SIZE; 30 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 31 | i--; 32 | continue; 33 | } 34 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 35 | } 36 | 37 | double t = omp_get_wtime(); 38 | // add your codes begin 39 | // add your codes end 40 | t = omp_get_wtime() - t; 41 | printf("time %f %d\n", t, SIZE); 42 | 43 | for (int i = 0; i < 20; i++) { 44 | int prev = rand() % SIZE; 45 | int next = rand() % SIZE; 46 | if (prev == next) { 47 | i--; 48 | continue; 49 | } 50 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.exe0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe0 -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.exe1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe1 -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.exe2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe2 -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.log0: -------------------------------------------------------------------------------- 1 | #num_dev: 0 2 | #num_thread: 1 3 | #device_id: 0 0 1200 1200 4 | time 9.751188 1200 5 | test 65 506 -0.544105 6 | test 272 605 -0.519495 7 | test 391 617 -0.342672 8 | test 384 274 -0.396835 9 | test 1136 489 -0.337305 10 | test 657 761 -0.485585 11 | test 893 278 -0.457243 12 | test 163 278 -0.431883 13 | test 366 221 -0.400291 14 | test 794 598 -0.520162 15 | test 712 749 -0.404731 16 | test 390 1090 -0.518321 17 | test 547 715 -0.355524 18 | test 366 102 -0.467554 19 | test 627 1029 -0.434617 20 | test 808 1044 -0.606123 21 | test 687 232 -0.404254 22 | test 449 230 -0.481887 23 | test 2 1186 -0.455573 24 | test 504 1138 -0.456215 25 | 9.76user 0.00system 0:09.76elapsed 99%CPU (0avgtext+0avgdata 16776maxresident)k 26 | 0inputs+8outputs (0major+3099minor)pagefaults 0swaps 27 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.log1: -------------------------------------------------------------------------------- 1 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct (floyd.cpp: 59) 2 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct (floyd.cpp: 65) 3 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct (floyd.cpp: 71) 4 | main: 5 | 5, include "math.h" 6 | 15, include "math.h" 7 | 36, include "cmath" 8 | 15, include "cmath" 9 | 1935, include "specfun.h" 10 | 45, include "stl_algobase.h" 11 | 45, #omp parallel 12 | 46, Begin single region 13 | End single region 14 | Barrier 15 | 73, Barrier 16 | 84, Barrier 17 | 24, Memory set idiom, loop replaced by call to __c_mset4 18 | NVC++/x86-64 Linux 23.3-0: compilation completed with warnings 19 | #num_dev: 0 20 | #num_thread: 64 21 | #device_id: 0 0 38 38 22 | #device_id: 62 2356 2394 38 23 | #device_id: 22 836 874 38 24 | #device_id: 13 494 532 38 25 | #device_id: 18 684 722 38 26 | #device_id: 32 1216 1254 38 27 | #device_id: 50 1900 1938 38 28 | #device_id: 43 1634 1672 38 29 | #device_id: 37 1406 1444 38 30 | #device_id: 58 2204 2242 38 31 | #device_id: 57 2166 2204 38 32 | #device_id: 12 456 494 38 33 | #device_id: 59 2242 2280 38 34 | #device_id: 19 722 760 38 35 | #device_id: 42 1596 1634 38 36 | #device_id: 45 1710 1748 38 37 | #device_id: 4 152 190 38 38 | #device_id: 40 1520 1558 38 39 | #device_id: 51 1938 1976 38 40 | #device_id: 31 1178 1216 38 41 | #device_id: 29 1102 1140 38 42 | #device_id: 1 38 76 38 43 | #device_id: 53 2014 2052 38 44 | #device_id: 3 114 152 38 45 | #device_id: 9 342 380 38 46 | #device_id: 38 1444 1482 38 47 | #device_id: 24 912 950 38 48 | #device_id: 39 1482 1520 38 49 | #device_id: 14 532 570 38 50 | #device_id: 8 304 342 38 51 | #device_id: 25 950 988 38 52 | #device_id: 30 1140 1178 38 53 | #device_id: 5 190 228 38 54 | #device_id: 20 760 798 38 55 | #device_id: 16 608 646 38 56 | #device_id: 33 1254 1292 38 57 | #device_id: 10 380 418 38 58 | #device_id: 41 1558 1596 38 59 | #device_id: 17 646 684 38 60 | #device_id: 21 798 836 38 61 | #device_id: 49 1862 1900 38 62 | #device_id: 61 2318 2356 38 63 | #device_id: 46 1748 1786 38 64 | #device_id: 60 2280 2318 38 65 | #device_id: 56 2128 2166 38 66 | #device_id: 6 228 266 38 67 | #device_id: 54 2052 2090 38 68 | #device_id: 48 1824 1862 38 69 | #device_id: 55 2090 2128 38 70 | #device_id: 52 1976 2014 38 71 | #device_id: 47 1786 1824 38 72 | #device_id: 7 266 304 38 73 | #device_id: 23 874 912 38 74 | #device_id: 35 1330 1368 38 75 | #device_id: 2 76 114 38 76 | #device_id: 34 1292 1330 38 77 | #device_id: 36 1368 1406 38 78 | #device_id: 44 1672 1710 38 79 | #device_id: 15 570 608 38 80 | #device_id: 11 418 456 38 81 | #device_id: 63 2394 2400 6 82 | #device_id: 27 1026 1064 38 83 | #device_id: 26 988 1026 38 84 | #device_id: 28 1064 1102 38 85 | time 1.008772 2400 86 | test 1114 2099 -0.351597 87 | test 1562 1856 -0.818695 88 | test 1929 924 -0.409053 89 | test 2200 2122 -0.236780 90 | test 450 1605 -0.494496 91 | test 515 1139 -0.526653 92 | test 2125 2166 -0.461825 93 | test 515 1372 -0.472302 94 | test 973 2141 -0.479141 95 | test 773 1809 -0.472171 96 | test 1274 457 -0.303956 97 | test 24 2241 -0.392215 98 | test 2129 375 -0.467766 99 | test 1958 1848 -0.472492 100 | test 1023 1572 -0.650282 101 | test 544 2137 -0.443251 102 | test 1624 58 -0.516323 103 | test 1593 1153 -0.588491 104 | test 982 1746 -0.528510 105 | test 875 1432 -0.458209 106 | 60.46user 2.21system 0:01.03elapsed 6066%CPU (0avgtext+0avgdata 51984maxresident)k 107 | 0inputs+0outputs (113major+11828minor)pagefaults 0swaps 108 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/floyd.log2: -------------------------------------------------------------------------------- 1 | index(int, int): 2 | 13, Generating implicit acc routine seq 3 | Generating NVIDIA GPU code 4 | main: 5 | 5, include "math.h" 6 | 15, include "math.h" 7 | 36, include "cmath" 8 | 15, include "cmath" 9 | 1935, include "specfun.h" 10 | 45, include "stl_algobase.h" 11 | 45, #omp parallel 12 | 46, Begin single region 13 | End single region 14 | Barrier 15 | 73, Barrier 16 | 84, Barrier 17 | 24, Memory set idiom, loop replaced by call to __c_mset4 18 | 56, Generating enter data create(copy[:12960000],dkj[:3600],dik[:3600]) 19 | Generating enter data copyin(data[:12960000]) 20 | 59, Generating present(d0[:],d1[:]) 21 | Generating implicit firstprivate(begin,end) 22 | Generating NVIDIA GPU code 23 | 61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 24 | 59, Generating implicit copyout(dik[begin:end-begin]) [if not already present] 25 | 61, Generating implicit firstprivate(k) 26 | 65, Generating update self(dik[begin:step]) 27 | Generating present(d1[:],d0[:]) 28 | Generating NVIDIA GPU code 29 | 67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */ 30 | 65, Generating implicit copyout(dkj[:3600]) [if not already present] 31 | 67, Generating implicit firstprivate(k) 32 | 71, Generating update self(dkj[:3600]) 33 | Generating update device(dkj[:3600],dik[:3600]) 34 | Generating present(d1[:],d0[:]) 35 | Generating implicit firstprivate(begin,end) 36 | Generating NVIDIA GPU code 37 | 78, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */ 38 | 79, /* blockIdx.x threadIdx.x tiled */ 39 | 71, Generating implicit copyin(dkj[:3600],dik[begin:end-begin]) [if not already present] 40 | 79, Generating implicit private(_T25_5543,_T22_5543) 41 | 99, Generating update self(data[begin*3600:step*3600]) 42 | Generating exit data delete(dkj[:3600],dik[:3600],data[:12960000],copy[:12960000]) 43 | float const& std::max(float const&, float const&): 44 | 5, include "math.h" 45 | 15, include "math.h" 46 | 36, include "cmath" 47 | 15, include "cmath" 48 | 1935, include "specfun.h" 49 | 45, include "stl_algobase.h" 50 | 255, Generating implicit acc routine seq 51 | Generating NVIDIA GPU code 52 | #num_dev: 1 53 | #num_thread: 1 54 | #device_id: 0 0 3600 3600 55 | time 1.106637 3600 56 | test 1591 172 -0.763397 57 | test 802 369 -0.453210 58 | test 2535 1922 -0.288441 59 | test 2509 360 -0.270466 60 | test 3035 2689 -0.397025 61 | test 3529 2138 -0.559413 62 | test 3380 3190 -0.562856 63 | test 1650 2709 -0.484510 64 | test 2813 3211 -0.411624 65 | test 3283 3454 -0.454825 66 | test 2055 1613 -0.629674 67 | test 3196 677 -0.532822 68 | test 697 3166 -0.443936 69 | test 2548 3111 -0.529714 70 | test 334 1171 -0.424709 71 | test 1307 1078 -0.537585 72 | test 1343 1261 -0.535192 73 | test 599 278 -0.486341 74 | test 3183 3108 -0.501162 75 | test 3390 1771 -0.395187 76 | 1.13user 0.19system 0:01.33elapsed 99%CPU (0avgtext+0avgdata 155400maxresident)k 77 | 0inputs+0outputs (20major+11576minor)pagefaults 0swaps 78 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/lab-floyd.2023-05-24.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/lab-floyd.2023-05-24.tgz -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | all: clean serial multicore managed optimize multidevice 30 | 31 | clean: 32 | rm -f floyd.exe* floyd.prof* floyd.log* 33 | 34 | -------------------------------------------------------------------------------- /openacc/lab-floyd/ans/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | source device.sh 1 4 | #source device.sh 8 5 | make clean 6 | 7 | timeout 1m time make serial 8 | timeout 1m time make multicore 9 | timeout 1m time make managed 10 | #timeout 1m time make optimize 11 | #timeout 1m time make multidevice 12 | 13 | -------------------------------------------------------------------------------- /openacc/lab-floyd/device.sh: -------------------------------------------------------------------------------- 1 | devreq=$1 2 | if [ "$devreq" == "" ] ; then 3 | devreq=2 4 | fi 5 | 6 | unset CUDA_VISIBLE_DEVICES 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l` 8 | if [ $devreq -ge $numdev ] ; then 9 | echo -e "unset CUDA_VISIBLE_DEVICES" 10 | else 11 | let devnum=$$%$numdev 12 | devlst=$devnum 13 | let devreq=$devreq-1 14 | while [ $devreq -gt 0 ] ; do 15 | let devnum=($devnum+1)%$numdev 16 | devlst=$devlst,$devnum 17 | let devreq=$devreq-1 18 | done 19 | export CUDA_VISIBLE_DEVICES=$devlst 20 | echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.exe0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe0 -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.exe1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe1 -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.exe2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe2 -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.log0: -------------------------------------------------------------------------------- 1 | time 8.989724 1200 2 | test 65 506 -0.544105 3 | test 272 605 -0.519495 4 | test 391 617 -0.342672 5 | test 384 274 -0.396835 6 | test 1136 489 -0.337305 7 | test 657 761 -0.485585 8 | test 893 278 -0.457243 9 | test 163 278 -0.431883 10 | test 366 221 -0.400291 11 | test 794 598 -0.520162 12 | test 712 749 -0.404731 13 | test 390 1090 -0.518321 14 | test 547 715 -0.355524 15 | test 366 102 -0.467554 16 | test 627 1029 -0.434617 17 | test 808 1044 -0.606123 18 | test 687 232 -0.404254 19 | test 449 230 -0.481887 20 | test 2 1186 -0.455573 21 | test 504 1138 -0.456215 22 | 8.99user 0.00system 0:09.00elapsed 99%CPU (0avgtext+0avgdata 11108maxresident)k 23 | 0inputs+8outputs (0major+1684minor)pagefaults 0swaps 24 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.log1: -------------------------------------------------------------------------------- 1 | main: 2 | 22, Memory set idiom, loop replaced by call to __c_mset4 3 | 43, #omp parallel 4 | time 1.063020 2400 5 | test 1114 2099 -0.351597 6 | test 1562 1856 -0.818695 7 | test 1929 924 -0.409053 8 | test 2200 2122 -0.236780 9 | test 450 1605 -0.494496 10 | test 515 1139 -0.526653 11 | test 2125 2166 -0.461825 12 | test 515 1372 -0.472302 13 | test 973 2141 -0.479141 14 | test 773 1809 -0.472171 15 | test 1274 457 -0.303956 16 | test 24 2241 -0.392215 17 | test 2129 375 -0.467766 18 | test 1958 1848 -0.472492 19 | test 1023 1572 -0.650282 20 | test 544 2137 -0.443251 21 | test 1624 58 -0.516323 22 | test 1593 1153 -0.588491 23 | test 982 1746 -0.528510 24 | test 875 1432 -0.458209 25 | 62.96user 0.49system 0:01.08elapsed 5854%CPU (0avgtext+0avgdata 29232maxresident)k 26 | 0inputs+0outputs (162major+6249minor)pagefaults 0swaps 27 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd.log2: -------------------------------------------------------------------------------- 1 | index(int, int): 2 | 13, Generating implicit acc routine seq 3 | Generating NVIDIA GPU code 4 | main: 5 | 22, Memory set idiom, loop replaced by call to __c_mset4 6 | 43, Generating NVIDIA GPU code 7 | 46, #pragma acc loop gang, worker(4) /* blockIdx.x threadIdx.y */ 8 | 49, #pragma acc loop vector(32) /* threadIdx.x */ 9 | 43, Generating implicit copy(data[:]) [if not already present] 10 | 49, Loop is parallelizable 11 | Generating implicit firstprivate(k) 12 | time 0.811786 3600 13 | test 1591 172 -0.763397 14 | test 802 369 -0.453210 15 | test 2535 1922 -0.288441 16 | test 2509 360 -0.270466 17 | test 3035 2689 -0.397025 18 | test 3529 2138 -0.559413 19 | test 3380 3190 -0.562856 20 | test 1650 2709 -0.484510 21 | test 2813 3211 -0.411624 22 | test 3283 3454 -0.454825 23 | test 2055 1613 -0.629674 24 | test 3196 677 -0.532822 25 | test 697 3166 -0.443936 26 | test 2548 3111 -0.529714 27 | test 334 1171 -0.424709 28 | test 1307 1078 -0.537585 29 | test 1343 1261 -0.535192 30 | test 599 278 -0.486341 31 | test 3183 3108 -0.501162 32 | test 3390 1771 -0.395187 33 | 0.85user 0.17system 0:01.04elapsed 98%CPU (0avgtext+0avgdata 156144maxresident)k 34 | 0inputs+0outputs (20major+11584minor)pagefaults 0swaps 35 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd_managed.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | // #pragma acc parallel loop 42 | // #pragma acc data copy(data[0:size2]) 43 | for (int k = 0; k < SIZE; k++) { 44 | #pragma acc parallel loop gang worker 45 | //#pragma acc data copyin(data) copyout(data) 46 | for (int i = 0; i < SIZE; i++) { 47 | //#pragma acc parallel oop collapse(2) 48 | #pragma acc loop vector 49 | for (int j = 0; j < SIZE; j++) { 50 | if (data[index(i, j)] < data[index(i, k)] + data[index(k, j)]) { 51 | data[index(i, j)] = data[index(i, k)] + data[index(k, j)]; 52 | } 53 | } 54 | } 55 | } 56 | 57 | // add your codes end 58 | t = omp_get_wtime() - t; 59 | printf("time %f %d\n", t, SIZE); 60 | 61 | for (int i = 0; i < 20; i++) { 62 | int prev = rand() % SIZE; 63 | int next = rand() % SIZE; 64 | if (prev == next) { 65 | i--; 66 | continue; 67 | } 68 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd_multicore.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | for (int k = 0; k < SIZE; k++) { 42 | #pragma omp parallel for num_threads(60) schedule(dynamic) 43 | for (int i = 0; i < SIZE; i++) { 44 | for (int j = 0; j < SIZE; j++) { 45 | if (data[index(i, k)] == -INF || data[index(k, j)] == -INF) 46 | continue; 47 | if (data[index(i, j)] < data[index(i, k)] + data[index(k, j)]) { 48 | data[index(i, j)] = data[index(i, k)] + data[index(k, j)]; 49 | } 50 | } 51 | } 52 | } 53 | 54 | // add your codes end 55 | t = omp_get_wtime() - t; 56 | printf("time %f %d\n", t, SIZE); 57 | 58 | for (int i = 0; i < 20; i++) { 59 | int prev = rand() % SIZE; 60 | int next = rand() % SIZE; 61 | if (prev == next) { 62 | i--; 63 | continue; 64 | } 65 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd_multidevice.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | for (int k = 0; k < SIZE; k++) { 42 | for (int i = 0; i < SIZE; i++) { 43 | for (int j = 0; j < SIZE; j++) { 44 | if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) { 45 | data[index(i,j)] = data[index(i,k)] + data[index(k,j)]; 46 | } 47 | } 48 | } 49 | } 50 | 51 | // add your codes end 52 | t = omp_get_wtime() - t; 53 | printf("time %f %d\n", t, SIZE); 54 | 55 | for (int i = 0; i < 20; i++) { 56 | int prev = rand() % SIZE; 57 | int next = rand() % SIZE; 58 | if (prev == next) { 59 | i--; 60 | continue; 61 | } 62 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd_optimize.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | #pragma acc data copy(data[0:size2]) 42 | for (int k = 0; k < SIZE; k++) { 43 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128) 44 | for (int i = 0; i < SIZE; i++) { 45 | #pragma acc loop vector 46 | for (int j = 0; j < SIZE; j++) { 47 | if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) { 48 | data[index(i,j)] = data[index(i,k)] + data[index(k,j)]; 49 | } 50 | } 51 | } 52 | } 53 | 54 | // add your codes end 55 | t = omp_get_wtime() - t; 56 | printf("time %f %d\n", t, SIZE); 57 | 58 | for (int i = 0; i < 20; i++) { 59 | int prev = rand() % SIZE; 60 | int next = rand() % SIZE; 61 | if (prev == next) { 62 | i--; 63 | continue; 64 | } 65 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /openacc/lab-floyd/floyd_serial.cpp: -------------------------------------------------------------------------------- 1 | #define INF 1e7 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace std; 12 | // #define SIZE 1000 13 | inline int index(const int i, const int j) { return i * SIZE + j; } 14 | 15 | // add your codes begin 16 | 17 | // add your codes end 18 | 19 | int main() { 20 | const int size2 = SIZE * SIZE; 21 | float* data = new float[size2]; 22 | for (int i = 0; i < size2; i++) data[i] = -INF; 23 | 24 | srand(SIZE); 25 | for (int i = 0; i < SIZE * 20; i++) { 26 | // 第一维坐标 27 | int prev = rand() % SIZE; 28 | // 第二维坐标 29 | int next = rand() % SIZE; 30 | // 如果为对称轴,或者已经被设置边权了 31 | if ((prev == next) || (data[index(prev, next)] > -INF)) { 32 | i--; 33 | continue; 34 | } 35 | // 赋值 36 | data[index(prev, next)] = log((rand() % 99 + 1.0) / 100); 37 | } 38 | double t = omp_get_wtime(); 39 | 40 | // add your codes begin 41 | for (int k = 0; k < SIZE; k++) { 42 | for (int i = 0; i < SIZE; i++) { 43 | for (int j = 0; j < SIZE; j++) { 44 | if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) { 45 | data[index(i,j)] = data[index(i,k)] + data[index(k,j)]; 46 | } 47 | } 48 | } 49 | } 50 | 51 | // add your codes end 52 | t = omp_get_wtime() - t; 53 | printf("time %f %d\n", t, SIZE); 54 | 55 | for (int i = 0; i < 20; i++) { 56 | int prev = rand() % SIZE; 57 | int next = rand() % SIZE; 58 | if (prev == next) { 59 | i--; 60 | continue; 61 | } 62 | printf("test %d %d %f\n", prev, next, data[index(prev, next)]); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /openacc/lab-floyd/lab-floyd.2023-05-24.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/lab-floyd.2023-05-24.tgz -------------------------------------------------------------------------------- /openacc/lab-floyd/makefile: -------------------------------------------------------------------------------- 1 | device: 2 | nvaccelinfo 3 | 4 | serial: 5 | nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd_serial.cpp >floyd.log0 2>&1 6 | #nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1 7 | timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1 8 | 9 | multicore: 10 | nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd_multicore.cpp >floyd.log1 2>&1 11 | #nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1 12 | timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1 13 | 14 | managed: 15 | nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd_managed.cpp >floyd.log2 2>&1 16 | #nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1 17 | timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1 18 | 19 | optimize: 20 | nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_optimize.cpp >floyd.log3 2>&1 21 | #nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1 22 | timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1 23 | 24 | multidevice: 25 | nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_multidevice.cpp >floyd.log4 2>&1 26 | #nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1 27 | timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1 28 | 29 | all: clean serial multicore managed optimize multidevice 30 | 31 | clean: 32 | rm -f floyd.exe* floyd.prof* floyd.log* 33 | 34 | -------------------------------------------------------------------------------- /openacc/lab-floyd/run.sh: -------------------------------------------------------------------------------- 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH 2 | 3 | source device.sh 1 4 | #source device.sh 8 5 | make clean 6 | 7 | timeout 1m time make serial 8 | timeout 1m time make multicore 9 | timeout 1m time make managed 10 | # timeout 1m time make optimize 11 | # timeout 1m time make multidevice 12 | 13 | -------------------------------------------------------------------------------- /openacc/note.md: -------------------------------------------------------------------------------- 1 | # OpenACC 2 | 语法 3 | 4 | ```c++ 5 | #pragma acc parallel 6 | 会产生一个或者多个gang并行执行代码 7 | #pragma acc loop 8 | 提醒编译器下方代码需要并行的去执行 9 | 10 | -ta=tesla:managed 11 | 12 | kernels与parallel的区别,kernels需要编译器来保证并行的正确性 13 | #pragma acc kernels 14 | { 15 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | using namespace std; 12 | 13 | // add your codes begin 14 | 15 | // #define SIZE 10000000 16 | 17 | int cnt; 18 | bool st[SIZE]; 19 | bool is_prime[SIZE]; 20 | 21 | // add your codes end 22 | 23 | int main() { 24 | vector prime; 25 | 26 | double t = omp_get_wtime(); 27 | omp_set_num_threads(30); 28 | // 根据N的不同大小需要调整 29 | int sz = SIZE; 30 | if (SIZE >= 100000000) { 31 | sz /= 15; 32 | } 33 | prime.resize(sz); 34 | // 埃式筛法+数理推导 35 | for (int i = 2; i <= (int)sqrt(SIZE); i++) { 36 | if (!st[i]) { 37 | // prime.push_back(i); 38 | #pragma omp parallel for 39 | for (int j = i + i; j <= SIZE; j += i) { 40 | if (!st[j]) st[j] = true; 41 | } 42 | } 43 | } 44 | st[0] = st[1] = 1; 45 | int cnt = 0; 46 | #pragma omp parallel 47 | { 48 | int id, i, nthrds; 49 | id = omp_get_thread_num(); 50 | nthrds = omp_get_num_threads(); 51 | vector res; 52 | int num = SIZE; 53 | if (id != 2 && id % 2 == 0 && nthrds % 2 == 0) { 54 | } else { 55 | for (i = id; i <= SIZE; i += nthrds) { 56 | if (st[i] == 0) { 57 | res.push_back(i); 58 | } 59 | } 60 | } 61 | 62 | #pragma omp critical 63 | for (int j = 0; j < res.size(); j++) { 64 | prime[cnt++] = res[j]; 65 | } 66 | } 67 | prime.resize(cnt); 68 | 69 | // add your codes end 70 | t = omp_get_wtime() - t; 71 | printf("time %f %ld\n", t, long(SIZE)); 72 | 73 | printf("prime"); 74 | sort(prime.begin(), prime.end()); 75 | for (long i = 0; i < prime.size(); i++) printf(" %ld", prime[i]); 76 | printf("\nsize %ld\n", prime.size()); 77 | } 78 | -------------------------------------------------------------------------------- /openmp/assign_prime/prime_solution: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/prime_solution -------------------------------------------------------------------------------- /openmp/assign_prime/res.log: -------------------------------------------------------------------------------- 1 | 1c1 2 | < time 9.164585 100000000 3 | --- 4 | > time 0.485323 100000000 5 | 4c4 6 | < time 7.479427 100000000 7 | --- 8 | > time 0.256331 100000000 9 | 7c7 10 | < time 7.607437 100000000 11 | --- 12 | > time 0.533640 100000000 13 | 10c10 14 | < time 8.163960 100000000 15 | --- 16 | > time 0.668524 100000000 17 | 13c13 18 | < time 13.144602 100000000 19 | --- 20 | > time 0.261871 100000000 21 | -------------------------------------------------------------------------------- /openmp/assign_prime/run.sh: -------------------------------------------------------------------------------- 1 | size=100000000 2 | 3 | g++ -o prime -fopenmp -DSIZE=$size prime.cpp && timeout 60s time ./prime #> testmy.log 4 | 5 | -------------------------------------------------------------------------------- /openmp/assign_prime/temp/assign_01_prime/build/prime: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/prime -------------------------------------------------------------------------------- /openmp/assign_prime/temp/assign_01_prime/build/primeSerial: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/primeSerial -------------------------------------------------------------------------------- /openmp/assign_prime/temp/assign_01_prime/build/primexyy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/primexyy -------------------------------------------------------------------------------- /openmp/assign_prime/temp/assign_01_prime/prime.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10 13 | #define N 100000000 14 | 15 | int primes[N], cnt; 16 | bool st[N]; 17 | 18 | bool is_prime[N]; 19 | 20 | // add your codes end 21 | bool check1(int x) { 22 | if (x < 2) return false; 23 | if (x > 10 && x % 10 == 5) return false; 24 | for (int i = 2; i <= x / i; ++i) { 25 | if (x % i == 0) { 26 | return false; 27 | } 28 | } 29 | return true; 30 | } 31 | 32 | bool check2(int x) { 33 | if (x < 2) return false; 34 | int nthread = SIZE; 35 | int flag = 0; 36 | omp_set_num_threads(SIZE); 37 | #pragma omp parallel 38 | { 39 | int id = omp_get_thread_num(); 40 | if (id == 0) nthread = omp_get_num_threads(); 41 | int fg = 0; 42 | for (int i = 2; i <= x / i; i += nthread) { 43 | if (x % i == 0) { 44 | fg++; 45 | break; 46 | } 47 | } 48 | #pragma omp atomic 49 | flag += fg; 50 | } 51 | if (flag) { 52 | return false; 53 | } 54 | 55 | return true; 56 | } 57 | 58 | bool check3(int x) { 59 | if (x < 2) return false; 60 | if (x > 10 && x % 10 == 5) return false; 61 | for (size_t i = 0; i < x; i++) { 62 | /* code */ 63 | } 64 | return false; 65 | } 66 | 67 | int main() { 68 | double pi; 69 | 70 | double t = omp_get_wtime(); 71 | // add your codes begin 72 | 73 | omp_set_num_threads(SIZE); 74 | 75 | int count = 0; 76 | int nthread = SIZE; 77 | // 普通筛法 78 | // for (int i = 2; i <= N; i++) { 79 | // if (!st[i]) primes[cnt++] = i; // 把素数存起来 80 | // #pragma omp for 81 | // for (int j = i; j <= N; 82 | // j += i) { // 不管是合数还是质数,都用来筛掉后面它的倍数 83 | // if (!st[j]) st[j] = true; 84 | // } 85 | // } 86 | 87 | for (long i = 2; i * i <= N; i++) { 88 | #pragma omp parallel for 89 | for (int j = i * i; j <= N; j = j + i) { 90 | if (st[j] == 0) st[j] = 1; 91 | } 92 | } 93 | 94 | // 埃式筛法 95 | // for (int i = 2; i <= N; i++) { 96 | // if (!st[i]) { 97 | // primes[cnt++] = i; 98 | // #pragma omp for 99 | // for (int j = i; j <= N; j += i) 100 | // if (!st[j]) st[j] = true; // 101 | // 可以用质数就把所有的合数都筛掉; 102 | // } 103 | // } 104 | // 线性筛法 105 | // for (int i = 2; i <= N; i++) { 106 | // if (!st[i]) primes[cnt++] = i; 107 | // for (int j = 0; primes[j] <= N / i; j++) { 108 | // st[primes[j] * i] = true; 109 | // if (i % primes[j] == 0) break; 110 | // } 111 | // } 112 | // #pragma omp parallel 113 | // { 114 | // int id = omp_get_thread_num(); 115 | // if (id == 0) nthread = omp_get_num_threads(); 116 | // int cnt = 0; 117 | 118 | // for (int i = id; i < N; i += nthread) { 119 | // if (i % 2 == 0) continue; 120 | // if (check1(i)){ 121 | // cnt++; 122 | // } 123 | // } 124 | // #pragma omp atomic 125 | // count+=cnt; 126 | // } 127 | for (int i = 2; i <= N; i++) { 128 | if (!st[i]) { 129 | cnt++; 130 | } 131 | } 132 | t = omp_get_wtime() - t; 133 | printf("time %f %d\n", t, SIZE); 134 | 135 | printf("%d\n", cnt); 136 | } 137 | -------------------------------------------------------------------------------- /openmp/assign_prime/temp/assign_01_prime/primeSerial.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | #define SIZE 25 11 | #define N 100000000 12 | // add your codes end 13 | bool check(int x) { 14 | if (x < 2) return false; 15 | for (int i = 2; i <= x / i; ++i) { 16 | if (x % i == 0) { 17 | return false; 18 | } 19 | } 20 | return true; 21 | } 22 | 23 | int main() { 24 | double pi; 25 | 26 | double t = omp_get_wtime(); 27 | // add your codes begin 28 | 29 | omp_set_num_threads(SIZE); 30 | int count = 0; 31 | for (int i = 0; i < N; i++) { 32 | if (check(i)) count++; 33 | } 34 | t = omp_get_wtime() - t; 35 | printf("time %f %d\n", t, SIZE); 36 | 37 | printf("%d\n", count); 38 | } 39 | -------------------------------------------------------------------------------- /openmp/assign_prime/temp/build/prime: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/build/prime -------------------------------------------------------------------------------- /openmp/assign_prime/temp/prime: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/prime -------------------------------------------------------------------------------- /openmp/assign_prime/temp/prime.cpp: -------------------------------------------------------------------------------- 1 | // // student name: 2 | // // id number: 3 | 4 | // #include 5 | // #include 6 | // #include 7 | 8 | // #include 9 | // #include 10 | // #include 11 | // using namespace std; 12 | 13 | // // add your codes begin 14 | 15 | // // #define SIZE 100000000 16 | 17 | // int cnt; 18 | // bool st[SIZE]; 19 | // bool is_prime[SIZE]; 20 | 21 | // // add your codes end 22 | 23 | // int main() { 24 | // vector prime; 25 | 26 | // double t = omp_get_wtime(); 27 | // omp_set_num_threads(25); 28 | // // add your codes begin 29 | // // 普通筛法 30 | // // for (int i = 2; i < SIZE; i++) { 31 | // // if (!st[i]) prime.push_back(i); 32 | 33 | // // for (int j = i; j < SIZE; j += i) { 34 | // // if (!st[j]) st[j] = true; 35 | // // } 36 | // // } 37 | 38 | // // 数理推导 39 | // // for (int i = 2; i * i <= SIZE; i++) { 40 | // // // if (!st[i]) prime.push_back(i); 41 | // // #pragma omp paraller for 42 | // // for (int j = i + i; j <= SIZE; j += i) { 43 | // // if (st[j] == 0) st[j] = 1; 44 | // // } 45 | // // } 46 | 47 | // for (long i = 2; i * i <= SIZE; i++) { 48 | // #pragma omp parallel for 49 | // for (int j = i * i; j <= SIZE; j = j + i) { 50 | // if (st[j] == 0) st[j] = 1; 51 | // } 52 | // } 53 | 54 | // // 埃式筛法 55 | // // for (int i = 2; i <= SIZE; i++) { 56 | // // if (!st[i]) { 57 | // // // prime.push_back(i); 58 | // // #pragma omp parallel for 59 | // // for (int j = i; j < SIZE; j += i) { 60 | // // if (!st[j]) st[j] = true; 61 | // // } 62 | // // } 63 | // // } 64 | // // 线性筛法 65 | // // for (int i = 2; i < SIZE; i++) { 66 | // // if (!st[i]) { 67 | // // prime.push_back(i); 68 | // // } 69 | // // for (int j = 0; prime[j] * i < SIZE && j < prime.size(); j++) { 70 | // // st[prime[j] * i] = true; 71 | // // if (i % prime[j] == 0) break; 72 | // // } 73 | // // } 74 | 75 | // // add your codes end 76 | // t = omp_get_wtime() - t; 77 | // printf("time %f %ld\n", t, long(SIZE)); 78 | 79 | // printf("prime"); 80 | // sort(prime.begin(), prime.end()); 81 | // // for (long i = 0; i < prime.size(); i++) printf(" %ld", prime[i]); 82 | // printf("\nsize %ld\n", prime.size()); 83 | // } 84 | // student name:颜恺楠 85 | // id number:202000130203 86 | 87 | #include 88 | #include 89 | #include 90 | 91 | #include 92 | #include 93 | #include 94 | using namespace std; 95 | 96 | // add your codes begin 97 | // add your codes end 98 | bool check[SIZE + 1]; 99 | 100 | int main() { 101 | vector prime; 102 | 103 | double t = omp_get_wtime(); 104 | 105 | // add your codes begin 106 | // vector check(SIZE + 1, 0); 107 | for (long i = 2; i * i <= SIZE; i++) { 108 | // if (check[i] != 0 || i&1==0 || (i%6!=1 && i%6!=5)) 109 | /* if (check[i] != 0) 110 | continue; */ 111 | if (!check[i]) //! check[i]==0表示是素数 112 | { 113 | // prime.push_back(i); 114 | #pragma parallel omp for 115 | for (long j = i + i; j <= SIZE; j += i) 116 | if (!check[j]) check[j] = 1; 117 | } 118 | } 119 | // #pragma parallel omp for 120 | for (long i = 2; i <= SIZE; i++) 121 | if (!check[i]) prime.push_back(i); 122 | // add your codes end 123 | 124 | t = omp_get_wtime() - t; 125 | printf("time %f %ld\n", t, long(SIZE)); 126 | 127 | printf("prime"); 128 | sort(prime.begin(), prime.end()); 129 | // for (long i = 0; i < prime.size(); i++) 130 | // printf(" %ld", prime[i]); 131 | printf("\nsize %ld\n", prime.size()); 132 | } 133 | -------------------------------------------------------------------------------- /openmp/assign_prime/temp/prime_solution: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/prime_solution -------------------------------------------------------------------------------- /openmp/assign_prime/temp/run.sh: -------------------------------------------------------------------------------- 1 | size=100000000 2 | 3 | g++ -o prime -fopenmp -DSIZE=$size prime.cpp && timeout 60s time ./prime 4 | 5 | -------------------------------------------------------------------------------- /openmp/assign_prime/temp/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # if [ -f test.log ] ; then 4 | # cat test.log | grep time 5 | # exit 6 | # fi 7 | 8 | for i in 0 1 2 3 4 ; do 9 | ./prime_solution 10 | done | tee test.log | grep time 11 | 12 | -------------------------------------------------------------------------------- /openmp/assign_prime/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f test.log ] ; then 4 | cat test.log | grep time 5 | exit 6 | fi 7 | 8 | for i in 0 1 2 3 4 ; do 9 | ./prime_solution 10 | done | tee test.log | grep time 11 | 12 | -------------------------------------------------------------------------------- /openmp/assign_prime/testmy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f testmy.log ] ; then 4 | cat testmy.log | grep time 5 | exit 6 | fi 7 | 8 | for i in 0 1 2 3 4 ; do 9 | ./prime 10 | done | tee testmy.log | grep time 11 | 12 | -------------------------------------------------------------------------------- /openmp/assign_sort/assign_sort.2023-03-29.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/assign_sort.2023-03-29.tgz -------------------------------------------------------------------------------- /openmp/assign_sort/build/build/tp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp -------------------------------------------------------------------------------- /openmp/assign_sort/build/build/tp2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp2 -------------------------------------------------------------------------------- /openmp/assign_sort/build/build/tp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp3 -------------------------------------------------------------------------------- /openmp/assign_sort/build/sort_radix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/sort_radix -------------------------------------------------------------------------------- /openmp/assign_sort/build/sort_radix.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #define CUTOFF 1024 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | // add your codes begin 18 | #define maxbit 256 19 | int thrd_num = 10; 20 | int bitnum = 256; 21 | // #define SIZE 1000 22 | int bits[6] = {0, 8, 16, 24, 32, 40}; 23 | // int bits[12] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40}; 24 | 25 | inline int get_bit(int x, int bit) { return (x >> bits[bit]) & bitnum - 1; } 26 | 27 | void rscan(int* data, int size) { 28 | if (size == 1) return; 29 | int twoSum[size / 2]; 30 | #pragma omp parallel for num_threads(thrd_num) 31 | for (int i = 0; i < size / 2; i++) { 32 | twoSum[i] = data[i * 2] + data[2 * i + 1]; 33 | } 34 | rscan(twoSum, size / 2); 35 | #pragma omp parallel for num_threads(thrd_num) 36 | for (int i = 1; i < size; i += 2) { 37 | data[i] = twoSum[i / 2]; 38 | if (i + 1 < size) { 39 | data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1]; 40 | } 41 | } 42 | } 43 | 44 | void bit_sort(vector& data) { 45 | int cut = int(log2(10 * SIZE)) / 8 + 1; 46 | int cnt[bitnum]; 47 | // 开一个数组用于备份原数组 48 | int* bucket = (int*)malloc(sizeof(int) * SIZE); 49 | for (int i = 0; i < cut; i++) { 50 | memset(cnt, 0, sizeof cnt); 51 | // 向每个桶里放数据 52 | // 不需要真的把数据放进桶里 53 | // 记录一下每个桶里数据的数量 54 | for (int j = 0; j < data.size(); j++) { 55 | // bucket[get_bit(data[j], i)].push_back(data[j]); 56 | cnt[get_bit(data[j], i)]++; 57 | } 58 | // 求前缀和,得知每个数字应该放在哪个位置 59 | rscan(cnt, bitnum); 60 | // 可能存在false sharing,这部分不再并行 61 | // TODO: 一个compact 可以尝试一下并行 62 | for (int j = SIZE - 1; j >= 0; j--) { 63 | int bit = get_bit(data[j], i); 64 | bucket[cnt[bit] - 1] = data[j]; 65 | cnt[bit]--; 66 | } 67 | memcpy(&data[0], bucket, sizeof(int) * SIZE); 68 | } 69 | } 70 | 71 | // add your codes end 72 | 73 | int main() { 74 | vector data(SIZE); 75 | 76 | srand(SIZE); 77 | for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10); 78 | 79 | double t = omp_get_wtime(); 80 | // add your codes begin 81 | bit_sort(data); 82 | // add your codes end 83 | t = omp_get_wtime() - t; 84 | printf("time %f %d\n", t, SIZE); 85 | 86 | for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]); 87 | } 88 | -------------------------------------------------------------------------------- /openmp/assign_sort/build/sort_sample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/sort_sample -------------------------------------------------------------------------------- /openmp/assign_sort/build/tp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | using namespace std; 12 | 13 | // add your codes begin 14 | #define CUTOFF 256 15 | #define SIZE 100000000 16 | static int p[6] = {0, 8, 16, 24, 32, 40}; 17 | 18 | inline int get_digit(int n, int i) { return (n >> p[i]) & (CUTOFF - 1); } 19 | 20 | void SCAN(int* arr, int steps, int start_index) { 21 | if (256 < start_index) { 22 | return; 23 | } else { 24 | #pragma acc data copy(arr [0:CUTOFF]) 25 | { 26 | #pragma acc parallel loop 27 | for (int i = start_index + (steps / 2); i < CUTOFF; i += steps) { 28 | arr[i] += arr[i - steps / 2]; 29 | } 30 | } 31 | SCAN(arr, steps * 2, start_index + steps / 2); 32 | #pragma acc data copy(arr [0:CUTOFF]) 33 | { 34 | #pragma acc parallel loop 35 | for (int i = start_index + steps; i < CUTOFF; i += steps) { 36 | arr[i] += arr[i - steps / 2]; 37 | } 38 | } 39 | } 40 | } 41 | 42 | void quciksort(int* arr) { 43 | // 开了一个大小为4*SIZE的桶 44 | int* bucket = (int*)malloc(sizeof(int) * SIZE); 45 | // 开一个256大小的数组 46 | int count[CUTOFF]; 47 | // 计算可能用到的最大bit数 48 | int cut = int(log2(10 * SIZE)) / 8 + 1; 49 | // cut 大小应该是5 50 | for (int i = 0; i < cut; i++) { 51 | memset(count, 0, sizeof(int) * CUTOFF); 52 | int sub_count[CUTOFF] = {0}; 53 | // 获得每一位的位数数量 54 | // 准确来说应该是8位 55 | // 统计每一批 的8位位数 56 | for (int j = 0; j < SIZE; j += 1) { 57 | sub_count[get_digit(arr[j], i)]++; 58 | } 59 | for (int k = 0; k < CUTOFF; k++) { 60 | count[k] += sub_count[k]; 61 | } 62 | SCAN(count, 2, 0); 63 | for (int j = SIZE - 1; j >= 0; --j) { 64 | int k = get_digit(arr[j], i); 65 | bucket[count[k] - 1] = arr[j]; 66 | count[k]--; 67 | } 68 | memcpy(arr, bucket, sizeof(int) * SIZE); 69 | } 70 | free(bucket); 71 | } 72 | // add your codes end 73 | 74 | int main() { 75 | vector data(SIZE); 76 | 77 | srand(SIZE); 78 | for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10); 79 | 80 | double t = omp_get_wtime(); 81 | // add your codes begin 82 | quciksort(&data[0]); 83 | // add your codes end 84 | t = omp_get_wtime() - t; 85 | printf("time %f %d\n", t, SIZE); 86 | 87 | for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]); 88 | } -------------------------------------------------------------------------------- /openmp/assign_sort/build/tp2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #define SIZE 1000000 13 | using namespace std; 14 | 15 | 16 | void sampleSort(int *arr, int n, int p) { 17 | // Step 1: Sample data 18 | int s = 10 * p; // Sample size 19 | int *sample = new int[s * p]; 20 | srand(time(NULL)); 21 | #pragma omp parallel for 22 | for (int i = 0; i < s * p; i++) { 23 | sample[i] = arr[rand() % n]; 24 | } 25 | std::sort(sample, sample + s * p); 26 | // Step 2: Choose pivots 27 | std::vector pivots(p - 1); 28 | for (int i = 0; i < p - 1; i++) { 29 | pivots[i] = sample[(i + 1) * s]; 30 | } 31 | 32 | // Step 3: Partition data 33 | std::vector counts(p); 34 | std::vector offsets(p); 35 | #pragma omp parallel for 36 | for (int i = 0; i < n; i++) { 37 | int rank = p - 1; 38 | for (int j = 0; j < p - 1; j++) { 39 | if (arr[i] < pivots[j]) { 40 | rank = j; 41 | break; 42 | } 43 | } 44 | #pragma omp atomic 45 | counts[rank]++; 46 | } 47 | offsets[0] = 0; 48 | for (int i = 1; i < p; i++) { 49 | offsets[i] = offsets[i - 1] + counts[i - 1]; 50 | } 51 | double *tmp = new double[n]; 52 | #pragma omp parallel for 53 | for (int i = 0; i < n; i++) { 54 | int rank = p - 1; 55 | for (int j = 0; j < p - 1; j++) { 56 | if (arr[i] < pivots[j]) { 57 | rank = j; 58 | break; 59 | } 60 | } 61 | int idx = offsets[rank]++; 62 | tmp[idx] = arr[i]; 63 | } 64 | 65 | // Step 4: Sort each block 66 | #pragma omp parallel for 67 | for (int i = 0; i < p; i++) { 68 | std::sort(tmp + offsets[i], tmp + offsets[i] + counts[i]); 69 | } 70 | 71 | // Step 5: Merge blocks 72 | #pragma omp parallel for 73 | for (int i = 0; i < p; i++) { 74 | std::copy(tmp + offsets[i], tmp + offsets[i] + counts[i], 75 | arr + i * (n / p)); 76 | } 77 | 78 | // Step 6: Sort merged data 79 | std::sort(arr, arr + n); 80 | 81 | delete[] sample; 82 | delete[] tmp; 83 | } 84 | 85 | int main() { 86 | vector data(SIZE); 87 | 88 | srand(SIZE); 89 | for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10); 90 | 91 | double t = omp_get_wtime(); 92 | // add your codes begin 93 | sampleSort(&data[0], SIZE, 10); 94 | // add your codes end 95 | t = omp_get_wtime() - t; 96 | printf("time %f %d\n", t, SIZE); 97 | 98 | for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]); 99 | } -------------------------------------------------------------------------------- /openmp/assign_sort/build/tp3.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void sampleSort(double *arr, int n, int p, int s) { 10 | // Step 1: Choose samples 11 | double *sample = new double[p * s]; 12 | srand(time(NULL)); 13 | #pragma omp parallel for 14 | for (int i = 0; i < p * s; i++) { 15 | sample[i] = arr[rand() % n]; 16 | } 17 | std::sort(sample, sample + p * s); 18 | 19 | // Step 2: Choose pivots 20 | std::vector pivots(p - 1); 21 | for (int i = 0; i < p - 1; i++) { 22 | pivots[i] = sample[(i + 1) * s]; 23 | } 24 | // Step 3: Partition data 25 | std::vector counts(p); 26 | std::vector offsets(p); 27 | #pragma omp parallel for 28 | for (int i = 0; i < n; i++) { 29 | int rank = p - 1; 30 | for (int j = 0; j < p - 1; j++) { 31 | if (arr[i] < pivots[j]) { 32 | rank = j; 33 | break; 34 | } 35 | } 36 | #pragma omp atomic 37 | counts[rank]++; 38 | } 39 | offsets[0] = 0; 40 | for (int i = 1; i < p; i++) { 41 | offsets[i] = offsets[i - 1] + counts[i - 1]; 42 | } 43 | double *tmp = new double[n]; 44 | #pragma omp parallel for 45 | for (int i = 0; i < n; i++) { 46 | int rank = p - 1; 47 | for (int j = 0; j < p - 1; j++) { 48 | if (arr[i] < pivots[j]) { 49 | rank = j; 50 | break; 51 | } 52 | } 53 | int idx = offsets[rank]++; 54 | tmp[idx] = arr[i]; 55 | } 56 | 57 | // Step 4: Sort each block 58 | #pragma omp parallel for 59 | for (int i = 0; i < p; i++) { 60 | std::sort(tmp + offsets[i], tmp + offsets[i] + counts[i]); 61 | } 62 | 63 | // Step 5: Merge blocks 64 | #pragma omp parallel for 65 | for (int i = 0; i < p; i++) { 66 | std::copy(tmp + offsets[i], tmp + offsets[i] + counts[i], 67 | arr + i * (n / p)); 68 | } 69 | 70 | // Step 6: Sort merged data 71 | std::sort(arr, arr + n); 72 | 73 | delete[] sample; 74 | delete[] tmp; 75 | } 76 | 77 | int main() { 78 | int n = 100000; // Array size 79 | double *arr = new double[n]; 80 | srand(time(NULL)); 81 | for (int i = 0; i < n; i++) { 82 | arr[i] = rand() % 1000; 83 | } 84 | int p = 4; // Parallelism 85 | double *sorted_arr = new double[n]; 86 | double start = omp_get_wtime(); 87 | sampleSort(arr, n, p, n / (4 * p)); 88 | double end = omp_get_wtime(); 89 | std::cout << "Time: " << end - start << std::endl; 90 | delete[] arr; 91 | delete[] sorted_arr; 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /openmp/assign_sort/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Implement your sort algorithm 5 | - Version 1: radix sort program 6 | - Version 2: sample sort program 7 | - Compile and run your programs multiple times 8 | - Describe your observations 9 | sort_radix: 10 | 多次运行该算法,设置了不同的参数尝试,以每一位bit一组从低到高排序,耗时约为50s 11 | 以每四位bit为一组从第到高排序,耗时约为15s 12 | 以每八位bit为一组从第到高排序,耗时约为8-10s 13 | sort_sample: 14 | 可以调节的参数有采样的数量和线程数量,在尝试过程中找到了几个效率较高的参数组合 15 | 线程数为30,采样数位30*35或30*500时效率在6.4-7s之间 16 | 线程数为50,采样数为50*500时效率在4.6-5s之间 17 | 18 | - Explain why this happens 19 | 针对sort_radix: 20 | 每一组中bit位数越多,最外面循环的次数就越少,且桶的数量会越多,可以使用使用更多的线程并行,效率越高。 21 | 针对sort_sample: 22 | 一般来说线程数量越多,并行度越高,效率会越高,到达某一临界值时,提升线程数量对效率的提升会变小甚至无提升, 23 | 要协调好线程开销与并发效率的关系。 24 | 该方法里有一个可调节的参数采样数量,该值越大,越能逼近均匀分割,但同时会提高该部分排序耗时。 25 | 可以多次尝试找到一个合适取值,运气好的话,较小的采样量也能达到不错的效果。 26 | -------------------------------------------------------------------------------- /openmp/assign_sort/run.sh: -------------------------------------------------------------------------------- 1 | version=radix 2 | # size=100000000 3 | size=100000000 4 | 5 | g++ -o sort_$version -fopenmp -DSIZE=$size sort_$version.cpp && timeout 60s time ./sort_$version 6 | 7 | -------------------------------------------------------------------------------- /openmp/assign_sort/sort_radix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_radix -------------------------------------------------------------------------------- /openmp/assign_sort/sort_radix.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #define CUTOFF 1024 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | // add your codes begin 18 | #define maxbit 256 19 | int thrd_num = 10; 20 | int bitnum = 256; 21 | // #define SIZE 1000 22 | int bits[6] = {0, 8, 16, 24, 32, 40}; 23 | // int bits[12] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40}; 24 | 25 | inline int get_bit(int x, int bit) { return (x >> bits[bit]) & bitnum - 1; } 26 | 27 | void rscan(int* data, int size) { 28 | if (size == 1) return; 29 | int twoSum[size / 2]; 30 | #pragma omp parallel for num_threads(thrd_num) 31 | for (int i = 0; i < size / 2; i++) { 32 | twoSum[i] = data[i * 2] + data[2 * i + 1]; 33 | } 34 | rscan(twoSum, size / 2); 35 | #pragma omp parallel for num_threads(thrd_num) 36 | for (int i = 1; i < size; i += 2) { 37 | data[i] = twoSum[i / 2]; 38 | if (i + 1 < size) { 39 | data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1]; 40 | } 41 | } 42 | } 43 | 44 | void bit_sort(vector& data) { 45 | int cut = int(log2(10 * SIZE)) / 8 + 1; 46 | int cnt[bitnum]; 47 | // vector cnt(bitnum); 48 | // 开一个数组用于备份原数组 49 | int* bucket = (int*)malloc(sizeof(int) * SIZE); 50 | for (int i = 0; i < cut; i++) { 51 | memset(cnt, 0, sizeof cnt); 52 | // std::fill(cnt.begin(), cnt.end(), 0); 53 | // 向每个桶里放数据 54 | // 不需要真的把数据放进桶里 55 | // 记录一下每个桶里数据的数量 56 | for (int j = 0; j < data.size(); j++) { 57 | // bucket[get_bit(data[j], i)].push_back(data[j]); 58 | cnt[get_bit(data[j], i)]++; 59 | } 60 | // 求前缀和,得知每个数字应该放在哪个位置 61 | rscan(&cnt[0], bitnum); 62 | // cnt.insert(cnt.begin(), 0); 63 | // 可能存在false sharing,这部分不再并行 64 | // TODO: 一个compact 可以尝试一下并行 65 | // 似乎因为线程太多,不太可行 66 | // 并行compact由于需要拷贝数组,导致性能下降,不再进行并行compact 67 | // #pragma omp parallel num_threads(256) 68 | // { 69 | // int id, nthrds; 70 | // id = omp_get_thread_num(); 71 | // // cout << id << endl; 72 | // nthrds = omp_get_num_threads(); 73 | // // 每个线程只执行循环中自己需要执行的部分 74 | // vector tp; 75 | // for (int j = 0; j < SIZE; j++) { 76 | // if (get_bit(data[j], i) == id) { 77 | // tp.push_back(data[j]); 78 | // } 79 | // } 80 | // #pragma omp critical 81 | // { memcpy(bucket, &tp[0], 4 * tp.size()); } 82 | // } 83 | 84 | for (int j = SIZE - 1; j >= 0; j--) { 85 | int bit = get_bit(data[j], i); 86 | bucket[cnt[bit] - 1] = data[j]; 87 | cnt[bit]--; 88 | } 89 | memcpy(&data[0], bucket, sizeof(int) * SIZE); 90 | } 91 | } 92 | 93 | // add your codes end 94 | 95 | int main() { 96 | vector data(SIZE); 97 | 98 | srand(SIZE); 99 | for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10); 100 | 101 | double t = omp_get_wtime(); 102 | // add your codes begin 103 | bit_sort(data); 104 | // add your codes end 105 | t = omp_get_wtime() - t; 106 | printf("time %f %d\n", t, SIZE); 107 | 108 | for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]); 109 | } 110 | -------------------------------------------------------------------------------- /openmp/assign_sort/sort_radix_solution: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_radix_solution -------------------------------------------------------------------------------- /openmp/assign_sort/sort_sample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_sample -------------------------------------------------------------------------------- /openmp/assign_sort/sort_sample_solution: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_sample_solution -------------------------------------------------------------------------------- /openmp/exam_knn/ans.txt: -------------------------------------------------------------------------------- 1 | time 5.126451 20000 2 | checksum 8.77751328e+04 8.78574766e+04 8.77312266e+04 8.78405156e+04 8.77315234e+04 8.76717109e+04 8.78353750e+04 8.77646250e+04 8.78045000e+04 8.78303125e+04 8.76919609e+04 3 | -------------------------------------------------------------------------------- /openmp/exam_knn/build/build/knn copy 2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/build/knn copy 2 -------------------------------------------------------------------------------- /openmp/exam_knn/build/build/knn copy 4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/build/knn copy 4 -------------------------------------------------------------------------------- /openmp/exam_knn/build/knn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/knn -------------------------------------------------------------------------------- /openmp/exam_knn/build/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/test -------------------------------------------------------------------------------- /openmp/exam_knn/exam_knn.2023-05-10.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/exam_knn.2023-05-10.tgz -------------------------------------------------------------------------------- /openmp/exam_knn/knn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/knn -------------------------------------------------------------------------------- /openmp/exam_knn/knn_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/knn_0 -------------------------------------------------------------------------------- /openmp/exam_knn/out.txt: -------------------------------------------------------------------------------- 1 | time 1.518604 20000 2 | checksum 8.77751328e+04 8.78574766e+04 8.77312266e+04 8.78405156e+04 8.77315234e+04 8.76717109e+04 8.78353750e+04 8.77646250e+04 8.78045000e+04 8.78303125e+04 8.76919609e+04 3 | -------------------------------------------------------------------------------- /openmp/exam_knn/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Design a parallel algorithm to find the K-nearest neighbors (e.g., K=10) of each element given the coordinates of N points (e.g., N=20000) in D-dimensional space (e.g., D=64). 5 | - Implement the algorithm with OpenMP and submit (only) your program. Please see the provided source codes as a starting point. 6 | - Your mark depends on both the correctness and the running time of your program. 7 | 8 | -------------------------------------------------------------------------------- /openmp/exam_knn/run.sh: -------------------------------------------------------------------------------- 1 | size=20000 2 | 3 | g++ -o knn_0 -fopenmp -DSIZE=$size knn.cpp && timeout 60s time ./knn_0 4 | 5 | -------------------------------------------------------------------------------- /openmp/exam_knn/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define _mBeginASM __asm__ __volatile__ ( 4 | #define _mEndASM ); 5 | 6 | int main(int argc, char *argv[]) { 7 | int a = 44, b = 33, c; 8 | 9 | _mBeginASM "addl %%ebx,%%eax" 10 | : "=a"(c) /* 说明了调用'函数体'之后,应该把eax中的值赋值该变量c */ 11 | : "b"(b), 12 | "a"(a) /* 表明了在调用'函数体'之前,应该把变量a复制到eax中,b复制到ebx中 13 | */ 14 | _mEndASM 15 | 16 | printf("%d\n", c); 17 | return 0; 18 | } -------------------------------------------------------------------------------- /openmp/final_exam/build/circle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/final_exam/build/circle -------------------------------------------------------------------------------- /openmp/final_exam/circle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/final_exam/circle -------------------------------------------------------------------------------- /openmp/final_exam/run.sh: -------------------------------------------------------------------------------- 1 | size=1000 2 | 3 | g++ -o circle -fopenmp -DSIZE=$size circle.cpp && timeout 60s time ./circle 4 | 5 | -------------------------------------------------------------------------------- /openmp/lab_knn/build/build/knn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/build/build/knn -------------------------------------------------------------------------------- /openmp/lab_knn/build/knn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/build/knn -------------------------------------------------------------------------------- /openmp/lab_knn/build/knn.cpp: -------------------------------------------------------------------------------- 1 | #define DIM 64 2 | #define KNN 10 3 | #define NBITS 6 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | // add your codes begin 18 | #define SIZE 20000 19 | #define SEED 1 20 | float dis(float *vec1, float *vec2) { 21 | float res = 0; 22 | for (int i = 0; i < DIM; i++) { 23 | float re = vec1[i] - vec2[i]; 24 | res += re * re; 25 | } 26 | // return sqrt(res); // 27 | return res; // 28 | } 29 | 30 | bool cmp(pair p1, pair p2) { 31 | return p1.second > p2.second; 32 | } 33 | // add your codes end 34 | 35 | int main() { 36 | srand(SEED); 37 | vector> coord(SIZE); 38 | vector> knn(SIZE); 39 | for (int i = 0; i < SIZE; i++) { 40 | vector c(DIM); 41 | for (int j = 0; j < DIM; j++) 42 | c[j] = float(rand()) / float(RAND_MAX) * 2 - 1; 43 | coord[i] = c; 44 | } 45 | 46 | srand(SEED); 47 | // 初始化若干个超平面 48 | vector> rnd(NBITS); 49 | for (int i = 0; i < NBITS; i++) { 50 | vector r(DIM); 51 | for (int j = 0; j < DIM; j++) 52 | r[j] = float(rand()) / float(RAND_MAX) * 2 - 1; 53 | rnd[i] = r; 54 | } 55 | 56 | double t = omp_get_wtime(); 57 | // add your codes begin 58 | // 点积计算所有点,将所有点转化为对应维度的向量 59 | // 将原始向量由 N * DIM 转化为 N*NBITS 60 | vector sbit(SIZE); 61 | map> buckets; 62 | #pragma omp parallel for 63 | for (int i = 0; i < coord.size(); i++) { 64 | string key = ""; 65 | for (int k = 0; k < NBITS; k++) { 66 | float res = 0; 67 | for (int j = 0; j < DIM; j++) { 68 | res += (coord[i][j] * rnd[k][j]); 69 | } 70 | if (res > 0) 71 | key += "1"; 72 | else 73 | key += "0"; 74 | } 75 | sbit[i] = key; 76 | } 77 | 78 | // #pragma omp parallel for 79 | for (int i = 0; i < coord.size(); i++) { 80 | buckets[sbit[i]].push_back(i); 81 | } 82 | 83 | #pragma omp parallel for 84 | for (int i = 0; i < SIZE; i++) { 85 | string key = sbit[i]; 86 | // 得到了当前节点的hash值 87 | vector bucket = buckets[key]; 88 | 89 | vector> stp; 90 | 91 | for (int tenv = 0; tenv < pow(2, NBITS); tenv++) { 92 | int val = 0; 93 | string tk = ""; 94 | for (int i = 0; i < NBITS; i++) { 95 | val += (((tenv >> i) & 1) != (key[i] - '0')); 96 | tk += (((tenv >> i) & 1) + '0'); 97 | } 98 | stp.push_back({val, tk}); 99 | } 100 | sort(stp.begin(), stp.end()); 101 | for (int i = 0; i < 0; i++) { 102 | bucket.insert(bucket.end(), 103 | buckets[stp[i + 1].second].begin(), 104 | buckets[stp[i + 1].second].end()); 105 | } 106 | 107 | int num = bucket.size(); 108 | float *dist = new float[num]; 109 | for (int k = 0; k < num; k++) { 110 | dist[k] = dis(&coord[i][0], &coord[bucket[k]][0]); 111 | } 112 | 113 | partial_sort(dist, dist + KNN + 1, dist + num); 114 | for (int j = 0; j < KNN + 1; j++) { 115 | knn[i].push_back(sqrt(dist[j])); 116 | } 117 | } 118 | 119 | // add your codes end 120 | t = omp_get_wtime() - t; 121 | printf("time %f %d\n", t, SIZE); 122 | 123 | const int size = 11; 124 | float chksum[size]; 125 | for (int i = 0; i < size; i++) chksum[i] = 0.0; 126 | for (int i = 0; i < SIZE; i++) { 127 | for (int j = 0; j < knn[i].size(); j++) { 128 | chksum[i % size] += knn[i][j]; 129 | } 130 | } 131 | printf("checksum"); 132 | for (int i = 0; i < size; i++) printf(" %.8e", chksum[i]); 133 | printf("\n"); 134 | } 135 | -------------------------------------------------------------------------------- /openmp/lab_knn/knn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn -------------------------------------------------------------------------------- /openmp/lab_knn/knn_apx1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx1 -------------------------------------------------------------------------------- /openmp/lab_knn/knn_apx2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx2 -------------------------------------------------------------------------------- /openmp/lab_knn/knn_apx3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx3 -------------------------------------------------------------------------------- /openmp/lab_knn/knn_apx4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx4 -------------------------------------------------------------------------------- /openmp/lab_knn/lab_knn.2023-05-17.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/lab_knn.2023-05-17.tgz -------------------------------------------------------------------------------- /openmp/lab_knn/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Design a parallel algorithm to find the **approximate** K-nearest neighbors (e.g., K=10) of each element given the coordinates of N points (e.g., N=20000) in D-dimensional space (e.g., D=64). 5 | - Implement the algorithm with OpenMP and submit (only) your program. Please see the provided source codes as a starting point. 6 | - Reference: https://www.pinecone.io/learn/locality-sensitive-hashing-random-projection/ 7 | 8 | -------------------------------------------------------------------------------- /openmp/lab_knn/run.sh: -------------------------------------------------------------------------------- 1 | size=20000 2 | seed=1 3 | 4 | g++ -o knn -fopenmp -DSIZE=$size -DSEED=$seed knn.cpp && timeout 60s time ./knn 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Firstprivate/fp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/DataSharing/Firstprivate/fp -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Firstprivate/fp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | const int MAX = 10; 7 | int A[MAX]; 8 | 9 | int main() { 10 | int incr = 0; 11 | #pragma omp parallel for firstprivate(incr) 12 | for (int i = 0; i <= MAX; i++) { 13 | if ((i % 2) == 0) incr++; 14 | A[i] = incr; 15 | } 16 | for (int i = 0; i < MAX; i++) { 17 | printf("%d", A[i]); 18 | } 19 | 20 | return 0; 21 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Firstprivate/makefile: -------------------------------------------------------------------------------- 1 | hello: fp.cpp 2 | g++ -fopenmp fp.cpp -o fp 3 | clean: 4 | rm -f fp 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Lastprivate/fp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/DataSharing/Lastprivate/fp -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Lastprivate/fp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | const int MAX = 10; 7 | int A[MAX]; 8 | 9 | int main() { 10 | int incr = 0; 11 | #pragma omp parallel for firstprivate(incr) 12 | for (int i = 0; i <= MAX; i++) { 13 | if ((i % 2) == 0) incr++; 14 | A[i] = incr; 15 | } 16 | for (int i = 0; i < MAX; i++) { 17 | printf("%d", A[i]); 18 | } 19 | 20 | return 0; 21 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/DataSharing/Lastprivate/makefile: -------------------------------------------------------------------------------- 1 | hello: fp.cpp 2 | g++ -fopenmp fp.cpp -o fp 3 | clean: 4 | rm -f fp 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/Intro05/hello: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/Intro05/hello -------------------------------------------------------------------------------- /openmp/lab_par_for/Intro05/hello.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(){ 4 | double A[1000]; 5 | omp_set_num_threads(4); 6 | #pragma omp parallel 7 | { 8 | int ID = omp_get_thread_num(); 9 | pooh(ID,A); 10 | } 11 | printf("all down\n"); 12 | return 0; 13 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/Intro05/makefile: -------------------------------------------------------------------------------- 1 | hello: hello.cpp 2 | g++ -fopenmp hello.cpp -o hello 3 | clean: 4 | rm -f hello 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PI/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PI/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PI/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PI/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | 7 | int main() { 8 | int i; 9 | double x, pi, sum = 0.0; 10 | step = 1.0 / (double)num_steps; 11 | for (i = 0; i < num_steps; i++) { 12 | x = (i + 0.5) * step; 13 | sum = sum + 4.0 / (1.0 + x * x); 14 | } 15 | pi = step * sum; 16 | printf("%lf", pi); 17 | return 0; 18 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv1/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv1/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv1/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv1/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | 7 | int main(){ 8 | int i,nthreads; 9 | double pi,sum[NUM_THREADS]; 10 | step = 1.0/(double)num_steps; 11 | omp_set_num_threads(NUM_THREADS); 12 | #pragma omp parallel 13 | { 14 | int i,id,nthrds; 15 | double x; 16 | id = omp_get_thread_num(); 17 | nthrds = omp_get_num_threads(); 18 | if(id==0) nthreads = nthrds; 19 | for (i = id,sum[id]=0.0; i < num_steps; i+=nthrds) { 20 | x = (i+0.5)*step; 21 | sum[id] += 4.0/(1.0+x*x); 22 | } 23 | /*for (i = 0,pi=0.0; i < num_steps; i+=nthrds) { 24 | x = (i+0.5)*step; 25 | sum[id] += 4.0 / (1.0+x*x); 26 | }*/ 27 | } 28 | for (i = 0,pi=0.0; i < nthreads; i++) 29 | { 30 | pi += sum[i]*step; 31 | } 32 | printf("%lf",pi); 33 | return 0; 34 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv2/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv2/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv2/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv2/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 1000000; 5 | double step; 6 | 7 | int main() { 8 | int nthreads; 9 | double pi = 0.0; 10 | step = 1.0/(double)num_steps; 11 | omp_set_num_threads(NUM_THREADS); 12 | #pragma omp parallel 13 | { 14 | int i,id,nthrds; 15 | double x,sum; 16 | // 获取线程的数量 17 | id = omp_get_thread_num(); 18 | nthrds = omp_get_num_threads(); 19 | if(id==0) nthreads = nthrds; 20 | // 获取线程的id 21 | for (i = id,sum = 0.0; i < num_steps; i+=nthreads) { 22 | x = (i+0.5)*step; 23 | sum += 4.0/(1.0+x*x); 24 | } 25 | // 这里使用了一个critical,确保每次都只会有一个线程访问该段代码 26 | // 与上一个版本相比,好处在于不需要开辟空间存储了 27 | // 但缺点是引入了锁,会导致性能的下降 28 | #pragma omp critical 29 | { 30 | pi += sum*step; 31 | } 32 | 33 | } 34 | printf("%lf",pi); 35 | return 0; 36 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv3/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv3/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv3/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv3/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | 7 | int main() { 8 | int nthreads; 9 | double pi = 0.0; 10 | step = 1.0/(double)num_steps; 11 | omp_set_num_threads(NUM_THREADS); 12 | #pragma omp parallel 13 | { 14 | int i,id,nthrds; 15 | double x,sum; 16 | id = omp_get_thread_num(); 17 | nthrds = omp_get_num_threads(); 18 | if(id==0) nthreads = nthrds; 19 | for (i = id,sum = 0.0; i < num_steps; i+=nthreads) { 20 | x = (i+0.5)*step; 21 | // 将这段代码放在循环内部,会增加索德竞争 22 | #pragma omp critical 23 | pi += 4.0/(1.0+x*x); 24 | } 25 | } 26 | pi*=step; 27 | printf("%lf",pi); 28 | return 0; 29 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv4/build/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv4/build/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv4/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv4/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv4/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv4/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define NUM_THREADS 4 5 | static long num_steps = 10000000; 6 | double step; 7 | 8 | int main() { 9 | int nthreads; 10 | double pi = 0.0; 11 | step = 1.0/(double)num_steps; 12 | 13 | double t = omp_get_wtime(); 14 | 15 | omp_set_num_threads(NUM_THREADS); 16 | #pragma omp parallel 17 | { 18 | int i,id,nthrds; 19 | double x,sum; 20 | id = omp_get_thread_num(); 21 | nthrds = omp_get_num_threads(); 22 | if(id==0) nthreads = nthrds; 23 | for (i = id,sum = 0.0; i < num_steps; i+=nthreads) { 24 | x = (i+0.5)*step; 25 | sum += 4.0/(1.0+x*x); 26 | } 27 | sum *= step; 28 | // 这里使用了一个critical,确保每次都只会有一个线程访问该段代码 29 | // 与上一个版本相比,好处在于不需要开辟空间存储了 30 | // 但缺点是引入了锁,会导致性能的下降 31 | #pragma atomic 32 | { 33 | pi += sum; 34 | } 35 | } 36 | //printf("%lf",pi); 37 | 38 | t = omp_get_wtime() - t; 39 | printf("time %f %d\n", t, NUM_THREADS); 40 | 41 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 42 | return 0; 43 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv5/makefile: -------------------------------------------------------------------------------- 1 | hello: pi.cpp 2 | g++ -fopenmp pi.cpp -o pi 3 | clean: 4 | rm -f pi 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv5/pi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv5/pi -------------------------------------------------------------------------------- /openmp/lab_par_for/PIv5/pi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define NUM_THREADS 2 4 | static long num_steps = 100000; 5 | double step; 6 | 7 | int main() { 8 | int i; 9 | double pi, sum = 0.0; 10 | step = 1.0 / (double)num_steps; 11 | #pragma omp parallel 12 | { 13 | double x; 14 | #pragma omp for reduction(+ : sum) 15 | for (i = 0; i < num_steps; i++) { 16 | x = (i + 0.5) * step; 17 | sum = sum + 4.0 / (1.0 + x * x); 18 | } 19 | } 20 | 21 | pi = step * sum; 22 | printf("%lf", pi); 23 | return 0; 24 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/hello/hello: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/hello/hello -------------------------------------------------------------------------------- /openmp/lab_par_for/hello/hello.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(){ 4 | #pragma omp parallel 5 | { 6 | int ID = omp_get_thread_num(); 7 | printf("Hello(%d)",ID); 8 | printf(" world(%d)",ID); 9 | } 10 | return 0; 11 | } -------------------------------------------------------------------------------- /openmp/lab_par_for/hello/makefile: -------------------------------------------------------------------------------- 1 | hello: hello.cpp 2 | g++ -fopenmp hello.cpp -o hello 3 | clean: 4 | rm -f hello 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/lab_par_for.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/lab_par_for.tgz -------------------------------------------------------------------------------- /openmp/lab_par_for/par_for/makefile: -------------------------------------------------------------------------------- 1 | hello: par_for.cpp 2 | g++ -fopenmp par_for.cpp -o par_for 3 | clean: 4 | rm -f par_for 5 | -------------------------------------------------------------------------------- /openmp/lab_par_for/par_for/par_for: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/par_for/par_for -------------------------------------------------------------------------------- /openmp/lab_par_for/par_for/par_for.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define SIZE 12 9 | using namespace std; 10 | 11 | 12 | int main() { 13 | int test[SIZE]; 14 | #pragma omp parallel for schedule(dynamic, 4) 15 | for (int i = 0; i < SIZE; i++) { 16 | test[i] = omp_get_thread_num(); 17 | } 18 | for (int i = 0; i < SIZE; i++) { 19 | printf(" %d", test[i]); 20 | } 21 | printf(" %d\n", SIZE); 22 | } 23 | 24 | -------------------------------------------------------------------------------- /openmp/lab_par_for/readme.txt: -------------------------------------------------------------------------------- 1 | - Watch lectures: https://icloud.qd.sdu.edu.cn:7777/link/2D2A742C095E0CFD13FEB87F405E2FEB 2 | - Expiration date: 2023-06-18 3 | - Implement your "Hello, World!" program "hello.cpp" of Unit 1 4 | 代码见hello文件夹 5 | - Compile and run program "par_for.cpp" multiple times 6 | 7 | - Describe your observations 8 | 将SIZE设置为12,多次运行结果为 9 | 58 58 58 58 61 61 61 61 57 57 57 57 12 10 | 43 43 43 43 46 46 46 46 44 44 44 44 12 11 | 24 24 24 24 53 53 53 53 39 39 39 39 12 12 | 62 62 62 62 48 48 48 48 20 20 20 20 12 13 | #pragma omp parallel for schedule(dynamic, k) 14 | 每次输出时都会有多组数字,组数为(SIZE/k)或(SIZE/k + 1)每组数字中有k个相同的数字,k为我们指定的第二个参数 15 | 16 | - Explain why this happens 17 | 18 | openmp底层会为每个线程分配k次计算, 19 | 这里选用的调度方式为dynamic,运行时为随机选择可用的线程,每次运行的结果几乎都是不一样的, 20 | 一般无法做到预测使用哪些线程。 21 | 若调度方式为static,则每次运行结果都是固定的。 22 | 23 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/build/tp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/build/tp -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/pi_integral_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_0 -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/pi_integral_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_1 -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/pi_integral_1_1.cpp: -------------------------------------------------------------------------------- 1 | // student name: 2 | // id number: 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | #define SIZE 10 11 | static long num_step = 10000000; 12 | // add your codes end 13 | 14 | int main() { 15 | double pi; 16 | 17 | double t = omp_get_wtime(); 18 | // add your codes begin 19 | int i; 20 | double sum = 0.0; 21 | double step = 1.0 / (double)num_step; 22 | omp_set_num_threads(SIZE); 23 | #pragma omp parallel 24 | { 25 | double x; 26 | #pragma omp for 27 | for (i = 0; i < num_step; i++) { 28 | #pragma omp critical(cr) 29 | { 30 | x = (i + 0.5) * step; 31 | sum = sum + 4.0 / (1.0 + x * x); 32 | } 33 | } 34 | } 35 | pi = step * sum; 36 | // add your codes end 37 | t = omp_get_wtime() - t; 38 | printf("time %f %d\n", t, SIZE); 39 | 40 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 41 | } 42 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/pi_integral_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_2 -------------------------------------------------------------------------------- /openmp/lab_pi_integral/build/tp.cpp: -------------------------------------------------------------------------------- 1 | // student name: Junhao Xu 2 | // id number: 201900122025 3 | 4 | /*#include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | #define SIZE 10000000 11 | static long num_steps = SIZE; 12 | double step; 13 | // add your codes end 14 | 15 | int main() 16 | { 17 | double pi=0.0; 18 | 19 | double t = omp_get_wtime(); 20 | // add your codes begin 21 | step = 1.0 / (double)num_steps; 22 | omp_set_num_threads(100); 23 | 24 | 25 | #pragma omp parallel 26 | { 27 | int i; 28 | double x, sum; 29 | int id = omp_get_thread_num(); 30 | int nthrds = omp_get_num_threads(); 31 | // double temp1 = 0.5*step; 32 | // double temp2 = nthrds*step; 33 | // double end = num_steps*step; 34 | x = (id) * step; 35 | double margin = nthrds*step; 36 | for (i = id + nthrds,sum= 0.0; i < num_steps; i+=nthrds) 37 | { 38 | x += margin; 39 | // x = (i + 0.5) * step; 40 | sum += 4.0 / (1.0 + x * x); 41 | } 42 | // sum = sum*step; 43 | #pragma omp critical 44 | pi+=sum*step; 45 | 46 | // #pragma atomic 47 | // pi+=sum; 48 | } 49 | // pi+=0.000100000001; 50 | // add your codes end 51 | t = omp_get_wtime() - t; 52 | printf("time %f %d\n", t, SIZE); 53 | 54 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 55 | }*/ 56 | 57 | #include 58 | static long num_steps = 100000; 59 | double step; 60 | #define NUM_THREADS 2 61 | void main() { 62 | double pi; 63 | step = 1.0 / (double)num_steps; 64 | omp_set_num_threads(NUM_THREADS); 65 | int nthreads = 0; 66 | #pragma omp parallel 67 | { 68 | int i, id, nthrds; 69 | double x, sum; 70 | id = omp_get_thread_num(); 71 | nthrds = omp_get_num_threads(); 72 | if (id == 0) 73 | nthreads = nthrds; 74 | id = omp_get_thread_num(); 75 | nthrds = omp_get_num_threads(); 76 | for (i = id, sum = 0.0; i < num_steps; i = i + nthreads) { 77 | x = (i + 0.5) * step; 78 | sum += 4.0 / (1.0 + x * x); 79 | } 80 | sum = sum * step; 81 | #pragma atomic 82 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/lab_pi_integral.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/lab_pi_integral.tgz -------------------------------------------------------------------------------- /openmp/lab_pi_integral/pi_integral_0.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | 10 | // add your codes begin 11 | // #define SIZE 10000000 12 | static long num_steps = SIZE; 13 | double step; 14 | // add your codes end 15 | 16 | int main() { 17 | double t = omp_get_wtime(); 18 | // add your codes begin 19 | int i; 20 | double x, pi, sum = 0.0; 21 | //omp_set_num_threads(SIZE); 22 | step = 1.0 / (double)num_steps; 23 | for (int i = 0; i < num_steps; i++) { 24 | x = (i + 0.5) * step; 25 | sum = sum + 4.0 / (1.0 + x * x); 26 | } 27 | pi = step * sum; 28 | 29 | // add your codes end 30 | t = omp_get_wtime() - t; 31 | printf("time %f %d\n", t, SIZE); 32 | 33 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 34 | } 35 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/pi_integral_1.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | //#define SIZE 10000000 11 | static long num_step = SIZE; 12 | // add your codes end 13 | 14 | int main() { 15 | double pi; 16 | 17 | double t = omp_get_wtime(); 18 | // add your codes begin 19 | int i; 20 | double sum = 0.0; 21 | double step = 1.0 / (double)num_step; 22 | omp_set_num_threads(10); 23 | #pragma omp parallel 24 | { 25 | double x; 26 | #pragma omp for reduction(+ : sum) 27 | for (i = 0; i < num_step; i++) { 28 | x = (i + 0.5) * step; 29 | sum = sum + 4.0 / (1.0 + x * x); 30 | } 31 | } 32 | pi = step * sum; 33 | // add your codes end 34 | t = omp_get_wtime() - t; 35 | printf("time %f %d\n", t, SIZE); 36 | 37 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 38 | } 39 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/pi_integral_2.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | //#define SIZE 25 11 | 12 | static long num_step = SIZE; 13 | // add your codes end 14 | 15 | int main() { 16 | double pi; 17 | 18 | double t = omp_get_wtime(); 19 | // add your codes begin 20 | double step = 1 / (double)num_step; 21 | 22 | omp_set_num_threads(10); 23 | int real_num = 0; 24 | int nthreads; 25 | 26 | #pragma omp parallel 27 | { 28 | int id,i,nthrds; 29 | id = omp_get_thread_num(); 30 | // i为每个线程私有,每个线程只需执行自己需要执行的次数即可 31 | // 这里的x和sum也不再共享,而是每个线程私有一份 32 | double x = 0.0, sum = 0.0; 33 | nthrds = omp_get_num_threads(); 34 | //x = (id)*step; 35 | //double real_step = nthrds*step; 36 | // 每个线程只执行循环中自己需要执行的部分 37 | for (i = id; i < num_step; i += nthrds) { 38 | x = (i + 0.5) * step; 39 | //x = x + real_step; 40 | sum += 4.0 / (1.0 + x * x); 41 | } 42 | // #pragma atomic 43 | // pi += sum* step; 44 | //printf("%.12f ",sum); 45 | // 如果使用atomic要注意,atomic中只支持简单的操作 46 | // 使用atomic需要将乘法在外面做,然后内部只有加法 47 | #pragma critical (critical1) 48 | pi += sum * step; 49 | } 50 | // add your codes end 51 | t = omp_get_wtime() - t; 52 | printf("time %f %d\n", t, SIZE); 53 | 54 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 55 | } 56 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Watch lectures: https://icloud.qd.sdu.edu.cn:7777/link/2D2A742C095E0CFD13FEB87F405E2FEB 5 | - Expiration date: 2023-06-18 6 | - Implement your Pi program "pi_integral_N.cpp" of Unit 2 7 | - Version 0: serial program 8 | - Version 1: parallel program using parallel-for and reduction 9 | - Version 2: parallel program without using parallel-for 10 | 11 | 代码见目录 12 | - Compile and run your programs multiple times 13 | 三个代码统一设置线程数量为4,步数为10000000,v2版本中尝试使用了atomic和ctrtical 14 | v0运行三次结果分别为 15 | time 0.027952 4 16 | pi 3.141592653590 -0.000000000000 17 | 18 | time 0.028419 4 19 | pi 3.141592653590 -0.000000000000 20 | 21 | time 0.027284 4 22 | pi 3.141592653590 -0.000000000000 23 | 24 | v1运行三次结果分别为 25 | time 0.009849 4 26 | pi 3.141592653590 -0.000000000000 27 | 28 | time 0.023802 4 29 | pi 3.141592653590 -0.000000000000 30 | 31 | time 0.012471 4 32 | pi 3.141592653590 -0.000000000000 33 | 34 | v2运行三次结果分别为 35 | critical版本 36 | time 0.011807 4 37 | pi 3.141592653590 -0.000000000000 38 | 39 | time 0.013312 4 40 | pi 3.141592653590 -0.000000000000 41 | 42 | time 0.010704 4 43 | pi 3.141592653590 -0.000000000000 44 | 45 | atomic版本 46 | time 0.010627 4 47 | pi 3.141592653590 -0.000000000000 48 | 49 | time 0.010760 4 50 | pi 3.141592653590 -0.000000000000 51 | 52 | time 0.011253 4 53 | pi 3.141592653590 -0.000000000000 54 | 55 | - Describe your observations 56 | 串行版本的耗时最长,使用reduction和v2版本效率差距不大。 57 | 在步数固定的情况下,当线程数量增加时,v1和v2版本的性能会先增加, 58 | 但线程数量翻倍运行效率并不会达到对应的倍数。当线程数量到达一定值时,效率不再增加。 59 | 之后再增加线程数量,效率反而会下降。 60 | 61 | 在实现v2版本时,初始时对于for循环中每一次都进行了加锁操作,这时效率极差, 62 | 将部分变量改为线程私有,每个线程独自计算自己需要计算的部分,最后将结果原子求和,效率得到了极大提升。 63 | 64 | 65 | - Explain why this happens 66 | 初期随着线程数量的增加,运行效率会随之增加,这是因为多个线程每个线程只执行循环的一部分,并行执行, 67 | 效率增加。 68 | 效率并不会随着线程数量翻倍,这是因为线程的创建和销毁都会占用时间 69 | 当线程数量过大时,效率下降:这说明此时该程序的执行时间用在线程创建销毁的时间已经超过的具体计算的时间, 70 | 成为了该程序的计算瓶颈。这提醒我们线程并不是越多越好。 71 | 72 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/run.sh: -------------------------------------------------------------------------------- 1 | version=2 2 | size=1000000 3 | #rm ./build/pi_integral_$version 4 | g++ -o ./build/pi_integral_$version -fopenmp -DSIZE=$size pi_integral_$version.cpp && timeout 60s time ./build/pi_integral_$version 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_pi_integral/test.cpp: -------------------------------------------------------------------------------- 1 | // student 宋家庆: 2 | // id 202000130061: 3 | 4 | /*#include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | //#define SIZE 25 11 | 12 | static long num_step = 10000000; 13 | // add your codes end 14 | 15 | int main() { 16 | double pi; 17 | 18 | double t = omp_get_wtime(); 19 | // add your codes begin 20 | double step = 1 / (double)num_step; 21 | 22 | omp_set_num_threads(25); 23 | int real_num = 0; 24 | int nthreads; 25 | 26 | #pragma omp parallel 27 | { 28 | int id,i,nthrds; 29 | id = omp_get_thread_num(); 30 | // i为每个线程私有,每个线程只需执行自己需要执行的次数即可 31 | // 这里的x和sum也不再共享,而是每个线程私有一份 32 | double x, sum = 0.0; 33 | nthrds = omp_get_num_threads(); 34 | if (id == 0) { 35 | nthreads = omp_get_num_threads(); 36 | //printf("-----%d",real_num); 37 | } 38 | //id = omp_get_thread_num(); 39 | //nthrds = omp_get_num_threads(); 40 | //printf("%d ",id); 41 | x = (id)*step; 42 | double margin = nthrds*step; 43 | // 每个线程只执行循环中自己需要执行的部分 44 | for (i = id; i < num_step; i += nthreads) { 45 | //x = (i + 0.5) * step; 46 | x = x + margin; 47 | sum += 4.0 / (1.0 + x * x); 48 | } 49 | #pragma atomic 50 | pi += sum* step; 51 | //#pragma critical (critical1) 52 | // pi += sum; 53 | } 54 | // add your codes end 55 | t = omp_get_wtime() - t; 56 | printf("time %f %d\n", t, SIZE); 57 | 58 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 59 | } 60 | */ 61 | 62 | // student name: Junhao Xu 63 | // id number: 201900122025 64 | 65 | #include 66 | #include 67 | #include 68 | using namespace std; 69 | 70 | // add your codes begin 71 | 72 | static long num_steps = 100; 73 | double step; 74 | // add your codes end 75 | 76 | int main() 77 | { 78 | double pi=0.0; 79 | 80 | double t = omp_get_wtime(); 81 | // add your codes begin 82 | step = 1.0 / (double)num_steps; 83 | omp_set_num_threads(100); 84 | 85 | int i = 0; 86 | #pragma omp parallel 87 | { 88 | i++; 89 | int id = omp_get_thread_num(); 90 | printf("%d\n",i); 91 | } 92 | // pi+=0.000100000001; 93 | // add your codes end 94 | t = omp_get_wtime() - t; 95 | printf("time %f %d\n", t, 25); 96 | 97 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 98 | } 99 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/build/pi_rnd_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/build/pi_rnd_1 -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/build/pi_rnd_2 copy.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | // #define SIZE 10000 11 | static long MULTIPLIER1 = 1366; 12 | static long ADDEND1 = 150889; 13 | static long PMOD1 = 714025; 14 | 15 | static long MULTIPLIER2 = 1277; 16 | static long ADDEND2 = 524849; 17 | static long PMOD2 = 981293; 18 | 19 | const int thrd_num = 30; 20 | unsigned long long pseed[10000]; 21 | 22 | inline long LCG(long random_last) { 23 | return (MULTIPLIER2 * random_last + ADDEND2) % PMOD2; 24 | } 25 | // add your codes end 26 | 27 | int main() { 28 | double pi; 29 | 30 | double t = omp_get_wtime(); 31 | // add your codes begin 32 | int nthreads = 0; 33 | double r = 1.0; 34 | int num_in = 0; 35 | #pragma omp parallel 36 | { 37 | #pragma omp single 38 | { 39 | nthreads = omp_get_num_threads(); 40 | unsigned long long iseed = PMOD1 / MULTIPLIER1; // just pick a seed 41 | pseed[0] = iseed; 42 | for (int i = 1; i < nthreads; ++i) { 43 | iseed = (unsigned long long)((MULTIPLIER1 * iseed) % PMOD1); 44 | pseed[i] = iseed; 45 | } 46 | } 47 | int id = omp_get_thread_num(); 48 | int tp_num = 0; 49 | long long random_last = (unsigned long long)pseed[id]; 50 | for (int i = 0; i < SIZE; i += nthreads) { 51 | random_last = LCG(random_last); 52 | double x = (double)random_last / (double)PMOD2; 53 | random_last = LCG(random_last); 54 | double y = (double)random_last / (double)PMOD2; 55 | if (x * x + y * y <= r) { 56 | tp_num++; 57 | } 58 | } 59 | #pragma omp critical 60 | num_in += tp_num; 61 | } 62 | 63 | pi = 4.0 * (double)(num_in) / (double)(SIZE); 64 | // add your codes end 65 | t = omp_get_wtime() - t; 66 | printf("time %f %d\n", t, SIZE); 67 | 68 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 69 | } 70 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/build/pi_rnd_2.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | // #define SIZE 10000 11 | static long MULTIPLIER = 1366; 12 | static long ADDEND = 150889; 13 | static long PMOD = 714025; 14 | long random_last = 0; 15 | 16 | inline long LCG() { 17 | long random_next; 18 | random_next = (MULTIPLIER * random_last + ADDEND) % PMOD; 19 | random_last = random_next; 20 | return random_next; 21 | } 22 | // add your codes end 23 | 24 | double rnd_arr[2 * SIZE]; 25 | 26 | int main() { 27 | double pi; 28 | 29 | double t = omp_get_wtime(); 30 | // add your codes begin 31 | double x, y; 32 | double r = 1.0; 33 | int num_in = 0; 34 | for (int i = 0; i < 2 * SIZE; i++) { 35 | rnd_arr[i] = (double)LCG() / double(PMOD); 36 | } 37 | #pragma omp parallel for reduction(+ : num_in) 38 | for (int i = 0; i < SIZE; i++) { 39 | double x = rnd_arr[2 * i]; 40 | double y = rnd_arr[2 * i + 1]; 41 | if (x * x + y * y <= r) { 42 | num_in++; 43 | } 44 | } 45 | pi = 4.0 * (double)(num_in) / (double)(SIZE); 46 | // add your codes end 47 | t = omp_get_wtime() - t; 48 | printf("time %f %d\n", t, SIZE); 49 | 50 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 51 | } 52 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/lab_pi_rnd.2023-04-12.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/lab_pi_rnd.2023-04-12.tgz -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_0 -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_0.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | static long MULTIPLIER = 1366; 11 | static long ADDEND = 150889; 12 | static long PMOD = 714025; 13 | long random_last = 0; 14 | 15 | long LCG() { 16 | long random_next; 17 | random_next = (MULTIPLIER * random_last + ADDEND) % PMOD; 18 | random_last = random_next; 19 | return random_next; 20 | } 21 | // add your codes end 22 | 23 | int main() { 24 | double pi; 25 | 26 | double t = omp_get_wtime(); 27 | // add your codes begin 28 | double x, y; 29 | double r = 1.0; 30 | int num_in = 0; 31 | int tot_num = SIZE; 32 | for (int i = 0; i < SIZE; i++) { 33 | x = (double)LCG() / (double)PMOD; 34 | y = (double)LCG() / (double)PMOD; 35 | if (x * x + y * y <= r) { 36 | num_in++; 37 | } 38 | } 39 | pi = 4.0 * (double)(num_in) / (double)(SIZE); 40 | t = omp_get_wtime() - t; 41 | // add your codes end 42 | printf("time %f %d\n", t, SIZE); 43 | 44 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 45 | } 46 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_1 -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_1.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | // #define SIZE 10000 11 | static long MULTIPLIER = 1366; 12 | static long ADDEND = 150889; 13 | static long PMOD = 714025; 14 | 15 | // inline long LCG(long random_last) { 16 | // return (MULTIPLIER * random_last + ADDEND) % PMOD; 17 | // } 18 | long random_last = 0; 19 | #pragma omp threadprivate(random_last) 20 | double LCG() { 21 | long random_next; 22 | random_next = (MULTIPLIER * random_last + ADDEND) % PMOD; 23 | random_last = random_next; 24 | return ((double)random_next / (double)PMOD); 25 | } 26 | // add your codes end 27 | 28 | int main() { 29 | double pi; 30 | 31 | double t = omp_get_wtime(); 32 | // add your codes begin 33 | double x, y; 34 | double r = 1.0; 35 | int num_in = 0; 36 | long random_last = 0; 37 | #pragma omp parallel for reduction(+ : num_in) 38 | for (int i = 0; i < SIZE; i++) { 39 | double x = LCG(); 40 | double y = LCG(); 41 | if (x * x + y * y <= r) { 42 | num_in++; 43 | } 44 | } 45 | pi = 4.0 * (double)(num_in) / (double)(SIZE); 46 | 47 | // add your codes end 48 | t = omp_get_wtime() - t; 49 | printf("time %f %d\n", t, SIZE); 50 | 51 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 52 | } 53 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_2 -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/pi_rnd_2.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number: 202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | // #define SIZE 10000 11 | static long MULTIPLIER1 = 1366; 12 | static long ADDEND1 = 150889; 13 | static long PMOD1 = 714025; 14 | 15 | // static long MULTIPLIER2 = 1277; 16 | // static long ADDEND2 = 524849; 17 | // static long PMOD2 = 981293; 18 | static long MULTIPLIER2 = 1366; 19 | static long ADDEND2 = 150889; 20 | static long PMOD2 = 714025; 21 | 22 | const int thrd_num = 30; 23 | unsigned long long pseed[10000]; 24 | 25 | inline long LCG(long random_last) { 26 | return (MULTIPLIER2 * random_last + ADDEND2) % PMOD2; 27 | } 28 | // add your codes end 29 | 30 | int main() { 31 | double pi; 32 | 33 | double t = omp_get_wtime(); 34 | // add your codes begin 35 | int nthreads = 0; 36 | double r = 1.0; 37 | int num_in = 0; 38 | #pragma omp parallel num_threads(30) reduction(+ : num_in) 39 | { 40 | // 使用一种随机数生成算法生成若干随机数种子 41 | #pragma omp single 42 | { 43 | nthreads = omp_get_num_threads(); 44 | unsigned long long iseed = PMOD1 / MULTIPLIER1; // just pick a seed 45 | pseed[0] = iseed; 46 | for (int i = 1; i < nthreads; ++i) { 47 | iseed = rand() % PMOD1; //(unsigned long long)((MULTIPLIER1 * 48 | //iseed) % PMOD1); 49 | pseed[i] = iseed; 50 | } 51 | } 52 | // 每个线程依据上面的随机数,使用其他随机数生成算法进行生成 53 | int id = omp_get_thread_num(); 54 | int tp_num = 0; 55 | long long random_last = (unsigned long long)pseed[id]; 56 | for (int i = 0; i < SIZE; i += nthreads) { 57 | random_last = LCG(random_last); 58 | double x = (double)random_last / (double)PMOD2; 59 | random_last = LCG(random_last); 60 | double y = (double)random_last / (double)PMOD2; 61 | if (x * x + y * y <= r) { 62 | num_in++; 63 | } 64 | } 65 | // #pragma omp critical 66 | // num_in += tp_num; 67 | } 68 | 69 | pi = 4.0 * (double)(num_in) / (double)(SIZE); 70 | // add your codes end 71 | t = omp_get_wtime() - t; 72 | printf("time %f %d\n", t, SIZE); 73 | 74 | printf("pi %.12f %.12f\n", pi, pi - M_PI); 75 | } 76 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Implement your Pi program "pi_rnd_N.cpp" 5 | - Version 0: serial program using LCG PRNG 6 | - Version 1: parallel program using thread-safe PRNG 7 | - Version 2: parallel program using leap-frog PRNG 8 | - Compile and run your programs multiple times 9 | - Describe your observations 10 | 当SIZE大小为100000000时 11 | version 0 12 | 所需时间为4.167296,计算得到的pi大小为3.141593560000,误差为0.000000906410 13 | version 1 14 | 所需时间为0.127502,计算得到的pi大小为3.141757440000,误差为0.000164786410 15 | version 2 16 | 所需时间为0.175884,计算得到的pi大小为3.141585920000,误差为-0.000000533590 17 | 串行版本的准确度最高,thread-safe版本速度较快,但准确度较低,leap-frog可以在高效的情况下保证较高的准确度。 18 | - Explain why this happens 19 | thread_safe版本,每个线程都拷贝了一份初始的random_seed,但由于使用了相同的随机数种子和随机数算法,使得生成的 20 | 随机数并没有那么的随机,导致蒙特卡洛算法结果不理想。 21 | leap-frog版本先使用了一种随机数生成算法为每个线程生成了一个随机数种子,又使用了另一种随机数生成算法在每个线程内部 22 | 使用不同的随机数种子生成随机数。较之thread-safe版本生成的随机数更加随机。能够逼近单线程生成的随机数效果。 23 | 24 | -------------------------------------------------------------------------------- /openmp/lab_pi_rnd/run.sh: -------------------------------------------------------------------------------- 1 | version=2 2 | size=100000000 3 | 4 | g++ -o pi_rnd_$version -fopenmp -DSIZE=$size pi_rnd_$version.cpp && timeout 60s time ./pi_rnd_$version -------------------------------------------------------------------------------- /openmp/lab_scan_frag/lab_scan_frag.2023-04-19.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/lab_scan_frag.2023-04-19.tgz -------------------------------------------------------------------------------- /openmp/lab_scan_frag/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Implement your fragment scan algorithm 5 | - Compile and run your programs multiple times 6 | - Describle how you incrementally implement and test your program 7 | 算法运行结果如下 8 | time 0.090030 100000000 9 | 6.98user 1.91system 0:03.85elapsed 230%CPU (0avgtext+0avgdata 2344884maxresident)k 10 | 0inputs+0outputs (0major+586522minor)pagefaults 0swaps 11 | 平均执行时间在0.3-0.4之间,较之提供的标准答案效率提高了一倍。 12 | 13 | 首先直接复现一下之前在scan中实现的recurision算法,将其中的加法操作改为定义好的操作。 14 | 此时算法执行时间约为1秒,将递归操作中的引用传递改为值传递,时间降低到0.9s 15 | 考虑到std中定义很多额外的操作,可能会比较耗时,这里将所有的vector都转换为int*数组 16 | 此时时间降低到0.09。 17 | -------------------------------------------------------------------------------- /openmp/lab_scan_frag/run.sh: -------------------------------------------------------------------------------- 1 | version=0 2 | size=100000000 3 | 4 | g++ -o scan_frag_$version -fopenmp -DSIZE=$size scan_frag_$version.cpp && timeout 60s time ./scan_frag_$version 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_scan_frag/scan_frag: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/scan_frag -------------------------------------------------------------------------------- /openmp/lab_scan_frag/scan_frag_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/scan_frag_0 -------------------------------------------------------------------------------- /openmp/lab_scan_frag/scan_frag_0.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | // add your codes begin 10 | int thrd_num = 50; 11 | void rscan(int *data, int *flag, int size) { 12 | if (size == 1) return; 13 | //vector tpdata(size / 2); 14 | int *tpdata = new int[size / 2]; 15 | //vector tpflag(size / 2); 16 | int *tpflag = new int[size / 2]; 17 | #pragma omp parallel for num_threads(thrd_num) 18 | for (int i = 0; i < size / 2; i++) { 19 | if (flag[2 * i + 1] == 1) { 20 | tpdata[i] = data[2 * i + 1]; 21 | tpflag[i] = flag[2 * i + 1]; 22 | } else { 23 | tpdata[i] = data[i * 2] + data[2 * i + 1]; 24 | tpflag[i] = flag[2 * i] | flag[2 * i + 1]; 25 | } 26 | } 27 | rscan(&tpdata[0], &tpflag[0], size / 2); 28 | #pragma omp parallel for num_threads(thrd_num) 29 | for (int i = 1; i < size; i += 2) { 30 | data[i] = tpdata[i / 2]; 31 | if (i + 1 < size) { 32 | if (flag[i + 1] != 1) { 33 | data[i + 1] = tpdata[(i + 1) / 2 - 1] + data[i + 1]; 34 | flag[i + 1] = flag[i + 1] | tpflag[(i + 1) / 2 - 1]; 35 | } 36 | } 37 | } 38 | } 39 | 40 | // add your codes end 41 | 42 | int main() { 43 | vector data(SIZE, 1); 44 | vector flag(SIZE, 0); 45 | vector test(SIZE); 46 | 47 | srand(SIZE); 48 | data[0] = 0; 49 | flag[0] = 1; 50 | for (int i = 0; i < flag.size() / 12; i++) { 51 | int index = rand() % flag.size(); 52 | data[index] = 0; 53 | flag[index] = 1; 54 | } 55 | for (int i = 0; i < data.size(); i++) 56 | test[i] = (flag[i] != 0) ? data[i] : test[i - 1] + data[i]; 57 | 58 | double t = omp_get_wtime(); 59 | // add your codes begin 60 | //vector flag2 = flag; 61 | rscan(&data[0], &flag[0], SIZE); 62 | // add your codes end 63 | t = omp_get_wtime() - t; 64 | printf("time %f %d\n", t, SIZE); 65 | 66 | for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]); 67 | } 68 | -------------------------------------------------------------------------------- /openmp/lab_scan_link/build/scan_link_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/build/scan_link_0 -------------------------------------------------------------------------------- /openmp/lab_scan_link/lab_scan_link.2023-04-26.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/lab_scan_link.2023-04-26.tgz -------------------------------------------------------------------------------- /openmp/lab_scan_link/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 2 | - ID number: 3 | 4 | - Implement your list ranking algorithm 5 | - Compile and run your programs multiple times 6 | - Describle how you incrementally implement and test your program 7 | 8 | 当SIZE大小为100000000时,在无其他人占用的情况下,答案代码为7.964633,我的时间为5.512434 9 | 首先我们将所有的rank都初始化为0,在第i次循环里,所有的prev数组都向前跳一次,此时若某个点的prev数组不为-1,即没有跳到头节点,则将data值更新为前一位置data值+pow(2,i) 10 | 这是因为我们随着循环次数的增加,其每一次跳跃所跨越的距离指数递增。最多需要log(SIZE)次跳跃,即可实现求所有的rank 11 | 由于我们在循环对data和prev数组既有写又有读,所以我们要做一次读写分离来保证正确性,即对原数组做一次拷贝。 12 | 跳跃部分可以使用并行进行优化(由于读写分离,所以可以保证并行的正确性) 13 | 优化小trick: 14 | 初始时使用vector进行data和prev的备份,使用memcpy进行拷贝,此时时间为10.944141 15 | 之后将拷贝部分更换为多线程并发拷贝,时间降低至7.885708 16 | 考虑到vecttor作为一个stl容器,内部的维护也需要耗时,将vector更换为指针,此时时间为5.512434 17 | -------------------------------------------------------------------------------- /openmp/lab_scan_link/run.sh: -------------------------------------------------------------------------------- 1 | version=0 2 | size=100000000 3 | 4 | g++ -o scan_link_$version -fopenmp -DSIZE=$size scan_link_$version.cpp && timeout 60s time ./scan_link_$version 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_scan_link/scan_link: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/scan_link -------------------------------------------------------------------------------- /openmp/lab_scan_link/scan_link_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/scan_link_0 -------------------------------------------------------------------------------- /openmp/lab_scan_link/scan_link_0.cpp: -------------------------------------------------------------------------------- 1 | // student name: 宋家庆 2 | // id number:202000130061 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | using namespace std; 13 | 14 | // add your codes begin 15 | // #define SIZE 1000 16 | // add your codes end 17 | 18 | int main() { 19 | vector data(SIZE, -1); 20 | vector prev(SIZE, -1); 21 | vector next(SIZE, -1); 22 | vector test(SIZE, -1); 23 | 24 | srand(SIZE); 25 | { 26 | int tmp = -1; 27 | for (int i = 0; i < SIZE / 2; i++) { 28 | int idx = rand() % SIZE; 29 | while (data[idx] >= 0) idx = (idx + 1) % SIZE; 30 | if (i > 0) { 31 | data[idx] = 1; 32 | prev[idx] = tmp; 33 | next[tmp] = idx; 34 | } else { 35 | data[idx] = 0; 36 | } 37 | test[idx] = i; 38 | tmp = idx; 39 | } 40 | } 41 | 42 | double t = omp_get_wtime(); 43 | // add your codes begin 44 | // 将data的初始值全部置为0 45 | #pragma omp parallel for 46 | for (int i = 0; i < SIZE; i++) 47 | if (data[i] == 1) data[i] = 0;0 48 | 49 | int* cdata = new int[SIZE]; 50 | int* cprev = new int[SIZE]; 51 | // vector cdata(SIZE, -1); 52 | // vector cprev(SIZE, -1); 53 | for (int i = 0; pow(2, i) < SIZE; i++) { 54 | int step = pow(2, i); 55 | // memcpy(&cdata[0], &data[0], 4*SIZE); 56 | // memcpy(&cprev[0], &prev[0], 4*SIZE); 57 | #pragma omp parallel for 58 | for (int i = 0; i < SIZE; i++) cdata[i] = data[i]; 59 | #pragma omp parallel for 60 | for (int i = 0; i < SIZE; i++) cprev[i] = prev[i]; 61 | #pragma omp parallel for 62 | for (int j = 0; j < SIZE; j++) { 63 | if (prev[j] != -1) { 64 | data[j] = cdata[prev[j]] + step; 65 | prev[j] = cprev[prev[j]]; 66 | } 67 | } 68 | } 69 | // add your codes end 70 | t = omp_get_wtime() - t; 71 | printf("time %f %d\n", t, SIZE); 72 | 73 | for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]); 74 | } 75 | -------------------------------------------------------------------------------- /openmp/lab_scan_tree/lab_scan_tree.2023-05-06.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/lab_scan_tree.2023-05-06.tgz -------------------------------------------------------------------------------- /openmp/lab_scan_tree/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 2 | - ID number: 3 | 4 | - Implement your tree ranking algorithm 5 | - Compile and run your programs multiple times 6 | - Compare the running times of your list ranking and tree ranking algorithms 7 | - Explain your observations 8 | tree的scan算法逻辑和link的逻辑基本是一致的,从较远节点更新某一点的rank的迭代轮数比较近节点所需要的 9 | 迭代轮数更多,所以使用相同的逻辑可以保证正确性。 10 | 对scan_link和scan_tree进行对比,在同样使用50个线程,SIZE大小为100000000时,scan_link所需要的时间 11 | 为5.512434,而scan_tree所需要的时间为1.976432。可以发现scan_tree所需时间更少,这是因为tree在相同 12 | 节点数量的情况下,由于树可以有多个儿子节点,而link只会有一个儿子节点,数得高度较之link比较低,所需要 13 | 的迭代次数较少,所以所需时间更少。 14 | -------------------------------------------------------------------------------- /openmp/lab_scan_tree/run.sh: -------------------------------------------------------------------------------- 1 | version=0 2 | size=100000000 3 | 4 | g++ -o scan_tree_$version -fopenmp -DSIZE=$size scan_tree_$version.cpp && timeout 60s time ./scan_tree_$version 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_scan_tree/scan_tree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/scan_tree -------------------------------------------------------------------------------- /openmp/lab_scan_tree/scan_tree_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/scan_tree_0 -------------------------------------------------------------------------------- /openmp/lab_scan_tree/scan_tree_0.cpp: -------------------------------------------------------------------------------- 1 | //student name: 2 | //id number: 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace std; 12 | 13 | 14 | // add your codes begin 15 | const int thrd_num = 60; 16 | // add your codes end 17 | 18 | 19 | int main() { 20 | vector data(SIZE, -1); 21 | vector pare(SIZE, -1); 22 | vector test(SIZE, -1); 23 | 24 | srand(SIZE); 25 | { vector tmp; 26 | for (int i = 0; i < SIZE/2; i++) { 27 | // 随机获取一个idx 28 | int idx = rand() % SIZE; 29 | // 找到一个data[idx]没有父亲的 30 | while (data[idx] >= 0) idx = (idx + 1) % SIZE; 31 | if (i > 0) { 32 | // 把他的树高记为1 33 | data[idx] = 1; 34 | // 给他找一个父亲 35 | pare[idx] = tmp[rand() % tmp.size()]; 36 | // 得到真正的树高 37 | test[idx] = test[pare[idx]] + data[idx]; 38 | } else { 39 | // 这个节点是根节点,rank为0 40 | data[idx] = 0; 41 | test[idx] = data[idx]; 42 | } 43 | // 他现在可以作为父亲了 44 | tmp.push_back(idx); 45 | } 46 | } 47 | 48 | double t = omp_get_wtime(); 49 | // add your codes begin 50 | // 将data的初始值全部置为0 51 | // #pragma omp parallel for 52 | // for (int i = 0; i < SIZE; i++) 53 | // if (data[i] == 1) data[i] = 0; 54 | 55 | int* cdata = new int[SIZE]; 56 | int* cpare = new int[SIZE]; 57 | // vector cdata(SIZE, -1); 58 | // vector cpare(SIZE, -1); 59 | for (int i = 0; pow(2, i) < SIZE; i++) { 60 | int step = pow(2, i); 61 | // memcpy(&cdata[0], &data[0], 4*SIZE); 62 | // memcpy(&cpare[0], &pare[0], 4*SIZE); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (int i = 0; i < SIZE; i++) { 65 | cdata[i] = data[i]; 66 | cpare[i] = pare[i]; 67 | } 68 | 69 | #pragma omp parallel for num_threads(thrd_num) 70 | for (int j = 0; j < SIZE; j++) { 71 | if (pare[j] != -1) { 72 | data[j] = cdata[pare[j]] + step; 73 | pare[j] = cpare[pare[j]]; 74 | } 75 | } 76 | } 77 | 78 | // add your codes end 79 | t = omp_get_wtime() - t; 80 | printf("time %f %d\n", t, SIZE); 81 | 82 | for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]); 83 | } 84 | 85 | -------------------------------------------------------------------------------- /openmp/lab_scan_vect/build/scan_vect_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/build/scan_vect_1 -------------------------------------------------------------------------------- /openmp/lab_scan_vect/build/scan_vect_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/build/scan_vect_2 -------------------------------------------------------------------------------- /openmp/lab_scan_vect/lab_scan_vect.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/lab_scan_vect.tgz -------------------------------------------------------------------------------- /openmp/lab_scan_vect/readme.txt: -------------------------------------------------------------------------------- 1 | - Student name: 宋家庆 2 | - ID number: 202000130061 3 | 4 | - Implement your scan algorithm 5 | - Version 0: serial program 6 | - Version 1: brute-force program 7 | - Version 2: recursion program 8 | - Compile and run your programs multiple times 9 | - Describe your observations 10 | 在SIZE大小为1000的情况下,v0版本运行时间约为0.00007, 11 | v1版本运行时间约为0.001993,v2版本运行时间约为0.001227 12 | 13 | 在SIZE大小为100000000的情况下 14 | v0版本运行时间为0.052 15 | v1版本速度非常慢,远低于v2版本和v0版本,在60秒内无法得到结果。 16 | v2版本时间约为0.6-0.7之间,略高于串行版本。 17 | 18 | - Explain why this happens 19 | 20 | v1版本需要先做map,在做reduce,每个reduce的时间复杂度为n,计算量为n^2级别,当线程数足够多时,对每个reduce 21 | 新开一个线程,时间为最长的一次reduce所需要的时间,同时reduce过程中也可以并行,理论来说效率应该不会比v0版本低, 22 | 但在真正实现的过程中,由于线程数并没有那么多,线程的创建与回收都会有时间开销,导致brute-force的性能不如线性计算。 23 | 24 | v2版本的计算量为[(n+n/2)+(n/2+n/4)+...+1],约有log(n)项,计算复杂度可近似为等比数列求和,计算复杂度小于3n(txt不方便写公式), 25 | 每一次递归求和可以并行计算来优化,但由于递归中含有大量的函数调用,开销可能会比较大。导致并行后算法的时间略高于串行版本。 26 | -------------------------------------------------------------------------------- /openmp/lab_scan_vect/run.sh: -------------------------------------------------------------------------------- 1 | version=0 2 | size=10000000 3 | 4 | g++ -o scan_vect_$version -fopenmp -DSIZE=$size scan_vect_$version.cpp && timeout 60s time ./scan_vect_$version 5 | 6 | -------------------------------------------------------------------------------- /openmp/lab_scan_vect/scan_vect: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect -------------------------------------------------------------------------------- /openmp/lab_scan_vect/scan_vect_0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect_0 -------------------------------------------------------------------------------- /openmp/lab_scan_vect/scan_vect_0.cpp: -------------------------------------------------------------------------------- 1 | // student name: 2 | // id number: 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | // add your codes begin 13 | // add your codes end 14 | 15 | int main() { 16 | vector data(SIZE, 1); 17 | data[0] = 0; 18 | 19 | double t = omp_get_wtime(); 20 | // add your codes begin 21 | int res = 0; 22 | for (int i = 1; i < SIZE; i++) { 23 | res += data[i]; 24 | data[i] = res; 25 | } 26 | 27 | // add your codes end 28 | t = omp_get_wtime() - t; 29 | printf("time %f %d\n", t, SIZE); 30 | 31 | for (int i = 0; i < SIZE; i++) assert(data[i] == i); 32 | } 33 | -------------------------------------------------------------------------------- /openmp/lab_scan_vect/scan_vect_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect_1 -------------------------------------------------------------------------------- /openmp/lab_scan_vect/scan_vect_1.cpp: -------------------------------------------------------------------------------- 1 | // student name: 2 | // id number: 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | // add your codes begin 13 | // #define SIZE 1000 14 | // add your codes end 15 | 16 | int main() { 17 | vector data(SIZE, 1); 18 | data[0] = 0; 19 | 20 | double t = omp_get_wtime(); 21 | omp_set_num_threads(100); 22 | // add your codes begin 23 | vector temp = data; 24 | omp_set_nested(1); 25 | #pragma omp paraller num_threads(30) 26 | { 27 | int id, i, nthrds; 28 | id = omp_get_thread_num(); 29 | nthrds = omp_get_num_threads(); 30 | int num = SIZE; 31 | for (i = id; i < SIZE; i += nthrds) { 32 | int sum = 0; 33 | //#pragma omp parallel for num_threads(30) 34 | for (int j = 0; j <= i; j++) { 35 | //#pragma omp atomic 36 | sum += temp[j]; 37 | } 38 | data[i] = sum; 39 | } 40 | } 41 | // #pragma omp parallel for num_threads(30) 42 | // for (int i = 2; i < SIZE; i++) { 43 | // #pragma omp parallel for num_threads(30) 44 | // for (int j = i; j < SIZE; j++) { 45 | // #pragma omp atomic 46 | // data[j] += temp[i]; 47 | // } 48 | // } 49 | // for (int i = 0; i < SIZE; i++) { 50 | // cout< 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | using namespace std; 11 | 12 | // add your codes start 13 | // #define SIZE 1000 14 | const int thrd_num = 30; 15 | 16 | void rscan(vector &data, int size) { 17 | if (size == 1) return; 18 | vector twoSum(size / 2); 19 | #pragma omp parallel for num_threads(thrd_num) 20 | for (int i = 0; i < twoSum.size(); i++) { 21 | twoSum[i] = data[i * 2] + data[2 * i + 1]; 22 | } 23 | rscan(twoSum, size / 2); 24 | #pragma omp parallel for num_threads(thrd_num) 25 | for (int i = 1; i < size; i += 2) { 26 | data[i] = twoSum[i / 2]; 27 | if (i + 1 < size) { 28 | data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1]; 29 | } 30 | } 31 | } 32 | // add your codes end 33 | 34 | int main() { 35 | vector data(SIZE, 1); 36 | data[0] = 0; 37 | 38 | double t = omp_get_wtime(); 39 | // add your codes start 40 | rscan(data, SIZE); 41 | // add your codes end 42 | t = omp_get_wtime() - t; 43 | printf("time %f %d\n", t, SIZE); 44 | 45 | for (int i = 0; i < SIZE; i++) assert(data[i] == i); 46 | } -------------------------------------------------------------------------------- /openmp/midterm_exam/build/segment_softmax: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/segment_softmax -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers copy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers copy -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers copy 2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10000000 13 | const ll mod = 1e8 + 10; 14 | const ll thrd_num = 20; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(ll num) { 19 | matrix mat = vector(4, vector(4, (ll)0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, (ll)0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | ll n = a.size(); 41 | ll m = b[0].size(); 42 | ll l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (ll i = 0; i < n; i++) { 45 | for (ll j = 0; j < m; j++) { 46 | for (ll k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, ll size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (ll i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[2 * i + 1] * data[i * 2]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (ll i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (ll i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | // vector mats; 85 | // mats.push_back(initMatrix(2)); 86 | 87 | // for (ll i = 3; i < SIZE; i++) { 88 | // matrix c = initMatrix(i); 89 | // mats.push_back(c * mats[i - 3]); 90 | // } 91 | 92 | vector mats(SIZE-2); 93 | #pragma omp parallel for num_threads(thrd_num) 94 | for (ll i = 2; i < SIZE; i++) { 95 | mats[i-2] = initMatrix(i); 96 | } 97 | 98 | data[0] = 1; 99 | data[1] = 1; 100 | // reverse(mats.begin(), mats.end()); 101 | rscan(mats, SIZE - 2); 102 | // reverse(mats.begin(),mats.end()); 103 | #pragma omp parallel for num_threads(thrd_num) 104 | for (ll i = 2; i < SIZE; i++) { 105 | // matrix mt = initMatrix(2); 106 | // for (ll j = 3; j <= i; j++) { 107 | // mt = mt * initMatrix(j); 108 | // } 109 | // matrix mt = initEMatrix(); 110 | // for (ll j = i; j >= 2; j--) { 111 | // mt = mt * initMatrix(j); 112 | // } 113 | 114 | // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 115 | // (4 * mt[0][3]) % mod) % 116 | // mod; 117 | data[i] = 118 | (mats[i - 2][0][0] + mats[i - 2][0][1] + 119 | (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) % 120 | mod; 121 | } 122 | // reverse(data.begin()+2,data.end()); 123 | // add your codes end 124 | t = omp_get_wtime() - t; 125 | printf("time %f %d\n", t, SIZE); 126 | 127 | for (ll i = 0; i < SIZE; i++) { 128 | assert (data[i] == test[i]); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers copy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10000000 13 | const ll mod = 1e8 + 10; 14 | const ll thrd_num = 20; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(ll num) { 19 | matrix mat = vector(4, vector(4, (ll)0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, (ll)0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | ll n = a.size(); 41 | ll m = b[0].size(); 42 | ll l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (ll i = 0; i < n; i++) { 45 | for (ll j = 0; j < m; j++) { 46 | for (ll k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, ll size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (ll i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[2 * i + 1] * data[i * 2]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (ll i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (ll i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | // vector mats; 85 | // mats.push_back(initMatrix(2)); 86 | 87 | // for (ll i = 3; i < SIZE; i++) { 88 | // matrix c = initMatrix(i); 89 | // mats.push_back(c * mats[i - 3]); 90 | // } 91 | 92 | vector mats(SIZE-2); 93 | #pragma omp parallel for num_threads(thrd_num) 94 | for (ll i = 2; i < SIZE; i++) { 95 | mats[i-2] = initMatrix(i); 96 | } 97 | 98 | data[0] = 1; 99 | data[1] = 1; 100 | // reverse(mats.begin(), mats.end()); 101 | rscan(mats, SIZE - 2); 102 | // reverse(mats.begin(),mats.end()); 103 | #pragma omp parallel for num_threads(thrd_num) 104 | for (ll i = 2; i < SIZE; i++) { 105 | // matrix mt = initMatrix(2); 106 | // for (ll j = 3; j <= i; j++) { 107 | // mt = mt * initMatrix(j); 108 | // } 109 | // matrix mt = initEMatrix(); 110 | // for (ll j = i; j >= 2; j--) { 111 | // mt = mt * initMatrix(j); 112 | // } 113 | 114 | // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 115 | // (4 * mt[0][3]) % mod) % 116 | // mod; 117 | data[i] = 118 | (mats[i - 2][0][0] + mats[i - 2][0][1] + 119 | (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) % 120 | mod; 121 | } 122 | // reverse(data.begin()+2,data.end()); 123 | // add your codes end 124 | t = omp_get_wtime() - t; 125 | printf("time %f %d\n", t, SIZE); 126 | 127 | for (ll i = 0; i < SIZE; i++) { 128 | assert (data[i] == test[i]); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers_2 -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers_2 copy 2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10000000 13 | const ll mod = 1e8 + 10; 14 | const ll thrd_num = 100; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(ll num) { 19 | matrix mat = vector(4, vector(4, (ll)0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, (ll)0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | ll n = a.size(); 41 | ll m = b[0].size(); 42 | ll l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (ll i = 0; i < n; i++) { 45 | for (ll j = 0; j < m; j++) { 46 | for (ll k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, ll size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (ll i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[2 * i + 1] * data[i * 2]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (ll i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (ll i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | // vector mats; 85 | // mats.push_back(initMatrix(2)); 86 | 87 | // for (ll i = 3; i < SIZE; i++) { 88 | // matrix c = initMatrix(i); 89 | // mats.push_back(c * mats[i - 3]); 90 | // } 91 | 92 | vector mats; 93 | for (ll i = 2; i < SIZE; i++) { 94 | mats.push_back(initMatrix(i)); 95 | } 96 | 97 | data[0] = 1; 98 | data[1] = 1; 99 | // reverse(mats.begin(), mats.end()); 100 | rscan(mats, SIZE - 2); 101 | // reverse(mats.begin(),mats.end()); 102 | for (ll i = 2; i < SIZE; i++) { 103 | // matrix mt = initMatrix(2); 104 | // for (ll j = 3; j <= i; j++) { 105 | // mt = mt * initMatrix(j); 106 | // } 107 | // matrix mt = initEMatrix(); 108 | // for (ll j = i; j >= 2; j--) { 109 | // mt = mt * initMatrix(j); 110 | // } 111 | 112 | // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 113 | // (4 * mt[0][3]) % mod) % 114 | // mod; 115 | data[i] = 116 | (mats[i - 2][0][0] + mats[i - 2][0][1] + 117 | (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) % 118 | mod; 119 | } 120 | // reverse(data.begin()+2,data.end()); 121 | // add your codes end 122 | t = omp_get_wtime() - t; 123 | printf("time %f %d\n", t, SIZE); 124 | 125 | for (ll i = 0; i < SIZE; i++) { 126 | if (data[i] != test[i]) { 127 | cout << i << " "; 128 | }; 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /openmp/midterm_exam/build/series_of_numbers_2 copy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 1000 13 | const int mod = 1e8 + 10; 14 | const int thrd_num = 20; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(int num) { 19 | matrix mat = vector(4, vector(4, 0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, 0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | int n = a.size(); 41 | int m = b[0].size(); 42 | int l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (int i = 0; i < n; i++) { 45 | for (int j = 0; j < m; j++) { 46 | for (int k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, int size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (int i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[i * 2] * data[2 * i + 1]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (int i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = twoSum[(i + 1) / 2 - 1] * data[i + 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (int i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | vector mats; 85 | for (int i = 2; i < SIZE; i++) { 86 | mats.push_back(initMatrix(i)); 87 | } 88 | data[0] = 1; 89 | data[1] = 1; 90 | // reverse(mats.begin(),mats.end()); 91 | // rscan(mats, SIZE - 2); 92 | // reverse(mats.begin(),mats.end()); 93 | for (int i = 2; i < SIZE; i++) { 94 | // matrix mt = initMatrix(2); 95 | // for (int j = 3; j <= i; j++) { 96 | // mt = mt * initMatrix(j); 97 | // } 98 | matrix mt = initEMatrix(); 99 | for (int j = i; j >= 2; j--) { 100 | mt = mt * initMatrix(j); 101 | } 102 | 103 | data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 104 | (4 * mt[0][3]) % mod) % 105 | mod; 106 | } 107 | // reverse(data.begin()+2,data.end()); 108 | // add your codes end 109 | t = omp_get_wtime() - t; 110 | printf("time %f %d\n", t, SIZE); 111 | 112 | for (int i = 0; i < SIZE; i++) { 113 | if(data[i] != test[i]){ 114 | cout< 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 1000 13 | const int mod = 1e8 + 10; 14 | const int thrd_num = 20; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(int num) { 19 | matrix mat = vector(4, vector(4, 0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, 0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | int n = a.size(); 41 | int m = b[0].size(); 42 | int l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (int i = 0; i < n; i++) { 45 | for (int j = 0; j < m; j++) { 46 | for (int k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, int size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (int i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[i * 2] * data[2 * i + 1]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (int i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = twoSum[(i + 1) / 2 - 1] * data[i + 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (int i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | vector mats; 85 | for (int i = 2; i < SIZE; i++) { 86 | mats.push_back(initMatrix(i)); 87 | } 88 | data[0] = 1; 89 | data[1] = 1; 90 | // reverse(mats.begin(),mats.end()); 91 | // rscan(mats, SIZE - 2); 92 | // reverse(mats.begin(),mats.end()); 93 | for (int i = 2; i < SIZE; i++) { 94 | // matrix mt = initMatrix(2); 95 | // for (int j = 3; j <= i; j++) { 96 | // mt = mt * initMatrix(j); 97 | // } 98 | matrix mt = initEMatrix(); 99 | for (int j = i; j >= 2; j--) { 100 | mt = mt * initMatrix(j); 101 | } 102 | 103 | data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 104 | (4 * mt[0][3]) % mod) % 105 | mod; 106 | } 107 | // reverse(data.begin()+2,data.end()); 108 | // add your codes end 109 | t = omp_get_wtime() - t; 110 | printf("time %f %d\n", t, SIZE); 111 | 112 | for (int i = 0; i < SIZE; i++) { 113 | if(data[i] != test[i]){ 114 | cout< 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10000000 13 | const ll mod = 1e8 + 10; 14 | const ll thrd_num = 60; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(ll num) { 19 | matrix mat = vector(4, vector(4, (ll)0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, (ll)0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | ll n = a.size(); 41 | ll m = b[0].size(); 42 | ll l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (ll i = 0; i < n; i++) { 45 | for (ll j = 0; j < m; j++) { 46 | for (ll k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, ll size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (ll i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[2 * i + 1] * data[i * 2]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (ll i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = data[i + 1] * twoSum[(i + 1) / 2 - 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (ll i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | vector mats(SIZE - 2); 85 | // 初始化矩阵数组 86 | // 这里需注意要逆序存储 87 | #pragma omp parallel for num_threads(thrd_num) 88 | for (ll i = 2; i < SIZE; i++) { 89 | mats[i - 2] = initMatrix(i); 90 | } 91 | data[0] = 1; 92 | data[1] = 1; 93 | // 并行计算前缀积 94 | rscan(mats, SIZE - 2); 95 | // 取出最终的结果 96 | #pragma omp parallel for num_threads(thrd_num) 97 | for (ll i = 2; i < SIZE; i++) { 98 | data[i] = 99 | (mats[i - 2][0][0] + mats[i - 2][0][1] + 100 | (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) % 101 | mod; 102 | } 103 | // add your codes end 104 | t = omp_get_wtime() - t; 105 | printf("time %f %d\n", t, SIZE); 106 | 107 | for (ll i = 0; i < SIZE; i++) { 108 | assert(data[i] == test[i]); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /openmp/midterm_exam/softmax.sh: -------------------------------------------------------------------------------- 1 | size=10000000 2 | 3 | g++ -o segment_softmax -fopenmp -DSIZE=$size segment_softmax.cpp && timeout 60s time ./segment_softmax 4 | 5 | -------------------------------------------------------------------------------- /openmp/midterm_exam/submit/series_of_numbers.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #define ll long long 9 | using namespace std; 10 | 11 | // add your codes begin 12 | #define SIZE 10000000 13 | const ll mod = 1e8 + 10; 14 | const ll thrd_num = 20; 15 | 16 | typedef vector> matrix; 17 | 18 | matrix initMatrix(ll num) { 19 | matrix mat = vector(4, vector(4, (ll)0)); 20 | mat[0][0] = 1; 21 | mat[0][1] = 2; 22 | mat[0][2] = num; 23 | mat[0][3] = 1; 24 | mat[1][0] = 1; 25 | mat[2][2] = 1; 26 | mat[3][3] = 1; 27 | return mat; 28 | } 29 | 30 | matrix initEMatrix() { 31 | matrix mat = vector(4, vector(4, (ll)0)); 32 | mat[0][0] = 1; 33 | mat[1][1] = 1; 34 | mat[2][2] = 1; 35 | mat[3][3] = 1; 36 | return mat; 37 | } 38 | 39 | matrix operator*(const matrix& a, const matrix& b) { 40 | ll n = a.size(); 41 | ll m = b[0].size(); 42 | ll l = b.size(); 43 | matrix c(n, vector(m)); 44 | for (ll i = 0; i < n; i++) { 45 | for (ll j = 0; j < m; j++) { 46 | for (ll k = 0; k < l; k++) { 47 | c[i][j] += (a[i][k] * b[k][j]) % mod; 48 | c[i][j] %= mod; 49 | } 50 | } 51 | } 52 | return c; 53 | } 54 | 55 | void rscan(vector& data, ll size) { 56 | if (size == 1) return; 57 | vector twoSum(size / 2); 58 | #pragma omp parallel for num_threads(thrd_num) 59 | for (ll i = 0; i < twoSum.size(); i++) { 60 | twoSum[i] = data[2 * i + 1] * data[i * 2]; 61 | } 62 | rscan(twoSum, size / 2); 63 | #pragma omp parallel for num_threads(thrd_num) 64 | for (ll i = 1; i < size; i += 2) { 65 | data[i] = twoSum[i / 2]; 66 | if (i + 1 < size) { 67 | data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1]; 68 | } 69 | } 70 | } 71 | // add your codes end 72 | 73 | int main() { 74 | vector test(SIZE); 75 | vector data(SIZE); 76 | test[0] = 1; 77 | test[1] = 1; 78 | for (ll i = 2; i < SIZE; i++) { 79 | test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod; 80 | test[i] = (test[i] + (3 * i) % mod + 4) % mod; 81 | } 82 | double t = omp_get_wtime(); 83 | // add your codes begin 84 | // vector mats; 85 | // mats.push_back(initMatrix(2)); 86 | 87 | // for (ll i = 3; i < SIZE; i++) { 88 | // matrix c = initMatrix(i); 89 | // mats.push_back(c * mats[i - 3]); 90 | // } 91 | 92 | vector mats(SIZE-2); 93 | #pragma omp parallel for num_threads(thrd_num) 94 | for (ll i = 2; i < SIZE; i++) { 95 | mats[i-2] = initMatrix(i); 96 | } 97 | 98 | data[0] = 1; 99 | data[1] = 1; 100 | // reverse(mats.begin(), mats.end()); 101 | rscan(mats, SIZE - 2); 102 | // reverse(mats.begin(),mats.end()); 103 | #pragma omp parallel for num_threads(thrd_num) 104 | for (ll i = 2; i < SIZE; i++) { 105 | // matrix mt = initMatrix(2); 106 | // for (ll j = 3; j <= i; j++) { 107 | // mt = mt * initMatrix(j); 108 | // } 109 | // matrix mt = initEMatrix(); 110 | // for (ll j = i; j >= 2; j--) { 111 | // mt = mt * initMatrix(j); 112 | // } 113 | 114 | // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod + 115 | // (4 * mt[0][3]) % mod) % 116 | // mod; 117 | data[i] = 118 | (mats[i - 2][0][0] + mats[i - 2][0][1] + 119 | (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) % 120 | mod; 121 | } 122 | // reverse(data.begin()+2,data.end()); 123 | // add your codes end 124 | t = omp_get_wtime() - t; 125 | printf("time %f %d\n", t, SIZE); 126 | 127 | for (ll i = 0; i < SIZE; i++) { 128 | assert (data[i] == test[i]); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /openmp/notes/API.md: -------------------------------------------------------------------------------- 1 | ## 常用API 2 | 3 | ```c++ 4 | #pragma omp single 5 | #pragma omp task 6 | #pragma omp tasknowait 7 | 8 | #pragma omp parallel for default(shared) private(c,eps) 9 | 10 | ``` 11 | #pragma omp task 12 | 定义一个显式的任务,可能会被遇到的线程马上执行,也可能被延迟给线程组内其他线程来执行。任务的执行,依赖于OpenMP的任务调度。 13 | 14 | #pragma omp single 15 | single指令指定区域的代码只能由一组线程中的一个执行。在处理非线程安全的代码段(如I/O)时可能该指令非常有用 16 | 17 | #pragma omp section 18 | sections将一个任务分成独立的几个section,每个由不同的线程并行处理。 -------------------------------------------------------------------------------- /openmp/notes/wordsharing.md: -------------------------------------------------------------------------------- 1 | ## WorkSharing 2 | 可以通过omp的API来实现多线程执行循环 3 | ### for 4 | ```c++ 5 | #pragma omp parallel 6 | #pragma omp for 7 | for(i=0;i