├── README.md
├── openacc
    ├── .vscode
    │   ├── settings.json
    │   └── tasks.json
    ├── assign-floyd
    │   ├── ans
    │   │   ├── assign-floyd.2023-05-31.tgz
    │   │   ├── device.sh
    │   │   ├── floyd.cpp
    │   │   ├── floyd.exe3
    │   │   ├── floyd.log3
    │   │   ├── floyd_multidevice copy 2.cpp
    │   │   ├── floyd_multidevice copy.cpp
    │   │   ├── floyd_multidevice.cpp
    │   │   ├── makefile
    │   │   └── run.sh
    │   ├── assign-floyd.2023-05-31.tgz
    │   ├── device.sh
    │   ├── floyd.cpp
    │   ├── floyd.exe3
    │   ├── floyd.log3
    │   ├── floyd_multidevice.cpp
    │   ├── floyd_optimize.cpp
    │   ├── makefile
    │   └── run.sh
    ├── exam-floyd
    │   ├── ans
    │   │   ├── device.sh
    │   │   ├── exam-floyd.2023-05-31.tgz
    │   │   ├── floyd copy 10.cpp
    │   │   ├── floyd copy 11.cpp
    │   │   ├── floyd copy 12.cpp
    │   │   ├── floyd copy 13.cpp
    │   │   ├── floyd copy 14.cpp
    │   │   ├── floyd copy 15.cpp
    │   │   ├── floyd copy 16.cpp
    │   │   ├── floyd copy 17.cpp
    │   │   ├── floyd copy 2.cpp
    │   │   ├── floyd copy 3.cpp
    │   │   ├── floyd copy 4.cpp
    │   │   ├── floyd copy 5.cpp
    │   │   ├── floyd copy 6.cpp
    │   │   ├── floyd copy 7.cpp
    │   │   ├── floyd copy 8.cpp
    │   │   ├── floyd copy 9.cpp
    │   │   ├── floyd copy.cpp
    │   │   ├── floyd.cpp
    │   │   ├── floyd.exe4
    │   │   ├── floyd.log4
    │   │   ├── makefile
    │   │   └── run.sh
    │   ├── device.sh
    │   ├── exam-floyd.2023-05-31.tgz
    │   ├── floyd copy.log4
    │   ├── floyd.cpp
    │   ├── floyd.exe4
    │   ├── floyd.log4
    │   ├── makefile
    │   ├── output.i
    │   ├── output.o
    │   ├── output.s
    │   └── run.sh
    ├── image
    │   └── clauses.png
    ├── lab-floyd
    │   ├── ans
    │   │   ├── device.sh
    │   │   ├── floyd.cpp
    │   │   ├── floyd.exe0
    │   │   ├── floyd.exe1
    │   │   ├── floyd.exe2
    │   │   ├── floyd.log0
    │   │   ├── floyd.log1
    │   │   ├── floyd.log2
    │   │   ├── lab-floyd.2023-05-24.tgz
    │   │   ├── makefile
    │   │   └── run.sh
    │   ├── device.sh
    │   ├── floyd.exe0
    │   ├── floyd.exe1
    │   ├── floyd.exe2
    │   ├── floyd.log0
    │   ├── floyd.log1
    │   ├── floyd.log2
    │   ├── floyd_managed.cpp
    │   ├── floyd_multicore.cpp
    │   ├── floyd_multidevice.cpp
    │   ├── floyd_optimize.cpp
    │   ├── floyd_serial.cpp
    │   ├── lab-floyd.2023-05-24.tgz
    │   ├── makefile
    │   └── run.sh
    └── note.md
└── openmp
    ├── .vscode
        ├── settings.json
        └── tasks.json
    ├── assign_prime
        ├── build
        │   ├── prime
        │   └── prime_old
        ├── prime
        ├── prime.cpp
        ├── prime_old.cpp
        ├── prime_solution
        ├── res.log
        ├── run.sh
        ├── temp
        │   ├── assign_01_prime
        │   │   ├── build
        │   │   │   ├── prime
        │   │   │   ├── primeSerial
        │   │   │   └── primexyy
        │   │   ├── prime.cpp
        │   │   └── primeSerial.cpp
        │   ├── build
        │   │   └── prime
        │   ├── prime
        │   ├── prime.cpp
        │   ├── prime_solution
        │   ├── run.sh
        │   ├── test.log
        │   └── test.sh
        ├── test.sh
        └── testmy.sh
    ├── assign_sort
        ├── assign_sort.2023-03-29.tgz
        ├── build
        │   ├── build
        │   │   ├── tp
        │   │   ├── tp2
        │   │   └── tp3
        │   ├── sort_radix
        │   ├── sort_radix.cpp
        │   ├── sort_sample
        │   ├── sort_sample copy 2.cpp
        │   ├── sort_sample copy 3.cpp
        │   ├── sort_sample copy.cpp
        │   ├── sort_sample.cpp
        │   ├── tp.cpp
        │   ├── tp2.cpp
        │   └── tp3.cpp
        ├── readme.txt
        ├── run.sh
        ├── sort_radix
        ├── sort_radix.cpp
        ├── sort_radix_solution
        ├── sort_sample
        ├── sort_sample.cpp
        └── sort_sample_solution
    ├── exam_knn
        ├── ans.txt
        ├── build
        │   ├── build
        │   │   ├── knn copy 2
        │   │   └── knn copy 4
        │   ├── knn
        │   ├── knn copy 2.cpp
        │   ├── knn copy 3.cpp
        │   ├── knn copy 4.cpp
        │   ├── knn copy 5.cpp
        │   ├── knn copy 6.cpp
        │   ├── knn copy.cpp
        │   ├── knn.cpp
        │   └── test
        ├── exam_knn.2023-05-10.tgz
        ├── knn
        ├── knn.cpp
        ├── knn_0
        ├── out.txt
        ├── readme.txt
        ├── run.sh
        └── test.cpp
    ├── final_exam
        ├── build
        │   ├── circle
        │   ├── circle copy 2.cpp
        │   ├── circle copy 3.cpp
        │   └── circle copy.cpp
        ├── circle
        ├── circle.cpp
        └── run.sh
    ├── lab_knn
        ├── build
        │   ├── build
        │   │   └── knn
        │   ├── knn
        │   └── knn.cpp
        ├── knn
        ├── knn.cpp
        ├── knn_apx1
        ├── knn_apx2
        ├── knn_apx3
        ├── knn_apx4
        ├── lab_knn.2023-05-17.tgz
        ├── readme.txt
        └── run.sh
    ├── lab_par_for
        ├── DataSharing
        │   ├── Firstprivate
        │   │   ├── fp
        │   │   ├── fp.cpp
        │   │   └── makefile
        │   └── Lastprivate
        │   │   ├── fp
        │   │   ├── fp.cpp
        │   │   └── makefile
        ├── Intro05
        │   ├── hello
        │   ├── hello.cpp
        │   └── makefile
        ├── PI
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── PIv1
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── PIv2
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── PIv3
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── PIv4
        │   ├── build
        │   │   └── pi
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── PIv5
        │   ├── makefile
        │   ├── pi
        │   └── pi.cpp
        ├── hello
        │   ├── hello
        │   ├── hello.cpp
        │   └── makefile
        ├── lab_par_for.tgz
        ├── par_for
        │   ├── makefile
        │   ├── par_for
        │   └── par_for.cpp
        └── readme.txt
    ├── lab_pi_integral
        ├── build
        │   ├── build
        │   │   └── tp
        │   ├── pi_integral_0
        │   ├── pi_integral_1
        │   ├── pi_integral_1_1.cpp
        │   ├── pi_integral_2
        │   └── tp.cpp
        ├── lab_pi_integral.tgz
        ├── pi_integral_0.cpp
        ├── pi_integral_1.cpp
        ├── pi_integral_2.cpp
        ├── readme.txt
        ├── run.sh
        └── test.cpp
    ├── lab_pi_rnd
        ├── build
        │   ├── pi_rnd_1
        │   ├── pi_rnd_2 copy 2.cpp
        │   ├── pi_rnd_2 copy 3.cpp
        │   ├── pi_rnd_2 copy.cpp
        │   └── pi_rnd_2.cpp
        ├── lab_pi_rnd.2023-04-12.tgz
        ├── pi_rnd_0
        ├── pi_rnd_0.cpp
        ├── pi_rnd_1
        ├── pi_rnd_1.cpp
        ├── pi_rnd_2
        ├── pi_rnd_2.cpp
        ├── readme.txt
        └── run.sh
    ├── lab_scan_frag
        ├── lab_scan_frag.2023-04-19.tgz
        ├── readme.txt
        ├── run.sh
        ├── scan_frag
        ├── scan_frag_0
        └── scan_frag_0.cpp
    ├── lab_scan_link
        ├── build
        │   └── scan_link_0
        ├── lab_scan_link.2023-04-26.tgz
        ├── readme.txt
        ├── run.sh
        ├── scan_link
        ├── scan_link_0
        └── scan_link_0.cpp
    ├── lab_scan_tree
        ├── lab_scan_tree.2023-05-06.tgz
        ├── readme.txt
        ├── run.sh
        ├── scan_tree
        ├── scan_tree_0
        └── scan_tree_0.cpp
    ├── lab_scan_vect
        ├── build
        │   ├── scan_vect_1
        │   ├── scan_vect_2
        │   └── scan_vect_2.cpp
        ├── lab_scan_vect.tgz
        ├── readme.txt
        ├── run.sh
        ├── scan_vect
        ├── scan_vect_0
        ├── scan_vect_0.cpp
        ├── scan_vect_1
        ├── scan_vect_1.cpp
        ├── scan_vect_2
        └── scan_vect_2.cpp
    ├── midterm_exam
        ├── build
        │   ├── segment_softmax
        │   ├── segment_softmax copy.cpp
        │   ├── segment_softmax.cpp
        │   ├── series_of_numbers
        │   ├── series_of_numbers copy
        │   ├── series_of_numbers copy 2.cpp
        │   ├── series_of_numbers copy.cpp
        │   ├── series_of_numbers.cpp
        │   ├── series_of_numbers_2
        │   ├── series_of_numbers_2 copy 2.cpp
        │   ├── series_of_numbers_2 copy.cpp
        │   └── series_of_numbers_2.cpp
        ├── segment_softmax
        ├── segment_softmax.cpp
        ├── series.sh
        ├── series_of_numbers
        ├── series_of_numbers.cpp
        ├── softmax.sh
        └── submit
        │   ├── segment_softmax.cpp
        │   └── series_of_numbers.cpp
    └── notes
        ├── API.md
        ├── wordsharing.md
        └── 同步结构.md


/README.md:
--------------------------------------------------------------------------------
1 | # ParallelComputingCourse
2 | 山东大学计算机学院 并行计算课程实验
3 | 
4 | 所有代码仅供参考，并不保证正确性。
5 | 
6 | 如果对您有用，希望能得到您的star。
7 | 


--------------------------------------------------------------------------------
/openacc/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "files.associations": {
3 |         "array": "cpp",
4 |         "string": "cpp",
5 |         "string_view": "cpp",
6 |         "vector": "cpp"
7 |     }
8 | }


--------------------------------------------------------------------------------
/openacc/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tasks": [
 3 |         {
 4 |             "type": "cppbuild",
 5 |             "label": "C/C++: g++ 生成活动文件",
 6 |             "command": "/usr/bin/g++",
 7 |             "args": [
 8 |                 "-fdiagnostics-color=always",
 9 |                 "-g",
10 |                 "${file}",
11 |                 "-o",
12 |                 "${fileDirname}/${fileBasenameNoExtension}"
13 |             ],
14 |             "options": {
15 |                 "cwd": "${fileDirname}"
16 |             },
17 |             "problemMatcher": [
18 |                 "$gcc"
19 |             ],
20 |             "group": {
21 |                 "kind": "build",
22 |                 "isDefault": true
23 |             },
24 |             "detail": "调试器生成的任务。"
25 |         }
26 |     ],
27 |     "version": "2.0.0"
28 | }


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/assign-floyd.2023-05-31.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/ans/assign-floyd.2023-05-31.tgz


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=$devlst
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <omp.h>
 4 | #include <openacc.h>
 5 | #include <math.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <assert.h>
 9 | #include <algorithm>
10 | using namespace std;
11 | 
12 | 
13 | inline int index(const int i, const int j) {
14 |   return i * SIZE + j;
15 | }
16 | 
17 | // add your codes begin
18 | // add your codes end
19 | 
20 | 
21 | int main() {
22 |   const int size2 = SIZE * SIZE;
23 |   float* data = new float[size2];
24 |   for (int i = 0; i < size2; i++) data[i] = -INF;
25 | 
26 |   srand(SIZE);
27 |   for (int i = 0; i < SIZE*20; i++) {
28 |     int prev = rand() % SIZE;
29 |     int next = rand() % SIZE;
30 |     if ((prev == next) || (data[index(prev, next)] > -INF)) {
31 |       i--;
32 |       continue;
33 |     }
34 |     data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
35 |   }
36 | 
37 |   double t = omp_get_wtime();
38 |   // add your codes begin
39 |   // add your codes end
40 |   t = omp_get_wtime() - t;
41 |   printf("time %f %d\n", t, SIZE);
42 | 
43 |   for (int i = 0; i < 20; i++) {
44 |     int prev = rand() % SIZE;
45 |     int next = rand() % SIZE;
46 |     if (prev == next) {
47 |       i--;
48 |       continue;
49 |     }
50 |     printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
51 |   }
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd.exe3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/ans/floyd.exe3


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd.log3:
--------------------------------------------------------------------------------
 1 | index(int, int):
 2 |      13, Generating implicit acc routine seq
 3 |          Generating NVIDIA GPU code
 4 | main:
 5 |       5, include "math.h"
 6 |           15, include "math.h"
 7 |                36, include "cmath"
 8 |                     15, include "cmath"
 9 |                        1935, include "specfun.h"
10 |                               45, include "stl_algobase.h"
11 |                                    45, #omp parallel
12 |                                        46, Begin single region
13 |                                            End single region
14 |                                            Barrier
15 |                                        73, Barrier
16 |                                        84, Barrier
17 |      24, Memory set idiom, loop replaced by call to __c_mset4
18 |      56, Generating enter data create(copy[:23040000],dkj[:4800],dik[:4800])
19 |          Generating enter data copyin(data[:23040000])
20 |      59, Generating present(d0[:],d1[:])
21 |          Generating implicit firstprivate(begin,end)
22 |          Generating NVIDIA GPU code
23 |          61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
24 |      59, Generating implicit copyout(dik[begin:end-begin]) [if not already present]
25 |      61, Generating implicit firstprivate(k)
26 |      65, Generating update self(dik[begin:step])
27 |          Generating present(d1[:],d0[:])
28 |          Generating NVIDIA GPU code
29 |          67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
30 |      65, Generating implicit copyout(dkj[:4800]) [if not already present]
31 |      67, Generating implicit firstprivate(k)
32 |      71, Generating update self(dkj[:4800])
33 |          Generating update device(dkj[:4800],dik[:4800])
34 |          Generating present(d1[:],d0[:])
35 |          Generating implicit firstprivate(begin,end)
36 |          Generating NVIDIA GPU code
37 |          78, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
38 |          79,   /* blockIdx.x threadIdx.x tiled */
39 |      71, Generating implicit copyin(dkj[:4800],dik[begin:end-begin]) [if not already present]
40 |      79, Generating implicit private(_T25_5541,_T22_5541)
41 |      99, Generating update self(data[begin*4800:step*4800])
42 |          Generating exit data delete(dkj[:4800],dik[:4800],data[:23040000],copy[:23040000])
43 | float const& std::max<float>(float const&, float const&):
44 |       5, include "math.h"
45 |           15, include "math.h"
46 |                36, include "cmath"
47 |                     15, include "cmath"
48 |                        1935, include "specfun.h"
49 |                               45, include "stl_algobase.h"
50 |                                   255, Generating implicit acc routine seq
51 |                                        Generating NVIDIA GPU code
52 | #num_dev: 1
53 | #num_thread: 1
54 | #device_id: 0 0 4800 4800
55 | time 2.797260 4800
56 | test 1683 3274 -0.706498
57 | test 632 2448 -0.447264
58 | test 842 583 -0.503465
59 | test 807 4278 -0.626579
60 | test 803 353 -0.561426
61 | test 4022 1321 -0.534807
62 | test 2934 2255 -0.506989
63 | test 3334 4036 -0.535419
64 | test 3344 3528 -0.635352
65 | test 4058 965 -0.721051
66 | test 3765 3241 -0.468802
67 | test 2756 3304 -0.441437
68 | test 4583 1289 -0.648130
69 | test 1697 2723 -0.456125
70 | test 4475 3795 -0.479226
71 | test 1303 1358 -0.472469
72 | test 2269 4688 -0.527612
73 | test 1759 1063 -0.515894
74 | test 471 518 -0.486703
75 | test 542 1274 -0.587901
76 | 2.65user 0.24system 0:02.89elapsed 99%CPU (0avgtext+0avgdata 263092maxresident)k
77 | 0inputs+0outputs (3major+73084minor)pagefaults 0swaps
78 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd_multidevice copy 2.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 | // add your codes begin
41 | #pragma omp parallel num_threads(8)
42 |     {
43 |         int id, nthrds;
44 |         id = omp_get_thread_num();
45 |         nthrds = omp_get_num_threads();
46 |         acc_set_device_num(id, acc_device_nvidia);
47 | #pragma acc data copy(data[0 : size2])
48 |         for (int k = id; k < SIZE; k += nthrds) {
49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(256) //async(id)
50 |             for (int i = 0; i < SIZE; i++) {
51 | #pragma acc loop vector
52 |                 for (int j = 0; j < SIZE; j++) {
53 |                     float temp = data[index(i, k)] + data[index(k, j)];
54 |                     if (data[index(i, j)] < temp) {
55 |                         data[index(i, j)] = temp;
56 |                     }
57 |                 }
58 |             }
59 |         }
60 | #pragma acc update device(imgData[starty*step:blocksize*step])
61 |     }
62 | 
63 |     // add your codes end
64 |     t = omp_get_wtime() - t;
65 |     printf("time %f %d\n", t, SIZE);
66 | 
67 |     for (int i = 0; i < 20; i++) {
68 |         int prev = rand() % SIZE;
69 |         int next = rand() % SIZE;
70 |         if (prev == next) {
71 |             i--;
72 |             continue;
73 |         }
74 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd_multidevice copy.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 | // add your codes begin
41 | #pragma omp parallel num_threads(8)
42 |     {
43 |         int id, nthrds;
44 |         id = omp_get_thread_num();
45 |         nthrds = omp_get_num_threads();
46 |         acc_set_device_num(id, acc_device_nvidia);
47 | #pragma acc data copy(data[0 : size2])
48 |         for (int k = id; k < SIZE; k += nthrds) {
49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(256) //async(id)
50 |             for (int i = 0; i < SIZE; i++) {
51 | #pragma acc loop vector
52 |                 for (int j = 0; j < SIZE; j++) {
53 |                     float temp = data[index(i, k)] + data[index(k, j)];
54 |                     if (data[index(i, j)] < temp) {
55 |                         data[index(i, j)] = temp;
56 |                     }
57 |                 }
58 |             }
59 |         }
60 | #pragma acc update device(imgData[starty*step:blocksize*step])
61 |     }
62 | 
63 |     // add your codes end
64 |     t = omp_get_wtime() - t;
65 |     printf("time %f %d\n", t, SIZE);
66 | 
67 |     for (int i = 0; i < 20; i++) {
68 |         int prev = rand() % SIZE;
69 |         int next = rand() % SIZE;
70 |         if (prev == next) {
71 |             i--;
72 |             continue;
73 |         }
74 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/floyd_multidevice.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 | // add your codes begin
41 | #pragma omp parallel num_threads(8)
42 |     {
43 |         int id, nthrds;
44 |         id = omp_get_thread_num();
45 |         nthrds = omp_get_num_threads();
46 |         acc_set_device_num(id, acc_device_nvidia);
47 | #pragma acc data copy(data[0 : size2])
48 |         for (int k = id; k < SIZE; k += nthrds) {
49 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128)
50 |             for (int i = 0; i < SIZE; i++) {
51 |                 float temp;
52 | #pragma acc loop vector
53 |                 for (int j = 0; j < SIZE; j++) {
54 |                     temp = data[index(i, k)] + data[index(k, j)];
55 |                     if (data[index(i, j)] < temp) {
56 |                         data[index(i, j)] = temp;
57 |                     }
58 |                 }
59 |             }
60 |         }
61 |     }
62 | 
63 |     // add your codes end
64 |     t = omp_get_wtime() - t;
65 |     printf("time %f %d\n", t, SIZE);
66 | 
67 |     for (int i = 0; i < 20; i++) {
68 |         int prev = rand() % SIZE;
69 |         int next = rand() % SIZE;
70 |         if (prev == next) {
71 |             i--;
72 |             continue;
73 |         }
74 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | all: clean serial multicore managed optimize multidevice
30 | 
31 | clean:
32 | 	rm -f floyd.exe* floyd.prof* floyd.log*
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/ans/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | source device.sh 1
 4 | #source device.sh 8
 5 | make clean
 6 | 
 7 | #timeout 1m time make serial
 8 | #timeout 1m time make multicore
 9 | #timeout 1m time make managed
10 | timeout 1m time make optimize
11 | #timeout 1m time make multidevice
12 | 
13 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/assign-floyd.2023-05-31.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/assign-floyd.2023-05-31.tgz


--------------------------------------------------------------------------------
/openacc/assign-floyd/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=$devlst
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/floyd.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 | // add your codes begin
41 | #pragma acc data copy(data[0 : size2])
42 |     for (int k = 0; k < SIZE; k++) {
43 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128)
44 |         for (int i = 0; i < SIZE; i++) {
45 | #pragma acc loop vector
46 |             for (int j = 0; j < SIZE; j++) {
47 |                 // if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) {
48 |                 //     data[index(i,j)] = data[index(i,k)] + data[index(k,j)];
49 |                 // }
50 |                 float tp = data[i * SIZE + k] + data[k * SIZE + j];
51 |                 if (data[i * SIZE + j] >= tp) continue;
52 |                 data[i * SIZE + j] = tp;
53 |             }
54 |         }
55 |     }
56 | 
57 |     // add your codes end
58 |     t = omp_get_wtime() - t;
59 |     printf("time %f %d\n", t, SIZE);
60 | 
61 |     for (int i = 0; i < 20; i++) {
62 |         int prev = rand() % SIZE;
63 |         int next = rand() % SIZE;
64 |         if (prev == next) {
65 |             i--;
66 |             continue;
67 |         }
68 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/floyd.exe3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/assign-floyd/floyd.exe3


--------------------------------------------------------------------------------
/openacc/assign-floyd/floyd.log3:
--------------------------------------------------------------------------------
 1 | main:
 2 |      38, Generating copy(data[:23040000]) [if not already present]
 3 |      42, Generating NVIDIA GPU code
 4 |          44, #pragma acc loop gang, worker(4) /* blockIdx.x threadIdx.y */
 5 |          46, #pragma acc loop vector(128) /* threadIdx.x */
 6 |      46, Loop is parallelizable
 7 |          Generating implicit firstprivate(k)
 8 | time 1.254653 4800
 9 | test 1683 3274 -0.706498
10 | test 632 2448 -0.447264
11 | test 842 583 -0.503465
12 | test 807 4278 -0.626579
13 | test 803 353 -0.561426
14 | test 4022 1321 -0.534807
15 | test 2934 2255 -0.506989
16 | test 3334 4036 -0.535419
17 | test 3344 3528 -0.635352
18 | test 4058 965 -0.721051
19 | test 3765 3241 -0.468802
20 | test 2756 3304 -0.441437
21 | test 4583 1289 -0.648130
22 | test 1697 2723 -0.456125
23 | test 4475 3795 -0.479226
24 | test 1303 1358 -0.472469
25 | test 2269 4688 -0.527612
26 | test 1759 1063 -0.515894
27 | test 471 518 -0.486703
28 | test 542 1274 -0.587901
29 | 1.13user 0.26system 0:01.40elapsed 99%CPU (0avgtext+0avgdata 263100maxresident)k
30 | 0inputs+8outputs (0major+50299minor)pagefaults 0swaps
31 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/floyd_multidevice.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | #include <iostream>
12 | using namespace std;
13 | // #define SIZE 1000
14 | #define DEVICE_NUM 2
15 | inline int index(const int i, const int j) { return i * SIZE + j; }
16 | 
17 | // add your codes begin
18 | 
19 | // add your codes end
20 | 
21 | int main() {
22 |     const int size2 = SIZE * SIZE;
23 |     float* data = new float[size2];
24 |     for (int i = 0; i < size2; i++) data[i] = -INF;
25 | 
26 |     srand(SIZE);
27 |     for (int i = 0; i < SIZE * 20; i++) {
28 |         // 第一维坐标
29 |         int prev = rand() % SIZE;
30 |         // 第二维坐标
31 |         int next = rand() % SIZE;
32 |         // 如果为对称轴，或者已经被设置边权了
33 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
34 |             i--;
35 |             continue;
36 |         }
37 |         // 赋值
38 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
39 |     }
40 |     double t = omp_get_wtime();
41 | 
42 |     // add your codes begin
43 |     int blocksize = SIZE / DEVICE_NUM;
44 |     for (int k = 0; k < SIZE; k++) {
45 | #pragma omp parallel num_threads(DEVICE_NUM)
46 |         {
47 |             int id, nthrds;
48 |             id = omp_get_thread_num();
49 |             nthrds = omp_get_num_threads();
50 |             acc_set_device_num(id, acc_device_nvidia);
51 | #pragma acc data copyin(data[0 : size2])
52 |             {
53 | #pragma omp for schedule(static, 1)
54 |                 for (int device_id = 0; device_id < DEVICE_NUM; device_id++) {
55 |                     int st = id * blocksize;
56 |                     int ed = st + blocksize;
57 | #pragma acc parallel loop gang worker num_workers(4) vector_length(128)
58 |                     for (int i = st; i < ed; i++) {
59 | #pragma acc loop vector
60 |                         for (int j = 0; j < SIZE; j++) {
61 |                             if (data[index(i, j)] <
62 |                                 data[index(i, k)] + data[index(k, j)]) {
63 |                                 data[index(i, j)] =
64 |                                     data[index(i, k)] + data[index(k, j)];
65 |                             }
66 |                         }
67 |                     }
68 | #pragma acc update self(data[st * SIZE : SIZE * blocksize])
69 |                 }
70 |             }
71 |         }
72 |     }
73 | 
74 |     // add your codes end
75 |     t = omp_get_wtime() - t;
76 |     printf("time %f %d\n", t, SIZE);
77 | 
78 |     for (int i = 0; i < 20; i++) {
79 |         int prev = rand() % SIZE;
80 |         int next = rand() % SIZE;
81 |         if (prev == next) {
82 |             i--;
83 |             continue;
84 |         }
85 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
86 |     }
87 | }


--------------------------------------------------------------------------------
/openacc/assign-floyd/floyd_optimize.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 |     
40 |     // add your codes begin
41 |     #pragma acc data copy(data[0:size2])
42 |     for (int k = 0; k < SIZE; k++) {
43 |         #pragma acc parallel loop gang worker num_workers(4) vector_length(128)
44 |         for (int i = 0; i < SIZE; i++) {
45 |             #pragma acc loop vector
46 |             for (int j = 0; j < SIZE; j++) {
47 |                 if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) {
48 |                     data[index(i,j)] = data[index(i,k)] + data[index(k,j)];
49 |                 }
50 |             }
51 |         }
52 |     }
53 | 
54 |     // add your codes end
55 |     t = omp_get_wtime() - t;
56 |     printf("time %f %d\n", t, SIZE);
57 | 
58 |     for (int i = 0; i < 20; i++) {
59 |         int prev = rand() % SIZE;
60 |         int next = rand() % SIZE;
61 |         if (prev == next) {
62 |             i--;
63 |             continue;
64 |         }
65 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_multidevice.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | all: clean serial multicore managed optimize multidevice
30 | 
31 | clean:
32 | 	rm -f floyd.exe* floyd.prof* floyd.log*
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc/assign-floyd/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | source device.sh 1
 4 | #source device.sh 8
 5 | make clean
 6 | 
 7 | #timeout 1m time make serial
 8 | #timeout 1m time make multicore
 9 | #timeout 1m time make managed
10 | timeout 1m time make optimize
11 | # timeout 10s time make multidevice
12 | 
13 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=$devlst
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/exam-floyd.2023-05-31.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/ans/exam-floyd.2023-05-31.tgz


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/floyd.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <omp.h>
 4 | #include <openacc.h>
 5 | #include <math.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <assert.h>
 9 | #include <algorithm>
10 | using namespace std;
11 | 
12 | 
13 | inline int index(const int i, const int j) {
14 |   return i * SIZE + j;
15 | }
16 | 
17 | // add your codes begin
18 | // add your codes end
19 | 
20 | 
21 | int main() {
22 |   const int size2 = SIZE * SIZE;
23 |   float* data = new float[size2];
24 |   for (int i = 0; i < size2; i++) data[i] = -INF;
25 | 
26 |   srand(SIZE);
27 |   for (int i = 0; i < SIZE*20; i++) {
28 |     int prev = rand() % SIZE;
29 |     int next = rand() % SIZE;
30 |     if ((prev == next) || (data[index(prev, next)] > -INF)) {
31 |       i--;
32 |       continue;
33 |     }
34 |     data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
35 |   }
36 | 
37 |   double t = omp_get_wtime();
38 |   // add your codes begin
39 |   // add your codes end
40 |   t = omp_get_wtime() - t;
41 |   printf("time %f %d\n", t, SIZE);
42 | 
43 |   for (int i = 0; i < 20; i++) {
44 |     int prev = rand() % SIZE;
45 |     int next = rand() % SIZE;
46 |     if (prev == next) {
47 |       i--;
48 |       continue;
49 |     }
50 |     printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
51 |   }
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/floyd.exe4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/ans/floyd.exe4


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/floyd.log4:
--------------------------------------------------------------------------------
 1 | index(int, int):
 2 |      13, Generating implicit acc routine seq
 3 |          Generating NVIDIA GPU code
 4 | main:
 5 |       5, include "math.h"
 6 |           15, include "math.h"
 7 |                36, include "cmath"
 8 |                     15, include "cmath"
 9 |                        1935, include "specfun.h"
10 |                               45, include "stl_algobase.h"
11 |                                    45, #omp parallel
12 |                                        46, Begin single region
13 |                                            End single region
14 |                                            Barrier
15 |                                        73, Barrier
16 |                                        84, Barrier
17 |      24, Memory set idiom, loop replaced by call to __c_mset4
18 |      56, Generating enter data create(copy[:36000000],dkj[:6000],dik[:6000])
19 |          Generating enter data copyin(data[:36000000])
20 |      59, Generating present(d0[:],d1[:])
21 |          Generating implicit firstprivate(begin,end)
22 |          Generating NVIDIA GPU code
23 |          61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
24 |      59, Generating implicit copyout(dik[begin:end-begin]) [if not already present]
25 |      61, Generating implicit  (k)
26 |      65, Generating update self(dik[begin:step])
27 |          Generating present(d1[:],d0[:])
28 |          Generating NVIDIA GPU code
29 |          67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
30 |      65, Generating implicit copyout(dkj[:6000]) [if not already present]
31 |      67, Generating implicit firstprivate(k)
32 |      71, Generating update self(dkj[:6000])
33 |          Generating update device(dkj[:6000],dik[:6000])
34 |          Generating present(d1[:],d0[:])
35 |          Generating implicit firstprivate(begin,end)
36 |          Generating NVIDIA GPU code
37 |          78, #pragma acc loop gang, vector   /* blockIdx.x threadIdx.x */
38 |          79,   /* blockIdx.x threadIdx.x tiled */
39 |      71, Generating implicit copyin(dkj[:6000],dik[begin:end-begin]) [if not already present]
40 |      79, Generating implicit private(_T25_5541,_T22_5541)
41 |      99, Generating update self(data[begin*6000:step*6000])
42 |          Generating exit data delete(dkj[:6000],dik[:6000],data[:36000000],copy[:36000000])
43 | float const& std::max<float>(float const&, float const&):
44 |       5, include "math.h"
45 |           15, include "math.h"
46 |                36, include "cmath"
47 |                     15, include "cmath"
48 |                        1935, include "specfun.h"
49 |                               45, include "stl_algobase.h"
50 |                                   255, Generating implicit acc routine seq
51 |                                        Generating NVIDIA GPU code
52 | #num_dev: 8
53 | #num_thread: 8
54 | #device_id: 7 5250 6000 750
55 | #device_id: 4 3000 3750 750
56 | #device_id: 1 750 1500 750
57 | #device_id: 0 0 750 750 
58 | #device_id: 5 3750 4500 750
59 | #device_id: 6 4500 5250 750
60 | #device_id: 3 2250 3000 750
61 | #device_id: 2 1500 2250 750
62 | time 3.402403 6000
63 | test 2161 4093 -0.683869
64 | test 4560 849 -0.384304
65 | test 4615 5729 -0.371038
66 | test 5055 5059 -0.487377
67 | test 1882 5483 -0.559907
68 | test 1312 3253 -0.648813
69 | test 4565 5567 -0.638792
70 | test 4094 4590 -0.523406
71 | test 4765 4454 -0.760118
72 | test 4230 4312 -0.455890
73 | test 4446 5673 -0.539349
74 | test 5203 3022 -0.435516
75 | test 3117 548 -0.571684
76 | test 1839 1493 -0.294103
77 | test 71 2247 -0.472646
78 | test 2630 2585 -0.511463
79 | test 692 1190 -0.315346
80 | test 3434 5308 -0.457022
81 | test 1271 2841 -0.614242
82 | test 4367 3153 -0.563043
83 | 13.72user 3.36system 0:04.19elapsed 407%CPU (0avgtext+0avgdata 1543936maxresident)k
84 | 0inputs+8outputs (1major+256099minor)pagefaults 0swaps
85 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | all: clean serial multicore managed optimize multidevice
30 | 
31 | clean:
32 | 	rm -f floyd.exe* floyd.prof* floyd.log*
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/ans/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | make clean
 4 | 
 5 | #source device.sh 1
 6 | #timeout 1m time make serial
 7 | #timeout 1m time make multicore
 8 | #timeout 1m time make managed
 9 | #timeout 1m time make optimize
10 | 
11 | source device.sh 8
12 | timeout 1m time make multidevice
13 | 
14 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=5,6,7
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/exam-floyd.2023-05-31.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/exam-floyd.2023-05-31.tgz


--------------------------------------------------------------------------------
/openacc/exam-floyd/floyd copy.log4:
--------------------------------------------------------------------------------
 1 | main:
 2 |      23, Memory set idiom, loop replaced by call to __c_mset4
 3 |      27, Loop not vectorized/parallelized: contains call
 4 |      30, index(int, int) inlined, size=2 (inline) file floyd.cpp (13)
 5 |      34, index(int, int) inlined, size=2 (inline) file floyd.cpp (13)
 6 |      45, #omp parallel
 7 |          Loop not vectorized: data dependency
 8 |          Loop unrolled 2 times
 9 |      50, #omp parallel
10 |          85, Barrier
11 |      57, Generating copyin(data[begin*6000:blocksize*6000]) [if not already present]
12 |          Generating create(dkj[:6000],dik[:6000]) [if not already present]
13 |      58, Generating update device(dkj[:6000],dik[:6000])
14 |          Generating implicit firstprivate(begin,end)
15 |          Generating NVIDIA GPU code
16 |          63, #pragma acc loop gang /* blockIdx.x */
17 |          67, #pragma acc loop vector(1024) /* threadIdx.x */
18 |      58, Loop not vectorized/parallelized: contains call
19 |      67, Loop is parallelizable
20 |      73, Generating implicit firstprivate(begin,end)
21 |          Generating NVIDIA GPU code
22 |          75, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
23 |      75, Generating implicit firstprivate(k)
24 |      79, Generating update self(dik[begin:blocksize])
25 |          Generating NVIDIA GPU code
26 |          81, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
27 |      81, Generating implicit firstprivate(k)
28 |      84, Generating update self(dkj[:6000])
29 |      88, Generating update self(data[begin*6000:blocksize*6000])
30 |      95, Loop not vectorized/parallelized: contains call
31 |     102, index(int, int) inlined, size=2 (inline) file floyd.cpp (13)
32 | time 1.593988 6000
33 | test 2161 4093 -0.683869
34 | test 4560 849 -0.384304
35 | test 4615 5729 -0.371038
36 | test 5055 5059 -0.487377
37 | test 1882 5483 -0.559907
38 | test 1312 3253 -0.648813
39 | test 4565 5567 -0.638792
40 | test 4094 4590 -0.523406
41 | test 4765 4454 -0.760118
42 | test 4230 4312 -0.455890
43 | test 4446 5673 -0.539349
44 | test 5203 3022 -0.435516
45 | test 3117 548 -0.571684
46 | test 1839 1493 -0.294103
47 | test 71 2247 -0.472646
48 | test 2630 2585 -0.511463
49 | test 692 1190 -0.315346
50 | test 3434 5308 -0.457022
51 | test 1271 2841 -0.614242
52 | test 4367 3153 -0.563043
53 | 3.79user 0.90system 0:01.85elapsed 253%CPU (0avgtext+0avgdata 627496maxresident)k
54 | 0inputs+0outputs (0major+150079minor)pagefaults 0swaps
55 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/floyd.exe4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/floyd.exe4


--------------------------------------------------------------------------------
/openacc/exam-floyd/floyd.log4:
--------------------------------------------------------------------------------
 1 | main:
 2 |      23, Memory set idiom, loop replaced by call to __c_mset4
 3 |      45, #omp parallel
 4 |      50, #omp parallel
 5 |          85, Barrier
 6 |      57, Generating copyin(data[begin*6000:blocksize*6000]) [if not already present]
 7 |          Generating create(dkj[:6000],dik[:6000]) [if not already present]
 8 |      58, Generating update device(dik[:6000],dkj[:6000])
 9 |          Generating implicit firstprivate(end,begin)
10 |          Generating NVIDIA GPU code
11 |          63, #pragma acc loop gang /* blockIdx.x */
12 |          67, #pragma acc loop vector(1024) /* threadIdx.x */
13 |      67, Loop is parallelizable
14 |      73, Generating implicit firstprivate(begin,end)
15 |          Generating NVIDIA GPU code
16 |          75, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
17 |      75, Generating implicit firstprivate(k)
18 |      79, Generating update self(dik[begin:blocksize])
19 |          Generating NVIDIA GPU code
20 |          81, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
21 |      81, Generating implicit firstprivate(k)
22 |      84, Generating update self(dkj[:6000])
23 |      88, Generating update self(data[begin*6000:blocksize*6000])
24 | time 1.702572 6000
25 | test 2161 4093 -0.683869
26 | test 4560 849 -0.384304
27 | test 4615 5729 -0.371038
28 | test 5055 5059 -0.487377
29 | test 1882 5483 -0.559907
30 | test 1312 3253 -0.648813
31 | test 4565 5567 -0.638792
32 | test 4094 4590 -0.523406
33 | test 4765 4454 -0.760118
34 | test 4230 4312 -0.455890
35 | test 4446 5673 -0.539349
36 | test 5203 3022 -0.435516
37 | test 3117 548 -0.571684
38 | test 1839 1493 -0.294103
39 | test 71 2247 -0.472646
40 | test 2630 2585 -0.511463
41 | test 692 1190 -0.315346
42 | test 3434 5308 -0.457022
43 | test 1271 2841 -0.614242
44 | test 4367 3153 -0.563043
45 | 3.86user 1.03system 0:01.98elapsed 247%CPU (0avgtext+0avgdata 627488maxresident)k
46 | 0inputs+8outputs (1major+150118minor)pagefaults 0swaps
47 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -O1 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | test:
30 | 	nvc++ -c floyd.cpp -o output.o -DSIZE=6000 
31 | 
32 | all: clean serial multicore managed optimize multidevice
33 | 
34 | clean:
35 | 	rm -f floyd.exe* floyd.prof* floyd.log*
36 | 
37 | 


--------------------------------------------------------------------------------
/openacc/exam-floyd/output.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/exam-floyd/output.o


--------------------------------------------------------------------------------
/openacc/exam-floyd/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | make clean
 4 | 
 5 | #source device.sh 1
 6 | #timeout 1m time make serial
 7 | #timeout 1m time make multicore
 8 | #timeout 1m time make managed
 9 | #timeout 1m time make optimize
10 | 
11 | source device.sh 3
12 | timeout 20s time make multidevice
13 | 
14 | 


--------------------------------------------------------------------------------
/openacc/image/clauses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/image/clauses.png


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=$devlst
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <omp.h>
 4 | #include <openacc.h>
 5 | #include <math.h>
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <assert.h>
 9 | #include <algorithm>
10 | using namespace std;
11 | 
12 | 
13 | inline int index(const int i, const int j) {
14 |   return i * SIZE + j;
15 | }
16 | 
17 | // add your codes begin
18 | // add your codes end
19 | 
20 | 
21 | int main() {
22 |   const int size2 = SIZE * SIZE;
23 |   float* data = new float[size2];
24 |   for (int i = 0; i < size2; i++) data[i] = -INF;
25 | 
26 |   srand(SIZE);
27 |   for (int i = 0; i < SIZE*20; i++) {
28 |     int prev = rand() % SIZE;
29 |     int next = rand() % SIZE;
30 |     if ((prev == next) || (data[index(prev, next)] > -INF)) {
31 |       i--;
32 |       continue;
33 |     }
34 |     data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
35 |   }
36 | 
37 |   double t = omp_get_wtime();
38 |   // add your codes begin
39 |   // add your codes end
40 |   t = omp_get_wtime() - t;
41 |   printf("time %f %d\n", t, SIZE);
42 | 
43 |   for (int i = 0; i < 20; i++) {
44 |     int prev = rand() % SIZE;
45 |     int next = rand() % SIZE;
46 |     if (prev == next) {
47 |       i--;
48 |       continue;
49 |     }
50 |     printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
51 |   }
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.exe0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe0


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.exe1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe1


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.exe2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/floyd.exe2


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.log0:
--------------------------------------------------------------------------------
 1 | #num_dev: 0
 2 | #num_thread: 1
 3 | #device_id: 0 0 1200 1200
 4 | time 9.751188 1200
 5 | test 65 506 -0.544105
 6 | test 272 605 -0.519495
 7 | test 391 617 -0.342672
 8 | test 384 274 -0.396835
 9 | test 1136 489 -0.337305
10 | test 657 761 -0.485585
11 | test 893 278 -0.457243
12 | test 163 278 -0.431883
13 | test 366 221 -0.400291
14 | test 794 598 -0.520162
15 | test 712 749 -0.404731
16 | test 390 1090 -0.518321
17 | test 547 715 -0.355524
18 | test 366 102 -0.467554
19 | test 627 1029 -0.434617
20 | test 808 1044 -0.606123
21 | test 687 232 -0.404254
22 | test 449 230 -0.481887
23 | test 2 1186 -0.455573
24 | test 504 1138 -0.456215
25 | 9.76user 0.00system 0:09.76elapsed 99%CPU (0avgtext+0avgdata 16776maxresident)k
26 | 0inputs+8outputs (0major+3099minor)pagefaults 0swaps
27 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.log1:
--------------------------------------------------------------------------------
  1 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct  (floyd.cpp: 59)
  2 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct  (floyd.cpp: 65)
  3 | NVC++-W-0155-OpenACC multicore code disabled inside OpenMP parallel construct  (floyd.cpp: 71)
  4 | main:
  5 |       5, include "math.h"
  6 |           15, include "math.h"
  7 |                36, include "cmath"
  8 |                     15, include "cmath"
  9 |                        1935, include "specfun.h"
 10 |                               45, include "stl_algobase.h"
 11 |                                    45, #omp parallel
 12 |                                        46, Begin single region
 13 |                                            End single region
 14 |                                            Barrier
 15 |                                        73, Barrier
 16 |                                        84, Barrier
 17 |      24, Memory set idiom, loop replaced by call to __c_mset4
 18 | NVC++/x86-64 Linux 23.3-0: compilation completed with warnings
 19 | #num_dev: 0
 20 | #num_thread: 64
 21 | #device_id: 0 0 38 38
 22 | #device_id: 62 2356 2394 38
 23 | #device_id: 22 836 874 38
 24 | #device_id: 13 494 532 38
 25 | #device_id: 18 684 722 38
 26 | #device_id: 32 1216 1254 38
 27 | #device_id: 50 1900 1938 38
 28 | #device_id: 43 1634 1672 38
 29 | #device_id: 37 1406 1444 38
 30 | #device_id: 58 2204 2242 38
 31 | #device_id: 57 2166 2204 38
 32 | #device_id: 12 456 494 38
 33 | #device_id: 59 2242 2280 38
 34 | #device_id: 19 722 760 38
 35 | #device_id: 42 1596 1634 38
 36 | #device_id: 45 1710 1748 38
 37 | #device_id: 4 152 190 38
 38 | #device_id: 40 1520 1558 38
 39 | #device_id: 51 1938 1976 38
 40 | #device_id: 31 1178 1216 38
 41 | #device_id: 29 1102 1140 38
 42 | #device_id: 1 38 76 38
 43 | #device_id: 53 2014 2052 38
 44 | #device_id: 3 114 152 38
 45 | #device_id: 9 342 380 38
 46 | #device_id: 38 1444 1482 38
 47 | #device_id: 24 912 950 38
 48 | #device_id: 39 1482 1520 38
 49 | #device_id: 14 532 570 38
 50 | #device_id: 8 304 342 38
 51 | #device_id: 25 950 988 38
 52 | #device_id: 30 1140 1178 38
 53 | #device_id: 5 190 228 38
 54 | #device_id: 20 760 798 38
 55 | #device_id: 16 608 646 38
 56 | #device_id: 33 1254 1292 38
 57 | #device_id: 10 380 418 38
 58 | #device_id: 41 1558 1596 38
 59 | #device_id: 17 646 684 38
 60 | #device_id: 21 798 836 38
 61 | #device_id: 49 1862 1900 38
 62 | #device_id: 61 2318 2356 38
 63 | #device_id: 46 1748 1786 38
 64 | #device_id: 60 2280 2318 38
 65 | #device_id: 56 2128 2166 38
 66 | #device_id: 6 228 266 38
 67 | #device_id: 54 2052 2090 38
 68 | #device_id: 48 1824 1862 38
 69 | #device_id: 55 2090 2128 38
 70 | #device_id: 52 1976 2014 38
 71 | #device_id: 47 1786 1824 38
 72 | #device_id: 7 266 304 38
 73 | #device_id: 23 874 912 38
 74 | #device_id: 35 1330 1368 38
 75 | #device_id: 2 76 114 38
 76 | #device_id: 34 1292 1330 38
 77 | #device_id: 36 1368 1406 38
 78 | #device_id: 44 1672 1710 38
 79 | #device_id: 15 570 608 38
 80 | #device_id: 11 418 456 38
 81 | #device_id: 63 2394 2400 6
 82 | #device_id: 27 1026 1064 38
 83 | #device_id: 26 988 1026 38
 84 | #device_id: 28 1064 1102 38
 85 | time 1.008772 2400
 86 | test 1114 2099 -0.351597
 87 | test 1562 1856 -0.818695
 88 | test 1929 924 -0.409053
 89 | test 2200 2122 -0.236780
 90 | test 450 1605 -0.494496
 91 | test 515 1139 -0.526653
 92 | test 2125 2166 -0.461825
 93 | test 515 1372 -0.472302
 94 | test 973 2141 -0.479141
 95 | test 773 1809 -0.472171
 96 | test 1274 457 -0.303956
 97 | test 24 2241 -0.392215
 98 | test 2129 375 -0.467766
 99 | test 1958 1848 -0.472492
100 | test 1023 1572 -0.650282
101 | test 544 2137 -0.443251
102 | test 1624 58 -0.516323
103 | test 1593 1153 -0.588491
104 | test 982 1746 -0.528510
105 | test 875 1432 -0.458209
106 | 60.46user 2.21system 0:01.03elapsed 6066%CPU (0avgtext+0avgdata 51984maxresident)k
107 | 0inputs+0outputs (113major+11828minor)pagefaults 0swaps
108 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/floyd.log2:
--------------------------------------------------------------------------------
 1 | index(int, int):
 2 |      13, Generating implicit acc routine seq
 3 |          Generating NVIDIA GPU code
 4 | main:
 5 |       5, include "math.h"
 6 |           15, include "math.h"
 7 |                36, include "cmath"
 8 |                     15, include "cmath"
 9 |                        1935, include "specfun.h"
10 |                               45, include "stl_algobase.h"
11 |                                    45, #omp parallel
12 |                                        46, Begin single region
13 |                                            End single region
14 |                                            Barrier
15 |                                        73, Barrier
16 |                                        84, Barrier
17 |      24, Memory set idiom, loop replaced by call to __c_mset4
18 |      56, Generating enter data create(copy[:12960000],dkj[:3600],dik[:3600])
19 |          Generating enter data copyin(data[:12960000])
20 |      59, Generating present(d0[:],d1[:])
21 |          Generating implicit firstprivate(begin,end)
22 |          Generating NVIDIA GPU code
23 |          61, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
24 |      59, Generating implicit copyout(dik[begin:end-begin]) [if not already present]
25 |      61, Generating implicit firstprivate(k)
26 |      65, Generating update self(dik[begin:step])
27 |          Generating present(d1[:],d0[:])
28 |          Generating NVIDIA GPU code
29 |          67, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
30 |      65, Generating implicit copyout(dkj[:3600]) [if not already present]
31 |      67, Generating implicit firstprivate(k)
32 |      71, Generating update self(dkj[:3600])
33 |          Generating update device(dkj[:3600],dik[:3600])
34 |          Generating present(d1[:],d0[:])
35 |          Generating implicit firstprivate(begin,end)
36 |          Generating NVIDIA GPU code
37 |          78, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
38 |          79,   /* blockIdx.x threadIdx.x tiled */
39 |      71, Generating implicit copyin(dkj[:3600],dik[begin:end-begin]) [if not already present]
40 |      79, Generating implicit private(_T25_5543,_T22_5543)
41 |      99, Generating update self(data[begin*3600:step*3600])
42 |          Generating exit data delete(dkj[:3600],dik[:3600],data[:12960000],copy[:12960000])
43 | float const& std::max<float>(float const&, float const&):
44 |       5, include "math.h"
45 |           15, include "math.h"
46 |                36, include "cmath"
47 |                     15, include "cmath"
48 |                        1935, include "specfun.h"
49 |                               45, include "stl_algobase.h"
50 |                                   255, Generating implicit acc routine seq
51 |                                        Generating NVIDIA GPU code
52 | #num_dev: 1
53 | #num_thread: 1
54 | #device_id: 0 0 3600 3600
55 | time 1.106637 3600
56 | test 1591 172 -0.763397
57 | test 802 369 -0.453210
58 | test 2535 1922 -0.288441
59 | test 2509 360 -0.270466
60 | test 3035 2689 -0.397025
61 | test 3529 2138 -0.559413
62 | test 3380 3190 -0.562856
63 | test 1650 2709 -0.484510
64 | test 2813 3211 -0.411624
65 | test 3283 3454 -0.454825
66 | test 2055 1613 -0.629674
67 | test 3196 677 -0.532822
68 | test 697 3166 -0.443936
69 | test 2548 3111 -0.529714
70 | test 334 1171 -0.424709
71 | test 1307 1078 -0.537585
72 | test 1343 1261 -0.535192
73 | test 599 278 -0.486341
74 | test 3183 3108 -0.501162
75 | test 3390 1771 -0.395187
76 | 1.13user 0.19system 0:01.33elapsed 99%CPU (0avgtext+0avgdata 155400maxresident)k
77 | 0inputs+0outputs (20major+11576minor)pagefaults 0swaps
78 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/lab-floyd.2023-05-24.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/ans/lab-floyd.2023-05-24.tgz


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | all: clean serial multicore managed optimize multidevice
30 | 
31 | clean:
32 | 	rm -f floyd.exe* floyd.prof* floyd.log*
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/ans/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | source device.sh 1
 4 | #source device.sh 8
 5 | make clean
 6 | 
 7 | timeout 1m time make serial
 8 | timeout 1m time make multicore
 9 | timeout 1m time make managed
10 | #timeout 1m time make optimize
11 | #timeout 1m time make multidevice
12 | 
13 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/device.sh:
--------------------------------------------------------------------------------
 1 | devreq=$1
 2 | if [ "$devreq" == "" ] ; then
 3 |   devreq=2
 4 | fi
 5 | 
 6 | unset CUDA_VISIBLE_DEVICES
 7 | numdev=`nvaccelinfo | grep -e "^Device Number:" | wc -l`
 8 | if [ $devreq -ge $numdev ] ; then
 9 |   echo -e "unset CUDA_VISIBLE_DEVICES"
10 | else
11 |   let devnum=$$%$numdev
12 |   devlst=$devnum
13 |   let devreq=$devreq-1
14 |   while [ $devreq -gt 0 ] ; do
15 |     let devnum=($devnum+1)%$numdev
16 |     devlst=$devlst,$devnum
17 |     let devreq=$devreq-1
18 |   done
19 |   export CUDA_VISIBLE_DEVICES=$devlst
20 |   echo -e "set CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
21 | fi
22 | 
23 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.exe0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe0


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.exe1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe1


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.exe2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/floyd.exe2


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.log0:
--------------------------------------------------------------------------------
 1 | time 8.989724 1200
 2 | test 65 506 -0.544105
 3 | test 272 605 -0.519495
 4 | test 391 617 -0.342672
 5 | test 384 274 -0.396835
 6 | test 1136 489 -0.337305
 7 | test 657 761 -0.485585
 8 | test 893 278 -0.457243
 9 | test 163 278 -0.431883
10 | test 366 221 -0.400291
11 | test 794 598 -0.520162
12 | test 712 749 -0.404731
13 | test 390 1090 -0.518321
14 | test 547 715 -0.355524
15 | test 366 102 -0.467554
16 | test 627 1029 -0.434617
17 | test 808 1044 -0.606123
18 | test 687 232 -0.404254
19 | test 449 230 -0.481887
20 | test 2 1186 -0.455573
21 | test 504 1138 -0.456215
22 | 8.99user 0.00system 0:09.00elapsed 99%CPU (0avgtext+0avgdata 11108maxresident)k
23 | 0inputs+8outputs (0major+1684minor)pagefaults 0swaps
24 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.log1:
--------------------------------------------------------------------------------
 1 | main:
 2 |      22, Memory set idiom, loop replaced by call to __c_mset4
 3 |      43, #omp parallel
 4 | time 1.063020 2400
 5 | test 1114 2099 -0.351597
 6 | test 1562 1856 -0.818695
 7 | test 1929 924 -0.409053
 8 | test 2200 2122 -0.236780
 9 | test 450 1605 -0.494496
10 | test 515 1139 -0.526653
11 | test 2125 2166 -0.461825
12 | test 515 1372 -0.472302
13 | test 973 2141 -0.479141
14 | test 773 1809 -0.472171
15 | test 1274 457 -0.303956
16 | test 24 2241 -0.392215
17 | test 2129 375 -0.467766
18 | test 1958 1848 -0.472492
19 | test 1023 1572 -0.650282
20 | test 544 2137 -0.443251
21 | test 1624 58 -0.516323
22 | test 1593 1153 -0.588491
23 | test 982 1746 -0.528510
24 | test 875 1432 -0.458209
25 | 62.96user 0.49system 0:01.08elapsed 5854%CPU (0avgtext+0avgdata 29232maxresident)k
26 | 0inputs+0outputs (162major+6249minor)pagefaults 0swaps
27 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd.log2:
--------------------------------------------------------------------------------
 1 | index(int, int):
 2 |      13, Generating implicit acc routine seq
 3 |          Generating NVIDIA GPU code
 4 | main:
 5 |      22, Memory set idiom, loop replaced by call to __c_mset4
 6 |      43, Generating NVIDIA GPU code
 7 |          46, #pragma acc loop gang, worker(4) /* blockIdx.x threadIdx.y */
 8 |          49, #pragma acc loop vector(32) /* threadIdx.x */
 9 |      43, Generating implicit copy(data[:]) [if not already present]
10 |      49, Loop is parallelizable
11 |          Generating implicit firstprivate(k)
12 | time 0.811786 3600
13 | test 1591 172 -0.763397
14 | test 802 369 -0.453210
15 | test 2535 1922 -0.288441
16 | test 2509 360 -0.270466
17 | test 3035 2689 -0.397025
18 | test 3529 2138 -0.559413
19 | test 3380 3190 -0.562856
20 | test 1650 2709 -0.484510
21 | test 2813 3211 -0.411624
22 | test 3283 3454 -0.454825
23 | test 2055 1613 -0.629674
24 | test 3196 677 -0.532822
25 | test 697 3166 -0.443936
26 | test 2548 3111 -0.529714
27 | test 334 1171 -0.424709
28 | test 1307 1078 -0.537585
29 | test 1343 1261 -0.535192
30 | test 599 278 -0.486341
31 | test 3183 3108 -0.501162
32 | test 3390 1771 -0.395187
33 | 0.85user 0.17system 0:01.04elapsed 98%CPU (0avgtext+0avgdata 156144maxresident)k
34 | 0inputs+0outputs (20major+11584minor)pagefaults 0swaps
35 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd_managed.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 | // add your codes begin
41 | // #pragma acc parallel loop
42 | // #pragma acc data copy(data[0:size2]) 
43 |     for (int k = 0; k < SIZE; k++) {
44 |         #pragma acc parallel loop gang worker
45 |         //#pragma acc data copyin(data) copyout(data)
46 |         for (int i = 0; i < SIZE; i++) {
47 |             //#pragma acc parallel oop collapse(2)
48 |             #pragma acc loop vector
49 |             for (int j = 0; j < SIZE; j++) {
50 |                 if (data[index(i, j)] < data[index(i, k)] + data[index(k, j)]) {
51 |                     data[index(i, j)] = data[index(i, k)] + data[index(k, j)];
52 |                 }
53 |             }
54 |         }
55 |     }
56 | 
57 |     // add your codes end
58 |     t = omp_get_wtime() - t;
59 |     printf("time %f %d\n", t, SIZE);
60 | 
61 |     for (int i = 0; i < 20; i++) {
62 |         int prev = rand() % SIZE;
63 |         int next = rand() % SIZE;
64 |         if (prev == next) {
65 |             i--;
66 |             continue;
67 |         }
68 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd_multicore.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 | 
40 |     // add your codes begin
41 |     for (int k = 0; k < SIZE; k++) {
42 | #pragma omp parallel for num_threads(60) schedule(dynamic)
43 |         for (int i = 0; i < SIZE; i++) {
44 |             for (int j = 0; j < SIZE; j++) {
45 |                 if (data[index(i, k)] == -INF || data[index(k, j)] == -INF)
46 |                     continue;
47 |                 if (data[index(i, j)] < data[index(i, k)] + data[index(k, j)]) {
48 |                     data[index(i, j)] = data[index(i, k)] + data[index(k, j)];
49 |                 }
50 |             }
51 |         }
52 |     }
53 | 
54 |     // add your codes end
55 |     t = omp_get_wtime() - t;
56 |     printf("time %f %d\n", t, SIZE);
57 | 
58 |     for (int i = 0; i < 20; i++) {
59 |         int prev = rand() % SIZE;
60 |         int next = rand() % SIZE;
61 |         if (prev == next) {
62 |             i--;
63 |             continue;
64 |         }
65 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd_multidevice.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 |     
40 |     // add your codes begin
41 |     for (int k = 0; k < SIZE; k++) {
42 |         for (int i = 0; i < SIZE; i++) {
43 |             for (int j = 0; j < SIZE; j++) {
44 |                 if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) {
45 |                     data[index(i,j)] = data[index(i,k)] + data[index(k,j)];
46 |                 }
47 |             }
48 |         }
49 |     }
50 | 
51 |     // add your codes end
52 |     t = omp_get_wtime() - t;
53 |     printf("time %f %d\n", t, SIZE);
54 | 
55 |     for (int i = 0; i < 20; i++) {
56 |         int prev = rand() % SIZE;
57 |         int next = rand() % SIZE;
58 |         if (prev == next) {
59 |             i--;
60 |             continue;
61 |         }
62 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd_optimize.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 |     
40 |     // add your codes begin
41 |     #pragma acc data copy(data[0:size2])
42 |     for (int k = 0; k < SIZE; k++) {
43 |         #pragma acc parallel loop gang worker num_workers(4) vector_length(128)
44 |         for (int i = 0; i < SIZE; i++) {
45 |             #pragma acc loop vector
46 |             for (int j = 0; j < SIZE; j++) {
47 |                 if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) {
48 |                     data[index(i,j)] = data[index(i,k)] + data[index(k,j)];
49 |                 }
50 |             }
51 |         }
52 |     }
53 | 
54 |     // add your codes end
55 |     t = omp_get_wtime() - t;
56 |     printf("time %f %d\n", t, SIZE);
57 | 
58 |     for (int i = 0; i < 20; i++) {
59 |         int prev = rand() % SIZE;
60 |         int next = rand() % SIZE;
61 |         if (prev == next) {
62 |             i--;
63 |             continue;
64 |         }
65 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/floyd_serial.cpp:
--------------------------------------------------------------------------------
 1 | #define INF 1e7
 2 | 
 3 | #include <assert.h>
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <openacc.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #include <algorithm>
11 | using namespace std;
12 | // #define SIZE 1000
13 | inline int index(const int i, const int j) { return i * SIZE + j; }
14 | 
15 | // add your codes begin
16 | 
17 | // add your codes end
18 | 
19 | int main() {
20 |     const int size2 = SIZE * SIZE;
21 |     float* data = new float[size2];
22 |     for (int i = 0; i < size2; i++) data[i] = -INF;
23 | 
24 |     srand(SIZE);
25 |     for (int i = 0; i < SIZE * 20; i++) {
26 |         // 第一维坐标
27 |         int prev = rand() % SIZE;
28 |         // 第二维坐标
29 |         int next = rand() % SIZE;
30 |         // 如果为对称轴，或者已经被设置边权了
31 |         if ((prev == next) || (data[index(prev, next)] > -INF)) {
32 |             i--;
33 |             continue;
34 |         }
35 |         // 赋值
36 |         data[index(prev, next)] = log((rand() % 99 + 1.0) / 100);
37 |     }
38 |     double t = omp_get_wtime();
39 |     
40 |     // add your codes begin
41 |     for (int k = 0; k < SIZE; k++) {
42 |         for (int i = 0; i < SIZE; i++) {
43 |             for (int j = 0; j < SIZE; j++) {
44 |                 if (data[index(i,j)] < data[index(i,k)] + data[index(k,j)]) {
45 |                     data[index(i,j)] = data[index(i,k)] + data[index(k,j)];
46 |                 }
47 |             }
48 |         }
49 |     }
50 | 
51 |     // add your codes end
52 |     t = omp_get_wtime() - t;
53 |     printf("time %f %d\n", t, SIZE);
54 | 
55 |     for (int i = 0; i < 20; i++) {
56 |         int prev = rand() % SIZE;
57 |         int next = rand() % SIZE;
58 |         if (prev == next) {
59 |             i--;
60 |             continue;
61 |         }
62 |         printf("test %d %d %f\n", prev, next, data[index(prev, next)]);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/lab-floyd.2023-05-24.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openacc/lab-floyd/lab-floyd.2023-05-24.tgz


--------------------------------------------------------------------------------
/openacc/lab-floyd/makefile:
--------------------------------------------------------------------------------
 1 | device:
 2 | 	nvaccelinfo
 3 | 
 4 | serial:
 5 | 	nvc++ -o floyd.exe0 -DSIZE=1200 -Minfo=all -Mneginfo=all floyd_serial.cpp >floyd.log0 2>&1
 6 | 	#nsys profile -o floyd.prof0.nsys-rep -t openmp,openacc,cuda ./floyd.exe0 >>floyd.log0 2>&1
 7 | 	timeout 1m time ./floyd.exe0 >>floyd.log0 2>&1
 8 | 
 9 | multicore:
10 | 	nvc++ -o floyd.exe1 -DSIZE=2400 -mp=multicore -acc=multicore -Minfo=all -Mneginfo=all floyd_multicore.cpp >floyd.log1 2>&1
11 | 	#nsys profile -o floyd.prof1.nsys-rep -t openmp,openacc,cuda ./floyd.exe1 >>floyd.log1 2>&1
12 | 	timeout 1m time ./floyd.exe1 >>floyd.log1 2>&1
13 | 
14 | managed:
15 | 	nvc++ -o floyd.exe2 -DSIZE=3600 -mp=multicore -acc=gpu -gpu=managed -Minfo=all -Mneginfo=all floyd_managed.cpp >floyd.log2 2>&1
16 | 	#nsys profile -o floyd.prof2.nsys-rep -t openmp,openacc,cuda ./floyd.exe2 >>floyd.log2 2>&1
17 | 	timeout 1m time ./floyd.exe2 >>floyd.log2 2>&1
18 | 
19 | optimize:
20 | 	nvc++ -o floyd.exe3 -DSIZE=4800 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_optimize.cpp >floyd.log3 2>&1
21 | 	#nsys profile -o floyd.prof3.nsys-rep -t openmp,openacc,cuda ./floyd.exe3 >>floyd.log3 2>&1
22 | 	timeout 1m time ./floyd.exe3 >>floyd.log3 2>&1
23 | 
24 | multidevice:
25 | 	nvc++ -o floyd.exe4 -DSIZE=6000 -mp=multicore -acc=gpu -Minfo=all -Mneginfo=all floyd_multidevice.cpp >floyd.log4 2>&1
26 | 	#nsys profile -o floyd.prof4.nsys-rep -t openmp,openacc,cuda ./floyd.exe4 >>floyd.log4 2>&1
27 | 	timeout 1m time ./floyd.exe4 >>floyd.log4 2>&1
28 | 
29 | all: clean serial multicore managed optimize multidevice
30 | 
31 | clean:
32 | 	rm -f floyd.exe* floyd.prof* floyd.log*
33 | 
34 | 


--------------------------------------------------------------------------------
/openacc/lab-floyd/run.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.3/compilers/bin:$PATH
 2 | 
 3 | source device.sh 1
 4 | #source device.sh 8
 5 | make clean
 6 | 
 7 | timeout 1m time make serial
 8 | timeout 1m time make multicore
 9 | timeout 1m time make managed
10 | # timeout 1m time make optimize
11 | # timeout 1m time make multidevice
12 | 
13 | 


--------------------------------------------------------------------------------
/openacc/note.md:
--------------------------------------------------------------------------------
 1 | # OpenACC
 2 | 语法
 3 | 
 4 | ```c++
 5 | #pragma acc parallel
 6 | 会产生一个或者多个gang并行执行代码
 7 | #pragma acc loop
 8 | 提醒编译器下方代码需要并行的去执行
 9 | 
10 | -ta=tesla:managed
11 | 
12 | kernels与parallel的区别，kernels需要编译器来保证并行的正确性
13 | #pragma acc kernels
14 | {
15 |     for(int i=0; i<N; i++)
16 |     {
17 |         x[i] = 1.0;
18 |         y[i] = 2.0;
19 |     }
20 |     for(int i=0; i<N; i++)
21 |     {
22 |         y[i] = a*x[i] + y[i];
23 |     }
24 | }
25 | ```
26 | 
27 | ![](./image/clauses.png)
28 | 必须指定arrays的大小
29 | ```c++
30 | #pragma acc data copyin(a[0:nelem]) copyout(b[s/4:3*s/4])
31 | 
32 | #pragma acc data
33 | ```
34 | data指令定义了一个代码区域，其中GPU数组保留在GPU上，并在该区域的所有内核之间共享。
35 | 
36 | 在我们在CPU上改变向量之后，我们需要在GPU上更新它。
37 | ```c++
38 | #pragma acc update device(v.coefs[:v.n])
39 | ```
40 | 可以在loop后面增加gang，worker，vector，并指定他们的数量
41 | ```c++
42 | num_gangs(n), num_workers(n), vector_length(n)
43 | ```
44 | collapse可用于加速嵌套循环
45 | ```c++
46 | #pragma acc parallel loop collapse(2)
47 | ```


--------------------------------------------------------------------------------
/openmp/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "any": "cpp",
 4 |         "array": "cpp",
 5 |         "atomic": "cpp",
 6 |         "barrier": "cpp",
 7 |         "bit": "cpp",
 8 |         "*.tcc": "cpp",
 9 |         "bitset": "cpp",
10 |         "cctype": "cpp",
11 |         "cfenv": "cpp",
12 |         "charconv": "cpp",
13 |         "chrono": "cpp",
14 |         "cinttypes": "cpp",
15 |         "clocale": "cpp",
16 |         "cmath": "cpp",
17 |         "codecvt": "cpp",
18 |         "compare": "cpp",
19 |         "complex": "cpp",
20 |         "concepts": "cpp",
21 |         "condition_variable": "cpp",
22 |         "coroutine": "cpp",
23 |         "csetjmp": "cpp",
24 |         "csignal": "cpp",
25 |         "cstdarg": "cpp",
26 |         "cstddef": "cpp",
27 |         "cstdint": "cpp",
28 |         "cstdio": "cpp",
29 |         "cstdlib": "cpp",
30 |         "cstring": "cpp",
31 |         "ctime": "cpp",
32 |         "cuchar": "cpp",
33 |         "cwchar": "cpp",
34 |         "cwctype": "cpp",
35 |         "deque": "cpp",
36 |         "forward_list": "cpp",
37 |         "list": "cpp",
38 |         "map": "cpp",
39 |         "set": "cpp",
40 |         "string": "cpp",
41 |         "unordered_map": "cpp",
42 |         "unordered_set": "cpp",
43 |         "vector": "cpp",
44 |         "exception": "cpp",
45 |         "algorithm": "cpp",
46 |         "functional": "cpp",
47 |         "iterator": "cpp",
48 |         "memory": "cpp",
49 |         "memory_resource": "cpp",
50 |         "numeric": "cpp",
51 |         "optional": "cpp",
52 |         "random": "cpp",
53 |         "ratio": "cpp",
54 |         "regex": "cpp",
55 |         "source_location": "cpp",
56 |         "string_view": "cpp",
57 |         "system_error": "cpp",
58 |         "tuple": "cpp",
59 |         "type_traits": "cpp",
60 |         "utility": "cpp",
61 |         "fstream": "cpp",
62 |         "future": "cpp",
63 |         "initializer_list": "cpp",
64 |         "iomanip": "cpp",
65 |         "iosfwd": "cpp",
66 |         "iostream": "cpp",
67 |         "istream": "cpp",
68 |         "latch": "cpp",
69 |         "limits": "cpp",
70 |         "mutex": "cpp",
71 |         "new": "cpp",
72 |         "numbers": "cpp",
73 |         "ostream": "cpp",
74 |         "ranges": "cpp",
75 |         "scoped_allocator": "cpp",
76 |         "semaphore": "cpp",
77 |         "shared_mutex": "cpp",
78 |         "span": "cpp",
79 |         "sstream": "cpp",
80 |         "stdexcept": "cpp",
81 |         "stop_token": "cpp",
82 |         "streambuf": "cpp",
83 |         "syncstream": "cpp",
84 |         "thread": "cpp",
85 |         "typeindex": "cpp",
86 |         "typeinfo": "cpp",
87 |         "valarray": "cpp",
88 |         "variant": "cpp"
89 |     }
90 | }


--------------------------------------------------------------------------------
/openmp/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tasks": [
 3 |         {
 4 |             "type": "cppbuild",
 5 |             "label": "C/C++: g++ 生成活动文件",
 6 |             "command": "/usr/bin/g++",
 7 |             "args": [
 8 |                 "-fopenmp",
 9 |                 "-fdiagnostics-color=always",
10 |                 "-g",
11 |                 "${file}",
12 |                 "-o",
13 |                 "${fileDirname}/build/${fileBasenameNoExtension}"
14 |             ],
15 |             "options": {
16 |                 "cwd": "${fileDirname}"
17 |             },
18 |             "problemMatcher": [
19 |                 "$gcc"
20 |             ],
21 |             "group": {
22 |                 "kind": "build",
23 |                 "isDefault": true
24 |             },
25 |             "detail": "调试器生成的任务。"
26 |         }
27 |     ],
28 |     "version": "2.0.0"
29 | }


--------------------------------------------------------------------------------
/openmp/assign_prime/build/prime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/build/prime


--------------------------------------------------------------------------------
/openmp/assign_prime/build/prime_old:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/build/prime_old


--------------------------------------------------------------------------------
/openmp/assign_prime/prime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/prime


--------------------------------------------------------------------------------
/openmp/assign_prime/prime.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #include <algorithm>
 9 | #include <iostream>
10 | #include <vector>
11 | using namespace std;
12 | 
13 | // add your codes begin
14 | 
15 | // #define SIZE 10000000
16 | 
17 | int cnt;
18 | bool st[SIZE];
19 | bool is_prime[SIZE];
20 | 
21 | // add your codes end
22 | 
23 | int main() {
24 |     vector<long> prime;
25 | 
26 |     double t = omp_get_wtime();
27 |     omp_set_num_threads(30);
28 |     // 根据N的不同大小需要调整
29 |     int sz = SIZE;
30 |     if (SIZE >= 100000000) {
31 |         sz /= 15;
32 |     }
33 |     prime.resize(sz);
34 |     // 埃式筛法+数理推导
35 |     for (int i = 2; i <= (int)sqrt(SIZE); i++) {
36 |         if (!st[i]) {
37 |             // prime.push_back(i);
38 | #pragma omp parallel for
39 |             for (int j = i + i; j <= SIZE; j += i) {
40 |                 if (!st[j]) st[j] = true;
41 |             }
42 |         }
43 |     }
44 |     st[0] = st[1] = 1;
45 |     int cnt = 0;
46 | #pragma omp parallel
47 |     {
48 |         int id, i, nthrds;
49 |         id = omp_get_thread_num();
50 |         nthrds = omp_get_num_threads();
51 |         vector<int> res;
52 |         int num = SIZE;
53 |         if (id != 2 && id % 2 == 0 && nthrds % 2 == 0) {
54 |         } else {
55 |             for (i = id; i <= SIZE; i += nthrds) {
56 |                 if (st[i] == 0) {
57 |                     res.push_back(i);
58 |                 }
59 |             }
60 |         }
61 | 
62 | #pragma omp critical
63 |         for (int j = 0; j < res.size(); j++) {
64 |             prime[cnt++] = res[j];
65 |         }
66 |     }
67 |     prime.resize(cnt);
68 | 
69 |     // add your codes end
70 |     t = omp_get_wtime() - t;
71 |     printf("time %f %ld\n", t, long(SIZE));
72 | 
73 |     printf("prime");
74 |     sort(prime.begin(), prime.end());
75 |     for (long i = 0; i < prime.size(); i++) printf(" %ld", prime[i]);
76 |     printf("\nsize %ld\n", prime.size());
77 | }
78 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/prime_solution:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/prime_solution


--------------------------------------------------------------------------------
/openmp/assign_prime/res.log:
--------------------------------------------------------------------------------
 1 | 1c1
 2 | < time 9.164585 100000000
 3 | ---
 4 | > time 0.485323 100000000
 5 | 4c4
 6 | < time 7.479427 100000000
 7 | ---
 8 | > time 0.256331 100000000
 9 | 7c7
10 | < time 7.607437 100000000
11 | ---
12 | > time 0.533640 100000000
13 | 10c10
14 | < time 8.163960 100000000
15 | ---
16 | > time 0.668524 100000000
17 | 13c13
18 | < time 13.144602 100000000
19 | ---
20 | > time 0.261871 100000000
21 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/run.sh:
--------------------------------------------------------------------------------
1 | size=100000000
2 | 
3 | g++ -o prime -fopenmp -DSIZE=$size prime.cpp && timeout 60s time ./prime #> testmy.log
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/assign_01_prime/build/prime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/prime


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/assign_01_prime/build/primeSerial:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/primeSerial


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/assign_01_prime/build/primexyy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/assign_01_prime/build/primexyy


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/assign_01_prime/prime.cpp:
--------------------------------------------------------------------------------
  1 | // student 宋家庆:
  2 | // id 202000130061:
  3 | 
  4 | #include <math.h>
  5 | #include <omp.h>
  6 | #include <stdio.h>
  7 | 
  8 | #include <vector>
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10
 13 | #define N 100000000
 14 | 
 15 | int primes[N], cnt;
 16 | bool st[N];
 17 | 
 18 | bool is_prime[N];
 19 | 
 20 | // add your codes end
 21 | bool check1(int x) {
 22 |     if (x < 2) return false;
 23 |     if (x > 10 && x % 10 == 5) return false;
 24 |     for (int i = 2; i <= x / i; ++i) {
 25 |         if (x % i == 0) {
 26 |             return false;
 27 |         }
 28 |     }
 29 |     return true;
 30 | }
 31 | 
 32 | bool check2(int x) {
 33 |     if (x < 2) return false;
 34 |     int nthread = SIZE;
 35 |     int flag = 0;
 36 |     omp_set_num_threads(SIZE);
 37 | #pragma omp parallel
 38 |     {
 39 |         int id = omp_get_thread_num();
 40 |         if (id == 0) nthread = omp_get_num_threads();
 41 |         int fg = 0;
 42 |         for (int i = 2; i <= x / i; i += nthread) {
 43 |             if (x % i == 0) {
 44 |                 fg++;
 45 |                 break;
 46 |             }
 47 |         }
 48 | #pragma omp atomic
 49 |         flag += fg;
 50 |     }
 51 |     if (flag) {
 52 |         return false;
 53 |     }
 54 | 
 55 |     return true;
 56 | }
 57 | 
 58 | bool check3(int x) {
 59 |     if (x < 2) return false;
 60 |     if (x > 10 && x % 10 == 5) return false;
 61 |     for (size_t i = 0; i < x; i++) {
 62 |         /* code */
 63 |     }
 64 |     return false;
 65 | }
 66 | 
 67 | int main() {
 68 |     double pi;
 69 | 
 70 |     double t = omp_get_wtime();
 71 |     // add your codes begin
 72 | 
 73 |     omp_set_num_threads(SIZE);
 74 | 
 75 |     int count = 0;
 76 |     int nthread = SIZE;
 77 |     // 普通筛法
 78 |     //     for (int i = 2; i <= N; i++) {
 79 |     //         if (!st[i]) primes[cnt++] = i;  // 把素数存起来
 80 |     // #pragma omp for
 81 |     //         for (int j = i; j <= N;
 82 |     //              j += i) {  // 不管是合数还是质数，都用来筛掉后面它的倍数
 83 |     //             if (!st[j]) st[j] = true;
 84 |     //         }
 85 |     //     }
 86 |     
 87 |     for (long i = 2; i * i <= N; i++) {
 88 | #pragma omp parallel for
 89 |         for (int j = i * i; j <= N; j = j + i) {
 90 |             if (st[j] == 0) st[j] = 1;
 91 |         }
 92 |     }
 93 | 
 94 |     // 埃式筛法
 95 |     //     for (int i = 2; i <= N; i++) {
 96 |     //         if (!st[i]) {
 97 |     //             primes[cnt++] = i;
 98 |     // #pragma omp for
 99 |     //             for (int j = i; j <= N; j += i)
100 |     //                 if (!st[j]) st[j] = true;  //
101 |     //                 可以用质数就把所有的合数都筛掉；
102 |     //         }
103 |     //     }
104 |     // 线性筛法
105 |     // for (int i = 2; i <= N; i++) {
106 |     //     if (!st[i]) primes[cnt++] = i;
107 |     //     for (int j = 0; primes[j] <= N / i; j++) {
108 |     //         st[primes[j] * i] = true;
109 |     //         if (i % primes[j] == 0) break;
110 |     //     }
111 |     // }
112 |     // #pragma omp parallel
113 |     //     {
114 |     //         int id = omp_get_thread_num();
115 |     //         if (id == 0) nthread = omp_get_num_threads();
116 |     // 		int cnt = 0;
117 | 
118 |     //         for (int i = id; i < N; i += nthread) {
119 |     //             if (i % 2 == 0) continue;
120 |     //             if (check1(i)){
121 |     // 				cnt++;
122 |     // 			}
123 |     //         }
124 |     // 		#pragma omp atomic
125 |     //             count+=cnt;
126 |     //     }
127 |     for (int i = 2; i <= N; i++) {
128 |         if (!st[i]) {
129 |             cnt++;
130 |         }
131 |     }
132 |     t = omp_get_wtime() - t;
133 |     printf("time %f %d\n", t, SIZE);
134 | 
135 |     printf("%d\n", cnt);
136 | }
137 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/assign_01_prime/primeSerial.cpp:
--------------------------------------------------------------------------------
 1 | // student 宋家庆:
 2 | // id 202000130061:
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | #define SIZE 25
11 | #define N 100000000
12 | // add your codes end
13 | bool check(int x) {
14 |     if (x < 2) return false;
15 |     for (int i = 2; i <= x / i; ++i) {
16 |         if (x % i == 0) {
17 |             return false;
18 |         }
19 |     }
20 |     return true;
21 | }
22 | 
23 | int main() {
24 |     double pi;
25 | 
26 |     double t = omp_get_wtime();
27 |     // add your codes begin
28 | 
29 |     omp_set_num_threads(SIZE);
30 |     int count = 0;
31 |     for (int i = 0; i < N; i++) {
32 |         if (check(i)) count++;
33 |     }
34 |     t = omp_get_wtime() - t;
35 |     printf("time %f %d\n", t, SIZE);
36 | 
37 |     printf("%d\n", count);
38 | }
39 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/build/prime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/build/prime


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/prime:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/prime


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/prime.cpp:
--------------------------------------------------------------------------------
  1 | // // student name:
  2 | // // id number:
  3 | 
  4 | // #include <math.h>
  5 | // #include <omp.h>
  6 | // #include <stdlib.h>
  7 | 
  8 | // #include <algorithm>
  9 | // #include <iostream>
 10 | // #include <vector>
 11 | // using namespace std;
 12 | 
 13 | // // add your codes begin
 14 | 
 15 | // // #define SIZE 100000000
 16 | 
 17 | // int cnt;
 18 | // bool st[SIZE];
 19 | // bool is_prime[SIZE];
 20 | 
 21 | // // add your codes end
 22 | 
 23 | // int main() {
 24 | //     vector<long> prime;
 25 | 
 26 | //     double t = omp_get_wtime();
 27 | //     omp_set_num_threads(25);
 28 | //     // add your codes begin
 29 | //     // 普通筛法
 30 | //     // for (int i = 2; i < SIZE; i++) {
 31 | //     //     if (!st[i]) prime.push_back(i);
 32 | 
 33 | //     //     for (int j = i; j < SIZE; j += i) {
 34 | //     //         if (!st[j]) st[j] = true;
 35 | //     //     }
 36 | //     // }
 37 | 
 38 | //     // 数理推导
 39 | // //     for (int i = 2; i * i <= SIZE; i++) {
 40 | // // // if (!st[i]) prime.push_back(i);
 41 | // // #pragma omp paraller for
 42 | // //         for (int j = i + i; j <= SIZE; j += i) {
 43 | // //             if (st[j] == 0) st[j] = 1;
 44 | // //         }
 45 | // //     }
 46 | 
 47 | //     for (long i = 2; i * i <= SIZE; i++) {
 48 | // #pragma omp parallel for
 49 | //         for (int j = i * i; j <= SIZE; j = j + i) {
 50 | //             if (st[j] == 0) st[j] = 1;
 51 | //         }
 52 | //     }
 53 | 
 54 | //     // 埃式筛法
 55 | //     // for (int i = 2; i <= SIZE; i++) {
 56 | //     //     if (!st[i]) {
 57 | //     //         // prime.push_back(i);
 58 | //     //         #pragma omp parallel for
 59 | //     //         for (int j = i; j < SIZE; j += i) {
 60 | //     //             if (!st[j]) st[j] = true;
 61 | //     //         }
 62 | //     //     }
 63 | //     // }
 64 | //     // 线性筛法
 65 | //     // for (int i = 2; i < SIZE; i++) {
 66 | //     //     if (!st[i]) {
 67 | //     //         prime.push_back(i);
 68 | //     //     }
 69 | //     //     for (int j = 0; prime[j] * i < SIZE && j < prime.size(); j++) {
 70 | //     //         st[prime[j] * i] = true;
 71 | //     //         if (i % prime[j] == 0) break;
 72 | //     //     }
 73 | //     // }
 74 | 
 75 | //     // add your codes end
 76 | //     t = omp_get_wtime() - t;
 77 | //     printf("time %f %ld\n", t, long(SIZE));
 78 | 
 79 | //     printf("prime");
 80 | //     sort(prime.begin(), prime.end());
 81 | //     // for (long i = 0; i < prime.size(); i++) printf(" %ld", prime[i]);
 82 | //     printf("\nsize %ld\n", prime.size());
 83 | // }
 84 | // student name:颜恺楠
 85 | // id number:202000130203
 86 | 
 87 | #include <math.h>
 88 | #include <omp.h>
 89 | #include <stdlib.h>
 90 | 
 91 | #include <algorithm>
 92 | #include <iostream>
 93 | #include <vector>
 94 | using namespace std;
 95 | 
 96 | // add your codes begin
 97 | // add your codes end
 98 | bool check[SIZE + 1];
 99 | 
100 | int main() {
101 |     vector<long> prime;
102 | 
103 |     double t = omp_get_wtime();
104 | 
105 |     // add your codes begin
106 |     // vector<int> check(SIZE + 1, 0);
107 |     for (long i = 2; i * i <= SIZE; i++) {
108 |         // if (check[i] != 0 || i&1==0 || (i%6!=1 && i%6!=5))
109 |         /* if (check[i] != 0)
110 |           continue; */
111 |         if (!check[i])  //! check[i]==0表示是素数
112 |         {
113 | // prime.push_back(i);
114 | #pragma parallel omp for
115 |             for (long j = i + i; j <= SIZE; j += i)
116 |                 if (!check[j]) check[j] = 1;
117 |         }
118 |     }
119 |     // #pragma parallel omp for
120 |     for (long i = 2; i <= SIZE; i++)
121 |         if (!check[i]) prime.push_back(i);
122 |     // add your codes end
123 | 
124 |     t = omp_get_wtime() - t;
125 |     printf("time %f %ld\n", t, long(SIZE));
126 | 
127 |     printf("prime");
128 |     sort(prime.begin(), prime.end());
129 |     // for (long i = 0; i < prime.size(); i++)
130 |     //   printf(" %ld", prime[i]);
131 |     printf("\nsize %ld\n", prime.size());
132 | }
133 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/prime_solution:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_prime/temp/prime_solution


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/run.sh:
--------------------------------------------------------------------------------
1 | size=100000000
2 | 
3 | g++ -o prime -fopenmp -DSIZE=$size prime.cpp && timeout 60s time ./prime
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/temp/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # if [ -f test.log ] ; then
 4 | #     cat test.log | grep time
 5 | #     exit
 6 | # fi
 7 | 
 8 | for i in 0 1 2 3 4 ; do
 9 |     ./prime_solution
10 | done | tee test.log | grep time
11 | 
12 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f test.log ] ; then
 4 |     cat test.log | grep time
 5 |     exit
 6 | fi
 7 | 
 8 | for i in 0 1 2 3 4 ; do
 9 |     ./prime_solution
10 | done | tee test.log | grep time
11 | 
12 | 


--------------------------------------------------------------------------------
/openmp/assign_prime/testmy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f testmy.log ] ; then
 4 |     cat testmy.log | grep time
 5 |     exit
 6 | fi
 7 | 
 8 | for i in 0 1 2 3 4 ; do
 9 |     ./prime
10 | done | tee testmy.log | grep time
11 | 
12 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/assign_sort.2023-03-29.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/assign_sort.2023-03-29.tgz


--------------------------------------------------------------------------------
/openmp/assign_sort/build/build/tp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp


--------------------------------------------------------------------------------
/openmp/assign_sort/build/build/tp2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp2


--------------------------------------------------------------------------------
/openmp/assign_sort/build/build/tp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/build/tp3


--------------------------------------------------------------------------------
/openmp/assign_sort/build/sort_radix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/sort_radix


--------------------------------------------------------------------------------
/openmp/assign_sort/build/sort_radix.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #define CUTOFF 1024
 5 | 
 6 | #include <assert.h>
 7 | #include <math.h>
 8 | #include <omp.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | 
12 | #include <algorithm>
13 | #include <iostream>
14 | #include <vector>
15 | using namespace std;
16 | 
17 | // add your codes begin
18 | #define maxbit 256
19 | int thrd_num = 10;
20 | int bitnum = 256;
21 | // #define SIZE 1000
22 | int bits[6] = {0, 8, 16, 24, 32, 40};
23 | // int bits[12] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40};
24 | 
25 | inline int get_bit(int x, int bit) { return (x >> bits[bit]) & bitnum - 1; }
26 | 
27 | void rscan(int* data, int size) {
28 |     if (size == 1) return;
29 |     int twoSum[size / 2];
30 | #pragma omp parallel for num_threads(thrd_num)
31 |     for (int i = 0; i < size / 2; i++) {
32 |         twoSum[i] = data[i * 2] + data[2 * i + 1];
33 |     }
34 |     rscan(twoSum, size / 2);
35 | #pragma omp parallel for num_threads(thrd_num)
36 |     for (int i = 1; i < size; i += 2) {
37 |         data[i] = twoSum[i / 2];
38 |         if (i + 1 < size) {
39 |             data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1];
40 |         }
41 |     }
42 | }
43 | 
44 | void bit_sort(vector<int>& data) {
45 |     int cut = int(log2(10 * SIZE)) / 8 + 1;
46 |     int cnt[bitnum];
47 |     // 开一个数组用于备份原数组
48 |     int* bucket = (int*)malloc(sizeof(int) * SIZE);
49 |     for (int i = 0; i < cut; i++) {
50 |         memset(cnt, 0, sizeof cnt);
51 |         // 向每个桶里放数据
52 |         // 不需要真的把数据放进桶里
53 |         // 记录一下每个桶里数据的数量
54 |         for (int j = 0; j < data.size(); j++) {
55 |             // bucket[get_bit(data[j], i)].push_back(data[j]);
56 |             cnt[get_bit(data[j], i)]++;
57 |         }
58 |         // 求前缀和，得知每个数字应该放在哪个位置
59 |         rscan(cnt, bitnum);
60 |         // 可能存在false sharing，这部分不再并行
61 |         // TODO: 一个compact 可以尝试一下并行
62 |         for (int j = SIZE - 1; j >= 0; j--) {
63 |             int bit = get_bit(data[j], i);
64 |             bucket[cnt[bit] - 1] = data[j];
65 |             cnt[bit]--;
66 |         }
67 |         memcpy(&data[0], bucket, sizeof(int) * SIZE);
68 |     }
69 | }
70 | 
71 | // add your codes end
72 | 
73 | int main() {
74 |     vector<int> data(SIZE);
75 | 
76 |     srand(SIZE);
77 |     for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10);
78 | 
79 |     double t = omp_get_wtime();
80 |     // add your codes begin
81 |     bit_sort(data);
82 |     // add your codes end
83 |     t = omp_get_wtime() - t;
84 |     printf("time %f %d\n", t, SIZE);
85 | 
86 |     for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]);
87 | }
88 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/build/sort_sample:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/build/sort_sample


--------------------------------------------------------------------------------
/openmp/assign_sort/build/tp.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <omp.h>
 4 | #include <openacc.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #include <algorithm>
 9 | #include <iostream>
10 | #include <vector>
11 | using namespace std;
12 | 
13 | // add your codes begin
14 | #define CUTOFF 256
15 | #define SIZE 100000000
16 | static int p[6] = {0, 8, 16, 24, 32, 40};
17 | 
18 | inline int get_digit(int n, int i) { return (n >> p[i]) & (CUTOFF - 1); }
19 | 
20 | void SCAN(int* arr, int steps, int start_index) {
21 |     if (256 < start_index) {
22 |         return;
23 |     } else {
24 | #pragma acc data copy(arr [0:CUTOFF])
25 |         {
26 | #pragma acc parallel loop
27 |             for (int i = start_index + (steps / 2); i < CUTOFF; i += steps) {
28 |                 arr[i] += arr[i - steps / 2];
29 |             }
30 |         }
31 |         SCAN(arr, steps * 2, start_index + steps / 2);
32 | #pragma acc data copy(arr [0:CUTOFF])
33 |         {
34 | #pragma acc parallel loop
35 |             for (int i = start_index + steps; i < CUTOFF; i += steps) {
36 |                 arr[i] += arr[i - steps / 2];
37 |             }
38 |         }
39 |     }
40 | }
41 | 
42 | void quciksort(int* arr) {
43 |     // 开了一个大小为4*SIZE的桶
44 |     int* bucket = (int*)malloc(sizeof(int) * SIZE);
45 |     // 开一个256大小的数组
46 |     int count[CUTOFF];
47 |     // 计算可能用到的最大bit数
48 |     int cut = int(log2(10 * SIZE)) / 8 + 1;
49 |     // cut 大小应该是5
50 |     for (int i = 0; i < cut; i++) {
51 |         memset(count, 0, sizeof(int) * CUTOFF);
52 |         int sub_count[CUTOFF] = {0};
53 |         // 获得每一位的位数数量
54 |         // 准确来说应该是8位
55 |         // 统计每一批 的8位位数
56 |         for (int j = 0; j < SIZE; j += 1) {
57 |             sub_count[get_digit(arr[j], i)]++;
58 |         }
59 |         for (int k = 0; k < CUTOFF; k++) {
60 |             count[k] += sub_count[k];
61 |         }
62 |         SCAN(count, 2, 0);
63 |         for (int j = SIZE - 1; j >= 0; --j) {
64 |             int k = get_digit(arr[j], i);
65 |             bucket[count[k] - 1] = arr[j];
66 |             count[k]--;
67 |         }
68 |         memcpy(arr, bucket, sizeof(int) * SIZE);
69 |     }
70 |     free(bucket);
71 | }
72 | // add your codes end
73 | 
74 | int main() {
75 |     vector<int> data(SIZE);
76 | 
77 |     srand(SIZE);
78 |     for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10);
79 | 
80 |     double t = omp_get_wtime();
81 |     // add your codes begin
82 |     quciksort(&data[0]);
83 |     // add your codes end
84 |     t = omp_get_wtime() - t;
85 |     printf("time %f %d\n", t, SIZE);
86 | 
87 |     for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]);
88 | }


--------------------------------------------------------------------------------
/openmp/assign_sort/build/tp2.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <math.h>
 3 | #include <omp.h>
 4 | #include <openacc.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | 
 8 | #include <algorithm>
 9 | #include <iostream>
10 | #include <vector>
11 | 
12 | #define SIZE 1000000
13 | using namespace std;
14 | 
15 | 
16 | void sampleSort(int *arr, int n, int p) {
17 |     // Step 1: Sample data
18 |     int s = 10 * p;  // Sample size
19 |     int *sample = new int[s * p];
20 |     srand(time(NULL));
21 | #pragma omp parallel for
22 |     for (int i = 0; i < s * p; i++) {
23 |         sample[i] = arr[rand() % n];
24 |     }
25 |     std::sort(sample, sample + s * p);
26 |     // Step 2: Choose pivots
27 |     std::vector<double> pivots(p - 1);
28 |     for (int i = 0; i < p - 1; i++) {
29 |         pivots[i] = sample[(i + 1) * s];
30 |     }
31 | 
32 |     // Step 3: Partition data
33 |     std::vector<int> counts(p);
34 |     std::vector<int> offsets(p);
35 | #pragma omp parallel for
36 |     for (int i = 0; i < n; i++) {
37 |         int rank = p - 1;
38 |         for (int j = 0; j < p - 1; j++) {
39 |             if (arr[i] < pivots[j]) {
40 |                 rank = j;
41 |                 break;
42 |             }
43 |         }
44 | #pragma omp atomic
45 |         counts[rank]++;
46 |     }
47 |     offsets[0] = 0;
48 |     for (int i = 1; i < p; i++) {
49 |         offsets[i] = offsets[i - 1] + counts[i - 1];
50 |     }
51 |     double *tmp = new double[n];
52 | #pragma omp parallel for
53 |     for (int i = 0; i < n; i++) {
54 |         int rank = p - 1;
55 |         for (int j = 0; j < p - 1; j++) {
56 |             if (arr[i] < pivots[j]) {
57 |                 rank = j;
58 |                 break;
59 |             }
60 |         }
61 |         int idx = offsets[rank]++;
62 |         tmp[idx] = arr[i];
63 |     }
64 | 
65 | // Step 4: Sort each block
66 | #pragma omp parallel for
67 |     for (int i = 0; i < p; i++) {
68 |         std::sort(tmp + offsets[i], tmp + offsets[i] + counts[i]);
69 |     }
70 | 
71 | // Step 5: Merge blocks
72 | #pragma omp parallel for
73 |     for (int i = 0; i < p; i++) {
74 |         std::copy(tmp + offsets[i], tmp + offsets[i] + counts[i],
75 |                   arr + i * (n / p));
76 |     }
77 | 
78 |     // Step 6: Sort merged data
79 |     std::sort(arr, arr + n);
80 | 
81 |     delete[] sample;
82 |     delete[] tmp;
83 | }
84 | 
85 | int main() {
86 |     vector<int> data(SIZE);
87 | 
88 |     srand(SIZE);
89 |     for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10);
90 | 
91 |     double t = omp_get_wtime();
92 |     // add your codes begin
93 |     sampleSort(&data[0], SIZE, 10);
94 |     // add your codes end
95 |     t = omp_get_wtime() - t;
96 |     printf("time %f %d\n", t, SIZE);
97 | 
98 |     for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]);
99 | }


--------------------------------------------------------------------------------
/openmp/assign_sort/build/tp3.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | 
 3 | #include <algorithm>
 4 | #include <cstdlib>
 5 | #include <ctime>
 6 | #include <iostream>
 7 | #include <vector>
 8 | 
 9 | void sampleSort(double *arr, int n, int p, int s) {
10 |     // Step 1: Choose samples
11 |     double *sample = new double[p * s];
12 |     srand(time(NULL));
13 | #pragma omp parallel for
14 |     for (int i = 0; i < p * s; i++) {
15 |         sample[i] = arr[rand() % n];
16 |     }
17 |     std::sort(sample, sample + p * s);
18 | 
19 |     // Step 2: Choose pivots
20 |     std::vector<double> pivots(p - 1);
21 |     for (int i = 0; i < p - 1; i++) {
22 |         pivots[i] = sample[(i + 1) * s];
23 |     }
24 |     // Step 3: Partition data
25 |     std::vector<int> counts(p);
26 |     std::vector<int> offsets(p);
27 | #pragma omp parallel for
28 |     for (int i = 0; i < n; i++) {
29 |         int rank = p - 1;
30 |         for (int j = 0; j < p - 1; j++) {
31 |             if (arr[i] < pivots[j]) {
32 |                 rank = j;
33 |                 break;
34 |             }
35 |         }
36 | #pragma omp atomic
37 |         counts[rank]++;
38 |     }
39 |     offsets[0] = 0;
40 |     for (int i = 1; i < p; i++) {
41 |         offsets[i] = offsets[i - 1] + counts[i - 1];
42 |     }
43 |     double *tmp = new double[n];
44 | #pragma omp parallel for
45 |     for (int i = 0; i < n; i++) {
46 |         int rank = p - 1;
47 |         for (int j = 0; j < p - 1; j++) {
48 |             if (arr[i] < pivots[j]) {
49 |                 rank = j;
50 |                 break;
51 |             }
52 |         }
53 |         int idx = offsets[rank]++;
54 |         tmp[idx] = arr[i];
55 |     }
56 | 
57 | // Step 4: Sort each block
58 | #pragma omp parallel for
59 |     for (int i = 0; i < p; i++) {
60 |         std::sort(tmp + offsets[i], tmp + offsets[i] + counts[i]);
61 |     }
62 | 
63 | // Step 5: Merge blocks
64 | #pragma omp parallel for
65 |     for (int i = 0; i < p; i++) {
66 |         std::copy(tmp + offsets[i], tmp + offsets[i] + counts[i],
67 |                   arr + i * (n / p));
68 |     }
69 | 
70 |     // Step 6: Sort merged data
71 |     std::sort(arr, arr + n);
72 | 
73 |     delete[] sample;
74 |     delete[] tmp;
75 | }
76 | 
77 | int main() {
78 |     int n = 100000;  // Array size
79 |     double *arr = new double[n];
80 |     srand(time(NULL));
81 |     for (int i = 0; i < n; i++) {
82 |         arr[i] = rand() % 1000;
83 |     }
84 |     int p = 4;  // Parallelism
85 |     double *sorted_arr = new double[n];
86 |     double start = omp_get_wtime();
87 |     sampleSort(arr, n, p, n / (4 * p));
88 |     double end = omp_get_wtime();
89 |     std::cout << "Time: " << end - start << std::endl;
90 |     delete[] arr;
91 |     delete[] sorted_arr;
92 |     return 0;
93 | }
94 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name: 宋家庆
 2 | - ID number: 202000130061
 3 | 
 4 | - Implement your sort algorithm
 5 |   - Version 1: radix sort program
 6 |   - Version 2: sample sort program
 7 | - Compile and run your programs multiple times
 8 | - Describe your observations
 9 | sort_radix：
10 | 多次运行该算法，设置了不同的参数尝试，以每一位bit一组从低到高排序，耗时约为50s
11 | 以每四位bit为一组从第到高排序，耗时约为15s
12 | 以每八位bit为一组从第到高排序，耗时约为8-10s
13 | sort_sample:
14 | 可以调节的参数有采样的数量和线程数量，在尝试过程中找到了几个效率较高的参数组合
15 | 线程数为30，采样数位30*35或30*500时效率在6.4-7s之间
16 | 线程数为50，采样数为50*500时效率在4.6-5s之间
17 | 
18 | - Explain why this happens
19 | 针对sort_radix:
20 | 每一组中bit位数越多，最外面循环的次数就越少，且桶的数量会越多，可以使用使用更多的线程并行，效率越高。
21 | 针对sort_sample：
22 | 一般来说线程数量越多，并行度越高，效率会越高，到达某一临界值时，提升线程数量对效率的提升会变小甚至无提升，
23 | 要协调好线程开销与并发效率的关系。
24 | 该方法里有一个可调节的参数采样数量，该值越大，越能逼近均匀分割，但同时会提高该部分排序耗时。
25 | 可以多次尝试找到一个合适取值，运气好的话，较小的采样量也能达到不错的效果。
26 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/run.sh:
--------------------------------------------------------------------------------
1 | version=radix
2 | # size=100000000
3 | size=100000000
4 | 
5 | g++ -o sort_$version -fopenmp -DSIZE=$size sort_$version.cpp && timeout 60s time ./sort_$version
6 | 
7 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/sort_radix:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_radix


--------------------------------------------------------------------------------
/openmp/assign_sort/sort_radix.cpp:
--------------------------------------------------------------------------------
  1 | // student name: 宋家庆
  2 | // id number: 202000130061
  3 | 
  4 | #define CUTOFF 1024
  5 | 
  6 | #include <assert.h>
  7 | #include <math.h>
  8 | #include <omp.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | 
 12 | #include <algorithm>
 13 | #include <iostream>
 14 | #include <vector>
 15 | using namespace std;
 16 | 
 17 | // add your codes begin
 18 | #define maxbit 256
 19 | int thrd_num = 10;
 20 | int bitnum = 256;
 21 | // #define SIZE 1000
 22 | int bits[6] = {0, 8, 16, 24, 32, 40};
 23 | // int bits[12] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40};
 24 | 
 25 | inline int get_bit(int x, int bit) { return (x >> bits[bit]) & bitnum - 1; }
 26 | 
 27 | void rscan(int* data, int size) {
 28 |     if (size == 1) return;
 29 |     int twoSum[size / 2];
 30 | #pragma omp parallel for num_threads(thrd_num)
 31 |     for (int i = 0; i < size / 2; i++) {
 32 |         twoSum[i] = data[i * 2] + data[2 * i + 1];
 33 |     }
 34 |     rscan(twoSum, size / 2);
 35 | #pragma omp parallel for num_threads(thrd_num)
 36 |     for (int i = 1; i < size; i += 2) {
 37 |         data[i] = twoSum[i / 2];
 38 |         if (i + 1 < size) {
 39 |             data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1];
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | void bit_sort(vector<int>& data) {
 45 |     int cut = int(log2(10 * SIZE)) / 8 + 1;
 46 |     int cnt[bitnum];
 47 |     // vector<int> cnt(bitnum);
 48 |     // 开一个数组用于备份原数组
 49 |     int* bucket = (int*)malloc(sizeof(int) * SIZE);
 50 |     for (int i = 0; i < cut; i++) {
 51 |         memset(cnt, 0, sizeof cnt);
 52 |         // std::fill(cnt.begin(), cnt.end(), 0);
 53 |         // 向每个桶里放数据
 54 |         // 不需要真的把数据放进桶里
 55 |         // 记录一下每个桶里数据的数量
 56 |         for (int j = 0; j < data.size(); j++) {
 57 |             // bucket[get_bit(data[j], i)].push_back(data[j]);
 58 |             cnt[get_bit(data[j], i)]++;
 59 |         }
 60 |         // 求前缀和，得知每个数字应该放在哪个位置
 61 |         rscan(&cnt[0], bitnum);
 62 |         // cnt.insert(cnt.begin(), 0);
 63 |         // 可能存在false sharing，这部分不再并行
 64 |         // TODO: 一个compact 可以尝试一下并行
 65 |         // 似乎因为线程太多，不太可行
 66 |         // 并行compact由于需要拷贝数组，导致性能下降，不再进行并行compact
 67 |         // #pragma omp parallel num_threads(256)
 68 |         //         {
 69 |         //             int id, nthrds;
 70 |         //             id = omp_get_thread_num();
 71 |         //             // cout << id << endl;
 72 |         //             nthrds = omp_get_num_threads();
 73 |         //             // 每个线程只执行循环中自己需要执行的部分
 74 |         //             vector<int> tp;
 75 |         //             for (int j = 0; j < SIZE; j++) {
 76 |         //                 if (get_bit(data[j], i) == id) {
 77 |         //                     tp.push_back(data[j]);
 78 |         //                 }
 79 |         //             }
 80 |         // #pragma omp critical
 81 |         //             { memcpy(bucket, &tp[0], 4 * tp.size()); }
 82 |         //         }
 83 | 
 84 |         for (int j = SIZE - 1; j >= 0; j--) {
 85 |             int bit = get_bit(data[j], i);
 86 |             bucket[cnt[bit] - 1] = data[j];
 87 |             cnt[bit]--;
 88 |         }
 89 |         memcpy(&data[0], bucket, sizeof(int) * SIZE);
 90 |     }
 91 | }
 92 | 
 93 | // add your codes end
 94 | 
 95 | int main() {
 96 |     vector<int> data(SIZE);
 97 | 
 98 |     srand(SIZE);
 99 |     for (int i = 0; i < SIZE; i++) data[i] = rand() % (SIZE * 10);
100 | 
101 |     double t = omp_get_wtime();
102 |     // add your codes begin
103 |     bit_sort(data);
104 |     // add your codes end
105 |     t = omp_get_wtime() - t;
106 |     printf("time %f %d\n", t, SIZE);
107 | 
108 |     for (int i = 0; i < SIZE - 1; i++) assert(data[i] <= data[i + 1]);
109 | }
110 | 


--------------------------------------------------------------------------------
/openmp/assign_sort/sort_radix_solution:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_radix_solution


--------------------------------------------------------------------------------
/openmp/assign_sort/sort_sample:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_sample


--------------------------------------------------------------------------------
/openmp/assign_sort/sort_sample_solution:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/assign_sort/sort_sample_solution


--------------------------------------------------------------------------------
/openmp/exam_knn/ans.txt:
--------------------------------------------------------------------------------
1 | time 5.126451 20000
2 | checksum 8.77751328e+04 8.78574766e+04 8.77312266e+04 8.78405156e+04 8.77315234e+04 8.76717109e+04 8.78353750e+04 8.77646250e+04 8.78045000e+04 8.78303125e+04 8.76919609e+04
3 | 


--------------------------------------------------------------------------------
/openmp/exam_knn/build/build/knn copy 2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/build/knn copy 2


--------------------------------------------------------------------------------
/openmp/exam_knn/build/build/knn copy 4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/build/knn copy 4


--------------------------------------------------------------------------------
/openmp/exam_knn/build/knn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/knn


--------------------------------------------------------------------------------
/openmp/exam_knn/build/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/build/test


--------------------------------------------------------------------------------
/openmp/exam_knn/exam_knn.2023-05-10.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/exam_knn.2023-05-10.tgz


--------------------------------------------------------------------------------
/openmp/exam_knn/knn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/knn


--------------------------------------------------------------------------------
/openmp/exam_knn/knn_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/exam_knn/knn_0


--------------------------------------------------------------------------------
/openmp/exam_knn/out.txt:
--------------------------------------------------------------------------------
1 | time 1.518604 20000
2 | checksum 8.77751328e+04 8.78574766e+04 8.77312266e+04 8.78405156e+04 8.77315234e+04 8.76717109e+04 8.78353750e+04 8.77646250e+04 8.78045000e+04 8.78303125e+04 8.76919609e+04
3 | 


--------------------------------------------------------------------------------
/openmp/exam_knn/readme.txt:
--------------------------------------------------------------------------------
1 | - Student name: 宋家庆
2 | - ID number: 202000130061
3 | 
4 | - Design a parallel algorithm to find the K-nearest neighbors (e.g., K=10) of each element given the coordinates of N points (e.g., N=20000) in D-dimensional space (e.g., D=64).
5 | - Implement the algorithm with OpenMP and submit (only) your program. Please see the provided source codes as a starting point.
6 | - Your mark depends on both the correctness and the running time of your program.
7 | 
8 | 


--------------------------------------------------------------------------------
/openmp/exam_knn/run.sh:
--------------------------------------------------------------------------------
1 | size=20000
2 | 
3 | g++ -o knn_0 -fopenmp -DSIZE=$size knn.cpp && timeout 60s time ./knn_0
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/exam_knn/test.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | #define _mBeginASM 	__asm__ __volatile__ (
 4 | #define _mEndASM	);
 5 | 
 6 | int main(int argc, char *argv[]) {
 7 |     int a = 44, b = 33, c;
 8 | 
 9 |     _mBeginASM "addl		%%ebx,%%eax"
10 |         : "=a"(c) /* 说明了调用'函数体'之后，应该把eax中的值赋值该变量c */
11 |         : "b"(b),
12 |           "a"(a) /* 表明了在调用'函数体'之前，应该把变量a复制到eax中,b复制到ebx中
13 |                   */
14 |           _mEndASM
15 | 
16 |               printf("%d\n", c);
17 |     return 0;
18 | }


--------------------------------------------------------------------------------
/openmp/final_exam/build/circle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/final_exam/build/circle


--------------------------------------------------------------------------------
/openmp/final_exam/circle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/final_exam/circle


--------------------------------------------------------------------------------
/openmp/final_exam/run.sh:
--------------------------------------------------------------------------------
1 | size=1000
2 | 
3 | g++ -o circle -fopenmp -DSIZE=$size circle.cpp && timeout 60s time ./circle
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/lab_knn/build/build/knn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/build/build/knn


--------------------------------------------------------------------------------
/openmp/lab_knn/build/knn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/build/knn


--------------------------------------------------------------------------------
/openmp/lab_knn/build/knn.cpp:
--------------------------------------------------------------------------------
  1 | #define DIM 64
  2 | #define KNN 10
  3 | #define NBITS 6
  4 | 
  5 | #include <assert.h>
  6 | #include <math.h>
  7 | #include <omp.h>
  8 | #include <stdlib.h>
  9 | 
 10 | #include <algorithm>
 11 | #include <iostream>
 12 | #include <map>
 13 | #include <set>
 14 | #include <vector>
 15 | using namespace std;
 16 | 
 17 | // add your codes begin
 18 | #define SIZE 20000
 19 | #define SEED 1
 20 | float dis(float *vec1, float *vec2) {
 21 |     float res = 0;
 22 |     for (int i = 0; i < DIM; i++) {
 23 |         float re = vec1[i] - vec2[i];
 24 |         res += re * re;
 25 |     }
 26 |     // return sqrt(res);  //
 27 |     return res;  //
 28 | }
 29 | 
 30 | bool cmp(pair<int, float> p1, pair<int, float> p2) {
 31 |     return p1.second > p2.second;
 32 | }
 33 | // add your codes end
 34 | 
 35 | int main() {
 36 |     srand(SEED);
 37 |     vector<vector<float>> coord(SIZE);
 38 |     vector<vector<float>> knn(SIZE);
 39 |     for (int i = 0; i < SIZE; i++) {
 40 |         vector<float> c(DIM);
 41 |         for (int j = 0; j < DIM; j++)
 42 |             c[j] = float(rand()) / float(RAND_MAX) * 2 - 1;
 43 |         coord[i] = c;
 44 |     }
 45 | 
 46 |     srand(SEED);
 47 |     // 初始化若干个超平面
 48 |     vector<vector<float>> rnd(NBITS);
 49 |     for (int i = 0; i < NBITS; i++) {
 50 |         vector<float> r(DIM);
 51 |         for (int j = 0; j < DIM; j++)
 52 |             r[j] = float(rand()) / float(RAND_MAX) * 2 - 1;
 53 |         rnd[i] = r;
 54 |     }
 55 | 
 56 |     double t = omp_get_wtime();
 57 |     // add your codes begin
 58 |     // 点积计算所有点，将所有点转化为对应维度的向量
 59 |     // 将原始向量由 N * DIM 转化为 N*NBITS
 60 |     vector<string> sbit(SIZE);
 61 |     map<string, vector<int>> buckets;
 62 | #pragma omp parallel for
 63 |     for (int i = 0; i < coord.size(); i++) {
 64 |         string key = "";
 65 |         for (int k = 0; k < NBITS; k++) {
 66 |             float res = 0;
 67 |             for (int j = 0; j < DIM; j++) {
 68 |                 res += (coord[i][j] * rnd[k][j]);
 69 |             }
 70 |             if (res > 0)
 71 |                 key += "1";
 72 |             else
 73 |                 key += "0";
 74 |         }
 75 |         sbit[i] = key;
 76 |     }
 77 | 
 78 |     // #pragma omp parallel for
 79 |     for (int i = 0; i < coord.size(); i++) {
 80 |         buckets[sbit[i]].push_back(i);
 81 |     }
 82 | 
 83 | #pragma omp parallel for
 84 |     for (int i = 0; i < SIZE; i++) {
 85 |         string key = sbit[i];
 86 |         // 得到了当前节点的hash值
 87 |         vector<int> bucket = buckets[key];
 88 | 
 89 |         vector<pair<int, string>> stp;
 90 | 
 91 |         for (int tenv = 0; tenv < pow(2, NBITS); tenv++) {
 92 |             int val = 0;
 93 |             string tk = "";
 94 |             for (int i = 0; i < NBITS; i++) {
 95 |                 val += (((tenv >> i) & 1) != (key[i] - '0'));
 96 |                 tk += (((tenv >> i) & 1) + '0');
 97 |             }
 98 |             stp.push_back({val, tk});
 99 |         }
100 |         sort(stp.begin(), stp.end());
101 |         for (int i = 0; i < 0; i++) {
102 |             bucket.insert(bucket.end(),
103 |                           buckets[stp[i + 1].second].begin(),
104 |                           buckets[stp[i + 1].second].end());
105 |         }
106 | 
107 |         int num = bucket.size();
108 |         float *dist = new float[num];
109 |         for (int k = 0; k < num; k++) {
110 |             dist[k] = dis(&coord[i][0], &coord[bucket[k]][0]);
111 |         }
112 | 
113 |         partial_sort(dist, dist + KNN + 1, dist + num);
114 |         for (int j = 0; j < KNN + 1; j++) {
115 |             knn[i].push_back(sqrt(dist[j]));
116 |         }
117 |     }
118 | 
119 |     // add your codes end
120 |     t = omp_get_wtime() - t;
121 |     printf("time %f %d\n", t, SIZE);
122 | 
123 |     const int size = 11;
124 |     float chksum[size];
125 |     for (int i = 0; i < size; i++) chksum[i] = 0.0;
126 |     for (int i = 0; i < SIZE; i++) {
127 |         for (int j = 0; j < knn[i].size(); j++) {
128 |             chksum[i % size] += knn[i][j];
129 |         }
130 |     }
131 |     printf("checksum");
132 |     for (int i = 0; i < size; i++) printf(" %.8e", chksum[i]);
133 |     printf("\n");
134 | }
135 | 


--------------------------------------------------------------------------------
/openmp/lab_knn/knn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn


--------------------------------------------------------------------------------
/openmp/lab_knn/knn_apx1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx1


--------------------------------------------------------------------------------
/openmp/lab_knn/knn_apx2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx2


--------------------------------------------------------------------------------
/openmp/lab_knn/knn_apx3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx3


--------------------------------------------------------------------------------
/openmp/lab_knn/knn_apx4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/knn_apx4


--------------------------------------------------------------------------------
/openmp/lab_knn/lab_knn.2023-05-17.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_knn/lab_knn.2023-05-17.tgz


--------------------------------------------------------------------------------
/openmp/lab_knn/readme.txt:
--------------------------------------------------------------------------------
1 | - Student name: 宋家庆
2 | - ID number: 202000130061
3 | 
4 | - Design a parallel algorithm to find the **approximate** K-nearest neighbors (e.g., K=10) of each element given the coordinates of N points (e.g., N=20000) in D-dimensional space (e.g., D=64).
5 | - Implement the algorithm with OpenMP and submit (only) your program. Please see the provided source codes as a starting point.
6 | - Reference: https://www.pinecone.io/learn/locality-sensitive-hashing-random-projection/
7 | 
8 | 


--------------------------------------------------------------------------------
/openmp/lab_knn/run.sh:
--------------------------------------------------------------------------------
1 | size=20000
2 | seed=1
3 | 
4 | g++ -o knn -fopenmp -DSIZE=$size -DSEED=$seed knn.cpp && timeout 60s time ./knn
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Firstprivate/fp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/DataSharing/Firstprivate/fp


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Firstprivate/fp.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | const int MAX = 10;
 7 | int A[MAX];
 8 | 
 9 | int main() {
10 |   int incr = 0;
11 |   #pragma omp parallel for firstprivate(incr)
12 |   for (int i = 0; i <= MAX; i++) {
13 |     if ((i % 2) == 0) incr++;
14 |     A[i] = incr;
15 |   }
16 |   for (int i = 0; i < MAX; i++) {
17 |     printf("%d", A[i]);
18 |   }
19 | 
20 |   return 0;
21 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Firstprivate/makefile:
--------------------------------------------------------------------------------
1 | hello: fp.cpp
2 | 	g++ -fopenmp fp.cpp -o fp
3 | clean:
4 | 	rm -f fp
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Lastprivate/fp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/DataSharing/Lastprivate/fp


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Lastprivate/fp.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | const int MAX = 10;
 7 | int A[MAX];
 8 | 
 9 | int main() {
10 |   int incr = 0;
11 |   #pragma omp parallel for firstprivate(incr)
12 |   for (int i = 0; i <= MAX; i++) {
13 |     if ((i % 2) == 0) incr++;
14 |     A[i] = incr;
15 |   }
16 |   for (int i = 0; i < MAX; i++) {
17 |     printf("%d", A[i]);
18 |   }
19 | 
20 |   return 0;
21 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/DataSharing/Lastprivate/makefile:
--------------------------------------------------------------------------------
1 | hello: fp.cpp
2 | 	g++ -fopenmp fp.cpp -o fp
3 | clean:
4 | 	rm -f fp
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/Intro05/hello:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/Intro05/hello


--------------------------------------------------------------------------------
/openmp/lab_par_for/Intro05/hello.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | int main(){
 4 |     double A[1000];
 5 |     omp_set_num_threads(4);
 6 |     #pragma omp parallel
 7 |     {
 8 |         int ID = omp_get_thread_num();
 9 |         pooh(ID,A);
10 |     }
11 |     printf("all down\n");
12 |     return 0;
13 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/Intro05/makefile:
--------------------------------------------------------------------------------
1 | hello: hello.cpp
2 | 	g++ -fopenmp hello.cpp -o hello
3 | clean:
4 | 	rm -f hello
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PI/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PI/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PI/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PI/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | 
 7 | int main() {
 8 |   int i;
 9 |   double x, pi, sum = 0.0;
10 |   step = 1.0 / (double)num_steps;
11 |   for (i = 0; i < num_steps; i++) {
12 |     x = (i + 0.5) * step;
13 |     sum = sum + 4.0 / (1.0 + x * x);
14 |   }
15 |   pi = step * sum;
16 |   printf("%lf", pi);
17 |   return 0;
18 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv1/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv1/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv1/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv1/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | 
 7 | int main(){
 8 |     int i,nthreads;
 9 |     double pi,sum[NUM_THREADS];
10 |     step = 1.0/(double)num_steps;
11 |     omp_set_num_threads(NUM_THREADS);
12 |     #pragma omp parallel 
13 |     {
14 |         int i,id,nthrds;
15 |         double x;
16 |         id = omp_get_thread_num();
17 |         nthrds = omp_get_num_threads();
18 |         if(id==0) nthreads = nthrds;
19 |         for (i = id,sum[id]=0.0; i < num_steps; i+=nthrds) {
20 |             x = (i+0.5)*step;
21 |             sum[id] += 4.0/(1.0+x*x);
22 |         }
23 |         /*for (i = 0,pi=0.0; i < num_steps; i+=nthrds) {
24 |             x = (i+0.5)*step;
25 |             sum[id] += 4.0 / (1.0+x*x);
26 |         }*/
27 |     }
28 |     for (i = 0,pi=0.0; i < nthreads; i++)
29 |     {
30 |         pi += sum[i]*step;
31 |     }
32 |     printf("%lf",pi);
33 |     return 0;
34 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv2/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv2/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv2/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv2/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 1000000;
 5 | double step;
 6 | 
 7 | int main() {
 8 |     int nthreads;
 9 |     double pi = 0.0;
10 |     step = 1.0/(double)num_steps;
11 |     omp_set_num_threads(NUM_THREADS);
12 |     #pragma omp parallel
13 |     {
14 |         int i,id,nthrds;
15 |         double x,sum;
16 |         // 获取线程的数量
17 |         id = omp_get_thread_num();
18 |         nthrds = omp_get_num_threads();
19 |         if(id==0) nthreads = nthrds;
20 |         // 获取线程的id
21 |         for (i = id,sum = 0.0; i < num_steps; i+=nthreads) {
22 |             x = (i+0.5)*step;
23 |             sum += 4.0/(1.0+x*x);
24 |         }
25 |         // 这里使用了一个critical，确保每次都只会有一个线程访问该段代码
26 |         // 与上一个版本相比，好处在于不需要开辟空间存储了
27 |         // 但缺点是引入了锁，会导致性能的下降
28 |         #pragma omp critical
29 |         {
30 |             pi += sum*step;
31 |         }
32 |             
33 |     }
34 |     printf("%lf",pi);
35 |     return 0;
36 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv3/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv3/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv3/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv3/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | 
 7 | int main() {
 8 |     int nthreads;
 9 |     double pi = 0.0;
10 |     step = 1.0/(double)num_steps;
11 |     omp_set_num_threads(NUM_THREADS);
12 |     #pragma omp parallel
13 |     {
14 |         int i,id,nthrds;
15 |         double x,sum;
16 |         id = omp_get_thread_num();
17 |         nthrds = omp_get_num_threads();
18 |         if(id==0) nthreads = nthrds;
19 |         for (i = id,sum = 0.0; i < num_steps; i+=nthreads) {
20 |             x = (i+0.5)*step;
21 |             // 将这段代码放在循环内部，会增加索德竞争
22 |             #pragma omp critical
23 |                 pi += 4.0/(1.0+x*x);
24 |         }
25 |     }
26 |     pi*=step;
27 |     printf("%lf",pi);
28 |     return 0;
29 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv4/build/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv4/build/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv4/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv4/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv4/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv4/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | #include<math.h>
 4 | #define NUM_THREADS 4
 5 | static long num_steps = 10000000;
 6 | double step;
 7 | 
 8 | int main() {
 9 |     int nthreads;
10 |     double pi = 0.0;
11 |     step = 1.0/(double)num_steps;
12 | 
13 |     double t = omp_get_wtime();
14 | 
15 |     omp_set_num_threads(NUM_THREADS);
16 |     #pragma omp parallel
17 |     {
18 |         int i,id,nthrds;
19 |         double x,sum;
20 |         id = omp_get_thread_num();
21 |         nthrds = omp_get_num_threads();
22 |         if(id==0) nthreads = nthrds;
23 |         for (i = id,sum = 0.0; i < num_steps; i+=nthreads) {
24 |             x = (i+0.5)*step;
25 |             sum += 4.0/(1.0+x*x);
26 |         }
27 |         sum *= step;
28 |         // 这里使用了一个critical，确保每次都只会有一个线程访问该段代码
29 |         // 与上一个版本相比，好处在于不需要开辟空间存储了
30 |         // 但缺点是引入了锁，会导致性能的下降
31 |         #pragma atomic 
32 |         {
33 |             pi += sum;
34 |         }
35 |     }
36 |     //printf("%lf",pi);
37 | 
38 |     t = omp_get_wtime() - t;
39 |     printf("time %f %d\n", t, NUM_THREADS);
40 | 
41 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
42 |     return 0;
43 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv5/makefile:
--------------------------------------------------------------------------------
1 | hello: pi.cpp
2 | 	g++ -fopenmp pi.cpp -o pi
3 | clean:
4 | 	rm -f pi
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv5/pi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/PIv5/pi


--------------------------------------------------------------------------------
/openmp/lab_par_for/PIv5/pi.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <stdio.h>
 3 | #define NUM_THREADS 2
 4 | static long num_steps = 100000;
 5 | double step;
 6 | 
 7 | int main() {
 8 |   int i;
 9 |   double pi, sum = 0.0;
10 |   step = 1.0 / (double)num_steps;
11 | #pragma omp parallel
12 |   {
13 |     double x;
14 | #pragma omp for reduction(+ : sum)
15 |     for (i = 0; i < num_steps; i++) {
16 |       x = (i + 0.5) * step;
17 |       sum = sum + 4.0 / (1.0 + x * x);
18 |     }
19 |   }
20 | 
21 |   pi = step * sum;
22 |   printf("%lf", pi);
23 |   return 0;
24 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/hello/hello:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/hello/hello


--------------------------------------------------------------------------------
/openmp/lab_par_for/hello/hello.cpp:
--------------------------------------------------------------------------------
 1 | #include<omp.h>
 2 | #include<stdio.h>
 3 | int main(){
 4 |     #pragma omp parallel
 5 |     {
 6 |         int ID = omp_get_thread_num();
 7 |         printf("Hello(%d)",ID);
 8 |         printf(" world(%d)",ID);
 9 |     }
10 |     return 0;
11 | }


--------------------------------------------------------------------------------
/openmp/lab_par_for/hello/makefile:
--------------------------------------------------------------------------------
1 | hello: hello.cpp
2 | 	g++ -fopenmp hello.cpp -o hello
3 | clean:
4 | 	rm -f hello
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/lab_par_for.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/lab_par_for.tgz


--------------------------------------------------------------------------------
/openmp/lab_par_for/par_for/makefile:
--------------------------------------------------------------------------------
1 | hello: par_for.cpp
2 | 	g++ -fopenmp par_for.cpp -o par_for
3 | clean:
4 | 	rm -f par_for
5 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/par_for/par_for:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_par_for/par_for/par_for


--------------------------------------------------------------------------------
/openmp/lab_par_for/par_for/par_for.cpp:
--------------------------------------------------------------------------------
 1 | #include <omp.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <vector>
 5 | #include <iostream>
 6 | #include <algorithm>
 7 | 
 8 | #define SIZE 12
 9 | using namespace std;
10 | 
11 | 
12 | int main() {
13 |   int test[SIZE];
14 |   #pragma omp parallel for schedule(dynamic, 4)
15 |   for (int i = 0; i < SIZE; i++) {
16 |     test[i] = omp_get_thread_num();
17 |   }
18 |   for (int i = 0; i < SIZE; i++) {
19 |     printf(" %d", test[i]);
20 |   }
21 |   printf(" %d\n", SIZE);
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/openmp/lab_par_for/readme.txt:
--------------------------------------------------------------------------------
 1 | - Watch lectures: https://icloud.qd.sdu.edu.cn:7777/link/2D2A742C095E0CFD13FEB87F405E2FEB
 2 | - Expiration date: 2023-06-18
 3 | - Implement your "Hello, World!" program "hello.cpp" of Unit 1
 4 | 代码见hello文件夹
 5 | - Compile and run program "par_for.cpp" multiple times
 6 | 
 7 | - Describe your observations
 8 | 将SIZE设置为12，多次运行结果为
 9 | 58 58 58 58 61 61 61 61 57 57 57 57 12
10 | 43 43 43 43 46 46 46 46 44 44 44 44 12
11 | 24 24 24 24 53 53 53 53 39 39 39 39 12
12 | 62 62 62 62 48 48 48 48 20 20 20 20 12
13 | #pragma omp parallel for schedule(dynamic, k)
14 | 每次输出时都会有多组数字，组数为（SIZE/k）或（SIZE/k + 1）每组数字中有k个相同的数字，k为我们指定的第二个参数
15 | 
16 | - Explain why this happens
17 | 
18 | openmp底层会为每个线程分配k次计算，
19 | 这里选用的调度方式为dynamic，运行时为随机选择可用的线程，每次运行的结果几乎都是不一样的，
20 | 一般无法做到预测使用哪些线程。
21 | 若调度方式为static，则每次运行结果都是固定的。
22 | 
23 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/build/tp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/build/tp


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/pi_integral_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_0


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/pi_integral_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_1


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/pi_integral_1_1.cpp:
--------------------------------------------------------------------------------
 1 | // student name:
 2 | // id number:
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | #define SIZE 10
11 | static long num_step = 10000000;
12 | // add your codes end
13 | 
14 | int main() {
15 |   double pi;
16 | 
17 |   double t = omp_get_wtime();
18 |   // add your codes begin
19 |   int i;
20 |   double sum = 0.0;
21 |   double step = 1.0 / (double)num_step;
22 |   omp_set_num_threads(SIZE);
23 | #pragma omp parallel
24 |   {
25 |     double x;
26 | #pragma omp for
27 |     for (i = 0; i < num_step; i++) {
28 | #pragma omp critical(cr)
29 |       {
30 |         x = (i + 0.5) * step;
31 |         sum = sum + 4.0 / (1.0 + x * x);
32 |       }
33 |     }
34 |   }
35 |   pi = step * sum;
36 |   // add your codes end
37 |   t = omp_get_wtime() - t;
38 |   printf("time %f %d\n", t, SIZE);
39 | 
40 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
41 | }
42 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/pi_integral_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/build/pi_integral_2


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/build/tp.cpp:
--------------------------------------------------------------------------------
 1 | // student name: Junhao Xu
 2 | // id number: 201900122025
 3 | 
 4 | /*#include <omp.h>
 5 | #include <math.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | #define SIZE 10000000
11 | static long num_steps = SIZE;
12 | double step;
13 | // add your codes end
14 | 
15 | int main()
16 | {
17 |   double pi=0.0;
18 | 
19 |   double t = omp_get_wtime();
20 |   // add your codes begin
21 |   step = 1.0 / (double)num_steps;
22 |   omp_set_num_threads(100);
23 | 
24 | 
25 |   #pragma omp parallel
26 |   {
27 |     int i;
28 |     double x, sum;
29 |     int id = omp_get_thread_num();
30 |     int nthrds = omp_get_num_threads();
31 |     // double temp1 = 0.5*step;
32 |     // double temp2 = nthrds*step;
33 |     // double end = num_steps*step;
34 |     x = (id) * step;
35 |     double margin = nthrds*step;
36 |     for (i = id + nthrds,sum= 0.0; i < num_steps; i+=nthrds)
37 |     {
38 |       x += margin;
39 |       // x = (i + 0.5) * step;
40 |       sum += 4.0 / (1.0 + x * x);
41 |     }
42 |     // sum = sum*step;
43 |     #pragma omp critical
44 |       pi+=sum*step;
45 | 
46 |     // #pragma atomic
47 |     //   pi+=sum;
48 |   }
49 |   // pi+=0.000100000001;
50 |   // add your codes end
51 |   t = omp_get_wtime() - t;
52 |   printf("time %f %d\n", t, SIZE);
53 | 
54 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
55 | }*/
56 | 
57 | #include <omp.h>
58 | static long num_steps = 100000;
59 | double step;
60 | #define NUM_THREADS 2
61 | void main() {
62 |   double pi;
63 |   step = 1.0 / (double)num_steps;
64 |   omp_set_num_threads(NUM_THREADS);
65 |   int nthreads = 0;
66 | #pragma omp parallel
67 |   {
68 |     int i, id, nthrds;
69 |     double x, sum;
70 |     id = omp_get_thread_num();
71 |     nthrds = omp_get_num_threads();
72 |     if (id == 0)
73 |       nthreads = nthrds;
74 |     id = omp_get_thread_num();
75 |     nthrds = omp_get_num_threads();
76 |     for (i = id, sum = 0.0; i < num_steps; i = i + nthreads) {
77 |       x = (i + 0.5) * step;
78 |       sum += 4.0 / (1.0 + x * x);
79 |     }
80 |     sum = sum * step;
81 | #pragma atomic
82 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/lab_pi_integral.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_integral/lab_pi_integral.tgz


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/pi_integral_0.cpp:
--------------------------------------------------------------------------------
 1 | // student 宋家庆:
 2 | // id 202000130061:
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | 
10 | // add your codes begin
11 | // #define SIZE 10000000
12 | static long num_steps = SIZE;
13 | double step;
14 | // add your codes end
15 | 
16 | int main() {
17 |   double t = omp_get_wtime();
18 |   // add your codes begin
19 |   int i;
20 |   double x, pi, sum = 0.0;
21 |   //omp_set_num_threads(SIZE);
22 |   step = 1.0 / (double)num_steps;
23 |   for (int i = 0; i < num_steps; i++) {
24 |     x = (i + 0.5) * step;
25 |     sum = sum + 4.0 / (1.0 + x * x);
26 |   }
27 |   pi = step * sum;
28 | 
29 |   // add your codes end
30 |   t = omp_get_wtime() - t;
31 |   printf("time %f %d\n", t, SIZE);
32 | 
33 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
34 | }
35 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/pi_integral_1.cpp:
--------------------------------------------------------------------------------
 1 | // student 宋家庆:
 2 | // id 202000130061:
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | //#define SIZE 10000000
11 | static long num_step = SIZE;
12 | // add your codes end
13 | 
14 | int main() {
15 |   double pi;
16 | 
17 |   double t = omp_get_wtime();
18 |   // add your codes begin
19 |   int i;
20 |   double sum = 0.0;
21 |   double step = 1.0 / (double)num_step;
22 |   omp_set_num_threads(10);
23 | #pragma omp parallel
24 |   {
25 |     double x;
26 | #pragma omp for reduction(+ : sum)
27 |     for (i = 0; i < num_step; i++) {
28 |       x = (i + 0.5) * step;
29 |       sum = sum + 4.0 / (1.0 + x * x);
30 |     }
31 |   }
32 |   pi = step * sum;
33 |   // add your codes end
34 |   t = omp_get_wtime() - t;
35 |   printf("time %f %d\n", t, SIZE);
36 | 
37 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
38 | }
39 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/pi_integral_2.cpp:
--------------------------------------------------------------------------------
 1 | // student 宋家庆:
 2 | // id 202000130061:
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | //#define SIZE 25
11 | 
12 | static long num_step = SIZE;
13 | // add your codes end
14 | 
15 | int main() {
16 |   double pi;
17 | 
18 |   double t = omp_get_wtime();
19 |   // add your codes begin
20 |   double step = 1 / (double)num_step;
21 | 
22 |   omp_set_num_threads(10);
23 |   int real_num = 0;
24 |   int nthreads;
25 | 
26 | #pragma omp parallel
27 |   {
28 |     int id,i,nthrds;
29 |     id = omp_get_thread_num();
30 |     // i为每个线程私有，每个线程只需执行自己需要执行的次数即可
31 |     // 这里的x和sum也不再共享，而是每个线程私有一份
32 |     double x = 0.0, sum = 0.0;
33 |     nthrds = omp_get_num_threads();
34 |     //x = (id)*step;
35 |     //double real_step = nthrds*step;
36 |     // 每个线程只执行循环中自己需要执行的部分
37 |     for (i = id; i < num_step; i += nthrds) {
38 |       x = (i + 0.5) * step;
39 |       //x = x + real_step;
40 |       sum += 4.0 / (1.0 + x * x);
41 |     }
42 |     // #pragma atomic
43 |     //     pi += sum* step; 
44 |     //printf("%.12f ",sum);
45 |     // 如果使用atomic要注意，atomic中只支持简单的操作
46 |     // 使用atomic需要将乘法在外面做，然后内部只有加法
47 |     #pragma critical (critical1) 
48 |         pi += sum * step; 
49 |   }
50 |   // add your codes end
51 |   t = omp_get_wtime() - t;
52 |   printf("time %f %d\n", t, SIZE);
53 | 
54 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
55 | }
56 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name: 宋家庆
 2 | - ID number: 202000130061
 3 | 
 4 | - Watch lectures: https://icloud.qd.sdu.edu.cn:7777/link/2D2A742C095E0CFD13FEB87F405E2FEB
 5 | - Expiration date: 2023-06-18
 6 | - Implement your Pi program "pi_integral_N.cpp" of Unit 2
 7 |   - Version 0: serial program
 8 |   - Version 1: parallel program using parallel-for and reduction
 9 |   - Version 2: parallel program without using parallel-for 
10 | 
11 | 代码见目录
12 | - Compile and run your programs multiple times
13 | 三个代码统一设置线程数量为4，步数为10000000，v2版本中尝试使用了atomic和ctrtical
14 | v0运行三次结果分别为
15 | time 0.027952 4
16 | pi 3.141592653590 -0.000000000000
17 | 
18 | time 0.028419 4
19 | pi 3.141592653590 -0.000000000000
20 | 
21 | time 0.027284 4
22 | pi 3.141592653590 -0.000000000000
23 | 
24 | v1运行三次结果分别为
25 | time 0.009849 4
26 | pi 3.141592653590 -0.000000000000
27 | 
28 | time 0.023802 4
29 | pi 3.141592653590 -0.000000000000
30 | 
31 | time 0.012471 4
32 | pi 3.141592653590 -0.000000000000
33 | 
34 | v2运行三次结果分别为
35 | critical版本
36 | time 0.011807 4
37 | pi 3.141592653590 -0.000000000000
38 | 
39 | time 0.013312 4
40 | pi 3.141592653590 -0.000000000000
41 | 
42 | time 0.010704 4
43 | pi 3.141592653590 -0.000000000000
44 | 
45 | atomic版本
46 | time 0.010627 4
47 | pi 3.141592653590 -0.000000000000
48 | 
49 | time 0.010760 4
50 | pi 3.141592653590 -0.000000000000
51 | 
52 | time 0.011253 4
53 | pi 3.141592653590 -0.000000000000
54 | 
55 | - Describe your observations
56 | 串行版本的耗时最长，使用reduction和v2版本效率差距不大。
57 | 在步数固定的情况下，当线程数量增加时，v1和v2版本的性能会先增加，
58 | 但线程数量翻倍运行效率并不会达到对应的倍数。当线程数量到达一定值时，效率不再增加。
59 | 之后再增加线程数量，效率反而会下降。
60 | 
61 | 在实现v2版本时，初始时对于for循环中每一次都进行了加锁操作，这时效率极差，
62 | 将部分变量改为线程私有，每个线程独自计算自己需要计算的部分，最后将结果原子求和，效率得到了极大提升。
63 | 
64 | 
65 | - Explain why this happens
66 | 初期随着线程数量的增加，运行效率会随之增加，这是因为多个线程每个线程只执行循环的一部分，并行执行，
67 | 效率增加。
68 | 效率并不会随着线程数量翻倍，这是因为线程的创建和销毁都会占用时间
69 | 当线程数量过大时，效率下降：这说明此时该程序的执行时间用在线程创建销毁的时间已经超过的具体计算的时间，
70 | 成为了该程序的计算瓶颈。这提醒我们线程并不是越多越好。
71 | 
72 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/run.sh:
--------------------------------------------------------------------------------
1 | version=2
2 | size=1000000
3 | #rm ./build/pi_integral_$version
4 | g++ -o ./build/pi_integral_$version -fopenmp -DSIZE=$size pi_integral_$version.cpp && timeout 60s time ./build/pi_integral_$version
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_integral/test.cpp:
--------------------------------------------------------------------------------
 1 | // student 宋家庆:
 2 | // id 202000130061:
 3 | 
 4 | /*#include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | //#define SIZE 25
11 | 
12 | static long num_step = 10000000;
13 | // add your codes end
14 | 
15 | int main() {
16 |   double pi;
17 | 
18 |   double t = omp_get_wtime();
19 |   // add your codes begin
20 |   double step = 1 / (double)num_step;
21 | 
22 |   omp_set_num_threads(25);
23 |   int real_num = 0;
24 |   int nthreads;
25 | 
26 | #pragma omp parallel
27 |   {
28 |     int id,i,nthrds;
29 |     id = omp_get_thread_num();
30 |     // i为每个线程私有，每个线程只需执行自己需要执行的次数即可
31 |     // 这里的x和sum也不再共享，而是每个线程私有一份
32 |     double x, sum = 0.0;
33 |     nthrds = omp_get_num_threads();
34 |     if (id == 0) {
35 |         nthreads = omp_get_num_threads();
36 |         //printf("-----%d",real_num);
37 |     }
38 |     //id = omp_get_thread_num();
39 |     //nthrds = omp_get_num_threads();
40 |     //printf("%d ",id);
41 |     x = (id)*step;
42 |     double margin = nthrds*step;
43 |     // 每个线程只执行循环中自己需要执行的部分
44 |     for (i = id; i < num_step; i += nthreads) {
45 |       //x = (i + 0.5) * step;
46 |       x = x + margin;
47 |       sum += 4.0 / (1.0 + x * x);
48 |     }
49 |     #pragma atomic
50 |         pi += sum* step; 
51 |     //#pragma critical (critical1) 
52 |     //    pi += sum; 
53 |   }
54 |   // add your codes end
55 |   t = omp_get_wtime() - t;
56 |   printf("time %f %d\n", t, SIZE);
57 | 
58 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
59 | }
60 | */
61 | 
62 | // student name: Junhao Xu
63 | // id number: 201900122025
64 | 
65 | #include <omp.h>
66 | #include <math.h>
67 | #include <stdio.h>
68 | using namespace std;
69 | 
70 | // add your codes begin
71 | 
72 | static long num_steps = 100;
73 | double step;
74 | // add your codes end
75 | 
76 | int main()
77 | {
78 |   double pi=0.0;
79 | 
80 |   double t = omp_get_wtime();
81 |   // add your codes begin
82 |   step = 1.0 / (double)num_steps;
83 |   omp_set_num_threads(100);
84 |   
85 |     int i = 0;
86 |   #pragma omp parallel
87 |   {
88 |     i++;
89 |     int id = omp_get_thread_num();
90 |     printf("%d\n",i);
91 |   }
92 |   // pi+=0.000100000001;
93 |   // add your codes end
94 |   t = omp_get_wtime() - t;
95 |   printf("time %f %d\n", t, 25);
96 | 
97 |   printf("pi %.12f %.12f\n", pi, pi - M_PI);
98 | }
99 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/build/pi_rnd_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/build/pi_rnd_1


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/build/pi_rnd_2 copy.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | // #define SIZE 10000
11 | static long MULTIPLIER1 = 1366;
12 | static long ADDEND1 = 150889;
13 | static long PMOD1 = 714025;
14 | 
15 | static long MULTIPLIER2 = 1277;
16 | static long ADDEND2 = 524849;
17 | static long PMOD2 = 981293;
18 | 
19 | const int thrd_num = 30;
20 | unsigned long long pseed[10000];
21 | 
22 | inline long LCG(long random_last) {
23 |     return (MULTIPLIER2 * random_last + ADDEND2) % PMOD2;
24 | }
25 | // add your codes end
26 | 
27 | int main() {
28 |     double pi;
29 | 
30 |     double t = omp_get_wtime();
31 |     // add your codes begin
32 |     int nthreads = 0;
33 |     double r = 1.0;
34 |     int num_in = 0;
35 | #pragma omp parallel
36 |     {
37 | #pragma omp single
38 |         {
39 |             nthreads = omp_get_num_threads();
40 |             unsigned long long iseed = PMOD1 / MULTIPLIER1;  // just pick a seed
41 |             pseed[0] = iseed;
42 |             for (int i = 1; i < nthreads; ++i) {
43 |                 iseed = (unsigned long long)((MULTIPLIER1 * iseed) % PMOD1);
44 |                 pseed[i] = iseed;
45 |             }
46 |         }
47 |         int id = omp_get_thread_num();
48 |         int tp_num = 0;
49 |         long long random_last = (unsigned long long)pseed[id];
50 |         for (int i = 0; i < SIZE; i += nthreads) {
51 |             random_last = LCG(random_last);
52 |             double x = (double)random_last / (double)PMOD2;
53 |             random_last = LCG(random_last);
54 |             double y = (double)random_last / (double)PMOD2;
55 |             if (x * x + y * y <= r) {
56 |                 tp_num++;
57 |             }
58 |         }
59 | #pragma omp critical
60 |         num_in += tp_num;
61 |     }
62 | 
63 |     pi = 4.0 * (double)(num_in) / (double)(SIZE);
64 |     // add your codes end
65 |     t = omp_get_wtime() - t;
66 |     printf("time %f %d\n", t, SIZE);
67 | 
68 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
69 | }
70 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/build/pi_rnd_2.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | // #define SIZE 10000
11 | static long MULTIPLIER = 1366;
12 | static long ADDEND = 150889;
13 | static long PMOD = 714025;
14 | long random_last = 0;
15 | 
16 | inline long LCG() {
17 |     long random_next;
18 |     random_next = (MULTIPLIER * random_last + ADDEND) % PMOD;
19 |     random_last = random_next;
20 |     return random_next;
21 | }
22 | // add your codes end
23 | 
24 | double rnd_arr[2 * SIZE];
25 | 
26 | int main() {
27 |     double pi;
28 | 
29 |     double t = omp_get_wtime();
30 |     // add your codes begin
31 |     double x, y;
32 |     double r = 1.0;
33 |     int num_in = 0;
34 |     for (int i = 0; i < 2 * SIZE; i++) {
35 |         rnd_arr[i] = (double)LCG() / double(PMOD);
36 |     }
37 | #pragma omp parallel for reduction(+ : num_in)
38 |     for (int i = 0; i < SIZE; i++) {
39 |         double x = rnd_arr[2 * i];
40 |         double y = rnd_arr[2 * i + 1];
41 |         if (x * x + y * y <= r) {
42 |             num_in++;
43 |         }
44 |     }
45 |     pi = 4.0 * (double)(num_in) / (double)(SIZE);
46 |     // add your codes end
47 |     t = omp_get_wtime() - t;
48 |     printf("time %f %d\n", t, SIZE);
49 | 
50 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
51 | }
52 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/lab_pi_rnd.2023-04-12.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/lab_pi_rnd.2023-04-12.tgz


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_0


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_0.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | static long MULTIPLIER = 1366;
11 | static long ADDEND = 150889;
12 | static long PMOD = 714025;
13 | long random_last = 0;
14 | 
15 | long LCG() {
16 |     long random_next;
17 |     random_next = (MULTIPLIER * random_last + ADDEND) % PMOD;
18 |     random_last = random_next;
19 |     return random_next;
20 | }
21 | // add your codes end
22 | 
23 | int main() {
24 |     double pi;
25 | 
26 |     double t = omp_get_wtime();
27 |     // add your codes begin
28 |     double x, y;
29 |     double r = 1.0;
30 |     int num_in = 0;
31 |     int tot_num = SIZE;
32 |     for (int i = 0; i < SIZE; i++) {
33 |         x = (double)LCG() / (double)PMOD;
34 |         y = (double)LCG() / (double)PMOD;
35 |         if (x * x + y * y <= r) {
36 |             num_in++;
37 |         }
38 |     }
39 |     pi = 4.0 * (double)(num_in) / (double)(SIZE);
40 |     t = omp_get_wtime() - t;
41 |     // add your codes end
42 |     printf("time %f %d\n", t, SIZE);
43 | 
44 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
45 | }
46 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_1


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_1.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | // #define SIZE 10000
11 | static long MULTIPLIER = 1366;
12 | static long ADDEND = 150889;
13 | static long PMOD = 714025;
14 | 
15 | // inline long LCG(long random_last) {
16 | //     return (MULTIPLIER * random_last + ADDEND) % PMOD;
17 | // }
18 | long random_last = 0;
19 | #pragma omp threadprivate(random_last)
20 | double LCG() {
21 |     long random_next;
22 |     random_next = (MULTIPLIER * random_last + ADDEND) % PMOD;
23 |     random_last = random_next;
24 |     return ((double)random_next / (double)PMOD);
25 | }
26 | // add your codes end
27 | 
28 | int main() {
29 |     double pi;
30 | 
31 |     double t = omp_get_wtime();
32 |     // add your codes begin
33 |     double x, y;
34 |     double r = 1.0;
35 |     int num_in = 0;
36 |     long random_last = 0;
37 | #pragma omp parallel for reduction(+ : num_in)
38 |     for (int i = 0; i < SIZE; i++) {
39 |         double x = LCG();
40 |         double y = LCG();
41 |         if (x * x + y * y <= r) {
42 |             num_in++;
43 |         }
44 |     }
45 |     pi = 4.0 * (double)(num_in) / (double)(SIZE);
46 | 
47 |     // add your codes end
48 |     t = omp_get_wtime() - t;
49 |     printf("time %f %d\n", t, SIZE);
50 | 
51 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
52 | }
53 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_pi_rnd/pi_rnd_2


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/pi_rnd_2.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number: 202000130061
 3 | 
 4 | #include <math.h>
 5 | #include <omp.h>
 6 | #include <stdio.h>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | // #define SIZE 10000
11 | static long MULTIPLIER1 = 1366;
12 | static long ADDEND1 = 150889;
13 | static long PMOD1 = 714025;
14 | 
15 | // static long MULTIPLIER2 = 1277;
16 | // static long ADDEND2 = 524849;
17 | // static long PMOD2 = 981293;
18 | static long MULTIPLIER2 = 1366;
19 | static long ADDEND2 = 150889;
20 | static long PMOD2 = 714025;
21 | 
22 | const int thrd_num = 30;
23 | unsigned long long pseed[10000];
24 | 
25 | inline long LCG(long random_last) {
26 |     return (MULTIPLIER2 * random_last + ADDEND2) % PMOD2;
27 | }
28 | // add your codes end
29 | 
30 | int main() {
31 |     double pi;
32 | 
33 |     double t = omp_get_wtime();
34 |     // add your codes begin
35 |     int nthreads = 0;
36 |     double r = 1.0;
37 |     int num_in = 0;
38 | #pragma omp parallel num_threads(30) reduction(+ : num_in)
39 |     {
40 |         // 使用一种随机数生成算法生成若干随机数种子
41 | #pragma omp single
42 |         {
43 |             nthreads = omp_get_num_threads();
44 |             unsigned long long iseed = PMOD1 / MULTIPLIER1;  // just pick a seed
45 |             pseed[0] = iseed;
46 |             for (int i = 1; i < nthreads; ++i) {
47 |                 iseed = rand() % PMOD1;  //(unsigned long long)((MULTIPLIER1 *
48 |                                          //iseed) % PMOD1);
49 |                 pseed[i] = iseed;
50 |             }
51 |         }
52 |         // 每个线程依据上面的随机数，使用其他随机数生成算法进行生成
53 |         int id = omp_get_thread_num();
54 |         int tp_num = 0;
55 |         long long random_last = (unsigned long long)pseed[id];
56 |         for (int i = 0; i < SIZE; i += nthreads) {
57 |             random_last = LCG(random_last);
58 |             double x = (double)random_last / (double)PMOD2;
59 |             random_last = LCG(random_last);
60 |             double y = (double)random_last / (double)PMOD2;
61 |             if (x * x + y * y <= r) {
62 |                 num_in++;
63 |             }
64 |         }
65 | // #pragma omp critical
66 | //         num_in += tp_num;
67 |     }
68 | 
69 |     pi = 4.0 * (double)(num_in) / (double)(SIZE);
70 |     // add your codes end
71 |     t = omp_get_wtime() - t;
72 |     printf("time %f %d\n", t, SIZE);
73 | 
74 |     printf("pi %.12f %.12f\n", pi, pi - M_PI);
75 | }
76 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name: 宋家庆
 2 | - ID number: 202000130061
 3 | 
 4 | - Implement your Pi program "pi_rnd_N.cpp"
 5 |   - Version 0: serial program using LCG PRNG
 6 |   - Version 1: parallel program using thread-safe PRNG
 7 |   - Version 2: parallel program using leap-frog PRNG
 8 | - Compile and run your programs multiple times
 9 | - Describe your observations
10 | 当SIZE大小为100000000时
11 | version 0
12 | 所需时间为4.167296，计算得到的pi大小为3.141593560000，误差为0.000000906410
13 | version 1
14 | 所需时间为0.127502，计算得到的pi大小为3.141757440000，误差为0.000164786410
15 | version 2
16 | 所需时间为0.175884，计算得到的pi大小为3.141585920000，误差为-0.000000533590
17 | 串行版本的准确度最高，thread-safe版本速度较快，但准确度较低，leap-frog可以在高效的情况下保证较高的准确度。
18 | - Explain why this happens
19 | thread_safe版本，每个线程都拷贝了一份初始的random_seed，但由于使用了相同的随机数种子和随机数算法，使得生成的
20 | 随机数并没有那么的随机，导致蒙特卡洛算法结果不理想。
21 | leap-frog版本先使用了一种随机数生成算法为每个线程生成了一个随机数种子，又使用了另一种随机数生成算法在每个线程内部
22 | 使用不同的随机数种子生成随机数。较之thread-safe版本生成的随机数更加随机。能够逼近单线程生成的随机数效果。
23 | 
24 | 


--------------------------------------------------------------------------------
/openmp/lab_pi_rnd/run.sh:
--------------------------------------------------------------------------------
1 | version=2
2 | size=100000000
3 | 
4 | g++ -o pi_rnd_$version -fopenmp -DSIZE=$size pi_rnd_$version.cpp && timeout 60s time ./pi_rnd_$version


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/lab_scan_frag.2023-04-19.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/lab_scan_frag.2023-04-19.tgz


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name: 宋家庆
 2 | - ID number: 202000130061
 3 | 
 4 | - Implement your fragment scan algorithm
 5 | - Compile and run your programs multiple times
 6 | - Describle how you incrementally implement and test your program
 7 | 算法运行结果如下
 8 | time 0.090030 100000000
 9 | 6.98user 1.91system 0:03.85elapsed 230%CPU (0avgtext+0avgdata 2344884maxresident)k
10 | 0inputs+0outputs (0major+586522minor)pagefaults 0swaps
11 | 平均执行时间在0.3-0.4之间，较之提供的标准答案效率提高了一倍。
12 | 
13 | 首先直接复现一下之前在scan中实现的recurision算法，将其中的加法操作改为定义好的操作。
14 | 此时算法执行时间约为1秒，将递归操作中的引用传递改为值传递，时间降低到0.9s
15 | 考虑到std中定义很多额外的操作，可能会比较耗时，这里将所有的vector都转换为int*数组
16 | 此时时间降低到0.09。
17 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/run.sh:
--------------------------------------------------------------------------------
1 | version=0
2 | size=100000000
3 | 
4 | g++ -o scan_frag_$version -fopenmp -DSIZE=$size scan_frag_$version.cpp && timeout 60s time ./scan_frag_$version
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/scan_frag:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/scan_frag


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/scan_frag_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_frag/scan_frag_0


--------------------------------------------------------------------------------
/openmp/lab_scan_frag/scan_frag_0.cpp:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <omp.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include <iostream>
 6 | #include <vector>
 7 | using namespace std;
 8 | 
 9 | // add your codes begin
10 | int thrd_num = 50;
11 | void rscan(int *data, int *flag, int size) {
12 |     if (size == 1) return;
13 |     //vector<int> tpdata(size / 2);
14 |     int *tpdata = new int[size / 2];
15 |     //vector<int> tpflag(size / 2);
16 |     int *tpflag = new int[size / 2];
17 | #pragma omp parallel for num_threads(thrd_num)
18 |     for (int i = 0; i < size / 2; i++) {
19 |         if (flag[2 * i + 1] == 1) {
20 |             tpdata[i] = data[2 * i + 1];
21 |             tpflag[i] = flag[2 * i + 1];
22 |         } else {
23 |             tpdata[i] = data[i * 2] + data[2 * i + 1];
24 |             tpflag[i] = flag[2 * i] | flag[2 * i + 1];
25 |         }
26 |     }
27 |     rscan(&tpdata[0], &tpflag[0], size / 2);
28 | #pragma omp parallel for num_threads(thrd_num)
29 |     for (int i = 1; i < size; i += 2) {
30 |         data[i] = tpdata[i / 2];
31 |         if (i + 1 < size) {
32 |             if (flag[i + 1] != 1) {
33 |                 data[i + 1] = tpdata[(i + 1) / 2 - 1] + data[i + 1];
34 |                 flag[i + 1] = flag[i + 1] | tpflag[(i + 1) / 2 - 1];
35 |             }
36 |         }
37 |     }
38 | }
39 | 
40 | // add your codes end
41 | 
42 | int main() {
43 |     vector<int> data(SIZE, 1);
44 |     vector<int> flag(SIZE, 0);
45 |     vector<int> test(SIZE);
46 | 
47 |     srand(SIZE);
48 |     data[0] = 0;
49 |     flag[0] = 1;
50 |     for (int i = 0; i < flag.size() / 12; i++) {
51 |         int index = rand() % flag.size();
52 |         data[index] = 0;
53 |         flag[index] = 1;
54 |     }
55 |     for (int i = 0; i < data.size(); i++)
56 |         test[i] = (flag[i] != 0) ? data[i] : test[i - 1] + data[i];
57 | 
58 |     double t = omp_get_wtime();
59 |     // add your codes begin
60 |     //vector<int> flag2 = flag;
61 |     rscan(&data[0], &flag[0], SIZE);
62 |     // add your codes end
63 |     t = omp_get_wtime() - t;
64 |     printf("time %f %d\n", t, SIZE);
65 | 
66 |     for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]);
67 | }
68 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_link/build/scan_link_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/build/scan_link_0


--------------------------------------------------------------------------------
/openmp/lab_scan_link/lab_scan_link.2023-04-26.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/lab_scan_link.2023-04-26.tgz


--------------------------------------------------------------------------------
/openmp/lab_scan_link/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name:
 2 | - ID number:
 3 | 
 4 | - Implement your list ranking algorithm
 5 | - Compile and run your programs multiple times
 6 | - Describle how you incrementally implement and test your program
 7 | 
 8 | 当SIZE大小为100000000时，在无其他人占用的情况下，答案代码为7.964633，我的时间为5.512434
 9 | 首先我们将所有的rank都初始化为0，在第i次循环里，所有的prev数组都向前跳一次，此时若某个点的prev数组不为-1，即没有跳到头节点，则将data值更新为前一位置data值+pow(2,i)
10 | 这是因为我们随着循环次数的增加，其每一次跳跃所跨越的距离指数递增。最多需要log(SIZE)次跳跃，即可实现求所有的rank
11 | 由于我们在循环对data和prev数组既有写又有读，所以我们要做一次读写分离来保证正确性，即对原数组做一次拷贝。
12 | 跳跃部分可以使用并行进行优化（由于读写分离，所以可以保证并行的正确性）
13 | 优化小trick：
14 | 初始时使用vector进行data和prev的备份，使用memcpy进行拷贝，此时时间为10.944141
15 | 之后将拷贝部分更换为多线程并发拷贝，时间降低至7.885708
16 | 考虑到vecttor作为一个stl容器，内部的维护也需要耗时，将vector更换为指针，此时时间为5.512434
17 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_link/run.sh:
--------------------------------------------------------------------------------
1 | version=0
2 | size=100000000
3 | 
4 | g++ -o scan_link_$version -fopenmp -DSIZE=$size scan_link_$version.cpp && timeout 60s time ./scan_link_$version
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_link/scan_link:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/scan_link


--------------------------------------------------------------------------------
/openmp/lab_scan_link/scan_link_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_link/scan_link_0


--------------------------------------------------------------------------------
/openmp/lab_scan_link/scan_link_0.cpp:
--------------------------------------------------------------------------------
 1 | // student name: 宋家庆
 2 | // id number:202000130061
 3 | 
 4 | #include <assert.h>
 5 | #include <math.h>
 6 | #include <omp.h>
 7 | #include <stdlib.h>
 8 | #include <string.h>
 9 | 
10 | #include <iostream>
11 | #include <vector>
12 | using namespace std;
13 | 
14 | // add your codes begin
15 | // #define SIZE 1000
16 | // add your codes end
17 | 
18 | int main() {
19 |     vector<int> data(SIZE, -1);
20 |     vector<int> prev(SIZE, -1);
21 |     vector<int> next(SIZE, -1);
22 |     vector<int> test(SIZE, -1);
23 | 
24 |     srand(SIZE);
25 |     {
26 |         int tmp = -1;
27 |         for (int i = 0; i < SIZE / 2; i++) {
28 |             int idx = rand() % SIZE;
29 |             while (data[idx] >= 0) idx = (idx + 1) % SIZE;
30 |             if (i > 0) {
31 |                 data[idx] = 1;
32 |                 prev[idx] = tmp;
33 |                 next[tmp] = idx;
34 |             } else {
35 |                 data[idx] = 0;
36 |             }
37 |             test[idx] = i;
38 |             tmp = idx;
39 |         }
40 |     }
41 | 
42 |     double t = omp_get_wtime();
43 |     // add your codes begin
44 |     // 将data的初始值全部置为0
45 | #pragma omp parallel for
46 |     for (int i = 0; i < SIZE; i++)
47 |         if (data[i] == 1) data[i] = 0;0
48 | 
49 |     int* cdata = new int[SIZE];
50 |     int* cprev = new int[SIZE];
51 |     // vector<int> cdata(SIZE, -1);
52 |     // vector<int> cprev(SIZE, -1);
53 |     for (int i = 0; pow(2, i) < SIZE; i++) {
54 |         int step = pow(2, i);
55 |         // memcpy(&cdata[0], &data[0], 4*SIZE);
56 |         // memcpy(&cprev[0], &prev[0], 4*SIZE);
57 | #pragma omp parallel for
58 |         for (int i = 0; i < SIZE; i++) cdata[i] = data[i];
59 | #pragma omp parallel for
60 |         for (int i = 0; i < SIZE; i++) cprev[i] = prev[i];
61 | #pragma omp parallel for
62 |         for (int j = 0; j < SIZE; j++) {
63 |             if (prev[j] != -1) {
64 |                 data[j] = cdata[prev[j]] + step;
65 |                 prev[j] = cprev[prev[j]];
66 |             }
67 |         }
68 |     }
69 |     // add your codes end
70 |     t = omp_get_wtime() - t;
71 |     printf("time %f %d\n", t, SIZE);
72 | 
73 |     for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]);
74 | }
75 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/lab_scan_tree.2023-05-06.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/lab_scan_tree.2023-05-06.tgz


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name:
 2 | - ID number:
 3 | 
 4 | - Implement your tree ranking algorithm
 5 | - Compile and run your programs multiple times
 6 | - Compare the running times of your list ranking and tree ranking algorithms
 7 | - Explain your observations
 8 | tree的scan算法逻辑和link的逻辑基本是一致的，从较远节点更新某一点的rank的迭代轮数比较近节点所需要的
 9 | 迭代轮数更多，所以使用相同的逻辑可以保证正确性。
10 | 对scan_link和scan_tree进行对比，在同样使用50个线程，SIZE大小为100000000时，scan_link所需要的时间
11 | 为5.512434，而scan_tree所需要的时间为1.976432。可以发现scan_tree所需时间更少，这是因为tree在相同
12 | 节点数量的情况下，由于树可以有多个儿子节点，而link只会有一个儿子节点，数得高度较之link比较低，所需要
13 | 的迭代次数较少，所以所需时间更少。
14 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/run.sh:
--------------------------------------------------------------------------------
1 | version=0
2 | size=100000000
3 | 
4 | g++ -o scan_tree_$version -fopenmp -DSIZE=$size scan_tree_$version.cpp && timeout 60s time ./scan_tree_$version
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/scan_tree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/scan_tree


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/scan_tree_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_tree/scan_tree_0


--------------------------------------------------------------------------------
/openmp/lab_scan_tree/scan_tree_0.cpp:
--------------------------------------------------------------------------------
 1 | //student name:
 2 | //id number:
 3 | 
 4 | #include <omp.h>
 5 | #include <stdlib.h>
 6 | #include <assert.h>
 7 | #include <vector>
 8 | #include <iostream>
 9 | #include <cmath>
10 | #include <cstring>
11 | using namespace std;
12 | 
13 | 
14 | // add your codes begin
15 | const int thrd_num = 60;
16 | // add your codes end
17 | 
18 | 
19 | int main() {
20 |   vector<int> data(SIZE, -1);
21 |   vector<int> pare(SIZE, -1);
22 |   vector<int> test(SIZE, -1);
23 | 
24 |   srand(SIZE);
25 |   { vector<int> tmp;
26 |     for (int i = 0; i < SIZE/2; i++) {
27 |       // 随机获取一个idx
28 |       int idx = rand() % SIZE;
29 |       // 找到一个data[idx]没有父亲的    
30 |       while (data[idx] >= 0) idx = (idx + 1) % SIZE;
31 |       if (i > 0) {
32 |         // 把他的树高记为1
33 |         data[idx] = 1;
34 |         // 给他找一个父亲
35 |         pare[idx] = tmp[rand() % tmp.size()];
36 |         // 得到真正的树高
37 |         test[idx] = test[pare[idx]] + data[idx];
38 |       } else {
39 |         // 这个节点是根节点，rank为0
40 |         data[idx] = 0;
41 |         test[idx] = data[idx];
42 |       }
43 |       // 他现在可以作为父亲了
44 |       tmp.push_back(idx);
45 |     }
46 |   }
47 | 
48 |   double t = omp_get_wtime();
49 |   // add your codes begin
50 |   // 将data的初始值全部置为0
51 | // #pragma omp parallel for
52 | //     for (int i = 0; i < SIZE; i++)
53 | //         if (data[i] == 1) data[i] = 0;
54 | 
55 |     int* cdata = new int[SIZE];
56 |     int* cpare = new int[SIZE];
57 |     // vector<int> cdata(SIZE, -1);
58 |     // vector<int> cpare(SIZE, -1);
59 |     for (int i = 0; pow(2, i) < SIZE; i++) {
60 |         int step = pow(2, i);
61 |         // memcpy(&cdata[0], &data[0], 4*SIZE);
62 |         // memcpy(&cpare[0], &pare[0], 4*SIZE);
63 | #pragma omp parallel for num_threads(thrd_num)
64 |         for (int i = 0; i < SIZE; i++) {
65 |             cdata[i] = data[i];
66 |             cpare[i] = pare[i];
67 |         }
68 | 
69 | #pragma omp parallel for num_threads(thrd_num)
70 |         for (int j = 0; j < SIZE; j++) {
71 |             if (pare[j] != -1) {
72 |                 data[j] = cdata[pare[j]] + step;
73 |                 pare[j] = cpare[pare[j]];
74 |             }
75 |         }
76 |     }
77 | 
78 |   // add your codes end
79 |   t = omp_get_wtime() - t;
80 |   printf("time %f %d\n", t, SIZE);
81 | 
82 |   for (int i = 0; i < SIZE; i++) assert(data[i] == test[i]);
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/build/scan_vect_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/build/scan_vect_1


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/build/scan_vect_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/build/scan_vect_2


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/lab_scan_vect.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/lab_scan_vect.tgz


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/readme.txt:
--------------------------------------------------------------------------------
 1 | - Student name: 宋家庆
 2 | - ID number: 202000130061
 3 | 
 4 | - Implement your scan algorithm
 5 |   - Version 0: serial program
 6 |   - Version 1: brute-force program
 7 |   - Version 2: recursion program
 8 | - Compile and run your programs multiple times
 9 | - Describe your observations
10 | 在SIZE大小为1000的情况下，v0版本运行时间约为0.00007，
11 | v1版本运行时间约为0.001993，v2版本运行时间约为0.001227
12 | 
13 | 在SIZE大小为100000000的情况下
14 | v0版本运行时间为0.052
15 | v1版本速度非常慢，远低于v2版本和v0版本，在60秒内无法得到结果。
16 | v2版本时间约为0.6-0.7之间，略高于串行版本。
17 | 
18 | - Explain why this happens
19 | 
20 | v1版本需要先做map，在做reduce，每个reduce的时间复杂度为n，计算量为n^2级别，当线程数足够多时，对每个reduce
21 | 新开一个线程，时间为最长的一次reduce所需要的时间，同时reduce过程中也可以并行，理论来说效率应该不会比v0版本低，
22 | 但在真正实现的过程中，由于线程数并没有那么多，线程的创建与回收都会有时间开销，导致brute-force的性能不如线性计算。
23 | 
24 | v2版本的计算量为[(n+n/2)+(n/2+n/4)+...+1]，约有log(n)项，计算复杂度可近似为等比数列求和，计算复杂度小于3n(txt不方便写公式)，
25 | 每一次递归求和可以并行计算来优化，但由于递归中含有大量的函数调用，开销可能会比较大。导致并行后算法的时间略高于串行版本。
26 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/run.sh:
--------------------------------------------------------------------------------
1 | version=0
2 | size=10000000
3 | 
4 | g++ -o scan_vect_$version -fopenmp -DSIZE=$size scan_vect_$version.cpp && timeout 60s time ./scan_vect_$version
5 | 
6 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect_0


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_0.cpp:
--------------------------------------------------------------------------------
 1 | // student name:
 2 | // id number:
 3 | 
 4 | #include <assert.h>
 5 | #include <omp.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #include <iostream>
 9 | #include <vector>
10 | using namespace std;
11 | 
12 | // add your codes begin
13 | // add your codes end
14 | 
15 | int main() {
16 |     vector<int> data(SIZE, 1);
17 |     data[0] = 0;
18 | 
19 |     double t = omp_get_wtime();
20 |     // add your codes begin
21 |     int res = 0;
22 |     for (int i = 1; i < SIZE; i++) {
23 |         res += data[i];
24 |         data[i] = res;
25 |     }
26 | 
27 |     // add your codes end
28 |     t = omp_get_wtime() - t;
29 |     printf("time %f %d\n", t, SIZE);
30 | 
31 |     for (int i = 0; i < SIZE; i++) assert(data[i] == i);
32 | }
33 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect_1


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_1.cpp:
--------------------------------------------------------------------------------
 1 | // student name:
 2 | // id number:
 3 | 
 4 | #include <assert.h>
 5 | #include <omp.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #include <iostream>
 9 | #include <vector>
10 | using namespace std;
11 | 
12 | // add your codes begin
13 | // #define SIZE 1000
14 | // add your codes end
15 | 
16 | int main() {
17 |     vector<int> data(SIZE, 1);
18 |     data[0] = 0;
19 | 
20 |     double t = omp_get_wtime();
21 |     omp_set_num_threads(100);
22 |     // add your codes begin
23 |     vector<int> temp = data;
24 |     omp_set_nested(1);
25 | #pragma omp paraller num_threads(30)
26 |     {
27 |         int id, i, nthrds;
28 |         id = omp_get_thread_num();
29 |         nthrds = omp_get_num_threads();
30 |         int num = SIZE;
31 |         for (i = id; i < SIZE; i += nthrds) {
32 |             int sum = 0;
33 | //#pragma omp parallel for num_threads(30)
34 |             for (int j = 0; j <= i; j++) {
35 | //#pragma omp atomic
36 |                 sum += temp[j];
37 |             }
38 |             data[i] = sum;
39 |         }
40 |     }
41 | // #pragma omp parallel for num_threads(30)
42 | //     for (int i = 2; i < SIZE; i++) {
43 | // #pragma omp parallel for num_threads(30)
44 | //         for (int j = i; j < SIZE; j++) {
45 | //             #pragma omp atomic
46 | //             data[j] += temp[i];
47 | //         }
48 | //     }
49 |     // for (int i = 0; i < SIZE; i++) {
50 |     //     cout<<data[i]<<endl;
51 |     // }
52 |     
53 | 
54 |     //     for (int i = 1; i < SIZE; i++) {
55 |     //         int res = 0;
56 |     //         for (int j = 1; j <= i; j++) {
57 |     //             res += data[j];
58 |     //         }
59 |     // #pragma omp critical
60 |     //         data[i] = res;
61 |     //     }
62 |     // data = temp;
63 | 
64 |     // add your codes end
65 |     t = omp_get_wtime() - t;
66 |     printf("time %f %d\n", t, SIZE);
67 | 
68 |     for (int i = 0; i < SIZE; i++) assert(data[i] == i);
69 | }
70 | 


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/lab_scan_vect/scan_vect_2


--------------------------------------------------------------------------------
/openmp/lab_scan_vect/scan_vect_2.cpp:
--------------------------------------------------------------------------------
 1 | // student name:
 2 | // id number:
 3 | 
 4 | #include <assert.h>
 5 | #include <omp.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #include <iostream>
 9 | #include <vector>
10 | using namespace std;
11 | 
12 | // add your codes start
13 | // #define SIZE 1000
14 | const int thrd_num = 30;
15 | 
16 | void rscan(vector<int> &data, int size) {
17 |     if (size == 1) return;
18 |     vector<int> twoSum(size / 2);
19 | #pragma omp parallel for num_threads(thrd_num)
20 |     for (int i = 0; i < twoSum.size(); i++) {
21 |         twoSum[i] = data[i * 2] + data[2 * i + 1];
22 |     }
23 |     rscan(twoSum, size / 2);
24 | #pragma omp parallel for num_threads(thrd_num)
25 |     for (int i = 1; i < size; i += 2) {
26 |         data[i] = twoSum[i / 2];
27 |         if (i + 1 < size) {
28 |             data[i + 1] = twoSum[(i + 1) / 2 - 1] + data[i + 1];
29 |         }
30 |     }
31 | }
32 | // add your codes end
33 | 
34 | int main() {
35 |     vector<int> data(SIZE, 1);
36 |     data[0] = 0;
37 | 
38 |     double t = omp_get_wtime();
39 |     // add your codes start
40 |     rscan(data, SIZE);
41 |     // add your codes end
42 |     t = omp_get_wtime() - t;
43 |     printf("time %f %d\n", t, SIZE);
44 | 
45 |     for (int i = 0; i < SIZE; i++) assert(data[i] == i);
46 | }


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/segment_softmax:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/segment_softmax


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers copy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers copy


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers copy 2.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10000000
 13 | const ll mod = 1e8 + 10;
 14 | const ll thrd_num = 20;
 15 | 
 16 | typedef vector<vector<long long>> matrix;
 17 | 
 18 | matrix initMatrix(ll num) {
 19 |     matrix mat = vector(4, vector(4, (ll)0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, (ll)0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     ll n = a.size();
 41 |     ll m = b[0].size();
 42 |     ll l = b.size();
 43 |     matrix c(n, vector<ll>(m));
 44 |     for (ll i = 0; i < n; i++) {
 45 |         for (ll j = 0; j < m; j++) {
 46 |             for (ll k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, ll size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (ll i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[2 * i + 1] * data[i * 2];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (ll i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<ll> test(SIZE);
 75 |     vector<ll> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (ll i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     // vector<matrix> mats;
 85 |     // mats.push_back(initMatrix(2));
 86 | 
 87 |     // for (ll i = 3; i < SIZE; i++) {
 88 |     //     matrix c = initMatrix(i);
 89 |     //     mats.push_back(c * mats[i - 3]);
 90 |     // }
 91 | 
 92 |     vector<matrix> mats(SIZE-2);
 93 |     #pragma omp parallel for num_threads(thrd_num)
 94 |     for (ll i = 2; i < SIZE; i++) {
 95 |         mats[i-2] = initMatrix(i);
 96 |     }
 97 | 
 98 |     data[0] = 1;
 99 |     data[1] = 1;
100 |     // reverse(mats.begin(), mats.end());
101 |     rscan(mats, SIZE - 2);
102 |     // reverse(mats.begin(),mats.end());
103 |     #pragma omp parallel for num_threads(thrd_num)
104 |     for (ll i = 2; i < SIZE; i++) {
105 |         // matrix mt = initMatrix(2);
106 |         // for (ll j = 3; j <= i; j++) {
107 |         //     mt = mt * initMatrix(j);
108 |         // }
109 |         // matrix mt = initEMatrix();
110 |         // for (ll j = i; j >= 2; j--) {
111 |         //     mt = mt * initMatrix(j);
112 |         // }
113 | 
114 |         // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
115 |         //            (4 * mt[0][3]) % mod) %
116 |         //           mod;
117 |         data[i] =
118 |             (mats[i - 2][0][0] + mats[i - 2][0][1] +
119 |              (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) %
120 |             mod;
121 |     }
122 |     // reverse(data.begin()+2,data.end());
123 |     //  add your codes end
124 |     t = omp_get_wtime() - t;
125 |     printf("time %f %d\n", t, SIZE);
126 | 
127 |     for (ll i = 0; i < SIZE; i++) {
128 |         assert (data[i] == test[i]);
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers copy.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10000000
 13 | const ll mod = 1e8 + 10;
 14 | const ll thrd_num = 20;
 15 | 
 16 | typedef vector<vector<long long>> matrix;
 17 | 
 18 | matrix initMatrix(ll num) {
 19 |     matrix mat = vector(4, vector(4, (ll)0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, (ll)0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     ll n = a.size();
 41 |     ll m = b[0].size();
 42 |     ll l = b.size();
 43 |     matrix c(n, vector<ll>(m));
 44 |     for (ll i = 0; i < n; i++) {
 45 |         for (ll j = 0; j < m; j++) {
 46 |             for (ll k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, ll size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (ll i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[2 * i + 1] * data[i * 2];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (ll i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<ll> test(SIZE);
 75 |     vector<ll> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (ll i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     // vector<matrix> mats;
 85 |     // mats.push_back(initMatrix(2));
 86 | 
 87 |     // for (ll i = 3; i < SIZE; i++) {
 88 |     //     matrix c = initMatrix(i);
 89 |     //     mats.push_back(c * mats[i - 3]);
 90 |     // }
 91 | 
 92 |     vector<matrix> mats(SIZE-2);
 93 |     #pragma omp parallel for num_threads(thrd_num)
 94 |     for (ll i = 2; i < SIZE; i++) {
 95 |         mats[i-2] = initMatrix(i);
 96 |     }
 97 | 
 98 |     data[0] = 1;
 99 |     data[1] = 1;
100 |     // reverse(mats.begin(), mats.end());
101 |     rscan(mats, SIZE - 2);
102 |     // reverse(mats.begin(),mats.end());
103 |     #pragma omp parallel for num_threads(thrd_num)
104 |     for (ll i = 2; i < SIZE; i++) {
105 |         // matrix mt = initMatrix(2);
106 |         // for (ll j = 3; j <= i; j++) {
107 |         //     mt = mt * initMatrix(j);
108 |         // }
109 |         // matrix mt = initEMatrix();
110 |         // for (ll j = i; j >= 2; j--) {
111 |         //     mt = mt * initMatrix(j);
112 |         // }
113 | 
114 |         // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
115 |         //            (4 * mt[0][3]) % mod) %
116 |         //           mod;
117 |         data[i] =
118 |             (mats[i - 2][0][0] + mats[i - 2][0][1] +
119 |              (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) %
120 |             mod;
121 |     }
122 |     // reverse(data.begin()+2,data.end());
123 |     //  add your codes end
124 |     t = omp_get_wtime() - t;
125 |     printf("time %f %d\n", t, SIZE);
126 | 
127 |     for (ll i = 0; i < SIZE; i++) {
128 |         assert (data[i] == test[i]);
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/build/series_of_numbers_2


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers_2 copy 2.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10000000
 13 | const ll mod = 1e8 + 10;
 14 | const ll thrd_num = 100;
 15 | 
 16 | typedef vector<vector<long long>> matrix;
 17 | 
 18 | matrix initMatrix(ll num) {
 19 |     matrix mat = vector(4, vector(4, (ll)0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, (ll)0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     ll n = a.size();
 41 |     ll m = b[0].size();
 42 |     ll l = b.size();
 43 |     matrix c(n, vector<ll>(m));
 44 |     for (ll i = 0; i < n; i++) {
 45 |         for (ll j = 0; j < m; j++) {
 46 |             for (ll k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, ll size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (ll i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[2 * i + 1] * data[i * 2];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (ll i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<ll> test(SIZE);
 75 |     vector<ll> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (ll i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     // vector<matrix> mats;
 85 |     // mats.push_back(initMatrix(2));
 86 | 
 87 |     // for (ll i = 3; i < SIZE; i++) {
 88 |     //     matrix c = initMatrix(i);
 89 |     //     mats.push_back(c * mats[i - 3]);
 90 |     // }
 91 | 
 92 |     vector<matrix> mats;
 93 |     for (ll i = 2; i < SIZE; i++) {
 94 |         mats.push_back(initMatrix(i));
 95 |     }
 96 | 
 97 |     data[0] = 1;
 98 |     data[1] = 1;
 99 |     // reverse(mats.begin(), mats.end());
100 |     rscan(mats, SIZE - 2);
101 |     // reverse(mats.begin(),mats.end());
102 |     for (ll i = 2; i < SIZE; i++) {
103 |         // matrix mt = initMatrix(2);
104 |         // for (ll j = 3; j <= i; j++) {
105 |         //     mt = mt * initMatrix(j);
106 |         // }
107 |         // matrix mt = initEMatrix();
108 |         // for (ll j = i; j >= 2; j--) {
109 |         //     mt = mt * initMatrix(j);
110 |         // }
111 | 
112 |         // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
113 |         //            (4 * mt[0][3]) % mod) %
114 |         //           mod;
115 |         data[i] =
116 |             (mats[i - 2][0][0] + mats[i - 2][0][1] +
117 |              (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) %
118 |             mod;
119 |     }
120 |     // reverse(data.begin()+2,data.end());
121 |     //  add your codes end
122 |     t = omp_get_wtime() - t;
123 |     printf("time %f %d\n", t, SIZE);
124 | 
125 |     for (ll i = 0; i < SIZE; i++) {
126 |         if (data[i] != test[i]) {
127 |             cout << i << " ";
128 |         };
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers_2 copy.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 1000
 13 | const int mod = 1e8 + 10;
 14 | const int thrd_num = 20;
 15 | 
 16 | typedef vector<vector<int>> matrix;
 17 | 
 18 | matrix initMatrix(int num) {
 19 |     matrix mat = vector(4, vector(4, 0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, 0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     int n = a.size();
 41 |     int m = b[0].size();
 42 |     int l = b.size();
 43 |     matrix c(n, vector<int>(m));
 44 |     for (int i = 0; i < n; i++) {
 45 |         for (int j = 0; j < m; j++) {
 46 |             for (int k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, int size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (int i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[i * 2] * data[2 * i + 1];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (int i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = twoSum[(i + 1) / 2 - 1] * data[i + 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<int> test(SIZE);
 75 |     vector<int> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (int i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     vector<matrix> mats;
 85 |     for (int i = 2; i < SIZE; i++) {
 86 |         mats.push_back(initMatrix(i));
 87 |     }
 88 |     data[0] = 1;
 89 |     data[1] = 1;
 90 |     // reverse(mats.begin(),mats.end());
 91 |     // rscan(mats, SIZE - 2);
 92 |     // reverse(mats.begin(),mats.end());
 93 |     for (int i = 2; i < SIZE; i++) {
 94 |         // matrix mt = initMatrix(2);
 95 |         // for (int j = 3; j <= i; j++) {
 96 |         //     mt = mt * initMatrix(j);
 97 |         // }
 98 |         matrix mt = initEMatrix();
 99 |         for (int j = i; j >= 2; j--) {
100 |             mt = mt * initMatrix(j);
101 |         }
102 | 
103 |         data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
104 |                    (4 * mt[0][3]) % mod) %
105 |                   mod;
106 |     }
107 |     // reverse(data.begin()+2,data.end());
108 |     //  add your codes end
109 |     t = omp_get_wtime() - t;
110 |     printf("time %f %d\n", t, SIZE);
111 | 
112 |     for (int i = 0; i < SIZE; i++) {
113 |         if(data[i] != test[i]){
114 |             cout<<i<<" ";
115 |         };
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/build/series_of_numbers_2.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 1000
 13 | const int mod = 1e8 + 10;
 14 | const int thrd_num = 20;
 15 | 
 16 | typedef vector<vector<int>> matrix;
 17 | 
 18 | matrix initMatrix(int num) {
 19 |     matrix mat = vector(4, vector(4, 0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, 0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     int n = a.size();
 41 |     int m = b[0].size();
 42 |     int l = b.size();
 43 |     matrix c(n, vector<int>(m));
 44 |     for (int i = 0; i < n; i++) {
 45 |         for (int j = 0; j < m; j++) {
 46 |             for (int k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, int size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (int i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[i * 2] * data[2 * i + 1];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (int i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = twoSum[(i + 1) / 2 - 1] * data[i + 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<int> test(SIZE);
 75 |     vector<int> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (int i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     vector<matrix> mats;
 85 |     for (int i = 2; i < SIZE; i++) {
 86 |         mats.push_back(initMatrix(i));
 87 |     }
 88 |     data[0] = 1;
 89 |     data[1] = 1;
 90 |     // reverse(mats.begin(),mats.end());
 91 |     // rscan(mats, SIZE - 2);
 92 |     // reverse(mats.begin(),mats.end());
 93 |     for (int i = 2; i < SIZE; i++) {
 94 |         // matrix mt = initMatrix(2);
 95 |         // for (int j = 3; j <= i; j++) {
 96 |         //     mt = mt * initMatrix(j);
 97 |         // }
 98 |         matrix mt = initEMatrix();
 99 |         for (int j = i; j >= 2; j--) {
100 |             mt = mt * initMatrix(j);
101 |         }
102 | 
103 |         data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
104 |                    (4 * mt[0][3]) % mod) %
105 |                   mod;
106 |     }
107 |     // reverse(data.begin()+2,data.end());
108 |     //  add your codes end
109 |     t = omp_get_wtime() - t;
110 |     printf("time %f %d\n", t, SIZE);
111 | 
112 |     for (int i = 0; i < SIZE; i++) {
113 |         if(data[i] != test[i]){
114 |             cout<<i<<" ";
115 |         };
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/segment_softmax:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/segment_softmax


--------------------------------------------------------------------------------
/openmp/midterm_exam/series.sh:
--------------------------------------------------------------------------------
1 | size=1000
2 | 
3 | g++ -o series_of_numbers -fopenmp -DSIZE=$size series_of_numbers.cpp && timeout 60s time ./series_of_numbers
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/series_of_numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koi2000/ParallelComputingCourse/73809b61b00cb481d248cfb128c6a869ff0237c7/openmp/midterm_exam/series_of_numbers


--------------------------------------------------------------------------------
/openmp/midterm_exam/series_of_numbers.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10000000
 13 | const ll mod = 1e8 + 10;
 14 | const ll thrd_num = 60;
 15 | 
 16 | typedef vector<vector<long long>> matrix;
 17 | 
 18 | matrix initMatrix(ll num) {
 19 |     matrix mat = vector(4, vector(4, (ll)0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, (ll)0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     ll n = a.size();
 41 |     ll m = b[0].size();
 42 |     ll l = b.size();
 43 |     matrix c(n, vector<ll>(m));
 44 |     for (ll i = 0; i < n; i++) {
 45 |         for (ll j = 0; j < m; j++) {
 46 |             for (ll k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, ll size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (ll i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[2 * i + 1] * data[i * 2];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (ll i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = data[i + 1] * twoSum[(i + 1) / 2 - 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<ll> test(SIZE);
 75 |     vector<ll> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (ll i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     vector<matrix> mats(SIZE - 2);
 85 |     // 初始化矩阵数组
 86 |     // 这里需注意要逆序存储
 87 | #pragma omp parallel for num_threads(thrd_num)
 88 |     for (ll i = 2; i < SIZE; i++) {
 89 |         mats[i - 2] = initMatrix(i);
 90 |     }
 91 |     data[0] = 1;
 92 |     data[1] = 1;
 93 |     // 并行计算前缀积
 94 |     rscan(mats, SIZE - 2);
 95 |     // 取出最终的结果
 96 | #pragma omp parallel for num_threads(thrd_num)
 97 |     for (ll i = 2; i < SIZE; i++) {
 98 |         data[i] =
 99 |             (mats[i - 2][0][0] + mats[i - 2][0][1] +
100 |              (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) %
101 |             mod;
102 |     }
103 |     //  add your codes end
104 |     t = omp_get_wtime() - t;
105 |     printf("time %f %d\n", t, SIZE);
106 | 
107 |     for (ll i = 0; i < SIZE; i++) {
108 |         assert(data[i] == test[i]);
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/softmax.sh:
--------------------------------------------------------------------------------
1 | size=10000000
2 | 
3 | g++ -o segment_softmax -fopenmp -DSIZE=$size segment_softmax.cpp && timeout 60s time ./segment_softmax
4 | 
5 | 


--------------------------------------------------------------------------------
/openmp/midterm_exam/submit/series_of_numbers.cpp:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <omp.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #define ll long long
  9 | using namespace std;
 10 | 
 11 | // add your codes begin
 12 | #define SIZE 10000000
 13 | const ll mod = 1e8 + 10;
 14 | const ll thrd_num = 20;
 15 | 
 16 | typedef vector<vector<long long>> matrix;
 17 | 
 18 | matrix initMatrix(ll num) {
 19 |     matrix mat = vector(4, vector(4, (ll)0));
 20 |     mat[0][0] = 1;
 21 |     mat[0][1] = 2;
 22 |     mat[0][2] = num;
 23 |     mat[0][3] = 1;
 24 |     mat[1][0] = 1;
 25 |     mat[2][2] = 1;
 26 |     mat[3][3] = 1;
 27 |     return mat;
 28 | }
 29 | 
 30 | matrix initEMatrix() {
 31 |     matrix mat = vector(4, vector(4, (ll)0));
 32 |     mat[0][0] = 1;
 33 |     mat[1][1] = 1;
 34 |     mat[2][2] = 1;
 35 |     mat[3][3] = 1;
 36 |     return mat;
 37 | }
 38 | 
 39 | matrix operator*(const matrix& a, const matrix& b) {
 40 |     ll n = a.size();
 41 |     ll m = b[0].size();
 42 |     ll l = b.size();
 43 |     matrix c(n, vector<ll>(m));
 44 |     for (ll i = 0; i < n; i++) {
 45 |         for (ll j = 0; j < m; j++) {
 46 |             for (ll k = 0; k < l; k++) {
 47 |                 c[i][j] += (a[i][k] * b[k][j]) % mod;
 48 |                 c[i][j] %= mod;
 49 |             }
 50 |         }
 51 |     }
 52 |     return c;
 53 | }
 54 | 
 55 | void rscan(vector<matrix>& data, ll size) {
 56 |     if (size == 1) return;
 57 |     vector<matrix> twoSum(size / 2);
 58 | #pragma omp parallel for num_threads(thrd_num)
 59 |     for (ll i = 0; i < twoSum.size(); i++) {
 60 |         twoSum[i] = data[2 * i + 1] * data[i * 2];
 61 |     }
 62 |     rscan(twoSum, size / 2);
 63 | #pragma omp parallel for num_threads(thrd_num)
 64 |     for (ll i = 1; i < size; i += 2) {
 65 |         data[i] = twoSum[i / 2];
 66 |         if (i + 1 < size) {
 67 |             data[i + 1] = data[i + 1]*twoSum[(i + 1) / 2 - 1];
 68 |         }
 69 |     }
 70 | }
 71 | // add your codes end
 72 | 
 73 | int main() {
 74 |     vector<ll> test(SIZE);
 75 |     vector<ll> data(SIZE);
 76 |     test[0] = 1;
 77 |     test[1] = 1;
 78 |     for (ll i = 2; i < SIZE; i++) {
 79 |         test[i] = (test[i - 1] + (2 * test[i - 2]) % mod) % mod;
 80 |         test[i] = (test[i] + (3 * i) % mod + 4) % mod;
 81 |     }
 82 |     double t = omp_get_wtime();
 83 |     // add your codes begin
 84 |     // vector<matrix> mats;
 85 |     // mats.push_back(initMatrix(2));
 86 | 
 87 |     // for (ll i = 3; i < SIZE; i++) {
 88 |     //     matrix c = initMatrix(i);
 89 |     //     mats.push_back(c * mats[i - 3]);
 90 |     // }
 91 | 
 92 |     vector<matrix> mats(SIZE-2);
 93 |     #pragma omp parallel for num_threads(thrd_num)
 94 |     for (ll i = 2; i < SIZE; i++) {
 95 |         mats[i-2] = initMatrix(i);
 96 |     }
 97 | 
 98 |     data[0] = 1;
 99 |     data[1] = 1;
100 |     // reverse(mats.begin(), mats.end());
101 |     rscan(mats, SIZE - 2);
102 |     // reverse(mats.begin(),mats.end());
103 |     #pragma omp parallel for num_threads(thrd_num)
104 |     for (ll i = 2; i < SIZE; i++) {
105 |         // matrix mt = initMatrix(2);
106 |         // for (ll j = 3; j <= i; j++) {
107 |         //     mt = mt * initMatrix(j);
108 |         // }
109 |         // matrix mt = initEMatrix();
110 |         // for (ll j = i; j >= 2; j--) {
111 |         //     mt = mt * initMatrix(j);
112 |         // }
113 | 
114 |         // data[i] = (mt[0][0] + mt[0][1] + (3 * mt[0][2]) % mod +
115 |         //            (4 * mt[0][3]) % mod) %
116 |         //           mod;
117 |         data[i] =
118 |             (mats[i - 2][0][0] + mats[i - 2][0][1] +
119 |              (3 * mats[i - 2][0][2]) % mod + (4 * mats[i - 2][0][3]) % mod) %
120 |             mod;
121 |     }
122 |     // reverse(data.begin()+2,data.end());
123 |     //  add your codes end
124 |     t = omp_get_wtime() - t;
125 |     printf("time %f %d\n", t, SIZE);
126 | 
127 |     for (ll i = 0; i < SIZE; i++) {
128 |         assert (data[i] == test[i]);
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/openmp/notes/API.md:
--------------------------------------------------------------------------------
 1 | ## 常用API
 2 | 
 3 | ```c++
 4 | #pragma omp single
 5 | #pragma omp task
 6 | #pragma omp tasknowait
 7 | 
 8 | #pragma omp parallel for default(shared) private(c,eps)
 9 | 
10 | ```
11 | #pragma omp task
12 | 定义一个显式的任务，可能会被遇到的线程马上执行，也可能被延迟给线程组内其他线程来执行。任务的执行，依赖于OpenMP的任务调度。
13 | 
14 | #pragma omp single 
15 | single指令指定区域的代码只能由一组线程中的一个执行。在处理非线程安全的代码段（如I/O）时可能该指令非常有用
16 | 
17 | #pragma omp section
18 | sections将一个任务分成独立的几个section，每个由不同的线程并行处理。


--------------------------------------------------------------------------------
/openmp/notes/wordsharing.md:
--------------------------------------------------------------------------------
 1 | ## WorkSharing
 2 | 可以通过omp的API来实现多线程执行循环
 3 | ### for
 4 | ```c++
 5 | #pragma omp parallel 
 6 | #pragma omp for 
 7 | for(i=0;i<N;i++) { 
 8 |     a[i] = a[i] + b[i];
 9 | }
10 | ```
11 | ### schedule
12 | scheduel中有多种调度策略
13 | 
14 | `schedule(static [,chunk])`，每次结果都固定，从0开始为每个线程分配chunk次迭代
15 | 
16 | `schedule(dynamic [,chunk])`，每个线程从队列中抓取chunk次迭代，直到所有迭代都处理完毕
17 | 
18 | `schedule(guide [,chunk])`类似于动态调度，但每次分配的循环次数不同，开始比较大，以后逐渐减小。size表示每次分配的迭代次数的最小值，由于每次分配的迭代次数会逐渐减少，较少到size时，将不再减少。如果不知道size的大小，那么默认size为1，即一直减少到1。
19 | 
20 | `schedule(runtime)`根据环境变量OMP_SCHEDULE确定上述调度策略中的某一种
21 | 
22 | `schedule(auto)`调度由运行时选择(不一定是上面的任何一个)


--------------------------------------------------------------------------------
/openmp/notes/同步结构.md:
--------------------------------------------------------------------------------
  1 | ## OMP同步操作
  2 | 
  3 | omp同步结构指令包包括master,critical,barrier,atomic,flush,ordered等
  4 | ### Master指令
  5 | master指令指定的区域只由主线程执行，团队中其他线程都跳过该区域代码
  6 | ``` c++
  7 | #pragma omp parallel 
  8 | 	{
  9 | 	#pragma omp master
 10 | 		{
 11 | 			printf("in master thread %d\n", omp_get_thread_num());
 12 | 		}
 13 | 
 14 | 		printf("out master thread %d\n", omp_get_thread_num());
 15 | 	}
 16 | ```
 17 | 这段代码中只有主线程会执行相关操作
 18 | 
 19 | ### Critical指令
 20 | critical指令指定的代码区域，一次只能由一个线程执行，如果一个线程正在critical区域内执行，其他线程试图执行时会被阻塞
 21 | ```c++
 22 | #pragma omp parallel sections
 23 | 	{
 24 | 	#pragma omp section
 25 | 		{
 26 | 			#pragma omp critical (critical1)
 27 | 			{
 28 | 				for (int i=0; i < 5; i++)
 29 | 				{
 30 | 					printf("section1 thread %d excute i = %d\n", omp_get_thread_num(), i);
 31 | 					Sleep(200);
 32 | 				}
 33 | 			}
 34 | 
 35 | 		}
 36 | 
 37 | 	#pragma omp section
 38 | 		{
 39 | 			#pragma omp critical (critical2)
 40 | 			{
 41 | 				for (int j=0; j < 5; j++)
 42 | 				{	
 43 | 					printf("section2 thread %d excute j = %d\n", omp_get_thread_num(), j);
 44 | 					Sleep(200);
 45 | 				} 
 46 | 			}
 47 | 		}
 48 | 
 49 | 	}
 50 | ```
 51 | 
 52 | ### barrier指令
 53 | barrier指令同步团队中的所有线程，组内任何线程到达barrier指令时将在该点等待，直到所有其他线程都到达该barrier处为止。
 54 | ```c++
 55 | #pragma omp parallel
 56 | {
 57 |     printf("thread %d excute first print\n", omp_get_thread_num());
 58 |     #pragma omp barrier
 59 |     printf("thread %d excute second print\n", omp_get_thread_num());
 60 | }
 61 | 
 62 | ```
 63 | 
 64 | 
 65 | ### atomic指令
 66 | atomic指令指定必须以原子方式更新某变量的内存，而不是让多个线程都尝试对其进行写入
 67 | ```c++
 68 | int x=0;
 69 | #pragma omp parallel num_threads(6)
 70 | {
 71 |     for(int i=0; i<100000; ++i)
 72 |     #pragma omp atomic
 73 |         x++;
 74 | }
 75 | 
 76 | printf("%d", x); 
 77 | ```
 78 | 
 79 | 
 80 | ### flush指令
 81 | flush指令标识一个同步点，在该点上list中的变量都要被写回内存，而不是暂存在寄存器中，这样保证多线程数据的一致性。
 82 | 
 83 | 由于线程将共享变量更新后，其值可能暂存在寄存器中，并没有写到变量所在内存中，这样会导致其他线程不知道该更新而使用共享变量的旧值进行运算，可能会得到错误的结果。
 84 | 
 85 | 通过使用flush指令，要求相应的变量值刷新到内存中，从而保证线程读取到的共享变量的最新值。
 86 | 以下指令隐含flush操作：barrier、parallel 、critical 、ordered、for 、sections、single。
 87 | 
 88 | ### ordered指令
 89 | ordered指令指定区域的循环迭代将按串行顺序执行，与单个处理器处理结果顺序一致
 90 | ```c++
 91 | #pragma omp parallel
 92 | 	{
 93 | 	#pragma omp for ordered
 94 | 		for (int i = 0; i < 10; ++i)
 95 | 		{
 96 | 		#pragma omp ordered
 97 | 			{
 98 | 				printf("thread %d excute i = %d\n", omp_get_thread_num(), i);
 99 | 			}
100 | 		}
101 | 	} 
102 | 
103 | ```
104 | 
105 | ### nowait
106 | OpenMP中很多地方都有隐式的barrier，例如在parallel结束后，for子句后等等
107 | 在原有的隐式同步的指导命令后加入nowait子句可以取消隐式同步，从而加快程序执行速度。


--------------------------------------------------------------------------------