├── README.md
├── hw1
    ├── Homework-1.pdf
    ├── report.pdf
    ├── src
    │   ├── add
    │   │   ├── avx_add
    │   │   ├── avx_add.c
    │   │   ├── serial_add
    │   │   └── serial_add.c
    │   └── multi-thread
    │   │   ├── parallel
    │   │   └── parallel.c
    └── 参考文献
    │   ├── 64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf
    │   ├── AVX-AVX2-Example-Code-master.zip
    │   └── Intro_to_Intel_AVX.pdf
├── hw2
    ├── 18340208-张洪宾-并行分布式计算作业2.pdf
    ├── Homework-2.pdf
    └── source code.zip
├── hw3
    ├── Homework-3.pdf
    ├── report.pdf
    └── src
    │   ├── Data_Race
    │       ├── calculate_e
    │       │   ├── calculate_e
    │       │   ├── calculate_e.c
    │       │   ├── complier1.sh
    │       │   └── tsan1
    │       └── lock
    │       │   ├── complier2.sh
    │       │   ├── lock
    │       │   ├── lock.c
    │       │   └── tsan2
    │   ├── Non-reentrant
    │       ├── Control_experiment
    │       │   ├── Control_experiment.bc
    │       │   ├── Control_experiment.c
    │       │   └── Control_experiment.ll
    │       ├── I:O
    │       │   ├── IO.bc
    │       │   ├── IO.c
    │       │   └── IO.ll
    │       ├── global
    │       │   ├── global.bc
    │       │   ├── global.c
    │       │   └── global.ll
    │       ├── malloc
    │       │   ├── malloc.bc
    │       │   ├── malloc.c
    │       │   └── malloc.ll
    │       └── static
    │       │   ├── static.bc
    │       │   ├── static.c
    │       │   └── static.ll
    │   └── analyse
    │       ├── Control_experiment.ll
    │       ├── IO.ll
    │       ├── analyse
    │       ├── analyse.cpp
    │       ├── analyse.h
    │       ├── global.ll
    │       ├── malloc.ll
    │       └── static.ll
├── hw4
    ├── Homework-4.pdf
    ├── performance analysis
    │   ├── parallel
    │   ├── parallel.c
    │   ├── serial
    │   └── serial.c
    ├── readme.md
    ├── report.pdf
    └── src
    │   ├── parallel
    │   ├── parallel.c
    │   ├── serial
    │   └── serial.c
├── hw5
    ├── Homework-5.pdf
    ├── Q1
    │   └── 1.c
    ├── Q2
    │   ├── 2.1
    │   ├── 2.1.cpp
    │   ├── 2.2
    │   └── 2.2.cpp
    ├── Q3
    │   └── measure.c
    └── report.pdf
└── hw6
    ├── Homework-6
        ├── Homework-6.pdf
        ├── error-test.cu
        ├── error_check_1.h
        ├── error_checks.h
        ├── jacobi.cu
        └── jacobi.h
    ├── report.pdf
    └── src
        ├── 2
            ├── jacobi.cu
            └── jacobi.h
        ├── 1.0
            ├── error-test.cu
            ├── error_check_1.h
            └── error_checks.h
        ├── 1.1
            ├── error-test.cu
            ├── error_check_1.h
            └── error_checks.h
        ├── 1.2
            ├── error-test.cu
            ├── error_check_1.h
            └── error_checks.h
        └── 1.3
            ├── error-test.cu
            ├── error_check_1.h
            └── error_checks.h


/README.md:
--------------------------------------------------------------------------------
1 | # parallel-and-distributed-computing-homework
2 | 中山大学2020年并行与分布式计算作业
3 | 


--------------------------------------------------------------------------------
/hw1/Homework-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/Homework-1.pdf


--------------------------------------------------------------------------------
/hw1/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/report.pdf


--------------------------------------------------------------------------------
/hw1/src/add/avx_add:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/add/avx_add


--------------------------------------------------------------------------------
/hw1/src/add/avx_add.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author:  TripleZ<me@triplez.cn>
 3 |  * Date:    2018-08-17
 4 |  */
 5 | 
 6 | #include <immintrin.h>
 7 | #include <stdio.h>
 8 | #include  <sys/time.h>
 9 | #define N 1000000
10 | __m256i vec1[N/8 + 1];
11 | __m256i vec2[N/8 + 1];
12 | __m256i res[N/8 + 1];
13 | 
14 | int main(int argc, char const *argv[]) {
15 |     for(int i = 0;i < N/8;i++){
16 |         vec1[i] = _mm256_set1_epi32(0);
17 |         vec2[i] = _mm256_set1_epi32(0);
18 |     }
19 |     struct timeval begin,end;
20 |     gettimeofday(&begin,NULL);
21 |     for(int i = 0;i < N/8;i++){
22 |         res[i] = _mm256_add_epi32(vec1[i],vec2[i]);
23 |     }
24 |     gettimeofday(&end,NULL);
25 |     printf("Avx_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/hw1/src/add/serial_add:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/add/serial_add


--------------------------------------------------------------------------------
/hw1/src/add/serial_add.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include  <sys/time.h>
 3 | #define N 1000000
 4 | 
 5 | int a[N],b[N],c[N];
 6 | int main(int argc, char const *argv[]) {
 7 |     for(int i = 0;i < N;i++){
 8 |         a[i] = 0;
 9 |         b[i] = 0;
10 |     }
11 |     struct timeval begin,end;
12 |     gettimeofday(&begin,NULL);
13 |     for(int i = 0;i < N;i++){
14 |         c[i] = a[i] + b[i];
15 |     }
16 |     gettimeofday(&end,NULL);
17 |     printf("Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
18 |     return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/hw1/src/multi-thread/parallel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/multi-thread/parallel


--------------------------------------------------------------------------------
/hw1/src/multi-thread/parallel.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <omp.h>
 4 | #include <sys/time.h>
 5 | 
 6 | #define N 1000000
 7 | 
 8 | int a[N],b[N],c[N];
 9 | 
10 | int main(int argc,char* argv[]) {
11 |     printf("Input the number of the threads:");
12 |     int n;
13 |     scanf("%d",&n);
14 |     omp_set_num_threads(n);
15 |     struct timeval begin,end;
16 |     gettimeofday(&begin,NULL);
17 |     #pragma omp parallel for
18 |     for(int i = 0;i < N;i++){
19 |         a[i] = b[i] + c[i];
20 |     }
21 |     gettimeofday(&end,NULL);
22 |     printf("Run time with %d threads is %d μs\n",n,end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
23 | }
24 | 


--------------------------------------------------------------------------------
/hw1/参考文献/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf


--------------------------------------------------------------------------------
/hw1/参考文献/AVX-AVX2-Example-Code-master.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/AVX-AVX2-Example-Code-master.zip


--------------------------------------------------------------------------------
/hw1/参考文献/Intro_to_Intel_AVX.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/Intro_to_Intel_AVX.pdf


--------------------------------------------------------------------------------
/hw2/18340208-张洪宾-并行分布式计算作业2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/18340208-张洪宾-并行分布式计算作业2.pdf


--------------------------------------------------------------------------------
/hw2/Homework-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/Homework-2.pdf


--------------------------------------------------------------------------------
/hw2/source code.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/source code.zip


--------------------------------------------------------------------------------
/hw3/Homework-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/Homework-3.pdf


--------------------------------------------------------------------------------
/hw3/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/report.pdf


--------------------------------------------------------------------------------
/hw3/src/Data_Race/calculate_e/calculate_e:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/calculate_e/calculate_e


--------------------------------------------------------------------------------
/hw3/src/Data_Race/calculate_e/calculate_e.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | 
 6 | double res = 1;
 7 | int count = 8;//线程数量
 8 | int accuracy = 1000;//计算精度，用多项式项数表示
 9 | 
10 | void *Add(void *ptr) {
11 |     long thread_order = (long)ptr;//第i个线程，用于表示线程序号
12 |     long n = thread_order;//第n项
13 | 
14 |     long loop_num = accuracy / count;
15 |     if(accuracy % count)loop_num++;
16 |     for(long j = 0;j < loop_num;j++){
17 |         n += j * count;//n % count = thread_order
18 |         long long denominator = 1;
19 |         for(int i = 2;i <= n;i++){
20 |             denominator *= i;
21 |         }
22 |         double term = 0;
23 |         if(denominator > 0)term = 1.0/denominator;
24 |         res += term;
25 |     }
26 |     return NULL;
27 | }
28 | int main(int argc,char **argv) {
29 |     if(argc == 3){
30 |         count = atoi(argv[1]);
31 |         accuracy = atoi(argv[2]);
32 |     }
33 |     pthread_t handle[count];
34 |     long i;
35 |     for(i = 1;i <= count;i++){
36 |    
37 |         pthread_create(&handle[i - 1],NULL,Add,(void*)i);
38 |     }
39 |     for(int i = 1;i <= count;i++){
40 |         pthread_join(handle[i - 1],NULL);
41 |     }
42 |     printf("e = %f\n",res);
43 | }
44 | 


--------------------------------------------------------------------------------
/hw3/src/Data_Race/calculate_e/complier1.sh:
--------------------------------------------------------------------------------
 1 | clang calculate_e.c -o calculate_e -lpthread
 2 | ./calculate_e
 3 | ./calculate_e
 4 | ./calculate_e
 5 | ./calculate_e
 6 | ./calculate_e
 7 | ./calculate_e
 8 | ./calculate_e
 9 | ./calculate_e
10 | ./calculate_e
11 | ./calculate_e
12 | ./calculate_e
13 | ./calculate_e
14 | ./calculate_e
15 | ./calculate_e
16 | ./calculate_e
17 | ./calculate_e
18 | ./calculate_e
19 | ./calculate_e
20 | ./calculate_e
21 | ./calculate_e
22 | 


--------------------------------------------------------------------------------
/hw3/src/Data_Race/calculate_e/tsan1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/calculate_e/tsan1


--------------------------------------------------------------------------------
/hw3/src/Data_Race/lock/complier2.sh:
--------------------------------------------------------------------------------
 1 | clang lock.c -o lock -lpthread
 2 | ./lock
 3 | ./lock
 4 | ./lock
 5 | ./lock
 6 | ./lock
 7 | ./lock
 8 | ./lock
 9 | ./lock
10 | ./lock
11 | ./lock
12 | ./lock
13 | ./lock
14 | ./lock
15 | ./lock
16 | ./lock
17 | ./lock
18 | ./lock
19 | ./lock
20 | ./lock
21 | ./lock
22 | 


--------------------------------------------------------------------------------
/hw3/src/Data_Race/lock/lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/lock/lock


--------------------------------------------------------------------------------
/hw3/src/Data_Race/lock/lock.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | 
 6 | double res = 1;
 7 | int count = 8;//线程数量
 8 | int accuracy = 1000;//计算精度，用多项式项数表示
 9 | pthread_mutex_t lock;
10 | 
11 | void *Add(void *ptr) {
12 |     long thread_order = (int)ptr;//第i个线程，用于表示线程序号
13 |     long n = thread_order;//第n项
14 | 
15 |     long loop_num = accuracy / count;
16 |     if(accuracy % count)loop_num++;
17 |     for(long j = 0;j < loop_num;j++){
18 |         n += j * count;//n % count = thread_order
19 |         long long denominator = 1;
20 |         for(int i = 2;i <= n;i++){
21 |             denominator *= i;
22 |         }
23 |         double term = 0;
24 |         if(denominator > 0)term = 1.0/denominator;
25 | 
26 |         pthread_mutex_lock(&lock);
27 |         res += term;
28 |         pthread_mutex_unlock(&lock);
29 |     }
30 |     return NULL;
31 | }
32 | int main(int argc,char **argv) {
33 |     if(argc == 3){
34 |         count = atoi(argv[1]);
35 |         accuracy = atoi(argv[2]);
36 |     }
37 |     pthread_t handle[count];
38 |     long i;
39 |     for(i = 1;i <= count;i++){
40 |    
41 |         pthread_create(&handle[i - 1],NULL,Add,(void*)i);
42 |     }
43 |     for(int i = 1;i <= count;i++){
44 |         pthread_join(handle[i - 1],NULL);
45 |     }
46 |     printf("%f\n",res);
47 | }


--------------------------------------------------------------------------------
/hw3/src/Data_Race/lock/tsan2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/lock/tsan2


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/Control_experiment/Control_experiment.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/Control_experiment/Control_experiment.bc


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/Control_experiment/Control_experiment.c:
--------------------------------------------------------------------------------
1 | int add(int a,int b){
2 |     int c = a + b;
3 |     return c;
4 | }
5 | int main(){
6 |     return 0;
7 | }


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/Control_experiment/Control_experiment.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'Control_experiment.bc'
 2 | source_filename = "Control_experiment.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | ; Function Attrs: noinline nounwind optnone ssp uwtable
 7 | define i32 @add(i32, i32) #0 {
 8 |   %3 = alloca i32, align 4
 9 |   %4 = alloca i32, align 4
10 |   %5 = alloca i32, align 4
11 |   store i32 %0, i32* %3, align 4
12 |   store i32 %1, i32* %4, align 4
13 |   %6 = load i32, i32* %3, align 4
14 |   %7 = load i32, i32* %4, align 4
15 |   %8 = add nsw i32 %6, %7
16 |   store i32 %8, i32* %5, align 4
17 |   %9 = load i32, i32* %5, align 4
18 |   ret i32 %9
19 | }
20 | 
21 | ; Function Attrs: noinline nounwind optnone ssp uwtable
22 | define i32 @main() #0 {
23 |   %1 = alloca i32, align 4
24 |   store i32 0, i32* %1, align 4
25 |   ret i32 0
26 | }
27 | 
28 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
29 | 
30 | !llvm.module.flags = !{!0, !1, !2}
31 | !llvm.ident = !{!3}
32 | 
33 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
34 | !1 = !{i32 1, !"wchar_size", i32 4}
35 | !2 = !{i32 7, !"PIC Level", i32 2}
36 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
37 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/I:O/IO.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/I:O/IO.bc


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/I:O/IO.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <pthread.h>
 3 | #include <stdio.h>
 4 | 
 5 | 
 6 | int Global;
 7 | void *fun(void *x) {
 8 |   Global = 42;
 9 |   printf("%d\n",Global);
10 |   return x;
11 | }
12 | int main() {
13 |   pthread_t t;
14 |   pthread_create(&t, NULL, fun, NULL);
15 |   Global = 43;
16 |   pthread_join(t, NULL);
17 |   return Global;
18 | }
19 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/I:O/IO.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'IO.bc'
 2 | source_filename = "IO.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @Global = common global i32 0, align 4
11 | @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
12 | 
13 | ; Function Attrs: noinline nounwind optnone ssp uwtable
14 | define i8* @fun(i8*) #0 {
15 |   %2 = alloca i8*, align 8
16 |   store i8* %0, i8** %2, align 8
17 |   store i32 42, i32* @Global, align 4
18 |   %3 = load i32, i32* @Global, align 4
19 |   %4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %3)
20 |   %5 = load i8*, i8** %2, align 8
21 |   ret i8* %5
22 | }
23 | 
24 | declare i32 @printf(i8*, ...) #1
25 | 
26 | ; Function Attrs: noinline nounwind optnone ssp uwtable
27 | define i32 @main() #0 {
28 |   %1 = alloca i32, align 4
29 |   %2 = alloca %struct._opaque_pthread_t*, align 8
30 |   store i32 0, i32* %1, align 4
31 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null)
32 |   store i32 43, i32* @Global, align 4
33 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
34 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
35 |   %6 = load i32, i32* @Global, align 4
36 |   ret i32 %6
37 | }
38 | 
39 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
40 | 
41 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
42 | 
43 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
44 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
45 | 
46 | !llvm.module.flags = !{!0, !1, !2}
47 | !llvm.ident = !{!3}
48 | 
49 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
50 | !1 = !{i32 1, !"wchar_size", i32 4}
51 | !2 = !{i32 7, !"PIC Level", i32 2}
52 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
53 | !4 = !{!5}
54 | !5 = !{i64 2, i64 3, i1 false}
55 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/global/global.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/global/global.bc


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/global/global.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | 
 3 | int global = 10;
 4 | 
 5 | void* fun(void* argv){
 6 |     global = 20;
 7 |     return NULL;
 8 | }
 9 | int main() {
10 |   pthread_t t;
11 |   pthread_create(&t, NULL, fun, NULL);
12 |   global = 43;
13 |   pthread_join(t, NULL);
14 |   return global;
15 | }


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/global/global.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'global.bc'
 2 | source_filename = "global.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @global = global i32 10, align 4
11 | 
12 | ; Function Attrs: noinline nounwind optnone ssp uwtable
13 | define i8* @fun(i8*) #0 {
14 |   %2 = alloca i8*, align 8
15 |   store i8* %0, i8** %2, align 8
16 |   store i32 20, i32* @global, align 4
17 |   ret i8* null
18 | }
19 | 
20 | ; Function Attrs: noinline nounwind optnone ssp uwtable
21 | define i32 @main() #0 {
22 |   %1 = alloca i32, align 4
23 |   %2 = alloca %struct._opaque_pthread_t*, align 8
24 |   store i32 0, i32* %1, align 4
25 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null)
26 |   store i32 43, i32* @global, align 4
27 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
28 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
29 |   %6 = load i32, i32* @global, align 4
30 |   ret i32 %6
31 | }
32 | 
33 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
34 | 
35 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
36 | 
37 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
38 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
39 | 
40 | !llvm.module.flags = !{!0, !1, !2}
41 | !llvm.ident = !{!3}
42 | 
43 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
44 | !1 = !{i32 1, !"wchar_size", i32 4}
45 | !2 = !{i32 7, !"PIC Level", i32 2}
46 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
47 | !4 = !{!5}
48 | !5 = !{i64 2, i64 3, i1 false}
49 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/malloc/malloc.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/malloc/malloc.bc


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/malloc/malloc.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | #include <stdlib.h>
 3 | void* init(void*argv){
 4 |     int* array = malloc(sizeof(int) * 10);
 5 |     for(int i = 0;i < 10;i++)array[i] = 0;
 6 |     free(array);
 7 |     return argv;
 8 | }
 9 | 
10 | int main(){
11 |     pthread_t id;
12 |     pthread_create(&id, NULL, init, NULL);
13 |     pthread_join(id, NULL);
14 |     return 0;
15 | }


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/malloc/malloc.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'malloc.bc'
 2 | source_filename = "malloc.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | ; Function Attrs: noinline nounwind optnone ssp uwtable
11 | define i8* @init(i8*) #0 {
12 |   %2 = alloca i8*, align 8
13 |   %3 = alloca i32*, align 8
14 |   %4 = alloca i32, align 4
15 |   store i8* %0, i8** %2, align 8
16 |   %5 = call i8* @malloc(i64 40) #3
17 |   %6 = bitcast i8* %5 to i32*
18 |   store i32* %6, i32** %3, align 8
19 |   store i32 0, i32* %4, align 4
20 |   br label %7
21 | 
22 | 7:                                                ; preds = %15, %1
23 |   %8 = load i32, i32* %4, align 4
24 |   %9 = icmp slt i32 %8, 10
25 |   br i1 %9, label %10, label %18
26 | 
27 | 10:                                               ; preds = %7
28 |   %11 = load i32*, i32** %3, align 8
29 |   %12 = load i32, i32* %4, align 4
30 |   %13 = sext i32 %12 to i64
31 |   %14 = getelementptr inbounds i32, i32* %11, i64 %13
32 |   store i32 0, i32* %14, align 4
33 |   br label %15
34 | 
35 | 15:                                               ; preds = %10
36 |   %16 = load i32, i32* %4, align 4
37 |   %17 = add nsw i32 %16, 1
38 |   store i32 %17, i32* %4, align 4
39 |   br label %7
40 | 
41 | 18:                                               ; preds = %7
42 |   %19 = load i32*, i32** %3, align 8
43 |   %20 = bitcast i32* %19 to i8*
44 |   call void @free(i8* %20)
45 |   %21 = load i8*, i8** %2, align 8
46 |   ret i8* %21
47 | }
48 | 
49 | ; Function Attrs: allocsize(0)
50 | declare i8* @malloc(i64) #1
51 | 
52 | declare void @free(i8*) #2
53 | 
54 | ; Function Attrs: noinline nounwind optnone ssp uwtable
55 | define i32 @main() #0 {
56 |   %1 = alloca i32, align 4
57 |   %2 = alloca %struct._opaque_pthread_t*, align 8
58 |   store i32 0, i32* %1, align 4
59 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @init, i8* null)
60 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
61 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
62 |   ret i32 0
63 | }
64 | 
65 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #2
66 | 
67 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #2
68 | 
69 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
70 | attributes #1 = { allocsize(0) "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
71 | attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
72 | attributes #3 = { allocsize(0) }
73 | 
74 | !llvm.module.flags = !{!0, !1, !2}
75 | !llvm.ident = !{!3}
76 | 
77 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
78 | !1 = !{i32 1, !"wchar_size", i32 4}
79 | !2 = !{i32 7, !"PIC Level", i32 2}
80 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
81 | !4 = !{!5}
82 | !5 = !{i64 2, i64 3, i1 false}
83 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/static/static.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/static/static.bc


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/static/static.c:
--------------------------------------------------------------------------------
 1 | #include <pthread.h>
 2 | 
 3 | 
 4 | 
 5 | void* fun(void* argv){
 6 |     int* res = (int*)argv;
 7 |     static int x;
 8 |     x = *res;
 9 |     return NULL;
10 | }
11 | int main() {
12 |     pthread_t t1,t2;
13 |     pthread_create(&t1, NULL, fun, &t1);
14 |     pthread_create(&t2, NULL, fun, &t2);
15 |     
16 |     pthread_join(t1, NULL);
17 |     pthread_join(t2, NULL);
18 |     return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/hw3/src/Non-reentrant/static/static.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'static.bc'
 2 | source_filename = "static.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @fun.x = internal global i32 0, align 4
11 | 
12 | ; Function Attrs: noinline nounwind optnone ssp uwtable
13 | define i8* @fun(i8*) #0 {
14 |   %2 = alloca i8*, align 8
15 |   %3 = alloca i32*, align 8
16 |   store i8* %0, i8** %2, align 8
17 |   %4 = load i8*, i8** %2, align 8
18 |   %5 = bitcast i8* %4 to i32*
19 |   store i32* %5, i32** %3, align 8
20 |   %6 = load i32*, i32** %3, align 8
21 |   %7 = load i32, i32* %6, align 4
22 |   store i32 %7, i32* @fun.x, align 4
23 |   ret i8* null
24 | }
25 | 
26 | ; Function Attrs: noinline nounwind optnone ssp uwtable
27 | define i32 @main() #0 {
28 |   %1 = alloca i32, align 4
29 |   %2 = alloca %struct._opaque_pthread_t*, align 8
30 |   %3 = alloca %struct._opaque_pthread_t*, align 8
31 |   store i32 0, i32* %1, align 4
32 |   %4 = bitcast %struct._opaque_pthread_t** %2 to i8*
33 |   %5 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %4)
34 |   %6 = bitcast %struct._opaque_pthread_t** %3 to i8*
35 |   %7 = call i32 @pthread_create(%struct._opaque_pthread_t** %3, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %6)
36 |   %8 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
37 |   %9 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %8, i8** null)
38 |   %10 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %3, align 8
39 |   %11 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %10, i8** null)
40 |   ret i32 0
41 | }
42 | 
43 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
44 | 
45 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
46 | 
47 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
48 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
49 | 
50 | !llvm.module.flags = !{!0, !1, !2}
51 | !llvm.ident = !{!3}
52 | 
53 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
54 | !1 = !{i32 1, !"wchar_size", i32 4}
55 | !2 = !{i32 7, !"PIC Level", i32 2}
56 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
57 | !4 = !{!5}
58 | !5 = !{i64 2, i64 3, i1 false}
59 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/Control_experiment.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'Control_experiment.bc'
 2 | source_filename = "Control_experiment.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | ; Function Attrs: noinline nounwind optnone ssp uwtable
 7 | define i32 @add(i32, i32) #0 {
 8 |   %3 = alloca i32, align 4
 9 |   %4 = alloca i32, align 4
10 |   %5 = alloca i32, align 4
11 |   store i32 %0, i32* %3, align 4
12 |   store i32 %1, i32* %4, align 4
13 |   %6 = load i32, i32* %3, align 4
14 |   %7 = load i32, i32* %4, align 4
15 |   %8 = add nsw i32 %6, %7
16 |   store i32 %8, i32* %5, align 4
17 |   %9 = load i32, i32* %5, align 4
18 |   ret i32 %9
19 | }
20 | 
21 | ; Function Attrs: noinline nounwind optnone ssp uwtable
22 | define i32 @main() #0 {
23 |   %1 = alloca i32, align 4
24 |   store i32 0, i32* %1, align 4
25 |   ret i32 0
26 | }
27 | 
28 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
29 | 
30 | !llvm.module.flags = !{!0, !1, !2}
31 | !llvm.ident = !{!3}
32 | 
33 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
34 | !1 = !{i32 1, !"wchar_size", i32 4}
35 | !2 = !{i32 7, !"PIC Level", i32 2}
36 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
37 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/IO.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'IO.bc'
 2 | source_filename = "IO.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @Global = common global i32 0, align 4
11 | @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
12 | 
13 | ; Function Attrs: noinline nounwind optnone ssp uwtable
14 | define i8* @fun(i8*) #0 {
15 |   %2 = alloca i8*, align 8
16 |   store i8* %0, i8** %2, align 8
17 |   store i32 42, i32* @Global, align 4
18 |   %3 = load i32, i32* @Global, align 4
19 |   %4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %3)
20 |   %5 = load i8*, i8** %2, align 8
21 |   ret i8* %5
22 | }
23 | 
24 | declare i32 @printf(i8*, ...) #1
25 | 
26 | ; Function Attrs: noinline nounwind optnone ssp uwtable
27 | define i32 @main() #0 {
28 |   %1 = alloca i32, align 4
29 |   %2 = alloca %struct._opaque_pthread_t*, align 8
30 |   store i32 0, i32* %1, align 4
31 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null)
32 |   store i32 43, i32* @Global, align 4
33 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
34 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
35 |   %6 = load i32, i32* @Global, align 4
36 |   ret i32 %6
37 | }
38 | 
39 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
40 | 
41 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
42 | 
43 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
44 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
45 | 
46 | !llvm.module.flags = !{!0, !1, !2}
47 | !llvm.ident = !{!3}
48 | 
49 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
50 | !1 = !{i32 1, !"wchar_size", i32 4}
51 | !2 = !{i32 7, !"PIC Level", i32 2}
52 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
53 | !4 = !{!5}
54 | !5 = !{i64 2, i64 3, i1 false}
55 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/analyse:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/analyse/analyse


--------------------------------------------------------------------------------
/hw3/src/analyse/analyse.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Created by zhb on 2020/5/1.
 3 | //  Copyright © 2020 zhb. All rights reserved.
 4 | //
 5 | #include "analyse.h"
 6 | 
 7 | int main(int argc, const char * argv[]) {
 8 |     if(argc != 2){
 9 |         cout << "Please input the LLVM IR file name!" << endl;
10 |         abort();
11 |     }
12 |     string file = read_file(argv[1]);
13 |     divide(file);
14 |     Non_reentrant_fun.push_back("printf");
15 |     Non_reentrant_fun.push_back("scanf");
16 |     Non_reentrant_fun.push_back("malloc");
17 |     Non_reentrant_fun.push_back("free");
18 | 
19 |     bool res = false;
20 |     for(int i = 0;i < all_function.size();i++){
21 |         module test(all_function[i]);
22 |         bool ret = test.detect_and_print();
23 |         if(ret)res = true;
24 |     }
25 |     if(!res){
26 |         cout << "Non reentrant function not detected." << endl;
27 |     }
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/analyse.h:
--------------------------------------------------------------------------------
  1 | //
  2 | //  Created by zhb on 2020/5/1.
  3 | //  Copyright © 2020 zhb. All rights reserved.
  4 | //
  5 | 
  6 | #include <iostream>
  7 | #include <sstream>
  8 | #include <string>
  9 | #include <vector>
 10 | #include <string.h>
 11 | #include <fstream>
 12 | #include <algorithm>
 13 | 
 14 | using namespace std;
 15 | 
 16 | vector<string> all_function;
 17 | vector<string> Non_reentrant_fun;
 18 | 
 19 | 
 20 | 
 21 | 
 22 | inline void deduplication(vector<string>& c) 
 23 | {
 24 | 	sort(c.begin(), c.end());
 25 |     auto new_end = unique(c.begin(), c.end());//"删除"相邻的重复元素
 26 | 	c.erase(new_end, c.end());//删除(真正的删除)重复的元素
 27 | }
 28 |  
 29 | 
 30 | string read_file(const char*file_name){
 31 |     fstream file(file_name);
 32 |     stringstream ss; 
 33 |     ss << file.rdbuf();
 34 |     string str = ss.str();
 35 |     return str;
 36 | }
 37 | 
 38 | void divide(string s){
 39 |     vector<int> pos;
 40 |     int index = 0;
 41 |     string sub = "define";
 42 | 	while ((index = s.find(sub, index)) < s.length())
 43 | 	{
 44 |         pos.push_back(index);
 45 |         string temp;
 46 |         int flag = 0;
 47 |         int target = 0;
 48 |         for(int i = index;i < s.size();i++){
 49 |             if(s[i] == '}'){
 50 |                 target = i;
 51 |                 flag = 1;
 52 |                 break;
 53 |             }
 54 |             if(flag)break;
 55 |         }
 56 |         temp = s.substr(index,target - index + 1);
 57 |         all_function.push_back(temp);
 58 | 		index++;
 59 | 
 60 | 	}
 61 | }
 62 | 
 63 | 
 64 | class module{
 65 |     string fun_name;//函数名
 66 |     string argument;//参数列表
 67 |     string ret_type;//返回值类型
 68 |     string content;//函数体
 69 |     vector<int> symbol;//标志@在函数体的位置
 70 |     vector<string> _global;//函数中的全局变量
 71 |     vector<string> _static;//函数中的静态变量
 72 |     vector<string> _non_reentrant_fun;//函数中调用的不可重入函数
 73 | 
 74 |     //获取所有的@标志
 75 |     void get_all_symbol();
 76 |     //检测symbol向量中是否有全局变量，有则放入_global向量
 77 |     void detect_global();
 78 |     //检测symbol向量中是否有静态变量，有则放入_static向量
 79 |     void detect_static();
 80 |     //检测symbol向量中是否有函数中调用的不可重入函数，有则放入_non_reentrant_fun向量
 81 |     void detect_non_reentrant_fun();
 82 |     //对上述四种操作进行封装，若为不可重用函数则返回true
 83 |     bool detect();
 84 | public:
 85 |     //将IR code中一个函数的字符串表示作为一个整体初始化模块类
 86 |     module(string fun);
 87 |     //对detect()进行封装并打印提示信息。
 88 |     bool detect_and_print();
 89 | };
 90 | module::module(string fun){
 91 |     stringstream s;
 92 |     s << fun;
 93 |     string define;
 94 |     s >> define;
 95 |     s >> ret_type;
 96 |     s >> fun_name;
 97 |     int content_begin;
 98 |     for(int i = 0;i < fun.size();i++){
 99 |         if(fun[i] == '{'){
100 |             content_begin = i;
101 |             break;
102 |         }
103 |     }
104 |     content = fun.substr(content_begin);
105 |     int argument_pos = 0;
106 |     for(int i = 0;i < fun_name.size();i++){
107 |         if(fun_name[i] == '('){
108 |             argument_pos = i;
109 |             break;
110 |         }
111 |     }
112 |     argument = fun_name.substr(argument_pos + 1);
113 |     argument.pop_back();
114 |     fun_name.erase(fun_name.begin() + argument_pos,fun_name.end());
115 |     fun_name.erase(fun_name.begin());
116 | }
117 | void module::get_all_symbol(){
118 |     for(int i = 0;i < content.size();i++){
119 |         if(content[i] == '@'){
120 |             symbol.push_back(i);
121 |         }
122 |     }
123 | 
124 | }
125 | void module::detect_global(){
126 |     for(int i = 0;i < symbol.size();i++){
127 |         stringstream ss;
128 |         string sub = content.substr(symbol[i]);
129 |         ss << sub;
130 |         string value;
131 |         ss >> value;
132 |         int flag = 0;
133 |         for(int j = 0;j < value.size();j++){
134 |             if(value[j] == '(' || value[j] == '.'){
135 |                 flag = 1;
136 |                 break;
137 |             }
138 |         }
139 |         if(flag)continue;
140 |         while(value[value.size() - 1] == ',')value.pop_back();
141 |         value.erase(value.begin());
142 |         _global.push_back(value);
143 |     }
144 | }
145 | void module::detect_static(){
146 |     for(int i = 0;i < symbol.size();i++){
147 |         stringstream ss;
148 |         string sub = content.substr(symbol[i]);
149 |         ss << sub;
150 |         string value;
151 |         ss >> value;
152 |         int flag = 0;
153 |         int point = 1;
154 |         for(int j = 0;j < value.size();j++){
155 |             if(value[j] == '('){
156 |                 flag = 1;
157 |                 break;
158 |             }
159 |             if(value[j] == '.'){
160 |                 point = 0;
161 |             }
162 |         }
163 |         if(flag || point)continue;
164 |         while(value[value.size() - 1] == ',')value.pop_back();
165 |         value.erase(value.begin());
166 |         if(value[0] == '.')continue;
167 |         _static.push_back(value);
168 |     }
169 | }
170 | void module::detect_non_reentrant_fun(){
171 |     for(int i = 0;i < symbol.size();i++){
172 |         stringstream ss;
173 |         string sub = content.substr(symbol[i]);
174 |         ss << sub;
175 |         string value;
176 |         ss >> value;
177 |         int flag = 0;
178 |         int pos = -1;
179 |         for(int j = 0;j < value.size();j++){
180 |             if(value[j] == '('){
181 |                 flag = 1;
182 |                 pos = j;
183 |                 break;
184 |             }
185 |         }
186 |         if(!flag)continue;
187 |         
188 |         value.erase(value.begin() + pos,value.end());
189 |         value.erase(value.begin());
190 |         for(int k = 0;k < Non_reentrant_fun.size();k++){
191 |             if(value == Non_reentrant_fun[k]){
192 |                 _non_reentrant_fun.push_back(value);
193 |             }
194 |         }
195 |     }
196 | }
197 | bool module::detect(){
198 |     get_all_symbol();
199 |     detect_non_reentrant_fun();
200 |     detect_global();
201 |     detect_static();
202 |     return _global.size() || _static.size() || _non_reentrant_fun.size();
203 | }
204 | bool module::detect_and_print(){
205 |     if(fun_name == "main")return false;
206 |     bool res = detect();
207 |     if(res == false){
208 |         return false;
209 |     }
210 |     else{
211 |         Non_reentrant_fun.push_back(fun_name);
212 |         cout << "Detect non reentrant function " << fun_name << ", details are as follows: " << endl;
213 |         deduplication(_non_reentrant_fun);
214 |         deduplication(_global);
215 |         deduplication(_static);
216 |         if(_non_reentrant_fun.size()){
217 |             cout << "Call non reentrant function:";
218 |             for(int i = 0;i < _non_reentrant_fun.size();i++){
219 |                 cout << _non_reentrant_fun[i] << "  ";
220 |             }
221 |             cout << endl;
222 |         }
223 |         if(_global.size()){
224 |             cout << "Use global valuable:";
225 |             for(int i = 0;i < _global.size();i++){
226 |                 cout << _global[i] << " ";
227 |             }
228 |             cout << endl;
229 |         }
230 |         if(_static.size()){
231 |             cout << "Use static valuable:";
232 |             for(int i = 0;i < _static.size();i++){
233 |                 cout << _static[i] << " ";
234 |             }
235 |             cout << endl;
236 |         }
237 |         return true;
238 |     }
239 |     return true;
240 | }
241 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/global.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'global.bc'
 2 | source_filename = "global.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @global = global i32 10, align 4
11 | 
12 | ; Function Attrs: noinline nounwind optnone ssp uwtable
13 | define i8* @fun(i8*) #0 {
14 |   %2 = alloca i8*, align 8
15 |   store i8* %0, i8** %2, align 8
16 |   store i32 20, i32* @global, align 4
17 |   ret i8* null
18 | }
19 | 
20 | ; Function Attrs: noinline nounwind optnone ssp uwtable
21 | define i32 @main() #0 {
22 |   %1 = alloca i32, align 4
23 |   %2 = alloca %struct._opaque_pthread_t*, align 8
24 |   store i32 0, i32* %1, align 4
25 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null)
26 |   store i32 43, i32* @global, align 4
27 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
28 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
29 |   %6 = load i32, i32* @global, align 4
30 |   ret i32 %6
31 | }
32 | 
33 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
34 | 
35 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
36 | 
37 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
38 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
39 | 
40 | !llvm.module.flags = !{!0, !1, !2}
41 | !llvm.ident = !{!3}
42 | 
43 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
44 | !1 = !{i32 1, !"wchar_size", i32 4}
45 | !2 = !{i32 7, !"PIC Level", i32 2}
46 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
47 | !4 = !{!5}
48 | !5 = !{i64 2, i64 3, i1 false}
49 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/malloc.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'malloc.bc'
 2 | source_filename = "malloc.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | ; Function Attrs: noinline nounwind optnone ssp uwtable
11 | define i8* @init(i8*) #0 {
12 |   %2 = alloca i8*, align 8
13 |   %3 = alloca i32*, align 8
14 |   %4 = alloca i32, align 4
15 |   store i8* %0, i8** %2, align 8
16 |   %5 = call i8* @malloc(i64 40) #3
17 |   %6 = bitcast i8* %5 to i32*
18 |   store i32* %6, i32** %3, align 8
19 |   store i32 0, i32* %4, align 4
20 |   br label %7
21 | 
22 | 7:                                                ; preds = %15, %1
23 |   %8 = load i32, i32* %4, align 4
24 |   %9 = icmp slt i32 %8, 10
25 |   br i1 %9, label %10, label %18
26 | 
27 | 10:                                               ; preds = %7
28 |   %11 = load i32*, i32** %3, align 8
29 |   %12 = load i32, i32* %4, align 4
30 |   %13 = sext i32 %12 to i64
31 |   %14 = getelementptr inbounds i32, i32* %11, i64 %13
32 |   store i32 0, i32* %14, align 4
33 |   br label %15
34 | 
35 | 15:                                               ; preds = %10
36 |   %16 = load i32, i32* %4, align 4
37 |   %17 = add nsw i32 %16, 1
38 |   store i32 %17, i32* %4, align 4
39 |   br label %7
40 | 
41 | 18:                                               ; preds = %7
42 |   %19 = load i32*, i32** %3, align 8
43 |   %20 = bitcast i32* %19 to i8*
44 |   call void @free(i8* %20)
45 |   %21 = load i8*, i8** %2, align 8
46 |   ret i8* %21
47 | }
48 | 
49 | ; Function Attrs: allocsize(0)
50 | declare i8* @malloc(i64) #1
51 | 
52 | declare void @free(i8*) #2
53 | 
54 | ; Function Attrs: noinline nounwind optnone ssp uwtable
55 | define i32 @main() #0 {
56 |   %1 = alloca i32, align 4
57 |   %2 = alloca %struct._opaque_pthread_t*, align 8
58 |   store i32 0, i32* %1, align 4
59 |   %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @init, i8* null)
60 |   %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
61 |   %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null)
62 |   ret i32 0
63 | }
64 | 
65 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #2
66 | 
67 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #2
68 | 
69 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
70 | attributes #1 = { allocsize(0) "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
71 | attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
72 | attributes #3 = { allocsize(0) }
73 | 
74 | !llvm.module.flags = !{!0, !1, !2}
75 | !llvm.ident = !{!3}
76 | 
77 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
78 | !1 = !{i32 1, !"wchar_size", i32 4}
79 | !2 = !{i32 7, !"PIC Level", i32 2}
80 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
81 | !4 = !{!5}
82 | !5 = !{i64 2, i64 3, i1 false}
83 | 


--------------------------------------------------------------------------------
/hw3/src/analyse/static.ll:
--------------------------------------------------------------------------------
 1 | ; ModuleID = 'static.bc'
 2 | source_filename = "static.c"
 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 4 | target triple = "x86_64-apple-macosx10.15.0"
 5 | 
 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] }
 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* }
 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] }
 9 | 
10 | @fun.x = internal global i32 0, align 4
11 | 
12 | ; Function Attrs: noinline nounwind optnone ssp uwtable
13 | define i8* @fun(i8*) #0 {
14 |   %2 = alloca i8*, align 8
15 |   %3 = alloca i32*, align 8
16 |   store i8* %0, i8** %2, align 8
17 |   %4 = load i8*, i8** %2, align 8
18 |   %5 = bitcast i8* %4 to i32*
19 |   store i32* %5, i32** %3, align 8
20 |   %6 = load i32*, i32** %3, align 8
21 |   %7 = load i32, i32* %6, align 4
22 |   store i32 %7, i32* @fun.x, align 4
23 |   ret i8* null
24 | }
25 | 
26 | ; Function Attrs: noinline nounwind optnone ssp uwtable
27 | define i32 @main() #0 {
28 |   %1 = alloca i32, align 4
29 |   %2 = alloca %struct._opaque_pthread_t*, align 8
30 |   %3 = alloca %struct._opaque_pthread_t*, align 8
31 |   store i32 0, i32* %1, align 4
32 |   %4 = bitcast %struct._opaque_pthread_t** %2 to i8*
33 |   %5 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %4)
34 |   %6 = bitcast %struct._opaque_pthread_t** %3 to i8*
35 |   %7 = call i32 @pthread_create(%struct._opaque_pthread_t** %3, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %6)
36 |   %8 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8
37 |   %9 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %8, i8** null)
38 |   %10 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %3, align 8
39 |   %11 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %10, i8** null)
40 |   ret i32 0
41 | }
42 | 
43 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1
44 | 
45 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1
46 | 
47 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
48 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
49 | 
50 | !llvm.module.flags = !{!0, !1, !2}
51 | !llvm.ident = !{!3}
52 | 
53 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]}
54 | !1 = !{i32 1, !"wchar_size", i32 4}
55 | !2 = !{i32 7, !"PIC Level", i32 2}
56 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"}
57 | !4 = !{!5}
58 | !5 = !{i64 2, i64 3, i1 false}
59 | 


--------------------------------------------------------------------------------
/hw4/Homework-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/Homework-4.pdf


--------------------------------------------------------------------------------
/hw4/performance analysis/parallel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/performance analysis/parallel


--------------------------------------------------------------------------------
/hw4/performance analysis/parallel.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #define row 1000
 5 | #define col 1000
 6 | 
 7 | int matrix[row][col];
 8 | int vector[col];
 9 | int res[row];
10 | 
11 | int s_res[col];
12 | void serial(){
13 |     for(int i = 0;i < row;i++){
14 |         for(int j = 0;j < col;j++){
15 |             s_res[i] += matrix[i][j] * vector[j];
16 |         }
17 |     }
18 | }
19 | int main(int argc, char **argv)
20 | { 
21 |     int my_id = 0;
22 |     int temp = 100;
23 |     int p;
24 |     MPI_Init(&argc, &argv);
25 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
26 |     MPI_Comm_size(MPI_COMM_WORLD, &p);
27 |     MPI_Status status_p;
28 |     if(my_id == 0){
29 |         for(int i = 0;i < col;i++){
30 |             vector[i] = rand() % 10;
31 |         }
32 |         for(int i = 0;i < row;i++){
33 |             for(int j = 0;j < col;j++){
34 |                 matrix[i][j] = rand() % 10;
35 |             }
36 |         }
37 |     }
38 |     MPI_Barrier(MPI_COMM_WORLD);
39 |     double my_start,my_end,my_elapsed,elapsed;
40 |     my_start = MPI_Wtime();
41 |     if(my_id == 0){
42 |         for(int i = 1;i < p;i++){
43 |             MPI_Send(matrix[i * row / p],col * row / p,MPI_INT,i,i,MPI_COMM_WORLD);
44 |         }
45 |     }
46 |     else{
47 |         MPI_Recv(matrix[my_id * row / p],col * row / p,MPI_INT,0,my_id,MPI_COMM_WORLD,&status_p);
48 |     }
49 | 
50 | 
51 |     MPI_Bcast(vector,col,MPI_INT,0,MPI_COMM_WORLD);
52 | 
53 |     for(int i = my_id * row / p;i < (my_id + 1) * row / p; i++){
54 |         for(int j = 0;j < col;j++){
55 |             res[i] += matrix[i][j] * vector[j];
56 |         }
57 |     }
58 |     if(my_id == 0){
59 |         for(int i = 1;i < p;i++){
60 |             MPI_Recv(res + i * row / p,row / p,MPI_INT,i,i,MPI_COMM_WORLD,&status_p);
61 |         }
62 | /*        serial();
63 |         int flag = 1;
64 |         for(int i = 0;i < row;i++){
65 |             if(res[i] != s_res[i]){
66 |                 printf("Error!\n");
67 |                 flag = 0;
68 |             }
69 |         }
70 |         if(flag){
71 |             printf("Correct calculation!\n");
72 |         }*/
73 |     }
74 |     else{
75 |         MPI_Send(res + my_id * row / p,row / p,MPI_INT,0,my_id,MPI_COMM_WORLD);
76 |     }
77 |     my_end = MPI_Wtime();
78 | 
79 |     my_elapsed = my_end - my_start;
80 | 
81 |     MPI_Reduce(&my_elapsed,&elapsed,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
82 |     if(my_id == 0){
83 |         printf("Parallel time is:%e seconds\n",elapsed);
84 |     }
85 |     MPI_Finalize();
86 | 
87 | 
88 |     return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/hw4/performance analysis/serial:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/performance analysis/serial


--------------------------------------------------------------------------------
/hw4/performance analysis/serial.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include  <sys/time.h>
 4 | #define row 1000
 5 | #define col 1000
 6 | 
 7 | int matrix[row][col];
 8 | int vector[row];
 9 | 
10 | int res[col];
11 | int main(){
12 |     struct timeval begin,end;
13 |     for(int i = 0;i < col;i++){
14 |         vector[i] = rand() % 10;//Initialize Vector
15 |     }
16 |     for(int i = 0;i < row;i++){
17 |         for(int j = 0;j < col;j++){
18 |             matrix[i][j] = rand() % 10;//Initialize Matrix
19 |       }
20 |     }
21 |     gettimeofday(&begin,NULL);
22 |     for(int i = 0;i < row;i++){
23 |         for(int j = 0;j < col;j++){
24 |             res[i] += matrix[i][j] * vector[j];//Multiplication
25 |         }
26 |     }
27 |     gettimeofday(&end,NULL);
28 |     printf("Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
29 |     return 0;
30 | }


--------------------------------------------------------------------------------
/hw4/readme.md:
--------------------------------------------------------------------------------
1 | ## 注意
2 | * 运行环境为macOS
3 | * 在src文件夹中的代码是没有对时间对测量的，无法进行性能分析，不过在并行的程序中有对程序正确性的验证
4 | * 在performance analysis文件夹中的代码可以对串行程序和并行程序的运行时间进行测量。


--------------------------------------------------------------------------------
/hw4/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/report.pdf


--------------------------------------------------------------------------------
/hw4/src/parallel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/src/parallel


--------------------------------------------------------------------------------
/hw4/src/parallel.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #define row 1000
 5 | #define col 1000
 6 | 
 7 | int matrix[row][col];
 8 | int vector[col];
 9 | int res[row];
10 | 
11 | int s_res[col];
12 | void serial(){
13 |     for(int i = 0;i < row;i++){
14 |         for(int j = 0;j < col;j++){
15 |             s_res[i] += matrix[i][j] * vector[j];
16 |         }
17 |     }
18 | }
19 | int main(int argc, char **argv)
20 | { 
21 |     int my_id = 0;
22 |     int temp = 100;
23 |     int p;
24 |     MPI_Init(&argc, &argv);
25 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
26 |     MPI_Comm_size(MPI_COMM_WORLD, &p);
27 |     MPI_Status status_p;
28 |     if(my_id == 0){
29 |         for(int i = 0;i < col;i++){
30 |             vector[i] = rand() % 10;
31 |         }
32 |         for(int i = 0;i < row;i++){
33 |             for(int j = 0;j < col;j++){
34 |                 matrix[i][j] = rand() % 10;
35 |             }
36 |         }
37 |     }
38 |     if(my_id == 0){
39 |         for(int i = 1;i < p;i++){
40 |             MPI_Send(matrix[i * row / p],col * row / p,MPI_INT,i,i,MPI_COMM_WORLD);
41 |         }
42 |     }
43 |     else{
44 |         MPI_Recv(matrix[my_id * row / p],col * row / p,MPI_INT,0,my_id,MPI_COMM_WORLD,&status_p);
45 |     }
46 | 
47 | 
48 |     MPI_Bcast(vector,col,MPI_INT,0,MPI_COMM_WORLD);
49 | 
50 |     for(int i = my_id * row / p;i < (my_id + 1) * row / p; i++){
51 |         for(int j = 0;j < col;j++){
52 |             res[i] += matrix[i][j] * vector[j];
53 |         }
54 |     }
55 |     if(my_id == 0){
56 |         for(int i = 1;i < p;i++){
57 |             MPI_Recv(res + i * row / p,row / p,MPI_INT,i,i,MPI_COMM_WORLD,&status_p);
58 |         }
59 |         serial();
60 |         int flag = 1;
61 |         for(int i = 0;i < row;i++){
62 |             if(res[i] != s_res[i]){
63 |                 printf("Error!\n");
64 |                 flag = 0;
65 |             }
66 |         }
67 |         if(flag){
68 |             printf("Correct calculation!\n");
69 |         }
70 |     }
71 |     else{
72 |         MPI_Send(res + my_id * row / p,row / p,MPI_INT,0,my_id,MPI_COMM_WORLD);
73 |     }
74 | 
75 |     MPI_Finalize();
76 | 
77 | 
78 |     return 0;
79 | }
80 | 


--------------------------------------------------------------------------------
/hw4/src/serial:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/src/serial


--------------------------------------------------------------------------------
/hw4/src/serial.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #define row 1000
 4 | #define col 1000
 5 | 
 6 | int matrix[row][col];
 7 | int vector[row];
 8 | 
 9 | int res[col];
10 | int main(){
11 |     for(int i = 0;i < col;i++){
12 |         vector[i] = rand() % 10;//Initialize Vector
13 |     }
14 |     for(int i = 0;i < row;i++){
15 |         for(int j = 0;j < col;j++){
16 |             matrix[i][j] = rand() % 10;//Initialize Matrix
17 |       }
18 |     }
19 |     for(int i = 0;i < row;i++){
20 |         for(int j = 0;j < col;j++){
21 |             res[i] += matrix[i][j] * vector[j];//Multiplication
22 |         }
23 |     }
24 |     return 0;
25 | }
26 | 


--------------------------------------------------------------------------------
/hw5/Homework-5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Homework-5.pdf


--------------------------------------------------------------------------------
/hw5/Q1/1.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <omp.h>
  3 | #include <string.h>
  4 | #include <stdlib.h>
  5 | #include  <sys/time.h>
  6 | struct compressed_matrix
  7 | {
  8 |     int row_size;
  9 |     int col_size;
 10 |     int element_size;
 11 |     int* row;
 12 |     int* col;
 13 |     double* element;
 14 | };
 15 | //获取随机数
 16 | double getRandData(int min,int max){
 17 |     double m1 = (double)(rand() % 101) / 101;
 18 |     min ++;
 19 |     double m2 = (double)((rand() % (max - min + 1) + min));
 20 |     m2--;
 21 |     return m1 + m2;
 22 | }
 23 | double* init_vector(int vector_size){
 24 |     double* vector = (double*)malloc(sizeof(double) * vector_size);
 25 |     for(int i = 0;i < vector_size;i++){
 26 |         vector[i] = getRandData(1,10);
 27 |     }
 28 |     return vector;
 29 | }
 30 | 
 31 | struct compressed_matrix init_matrix(FILE*stream){
 32 |     int row_size,col_size,element_size;
 33 |     fscanf(stream,"%d%d%d",&row_size,&col_size,&element_size);
 34 |     struct compressed_matrix mat;
 35 |     mat.col_size = col_size;
 36 |     mat.row_size = row_size;
 37 |     mat.element_size = element_size;
 38 |     mat.row = (int*)malloc(row_size * sizeof(int));
 39 |     mat.col = (int*)malloc(element_size * sizeof(int));
 40 |     mat.element = (double*)malloc(element_size * sizeof(double));
 41 |     for(int i = 0;i < row_size;i++){
 42 |         fscanf(stream,"%d",&mat.row[i]);
 43 |         mat.row[i]--;
 44 |     }
 45 |     for(int i = 0;i < element_size;i++){
 46 |         fscanf(stream,"%d",&mat.col[i]);
 47 |         mat.col[i]--;
 48 |     }
 49 |     for(int i = 0;i < element_size;i++){
 50 |         fscanf(stream,"%lf",&mat.element[i]);
 51 |     }
 52 |     return mat;
 53 | }
 54 | 
 55 | double* serial(struct compressed_matrix matrix,double* vector){
 56 |     double*res = (double*)malloc(matrix.row_size * sizeof(double));
 57 |     memset(res,0,matrix.row_size * sizeof(double));
 58 |     for(int i = 0;i < matrix.row_size;i++){
 59 |         int begin = matrix.row[i];
 60 |         int end;
 61 |         if(i < matrix.row_size - 1)
 62 |             end = matrix.row[i + 1] - 1;
 63 |         else 
 64 |             end = matrix.element_size - 1;
 65 |         for(int j = begin;j <= end;j++){
 66 |             res[i] += vector[matrix.col[j]] * matrix.element[j];
 67 |         }
 68 |     }
 69 |     return res;
 70 | }
 71 | double* parallel(struct compressed_matrix matrix,double* vector,int thread_count){
 72 |     double*res = (double*)malloc(matrix.row_size * sizeof(double));
 73 |     memset(res,0,matrix.row_size * sizeof(double));
 74 |     #pragma omp parallel for num_threads(thread_count)
 75 |     for(int i = 0;i < matrix.row_size;i++){
 76 |         int begin = matrix.row[i];
 77 |         int end;
 78 |         if(i < matrix.row_size - 1)
 79 |             end = matrix.row[i + 1] - 1;
 80 |         else 
 81 |             end = matrix.element_size - 1;
 82 |         for(int j = begin;j <= end;j++){
 83 |             res[i] += vector[matrix.col[j]] * matrix.element[j];
 84 |         }
 85 |     }
 86 |     return res;
 87 | }
 88 | void print_vector(double*res,int len){
 89 |     printf("{");
 90 |     for(int i = 0;i < len;i++){
 91 |         printf("%f",res[i]);
 92 |         if(i < len - 1)printf(",");
 93 |     }
 94 |     printf("}\n");
 95 | }
 96 | void free_matrix(struct compressed_matrix matrix){
 97 |     free(matrix.col);
 98 |     free(matrix.row);
 99 |     free(matrix.element);
100 | }
101 | int main(int argc, const char * argv[]){
102 |     if(argc < 3){
103 |         abort();
104 |     }
105 |     int thread_count = atoi(argv[1]);
106 |     FILE* matrix_file = fopen(argv[2],"r");
107 |     struct timeval begin,end;
108 |     struct compressed_matrix matrix = init_matrix(matrix_file);
109 |     double* vector = init_vector(matrix.col_size);
110 |     gettimeofday(&begin,NULL);
111 |     double* res = serial(matrix,vector);
112 |     gettimeofday(&end,NULL);
113 |     printf("\n\n");
114 |     printf("        Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
115 |     gettimeofday(&begin,NULL);
116 |     double* parallel_res = parallel(matrix,vector,thread_count);
117 |     gettimeofday(&end,NULL);
118 |     printf("\n\n");
119 |     printf("        There are %d threads.\n",thread_count);
120 |     printf("        Parallel_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec);
121 |     //print_vector(res,matrix.row_size);
122 |     //print_vector(parallel_res,matrix.row_size);
123 |     free_matrix(matrix);
124 |     free(vector);
125 |     free(res);
126 |     fclose(matrix_file);
127 |     free(parallel_res);
128 | }


--------------------------------------------------------------------------------
/hw5/Q2/2.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Q2/2.1


--------------------------------------------------------------------------------
/hw5/Q2/2.1.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <queue>
 3 | #include <math.h>
 4 | #include <omp.h>
 5 | #include <stdio.h>
 6 | #include <time.h>
 7 | #include <semaphore.h>
 8 | #define MaxSize 50
 9 | using namespace std;
10 | 
11 | typedef int resources;
12 | sem_t n ,s ;
13 | 
14 | queue<resources> all_resources;
15 | 
16 | resources produce(){
17 |     for(int i = 0;i < 5000;i++);
18 |     if(all_resources.empty()){
19 |         return 1;
20 |     }
21 |     else{
22 |         return all_resources.back() + 1;
23 |     }
24 | }
25 | 
26 | void consume(resources i){
27 |     printf("Resources %d is poped.\n",i);
28 | }
29 | 
30 | void producer(){
31 |     while(true){
32 |         resources x = produce();   
33 |         sem_wait(&s);
34 |         all_resources.push(x);
35 |         printf("Resources %d is pushed.\n",x);
36 |         sem_post(&s);
37 |         sem_post(&n);
38 |     }
39 | }
40 | void consumer(){
41 |     while(true){
42 |         resources x;
43 |         sem_wait(&n);
44 |         sem_wait(&s);
45 |         x = all_resources.front();
46 |         all_resources.pop();
47 |         sem_post(&s);
48 |         consume(x);
49 |     }
50 | }
51 | 
52 | int main(){
53 |     sem_init(&n,0,0);
54 |     sem_init(&s,0,1);
55 |     #pragma omp parallel sections
56 |     {
57 |         #pragma omp section
58 |         {
59 |             producer();
60 |         }
61 |         #pragma omp section
62 |         {
63 |             consumer();
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/hw5/Q2/2.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Q2/2.2


--------------------------------------------------------------------------------
/hw5/Q2/2.2.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <queue>
 3 | #include <math.h>
 4 | #include <omp.h>
 5 | #include <stdio.h>
 6 | #include <time.h>
 7 | #include <semaphore.h>
 8 | #define MaxSize 50
 9 | using namespace std;
10 | 
11 | typedef int resources;
12 | sem_t n ,s ;
13 | 
14 | queue<resources> all_resources;
15 | 
16 | resources produce(){
17 |     for(int i = 0;i < 1000;i++)
18 | 	for(int j = 0;j < 1000;j++);
19 |     if(all_resources.empty()){
20 |         return 1;
21 |     }
22 |     else{
23 |         return all_resources.back() + 1;
24 |     }
25 | }
26 | 
27 | void consume(resources i,int id){
28 |     printf("Resources %d is poped by thread %d.\n",i,id);
29 | }
30 | 
31 | void producer(int id){
32 |     while(true){
33 |         resources x = produce();   
34 |         sem_wait(&s);
35 |         all_resources.push(x);
36 |         printf("Resources %d is pushed by thread %d.\n",x,id);
37 |         sem_post(&s);
38 |         sem_post(&n);
39 | 	if(id == MaxSize)break;
40 |     }
41 | }
42 | void consumer(int id){
43 |     while(true){
44 |         resources x;
45 |         sem_wait(&n);
46 |         sem_wait(&s);
47 |         x = all_resources.front();
48 |         all_resources.pop();
49 |         sem_post(&s);
50 |         consume(x,id);
51 |     }
52 | }
53 | 
54 | int main(){
55 |     int p_count = 8;
56 |     int c_count = 4;
57 |     sem_init(&n,0,0);
58 |     sem_init(&s,0,1);
59 |     
60 | 	#pragma omp parallel num_threads(p_count + c_count)
61 | 	{
62 | 	    int id = omp_get_thread_num();
63 | 	    #pragma omp parallel sections
64 | 	    {
65 | 		#pragma omp section
66 | 		{
67 | 		    if(id < p_count)
68 | 		        producer(id);
69 | 		}
70 | 		#pragma omp section
71 | 		{
72 | 		    if(id >= p_count)
73 | 		        consumer(id);
74 | 		}
75 | 	    }
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/hw5/Q3/measure.c:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #define BUFLEN 1000000
 5 | 
 6 | char buf[BUFLEN];
 7 | 
 8 | int main(int argc, char **argv)
 9 | { 
10 |     int my_id = 0;
11 |     int p;
12 |     MPI_Init(&argc, &argv);
13 |     MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
14 |     MPI_Comm_size(MPI_COMM_WORLD, &p);
15 |     MPI_Status status_p;
16 |     MPI_Barrier(MPI_COMM_WORLD);
17 |     double my_start,my_end,my_elapsed,elapsed;
18 |     my_start = MPI_Wtime();
19 |     if(my_id == 0){
20 |         for(int i = 1;i < p;i++){
21 |             MPI_Send(buf,BUFLEN,MPI_CHAR,i,i,MPI_COMM_WORLD);
22 |         }
23 |     }
24 |     else{
25 |         MPI_Recv(buf,BUFLEN,MPI_CHAR,0,my_id,MPI_COMM_WORLD,&status_p);
26 |     }
27 |     my_end = MPI_Wtime();
28 | 
29 |     my_elapsed = my_end - my_start;
30 | 
31 |     MPI_Reduce(&my_elapsed,&elapsed,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
32 |     if(my_id == 0){
33 | 	printf("Time delay is %f s\n",elapsed);
34 |         printf("Bandwidth is %f Mbit/s\n", BUFLEN * 1.0 / (1048576* elapsed));
35 |     }
36 |     MPI_Finalize();
37 | 
38 | 
39 |     return 0;
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/hw5/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/report.pdf


--------------------------------------------------------------------------------
/hw6/Homework-6/Homework-6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw6/Homework-6/Homework-6.pdf


--------------------------------------------------------------------------------
/hw6/Homework-6/error-test.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
 4 | 
 5 | 
 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
 7 | {
 8 |     // Add the kernel code
 9 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     // Do not try to access past the allocated memory
12 |     if (idx < N) {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | }
16 | 
17 | 
18 | int main(void)
19 | {
20 |     const int N = 20;
21 |     const int ThreadsInBlock = 128;
22 |     double *dA, *dB, *dC;
23 |     double hA[N], hB[N], hC[N];
24 |   
25 |     for(int i = 0; i < N; ++i) {
26 |         hA[i] = (double) i;
27 |         hB[i] = (double) i * i;
28 |     }
29 | 
30 |     /* 
31 |        Add memory allocations and copies. Wrap your runtime function
32 |        calls with CUDA_CHECK( ) macro
33 |     */
34 |     CUDA_CHECK( cudaMalloc((void**)&dA, sizeof(double)*N) );
35 |     #error Add the remaining memory allocations and copies
36 | 
37 |     // Note the maximum size of threads in a block
38 |     dim3 grid, threads;
39 | 
40 |     //// Add the kernel call here
41 |     #error Add the CUDA kernel call
42 | 
43 | 
44 |     // Here we add an explicit synchronization so that we catch errors
45 |     // as early as possible. Don't do this in production code!
46 |     cudaDeviceSynchronize();
47 |     CHECK_ERROR_MSG("vector_add kernel");
48 | 
49 |     //// Copy back the results and free the device memory
50 |     #error Copy back the results and free the allocated memory
51 | 
52 |     for (int i = 0; i < N; i++)
53 |         printf("%5.1f\n", hC[i]);
54 | 
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/hw6/Homework-6/error_check_1.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/Homework-6/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/Homework-6/jacobi.cu:
--------------------------------------------------------------------------------
  1 | #include <sys/time.h>
  2 | #include <cstdio>
  3 | #include "jacobi.h"
  4 | #include "error_checks.h"
  5 | 
  6 | // Change this to 0 if CPU reference result is not needed
  7 | #define COMPUTE_CPU_REFERENCE 1
  8 | #define MAX_ITERATIONS 3000
  9 | 
 10 | // CPU kernel
 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 
 12 |               double h2, int N)
 13 | { 
 14 |     int i, j;
 15 |     int index, i1, i2, i3, i4;
 16 | 
 17 |     for (j = 1; j < N-1; j++) {
 18 |         for (i = 1; i < N-1; i++) {
 19 |             index = i + j*N; 
 20 |             i1 = (i-1) +   j   * N;
 21 |             i2 = (i+1) +   j   * N;
 22 |             i3 =   i   + (j-1) * N;
 23 |             i4 =   i   + (j+1) * N;
 24 |             phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 25 |                                  phiPrev[i3] + phiPrev[i4] - 
 26 |                                  h2 * source[index]);
 27 |         } 
 28 |     }
 29 | } 
 30 | 
 31 | // GPU kernel
 32 | __global__ 
 33 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 
 34 |               double h2, int N)
 35 | {
 36 |     #error Add here the GPU version of the update routine (see sweepCPU above)
 37 | }
 38 | 
 39 | 
 40 | double compareArrays(const double *a, const double *b, int N)
 41 | {
 42 |     double error = 0.0;
 43 |     int i;
 44 |     for (i = 0; i < N*N; i++) {
 45 |         error += fabs(a[i] - b[i]);
 46 |     }
 47 |     return error/(N*N);
 48 | }
 49 | 
 50 | 
 51 | double diffCPU(const double *phi, const double *phiPrev, int N)
 52 | {
 53 |     int i;
 54 |     double sum = 0;
 55 |     double diffsum = 0;
 56 |     
 57 |     for (i = 0; i < N*N; i++) {
 58 |         diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
 59 |         sum += phi[i] * phi[i];
 60 |     }
 61 | 
 62 |     return sqrt(diffsum/sum);
 63 | }
 64 | 
 65 | 
 66 | int main() 
 67 | { 
 68 |     timeval t1, t2; // Structs for timing
 69 |     const int N = 512;
 70 |     double h = 1.0 / (N - 1);
 71 |     int iterations;
 72 |     const double tolerance = 5e-4; // Stopping condition
 73 |     int i, j, index;
 74 | 
 75 |     const int blocksize = 16;
 76 |   
 77 |     double *phi      = new double[N*N]; 
 78 |     double *phiPrev  = new double[N*N]; 
 79 |     double *source   = new double[N*N]; 
 80 |     double *phi_cuda = new double[N*N]; 
 81 | 
 82 |     double *phi_d, *phiPrev_d, *source_d; 
 83 |     // Size of the arrays in bytes
 84 |     const int size = N*N*sizeof(double); 
 85 |     double diff;
 86 |   
 87 |     // Source initialization
 88 |     for (i = 0; i < N; i++) {
 89 |         for (j = 0; j < N; j++) {      
 90 |             double x, y;
 91 |             x = (i - N / 2) * h;
 92 |             y = (j - N / 2) * h;
 93 |             index = j + i * N;
 94 |             if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
 95 |                 source[index] = 1e10*h*h;
 96 |             else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
 97 |                 source[index] = -1e10*h*h;
 98 |             else
 99 |                 source[index] = 0.0;
100 |         }            
101 |     }
102 | 
103 |     CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 
104 |     CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 
105 | 
106 |     // Reset values to zero
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {      
109 |             index = j + i * N;
110 |             phi[index] = 0.0; 
111 |             phiPrev[index] = 0.0; 
112 |         }            
113 |     }
114 | 
115 |     CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 
116 |     CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 
117 |     CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
118 |     CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );
119 | 
120 |     // CPU version 
121 |     if(COMPUTE_CPU_REFERENCE) { 
122 |         gettimeofday(&t1, NULL);
123 | 
124 |         // Do sweeps untill difference is under the tolerance
125 |         diff = tolerance * 2;
126 |         iterations = 0;
127 |         while (diff > tolerance && iterations < MAX_ITERATIONS) {
128 |             sweepCPU(phiPrev, phi, source, h * h, N);
129 |             sweepCPU(phi, phiPrev, source, h * h, N);
130 |             
131 |             iterations += 2;
132 |             if (iterations % 100 == 0) {
133 |                 diff = diffCPU(phi, phiPrev, N);
134 |                 printf("%d %g\n", iterations, diff);
135 |             }
136 |         }
137 |         gettimeofday(&t2, NULL);
138 |         printf("CPU Jacobi: %g seconds, %d iterations\n", 
139 |                t2.tv_sec - t1.tv_sec + 
140 |                (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
141 |     }
142 | 
143 |     // GPU version
144 | 
145 |     dim3 dimBlock(blocksize, blocksize); 
146 |     dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 
147 |     
148 |     //do sweeps until diff under tolerance
149 |     diff = tolerance * 2;
150 |     iterations = 0;
151 | 
152 |     gettimeofday(&t1, NULL);
153 | 
154 |     while (diff > tolerance && iterations < MAX_ITERATIONS) {
155 |         // See above how the CPU update kernel is called
156 |         // and implement similar calling sequence for the GPU code
157 | 
158 |         //// Add routines here
159 |         #error Add GPU kernel calls here (see CPU version above)
160 | 
161 |         iterations += 2;
162 |         
163 |         if (iterations % 100 == 0) {
164 |             // diffGPU is defined in the header file, it uses
165 |             // Thrust library for reduction computation
166 |             diff = diffGPU<double>(phiPrev_d, phi_d, N);
167 |             CHECK_ERROR_MSG("Difference computation");
168 |             printf("%d %g\n", iterations, diff);
169 |         }
170 |     }
171 |     
172 |     //// Add here the routine to copy back the results
173 |     #error Copy back the results
174 | 
175 |     gettimeofday(&t2, NULL);
176 |     printf("GPU Jacobi: %g seconds, %d iterations\n", 
177 |            t2.tv_sec - t1.tv_sec + 
178 |            (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
179 | 
180 |     //// Add here the clean up code for all allocated CUDA resources
181 |     #error Add here the clean up code   
182 | 
183 |     if (COMPUTE_CPU_REFERENCE) {
184 |         printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
185 |     }
186 |     
187 |     delete[] phi; 
188 |     delete[] phi_cuda;
189 |     delete[] phiPrev; 
190 |     delete[] source; 
191 |     
192 |     return EXIT_SUCCESS; 
193 | } 


--------------------------------------------------------------------------------
/hw6/Homework-6/jacobi.h:
--------------------------------------------------------------------------------
 1 | #ifndef EX3_H_
 2 | #define EX3_H_
 3 | 
 4 | #include <thrust/device_vector.h>
 5 | #include <thrust/functional.h>
 6 | #include <thrust/transform_reduce.h>
 7 | #include <thrust/iterator/zip_iterator.h>
 8 | 
 9 | // Helper function prototypes
10 | double compareArrays(const double *a, const double *b, int N);
11 | double diffCPU(const double *a, const double *b, int N);
12 | void sweepCPU(double *phi, const double *phiPrev, 
13 |               const double *source, double h2, int N);
14 | 
15 | 
16 | /* -------------------------------------------------------------------------
17 |    EXTRACURRICULAR ACTIVITIES 
18 |    
19 |    This part provides the reduction operation (in this case summation of
20 |    difference of two arrays) using thrust library. Thrust mimics the
21 |    syntax and design of standard template library (STL) of C++. Thrust is
22 |    also a part of CUDA 4 SDK.
23 |    More information can be found from thrust home page 
24 |    http://code.google.com/p/thrust/
25 |    ----------------------------------------------------------------------- */
26 | 
27 | template<typename T>
28 | class square_diff_thr : public thrust::unary_function<thrust::tuple<T, T>, T>
29 | {
30 | public:
31 |     __host__ __device__ 
32 |     T operator()(const thrust::tuple<T, T>& x) const {
33 |         return (thrust::get<1>(x) - thrust::get<0>(x)) * 
34 |             (thrust::get<1>(x) - thrust::get<0>(x));
35 |     }
36 | };
37 | 
38 | template<typename T>
39 | class square_thr : public thrust::unary_function<T, T>
40 | {
41 | public:
42 |     __host__ __device__
43 |     T operator()(const T& x) const {
44 |         return x*x;
45 |     }
46 | };
47 | 
48 | template<typename T>
49 | T diffGPU(T *A_d, T *B_d, int N)
50 | {
51 |     typedef thrust::device_ptr<T> FloatIterator;
52 |     typedef thrust::tuple<FloatIterator, FloatIterator> IteratorTuple;
53 |     typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
54 |     
55 |     thrust::device_ptr<T> A_ptr(A_d);
56 |     thrust::device_ptr<T> B_ptr(B_d);
57 |     
58 |     ZipIterator first = 
59 |         thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr));
60 |     ZipIterator last = 
61 |         thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N*N, 
62 |                                                      B_ptr + N*N));
63 |     
64 |     T a1 = thrust::transform_reduce(first, last, square_diff_thr<T>(), 
65 |                                   static_cast<T>(0), thrust::plus<T>());
66 |     T a2 = thrust::transform_reduce(B_ptr, B_ptr + N*N, 
67 |                                   square_thr<T>(), static_cast<T>(0),
68 |                                   thrust::plus<T>());
69 |     
70 |     return sqrt(a1/a2);
71 | }
72 | 
73 | 
74 | #endif // EX3_H_


--------------------------------------------------------------------------------
/hw6/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw6/report.pdf


--------------------------------------------------------------------------------
/hw6/src/1.0/error-test.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
 4 | 
 5 | 
 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
 7 | {
 8 |     // Add the kernel code
 9 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     // Do not try to access past the allocated memory
12 |     if (idx < N) {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | }
16 | 
17 | 
18 | int main(void)
19 | {
20 |     const int N = 20;
21 |     const int ThreadsInBlock = 128;
22 |     double *dA, *dB, *dC;
23 |     double hA[N], hB[N], hC[N];
24 |   
25 |     for(int i = 0; i < N; ++i) {
26 |         hA[i] = (double) i;
27 |         hB[i] = (double) i * i;
28 |     }
29 | 
30 |     /* 
31 |        Add memory allocations and copies. Wrap your runtime function
32 |        calls with CUDA_CHECK( ) macro
33 |     */
34 |     CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N));
35 |     CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N));
36 |     CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N));
37 | 
38 |     // Note the maximum size of threads in a block
39 |     dim3 grid, threads;
40 | 
41 |     CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice));
42 |     CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice));
43 | 
44 |     //// Add the kernel call here
45 |     vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N);
46 | 
47 | 
48 |     // Here we add an explicit synchronization so that we catch errors
49 |     // as early as possible. Don't do this in production code!
50 |     cudaDeviceSynchronize();
51 |     CHECK_ERROR_MSG("vector_add kernel");
52 | 
53 |     //// Copy back the results and free the device memory
54 |     CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost));
55 | 
56 |     CUDA_CHECK(cudaFree(dA));
57 |     CUDA_CHECK(cudaFree(dB));
58 |     CUDA_CHECK(cudaFree(dC));
59 |     for (int i = 0; i < N; i++)
60 |         printf("%5.1f\n", hC[i]);
61 | 
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/hw6/src/1.0/error_check_1.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.0/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.1/error-test.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
 4 | 
 5 | 
 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
 7 | {
 8 |     // Add the kernel code
 9 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     // Do not try to access past the allocated memory
12 |     if (idx < N) {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | }
16 | 
17 | 
18 | int main(void)
19 | {
20 |     const int N = 20;
21 |     const int ThreadsInBlock = 12800;
22 |     double *dA, *dB, *dC;
23 |     double hA[N], hB[N], hC[N];
24 |   
25 |     for(int i = 0; i < N; ++i) {
26 |         hA[i] = (double) i;
27 |         hB[i] = (double) i * i;
28 |     }
29 | 
30 |     /* 
31 |        Add memory allocations and copies. Wrap your runtime function
32 |        calls with CUDA_CHECK( ) macro
33 |     */
34 |     CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N));
35 |     CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N));
36 |     CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N));
37 | 
38 |     // Note the maximum size of threads in a block
39 |     dim3 grid, threads;
40 | 
41 |     CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice));
42 |     CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice));
43 | 
44 |     //// Add the kernel call here
45 |     vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N);
46 | 
47 | 
48 |     // Here we add an explicit synchronization so that we catch errors
49 |     // as early as possible. Don't do this in production code!
50 |     //cudaDeviceSynchronize();
51 |     //CHECK_ERROR_MSG("vector_add kernel");
52 | 
53 |     //// Copy back the results and free the device memory
54 |     CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost));
55 | 
56 | 
57 |     for (int i = 0; i < N; i++)
58 |         printf("%5.1f\n", hC[i]);
59 | 
60 |     CUDA_CHECK(cudaFree(dA));
61 |     CUDA_CHECK(cudaFree(dB));
62 |     CUDA_CHECK(cudaFree(dC));
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/hw6/src/1.1/error_check_1.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.1/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.2/error-test.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
 4 | 
 5 | 
 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N)
 7 | {
 8 |     // Add the kernel code
 9 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     // Do not try to access past the allocated memory
12 |     if (idx < N) {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | }
16 | 
17 | 
18 | int main(void)
19 | {
20 |     const int N = 20;
21 |     const int ThreadsInBlock = 128;
22 |     double *dA, *dB, *dC;
23 |     double hA[N], hB[N], hC[N];
24 |   
25 |     for(int i = 0; i < N; ++i) {
26 |         hA[i] = (double) i;
27 |         hB[i] = (double) i * i;
28 |     }
29 | 
30 |     /* 
31 |        Add memory allocations and copies. Wrap your runtime function
32 |        calls with CUDA_CHECK( ) macro
33 |     */
34 |     CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N));
35 |     CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N));
36 |     CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N));
37 | 
38 |     // Note the maximum size of threads in a block
39 |     dim3 grid, threads;
40 | 
41 |     CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice));
42 |     CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice));
43 | 
44 |     //// Add the kernel call here
45 |     vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N);
46 |     
47 |     printf("Pointer to device memory: %d",*dA);
48 | 
49 |     // Here we add an explicit synchronization so that we catch errors
50 |     // as early as possible. Don't do this in production code!
51 |     cudaDeviceSynchronize();
52 |     CHECK_ERROR_MSG("vector_add kernel");
53 | 
54 |     //// Copy back the results and free the device memory
55 |     CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost));
56 | 
57 | 
58 |     for (int i = 0; i < N; i++)
59 |         printf("%5.1f\n", hC[i]);
60 | 
61 |     CUDA_CHECK(cudaFree(dA));
62 |     CUDA_CHECK(cudaFree(dB));
63 |     CUDA_CHECK(cudaFree(dC));
64 |     return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/hw6/src/1.2/error_check_1.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.2/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.3/error-test.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cmath>
 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
 4 | 
 5 | 
 6 | __global__ void vector_add(int* x,double *C, const double *A, const double *B, int N)
 7 | {
 8 |     // Add the kernel code
 9 |     int idx = *x * blockIdx.x * blockDim.x + threadIdx.x;
10 | 
11 |     // Do not try to access past the allocated memory
12 |     if (idx < N) {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | }
16 | 
17 | 
18 | int main(void)
19 | {
20 |     const int N = 20;
21 |     const int ThreadsInBlock = 128;
22 |     double *dA, *dB, *dC;
23 |     double hA[N], hB[N], hC[N];
24 |     int *x = (int*)malloc(sizeof(int));
25 |     *x = 1;
26 | 
27 |     for(int i = 0; i < N; ++i) {
28 |         hA[i] = (double) i;
29 |         hB[i] = (double) i * i;
30 |     }
31 | 
32 |     /* 
33 |        Add memory allocations and copies. Wrap your runtime function
34 |        calls with CUDA_CHECK( ) macro
35 |     */
36 |     CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N));
37 |     CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N));
38 |     CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N));
39 | 
40 |     // Note the maximum size of threads in a block
41 |     dim3 grid, threads;
42 | 
43 |     CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice));
44 |     CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice));
45 | 
46 |     //// Add the kernel call here
47 |     vector_add<<<1,ThreadsInBlock>>>(x,dC, dA, dB, N);
48 |     
49 |     //printf("Pointer to device memory: %d",*dA);
50 | 
51 |     // Here we add an explicit synchronization so that we catch errors
52 |     // as early as possible. Don't do this in production code!
53 |     cudaDeviceSynchronize();
54 |     CHECK_ERROR_MSG("vector_add kernel");
55 | 
56 |     //// Copy back the results and free the device memory
57 |     CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost));
58 | 
59 | 
60 |     for (int i = 0; i < N; i++)
61 |         printf("%5.1f\n", hC[i]);
62 | 
63 |     CUDA_CHECK(cudaFree(dA));
64 |     CUDA_CHECK(cudaFree(dB));
65 |     CUDA_CHECK(cudaFree(dC));
66 |     free(x);
67 |     return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/hw6/src/1.3/error_check_1.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/1.3/error_checks.h:
--------------------------------------------------------------------------------
 1 | // This header provides two helper macros for error checking
 2 | // See the exercise skeletons and answers for usage examples.
 3 | 
 4 | #ifndef COURSE_UTIL_H_
 5 | #define COURSE_UTIL_H_
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | 
10 | #define CUDA_CHECK(errarg)   __checkErrorFunc(errarg, __FILE__, __LINE__)
11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)
12 | 
13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 
14 | 			     const int line)
15 | {
16 |     if(errarg) {
17 | 	fprintf(stderr, "Error at %s(%i)\n", file, line);
18 | 	exit(EXIT_FAILURE);
19 |     }
20 | }
21 | 
22 | 
23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 
24 | 			      const int line)
25 | {
26 |     cudaError_t err = cudaGetLastError();
27 |     if(err != cudaSuccess) {
28 | 	fprintf(stderr, "Error: %s at %s(%i): %s\n", 
29 | 		errstr, file, line, cudaGetErrorString(err));
30 | 	exit(EXIT_FAILURE);
31 |     }
32 | }
33 | 
34 | #endif


--------------------------------------------------------------------------------
/hw6/src/2/jacobi.cu:
--------------------------------------------------------------------------------
  1 | #include <sys/time.h>
  2 | #include <cstdio>
  3 | #include "jacobi.h"
  4 | #include "error_checks.h"
  5 | 
  6 | // Change this to 0 if CPU reference result is not needed
  7 | #define COMPUTE_CPU_REFERENCE 1
  8 | #define MAX_ITERATIONS 3000
  9 | 
 10 | // CPU kernel
 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 
 12 |               double h2, int N)
 13 | { 
 14 |     int i, j;
 15 |     int index, i1, i2, i3, i4;
 16 | 
 17 |     for (j = 1; j < N-1; j++) {
 18 |         for (i = 1; i < N-1; i++) {
 19 |             index = i + j*N; 
 20 |             i1 = (i-1) +   j   * N;
 21 |             i2 = (i+1) +   j   * N;
 22 |             i3 =   i   + (j-1) * N;
 23 |             i4 =   i   + (j+1) * N;
 24 |             phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 25 |                                  phiPrev[i3] + phiPrev[i4] - 
 26 |                                  h2 * source[index]);
 27 |         } 
 28 |     }
 29 | } 
 30 | 
 31 | // GPU kernel
 32 | __global__  void sweepGPU(double *phi, const double *phiPrev, const double *source, 
 33 |               double h2, int N)
 34 | {
 35 |     //#error Add here the GPU version of the update routine (see sweepCPU above)
 36 |     int i, j;
 37 |     int index, i1, i2, i3, i4;
 38 | 
 39 |     i = blockIdx.x * blockDim.x + threadIdx.x; 
 40 |     j = blockIdx.y * blockDim.y + threadIdx.y;
 41 | 
 42 |     if(i < N - 1 && j < N - 1 && i > 0 && j > 0) {
 43 |         index = i + j*N; 
 44 |         i1 = (i-1) +   j   * N;
 45 |         i2 = (i+1) +   j   * N;
 46 |         i3 =   i   + (j-1) * N;
 47 |         i4 =   i   + (j+1) * N;
 48 |         phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 
 49 |                                 phiPrev[i3] + phiPrev[i4] - 
 50 |                                 h2 * source[index]);
 51 |     } 
 52 | }
 53 | 
 54 | 
 55 | double compareArrays(const double *a, const double *b, int N)
 56 | {
 57 |     double error = 0.0;
 58 |     int i;
 59 |     for (i = 0; i < N*N; i++) {
 60 |         error += fabs(a[i] - b[i]);
 61 |     }
 62 |     return error/(N*N);
 63 | }
 64 | 
 65 | 
 66 | double diffCPU(const double *phi, const double *phiPrev, int N)
 67 | {
 68 |     int i;
 69 |     double sum = 0;
 70 |     double diffsum = 0;
 71 |     
 72 |     for (i = 0; i < N*N; i++) {
 73 |         diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
 74 |         sum += phi[i] * phi[i];
 75 |     }
 76 | 
 77 |     return sqrt(diffsum/sum);
 78 | }
 79 | 
 80 | 
 81 | int main() 
 82 | { 
 83 |     timeval t1, t2; // Structs for timing
 84 |     const int N = 512;
 85 |     double h = 1.0 / (N - 1);
 86 |     int iterations;
 87 |     const double tolerance = 5e-4; // Stopping condition
 88 |     int i, j, index;
 89 | 
 90 |     const int blocksize = 16;
 91 |   
 92 |     double *phi      = new double[N*N]; 
 93 |     double *phiPrev  = new double[N*N]; 
 94 |     double *source   = new double[N*N]; 
 95 |     double *phi_cuda = new double[N*N]; 
 96 | 
 97 |     double *phi_d, *phiPrev_d, *source_d; 
 98 |     // Size of the arrays in bytes
 99 |     const int size = N*N*sizeof(double); 
100 |     double diff;
101 |   
102 |     // Source initialization
103 |     for (i = 0; i < N; i++) {
104 |         for (j = 0; j < N; j++) {      
105 |             double x, y;
106 |             x = (i - N / 2) * h;
107 |             y = (j - N / 2) * h;
108 |             index = j + i * N;
109 |             if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
110 |                 source[index] = 1e10*h*h;
111 |             else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
112 |                 source[index] = -1e10*h*h;
113 |             else
114 |                 source[index] = 0.0;
115 |         }            
116 |     }
117 | 
118 |     CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 
119 |     CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 
120 | 
121 |     // Reset values to zero
122 |     for (i = 0; i < N; i++) {
123 |         for (j = 0; j < N; j++) {      
124 |             index = j + i * N;
125 |             phi[index] = 0.0; 
126 |             phiPrev[index] = 0.0; 
127 |         }            
128 |     }
129 | 
130 |     CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 
131 |     CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 
132 |     CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) );
133 |     CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) );
134 | 
135 |     // CPU version 
136 |     if(COMPUTE_CPU_REFERENCE) { 
137 |         gettimeofday(&t1, NULL);
138 | 
139 |         // Do sweeps untill difference is under the tolerance
140 |         diff = tolerance * 2;
141 |         iterations = 0;
142 |         while (diff > tolerance && iterations < MAX_ITERATIONS) {
143 |             sweepCPU(phiPrev, phi, source, h * h, N);
144 |             sweepCPU(phi, phiPrev, source, h * h, N);
145 |             
146 |             iterations += 2;
147 |             if (iterations % 100 == 0) {
148 |                 diff = diffCPU(phi, phiPrev, N);
149 |                 printf("%d %g\n", iterations, diff);
150 |             }
151 |         }
152 |         gettimeofday(&t2, NULL);
153 |         printf("CPU Jacobi: %g seconds, %d iterations\n", 
154 |                t2.tv_sec - t1.tv_sec + 
155 |                (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
156 |     }
157 | 
158 |     // GPU version
159 | 
160 |     dim3 dimBlock(blocksize, blocksize); 
161 |     dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 
162 |     
163 |     //do sweeps until diff under tolerance
164 |     diff = tolerance * 2;
165 |     iterations = 0;
166 | 
167 |     gettimeofday(&t1, NULL);
168 | 
169 |     while (diff > tolerance && iterations < MAX_ITERATIONS) {
170 |         // See above how the CPU update kernel is called
171 |         // and implement similar calling sequence for the GPU code
172 | 
173 |         //// Add routines here
174 |         //#error Add GPU kernel calls here (see CPU version above)
175 |         sweepGPU<<<dimGrid,dimBlock>>>(phiPrev_d, phi_d, source_d, h*h, N);
176 |         sweepGPU<<<dimGrid,dimBlock>>>(phi_d, phiPrev_d, source_d, h*h, N);
177 |         
178 |         iterations += 2;
179 |         
180 |         if (iterations % 100 == 0) {
181 |             // diffGPU is defined in the header file, it uses
182 |             // Thrust library for reduction computation
183 |             diff = diffGPU<double>(phiPrev_d, phi_d, N);
184 |             CHECK_ERROR_MSG("Difference computation");
185 |             printf("%d %g\n", iterations, diff);
186 |         }
187 |     }
188 |     
189 |     //// Add here the routine to copy back the results
190 |     //#error Copy back the results
191 | 
192 |     CUDA_CHECK( cudaMemcpy(phi, phi_d, sizeof(double)*N*N, cudaMemcpyDeviceToHost));
193 |     CUDA_CHECK( cudaMemcpy(phiPrev, phiPrev_d, sizeof(double)*N*N,cudaMemcpyDeviceToHost));
194 | 
195 |     gettimeofday(&t2, NULL);
196 |     printf("GPU Jacobi: %g seconds, %d iterations\n", 
197 |            t2.tv_sec - t1.tv_sec + 
198 |            (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations);
199 | 
200 |     //// Add here the clean up code for all allocated CUDA resources
201 |     //#error Add here the clean up code   
202 | 
203 |     CUDA_CHECK( cudaFree(phi_d));
204 |     CUDA_CHECK( cudaFree(phiPrev_d));
205 |     CUDA_CHECK( cudaFree(source_d));
206 | 
207 |     if (COMPUTE_CPU_REFERENCE) {
208 |         printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
209 |     }
210 |     
211 |     delete[] phi; 
212 |     delete[] phi_cuda;
213 |     delete[] phiPrev; 
214 |     delete[] source; 
215 |     
216 |     return EXIT_SUCCESS; 
217 | } 
218 | 


--------------------------------------------------------------------------------
/hw6/src/2/jacobi.h:
--------------------------------------------------------------------------------
 1 | #ifndef EX3_H_
 2 | #define EX3_H_
 3 | 
 4 | #include <thrust/device_vector.h>
 5 | #include <thrust/functional.h>
 6 | #include <thrust/transform_reduce.h>
 7 | #include <thrust/iterator/zip_iterator.h>
 8 | 
 9 | // Helper function prototypes
10 | double compareArrays(const double *a, const double *b, int N);
11 | double diffCPU(const double *a, const double *b, int N);
12 | void sweepCPU(double *phi, const double *phiPrev, 
13 |               const double *source, double h2, int N);
14 | 
15 | 
16 | /* -------------------------------------------------------------------------
17 |    EXTRACURRICULAR ACTIVITIES 
18 |    
19 |    This part provides the reduction operation (in this case summation of
20 |    difference of two arrays) using thrust library. Thrust mimics the
21 |    syntax and design of standard template library (STL) of C++. Thrust is
22 |    also a part of CUDA 4 SDK.
23 |    More information can be found from thrust home page 
24 |    http://code.google.com/p/thrust/
25 |    ----------------------------------------------------------------------- */
26 | 
27 | template<typename T>
28 | class square_diff_thr : public thrust::unary_function<thrust::tuple<T, T>, T>
29 | {
30 | public:
31 |     __host__ __device__ 
32 |     T operator()(const thrust::tuple<T, T>& x) const {
33 |         return (thrust::get<1>(x) - thrust::get<0>(x)) * 
34 |             (thrust::get<1>(x) - thrust::get<0>(x));
35 |     }
36 | };
37 | 
38 | template<typename T>
39 | class square_thr : public thrust::unary_function<T, T>
40 | {
41 | public:
42 |     __host__ __device__
43 |     T operator()(const T& x) const {
44 |         return x*x;
45 |     }
46 | };
47 | 
48 | template<typename T>
49 | T diffGPU(T *A_d, T *B_d, int N)
50 | {
51 |     typedef thrust::device_ptr<T> FloatIterator;
52 |     typedef thrust::tuple<FloatIterator, FloatIterator> IteratorTuple;
53 |     typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
54 |     
55 |     thrust::device_ptr<T> A_ptr(A_d);
56 |     thrust::device_ptr<T> B_ptr(B_d);
57 |     
58 |     ZipIterator first = 
59 |         thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr));
60 |     ZipIterator last = 
61 |         thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N*N, 
62 |                                                      B_ptr + N*N));
63 |     
64 |     T a1 = thrust::transform_reduce(first, last, square_diff_thr<T>(), 
65 |                                   static_cast<T>(0), thrust::plus<T>());
66 |     T a2 = thrust::transform_reduce(B_ptr, B_ptr + N*N, 
67 |                                   square_thr<T>(), static_cast<T>(0),
68 |                                   thrust::plus<T>());
69 |     
70 |     return sqrt(a1/a2);
71 | }
72 | 
73 | 
74 | #endif // EX3_H_


--------------------------------------------------------------------------------