├── README.md ├── hw1 ├── Homework-1.pdf ├── report.pdf ├── src │ ├── add │ │ ├── avx_add │ │ ├── avx_add.c │ │ ├── serial_add │ │ └── serial_add.c │ └── multi-thread │ │ ├── parallel │ │ └── parallel.c └── 参考文献 │ ├── 64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf │ ├── AVX-AVX2-Example-Code-master.zip │ └── Intro_to_Intel_AVX.pdf ├── hw2 ├── 18340208-张洪宾-并行分布式计算作业2.pdf ├── Homework-2.pdf └── source code.zip ├── hw3 ├── Homework-3.pdf ├── report.pdf └── src │ ├── Data_Race │ ├── calculate_e │ │ ├── calculate_e │ │ ├── calculate_e.c │ │ ├── complier1.sh │ │ └── tsan1 │ └── lock │ │ ├── complier2.sh │ │ ├── lock │ │ ├── lock.c │ │ └── tsan2 │ ├── Non-reentrant │ ├── Control_experiment │ │ ├── Control_experiment.bc │ │ ├── Control_experiment.c │ │ └── Control_experiment.ll │ ├── I:O │ │ ├── IO.bc │ │ ├── IO.c │ │ └── IO.ll │ ├── global │ │ ├── global.bc │ │ ├── global.c │ │ └── global.ll │ ├── malloc │ │ ├── malloc.bc │ │ ├── malloc.c │ │ └── malloc.ll │ └── static │ │ ├── static.bc │ │ ├── static.c │ │ └── static.ll │ └── analyse │ ├── Control_experiment.ll │ ├── IO.ll │ ├── analyse │ ├── analyse.cpp │ ├── analyse.h │ ├── global.ll │ ├── malloc.ll │ └── static.ll ├── hw4 ├── Homework-4.pdf ├── performance analysis │ ├── parallel │ ├── parallel.c │ ├── serial │ └── serial.c ├── readme.md ├── report.pdf └── src │ ├── parallel │ ├── parallel.c │ ├── serial │ └── serial.c ├── hw5 ├── Homework-5.pdf ├── Q1 │ └── 1.c ├── Q2 │ ├── 2.1 │ ├── 2.1.cpp │ ├── 2.2 │ └── 2.2.cpp ├── Q3 │ └── measure.c └── report.pdf └── hw6 ├── Homework-6 ├── Homework-6.pdf ├── error-test.cu ├── error_check_1.h ├── error_checks.h ├── jacobi.cu └── jacobi.h ├── report.pdf └── src ├── 2 ├── jacobi.cu └── jacobi.h ├── 1.0 ├── error-test.cu ├── error_check_1.h └── error_checks.h ├── 1.1 ├── error-test.cu ├── error_check_1.h └── error_checks.h ├── 1.2 ├── error-test.cu ├── error_check_1.h └── error_checks.h └── 1.3 ├── error-test.cu ├── error_check_1.h └── error_checks.h /README.md: -------------------------------------------------------------------------------- 1 | # parallel-and-distributed-computing-homework 2 | 中山大学2020年并行与分布式计算作业 3 | -------------------------------------------------------------------------------- /hw1/Homework-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/Homework-1.pdf -------------------------------------------------------------------------------- /hw1/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/report.pdf -------------------------------------------------------------------------------- /hw1/src/add/avx_add: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/add/avx_add -------------------------------------------------------------------------------- /hw1/src/add/avx_add.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: TripleZ 3 | * Date: 2018-08-17 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #define N 1000000 10 | __m256i vec1[N/8 + 1]; 11 | __m256i vec2[N/8 + 1]; 12 | __m256i res[N/8 + 1]; 13 | 14 | int main(int argc, char const *argv[]) { 15 | for(int i = 0;i < N/8;i++){ 16 | vec1[i] = _mm256_set1_epi32(0); 17 | vec2[i] = _mm256_set1_epi32(0); 18 | } 19 | struct timeval begin,end; 20 | gettimeofday(&begin,NULL); 21 | for(int i = 0;i < N/8;i++){ 22 | res[i] = _mm256_add_epi32(vec1[i],vec2[i]); 23 | } 24 | gettimeofday(&end,NULL); 25 | printf("Avx_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /hw1/src/add/serial_add: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/add/serial_add -------------------------------------------------------------------------------- /hw1/src/add/serial_add.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define N 1000000 4 | 5 | int a[N],b[N],c[N]; 6 | int main(int argc, char const *argv[]) { 7 | for(int i = 0;i < N;i++){ 8 | a[i] = 0; 9 | b[i] = 0; 10 | } 11 | struct timeval begin,end; 12 | gettimeofday(&begin,NULL); 13 | for(int i = 0;i < N;i++){ 14 | c[i] = a[i] + b[i]; 15 | } 16 | gettimeofday(&end,NULL); 17 | printf("Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /hw1/src/multi-thread/parallel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/src/multi-thread/parallel -------------------------------------------------------------------------------- /hw1/src/multi-thread/parallel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define N 1000000 7 | 8 | int a[N],b[N],c[N]; 9 | 10 | int main(int argc,char* argv[]) { 11 | printf("Input the number of the threads:"); 12 | int n; 13 | scanf("%d",&n); 14 | omp_set_num_threads(n); 15 | struct timeval begin,end; 16 | gettimeofday(&begin,NULL); 17 | #pragma omp parallel for 18 | for(int i = 0;i < N;i++){ 19 | a[i] = b[i] + c[i]; 20 | } 21 | gettimeofday(&end,NULL); 22 | printf("Run time with %d threads is %d μs\n",n,end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 23 | } 24 | -------------------------------------------------------------------------------- /hw1/参考文献/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/64-ia-32-architectures-software-developer-vol-3a-part-1-manual.pdf -------------------------------------------------------------------------------- /hw1/参考文献/AVX-AVX2-Example-Code-master.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/AVX-AVX2-Example-Code-master.zip -------------------------------------------------------------------------------- /hw1/参考文献/Intro_to_Intel_AVX.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw1/参考文献/Intro_to_Intel_AVX.pdf -------------------------------------------------------------------------------- /hw2/18340208-张洪宾-并行分布式计算作业2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/18340208-张洪宾-并行分布式计算作业2.pdf -------------------------------------------------------------------------------- /hw2/Homework-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/Homework-2.pdf -------------------------------------------------------------------------------- /hw2/source code.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw2/source code.zip -------------------------------------------------------------------------------- /hw3/Homework-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/Homework-3.pdf -------------------------------------------------------------------------------- /hw3/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/report.pdf -------------------------------------------------------------------------------- /hw3/src/Data_Race/calculate_e/calculate_e: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/calculate_e/calculate_e -------------------------------------------------------------------------------- /hw3/src/Data_Race/calculate_e/calculate_e.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | double res = 1; 7 | int count = 8;//线程数量 8 | int accuracy = 1000;//计算精度,用多项式项数表示 9 | 10 | void *Add(void *ptr) { 11 | long thread_order = (long)ptr;//第i个线程,用于表示线程序号 12 | long n = thread_order;//第n项 13 | 14 | long loop_num = accuracy / count; 15 | if(accuracy % count)loop_num++; 16 | for(long j = 0;j < loop_num;j++){ 17 | n += j * count;//n % count = thread_order 18 | long long denominator = 1; 19 | for(int i = 2;i <= n;i++){ 20 | denominator *= i; 21 | } 22 | double term = 0; 23 | if(denominator > 0)term = 1.0/denominator; 24 | res += term; 25 | } 26 | return NULL; 27 | } 28 | int main(int argc,char **argv) { 29 | if(argc == 3){ 30 | count = atoi(argv[1]); 31 | accuracy = atoi(argv[2]); 32 | } 33 | pthread_t handle[count]; 34 | long i; 35 | for(i = 1;i <= count;i++){ 36 | 37 | pthread_create(&handle[i - 1],NULL,Add,(void*)i); 38 | } 39 | for(int i = 1;i <= count;i++){ 40 | pthread_join(handle[i - 1],NULL); 41 | } 42 | printf("e = %f\n",res); 43 | } 44 | -------------------------------------------------------------------------------- /hw3/src/Data_Race/calculate_e/complier1.sh: -------------------------------------------------------------------------------- 1 | clang calculate_e.c -o calculate_e -lpthread 2 | ./calculate_e 3 | ./calculate_e 4 | ./calculate_e 5 | ./calculate_e 6 | ./calculate_e 7 | ./calculate_e 8 | ./calculate_e 9 | ./calculate_e 10 | ./calculate_e 11 | ./calculate_e 12 | ./calculate_e 13 | ./calculate_e 14 | ./calculate_e 15 | ./calculate_e 16 | ./calculate_e 17 | ./calculate_e 18 | ./calculate_e 19 | ./calculate_e 20 | ./calculate_e 21 | ./calculate_e 22 | -------------------------------------------------------------------------------- /hw3/src/Data_Race/calculate_e/tsan1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/calculate_e/tsan1 -------------------------------------------------------------------------------- /hw3/src/Data_Race/lock/complier2.sh: -------------------------------------------------------------------------------- 1 | clang lock.c -o lock -lpthread 2 | ./lock 3 | ./lock 4 | ./lock 5 | ./lock 6 | ./lock 7 | ./lock 8 | ./lock 9 | ./lock 10 | ./lock 11 | ./lock 12 | ./lock 13 | ./lock 14 | ./lock 15 | ./lock 16 | ./lock 17 | ./lock 18 | ./lock 19 | ./lock 20 | ./lock 21 | ./lock 22 | -------------------------------------------------------------------------------- /hw3/src/Data_Race/lock/lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/lock/lock -------------------------------------------------------------------------------- /hw3/src/Data_Race/lock/lock.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | double res = 1; 7 | int count = 8;//线程数量 8 | int accuracy = 1000;//计算精度,用多项式项数表示 9 | pthread_mutex_t lock; 10 | 11 | void *Add(void *ptr) { 12 | long thread_order = (int)ptr;//第i个线程,用于表示线程序号 13 | long n = thread_order;//第n项 14 | 15 | long loop_num = accuracy / count; 16 | if(accuracy % count)loop_num++; 17 | for(long j = 0;j < loop_num;j++){ 18 | n += j * count;//n % count = thread_order 19 | long long denominator = 1; 20 | for(int i = 2;i <= n;i++){ 21 | denominator *= i; 22 | } 23 | double term = 0; 24 | if(denominator > 0)term = 1.0/denominator; 25 | 26 | pthread_mutex_lock(&lock); 27 | res += term; 28 | pthread_mutex_unlock(&lock); 29 | } 30 | return NULL; 31 | } 32 | int main(int argc,char **argv) { 33 | if(argc == 3){ 34 | count = atoi(argv[1]); 35 | accuracy = atoi(argv[2]); 36 | } 37 | pthread_t handle[count]; 38 | long i; 39 | for(i = 1;i <= count;i++){ 40 | 41 | pthread_create(&handle[i - 1],NULL,Add,(void*)i); 42 | } 43 | for(int i = 1;i <= count;i++){ 44 | pthread_join(handle[i - 1],NULL); 45 | } 46 | printf("%f\n",res); 47 | } -------------------------------------------------------------------------------- /hw3/src/Data_Race/lock/tsan2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Data_Race/lock/tsan2 -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/Control_experiment/Control_experiment.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/Control_experiment/Control_experiment.bc -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/Control_experiment/Control_experiment.c: -------------------------------------------------------------------------------- 1 | int add(int a,int b){ 2 | int c = a + b; 3 | return c; 4 | } 5 | int main(){ 6 | return 0; 7 | } -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/Control_experiment/Control_experiment.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'Control_experiment.bc' 2 | source_filename = "Control_experiment.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | ; Function Attrs: noinline nounwind optnone ssp uwtable 7 | define i32 @add(i32, i32) #0 { 8 | %3 = alloca i32, align 4 9 | %4 = alloca i32, align 4 10 | %5 = alloca i32, align 4 11 | store i32 %0, i32* %3, align 4 12 | store i32 %1, i32* %4, align 4 13 | %6 = load i32, i32* %3, align 4 14 | %7 = load i32, i32* %4, align 4 15 | %8 = add nsw i32 %6, %7 16 | store i32 %8, i32* %5, align 4 17 | %9 = load i32, i32* %5, align 4 18 | ret i32 %9 19 | } 20 | 21 | ; Function Attrs: noinline nounwind optnone ssp uwtable 22 | define i32 @main() #0 { 23 | %1 = alloca i32, align 4 24 | store i32 0, i32* %1, align 4 25 | ret i32 0 26 | } 27 | 28 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 29 | 30 | !llvm.module.flags = !{!0, !1, !2} 31 | !llvm.ident = !{!3} 32 | 33 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 34 | !1 = !{i32 1, !"wchar_size", i32 4} 35 | !2 = !{i32 7, !"PIC Level", i32 2} 36 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 37 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/I:O/IO.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/I:O/IO.bc -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/I:O/IO.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | 6 | int Global; 7 | void *fun(void *x) { 8 | Global = 42; 9 | printf("%d\n",Global); 10 | return x; 11 | } 12 | int main() { 13 | pthread_t t; 14 | pthread_create(&t, NULL, fun, NULL); 15 | Global = 43; 16 | pthread_join(t, NULL); 17 | return Global; 18 | } 19 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/I:O/IO.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'IO.bc' 2 | source_filename = "IO.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @Global = common global i32 0, align 4 11 | @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 12 | 13 | ; Function Attrs: noinline nounwind optnone ssp uwtable 14 | define i8* @fun(i8*) #0 { 15 | %2 = alloca i8*, align 8 16 | store i8* %0, i8** %2, align 8 17 | store i32 42, i32* @Global, align 4 18 | %3 = load i32, i32* @Global, align 4 19 | %4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %3) 20 | %5 = load i8*, i8** %2, align 8 21 | ret i8* %5 22 | } 23 | 24 | declare i32 @printf(i8*, ...) #1 25 | 26 | ; Function Attrs: noinline nounwind optnone ssp uwtable 27 | define i32 @main() #0 { 28 | %1 = alloca i32, align 4 29 | %2 = alloca %struct._opaque_pthread_t*, align 8 30 | store i32 0, i32* %1, align 4 31 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null) 32 | store i32 43, i32* @Global, align 4 33 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 34 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 35 | %6 = load i32, i32* @Global, align 4 36 | ret i32 %6 37 | } 38 | 39 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 40 | 41 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 42 | 43 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 44 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 45 | 46 | !llvm.module.flags = !{!0, !1, !2} 47 | !llvm.ident = !{!3} 48 | 49 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 50 | !1 = !{i32 1, !"wchar_size", i32 4} 51 | !2 = !{i32 7, !"PIC Level", i32 2} 52 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 53 | !4 = !{!5} 54 | !5 = !{i64 2, i64 3, i1 false} 55 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/global/global.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/global/global.bc -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/global/global.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int global = 10; 4 | 5 | void* fun(void* argv){ 6 | global = 20; 7 | return NULL; 8 | } 9 | int main() { 10 | pthread_t t; 11 | pthread_create(&t, NULL, fun, NULL); 12 | global = 43; 13 | pthread_join(t, NULL); 14 | return global; 15 | } -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/global/global.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'global.bc' 2 | source_filename = "global.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @global = global i32 10, align 4 11 | 12 | ; Function Attrs: noinline nounwind optnone ssp uwtable 13 | define i8* @fun(i8*) #0 { 14 | %2 = alloca i8*, align 8 15 | store i8* %0, i8** %2, align 8 16 | store i32 20, i32* @global, align 4 17 | ret i8* null 18 | } 19 | 20 | ; Function Attrs: noinline nounwind optnone ssp uwtable 21 | define i32 @main() #0 { 22 | %1 = alloca i32, align 4 23 | %2 = alloca %struct._opaque_pthread_t*, align 8 24 | store i32 0, i32* %1, align 4 25 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null) 26 | store i32 43, i32* @global, align 4 27 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 28 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 29 | %6 = load i32, i32* @global, align 4 30 | ret i32 %6 31 | } 32 | 33 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 34 | 35 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 36 | 37 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 38 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 39 | 40 | !llvm.module.flags = !{!0, !1, !2} 41 | !llvm.ident = !{!3} 42 | 43 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 44 | !1 = !{i32 1, !"wchar_size", i32 4} 45 | !2 = !{i32 7, !"PIC Level", i32 2} 46 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 47 | !4 = !{!5} 48 | !5 = !{i64 2, i64 3, i1 false} 49 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/malloc/malloc.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/malloc/malloc.bc -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/malloc/malloc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | void* init(void*argv){ 4 | int* array = malloc(sizeof(int) * 10); 5 | for(int i = 0;i < 10;i++)array[i] = 0; 6 | free(array); 7 | return argv; 8 | } 9 | 10 | int main(){ 11 | pthread_t id; 12 | pthread_create(&id, NULL, init, NULL); 13 | pthread_join(id, NULL); 14 | return 0; 15 | } -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/malloc/malloc.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'malloc.bc' 2 | source_filename = "malloc.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | ; Function Attrs: noinline nounwind optnone ssp uwtable 11 | define i8* @init(i8*) #0 { 12 | %2 = alloca i8*, align 8 13 | %3 = alloca i32*, align 8 14 | %4 = alloca i32, align 4 15 | store i8* %0, i8** %2, align 8 16 | %5 = call i8* @malloc(i64 40) #3 17 | %6 = bitcast i8* %5 to i32* 18 | store i32* %6, i32** %3, align 8 19 | store i32 0, i32* %4, align 4 20 | br label %7 21 | 22 | 7: ; preds = %15, %1 23 | %8 = load i32, i32* %4, align 4 24 | %9 = icmp slt i32 %8, 10 25 | br i1 %9, label %10, label %18 26 | 27 | 10: ; preds = %7 28 | %11 = load i32*, i32** %3, align 8 29 | %12 = load i32, i32* %4, align 4 30 | %13 = sext i32 %12 to i64 31 | %14 = getelementptr inbounds i32, i32* %11, i64 %13 32 | store i32 0, i32* %14, align 4 33 | br label %15 34 | 35 | 15: ; preds = %10 36 | %16 = load i32, i32* %4, align 4 37 | %17 = add nsw i32 %16, 1 38 | store i32 %17, i32* %4, align 4 39 | br label %7 40 | 41 | 18: ; preds = %7 42 | %19 = load i32*, i32** %3, align 8 43 | %20 = bitcast i32* %19 to i8* 44 | call void @free(i8* %20) 45 | %21 = load i8*, i8** %2, align 8 46 | ret i8* %21 47 | } 48 | 49 | ; Function Attrs: allocsize(0) 50 | declare i8* @malloc(i64) #1 51 | 52 | declare void @free(i8*) #2 53 | 54 | ; Function Attrs: noinline nounwind optnone ssp uwtable 55 | define i32 @main() #0 { 56 | %1 = alloca i32, align 4 57 | %2 = alloca %struct._opaque_pthread_t*, align 8 58 | store i32 0, i32* %1, align 4 59 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @init, i8* null) 60 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 61 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 62 | ret i32 0 63 | } 64 | 65 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #2 66 | 67 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #2 68 | 69 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 70 | attributes #1 = { allocsize(0) "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 71 | attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 72 | attributes #3 = { allocsize(0) } 73 | 74 | !llvm.module.flags = !{!0, !1, !2} 75 | !llvm.ident = !{!3} 76 | 77 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 78 | !1 = !{i32 1, !"wchar_size", i32 4} 79 | !2 = !{i32 7, !"PIC Level", i32 2} 80 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 81 | !4 = !{!5} 82 | !5 = !{i64 2, i64 3, i1 false} 83 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/static/static.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/Non-reentrant/static/static.bc -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/static/static.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | 5 | void* fun(void* argv){ 6 | int* res = (int*)argv; 7 | static int x; 8 | x = *res; 9 | return NULL; 10 | } 11 | int main() { 12 | pthread_t t1,t2; 13 | pthread_create(&t1, NULL, fun, &t1); 14 | pthread_create(&t2, NULL, fun, &t2); 15 | 16 | pthread_join(t1, NULL); 17 | pthread_join(t2, NULL); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /hw3/src/Non-reentrant/static/static.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'static.bc' 2 | source_filename = "static.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @fun.x = internal global i32 0, align 4 11 | 12 | ; Function Attrs: noinline nounwind optnone ssp uwtable 13 | define i8* @fun(i8*) #0 { 14 | %2 = alloca i8*, align 8 15 | %3 = alloca i32*, align 8 16 | store i8* %0, i8** %2, align 8 17 | %4 = load i8*, i8** %2, align 8 18 | %5 = bitcast i8* %4 to i32* 19 | store i32* %5, i32** %3, align 8 20 | %6 = load i32*, i32** %3, align 8 21 | %7 = load i32, i32* %6, align 4 22 | store i32 %7, i32* @fun.x, align 4 23 | ret i8* null 24 | } 25 | 26 | ; Function Attrs: noinline nounwind optnone ssp uwtable 27 | define i32 @main() #0 { 28 | %1 = alloca i32, align 4 29 | %2 = alloca %struct._opaque_pthread_t*, align 8 30 | %3 = alloca %struct._opaque_pthread_t*, align 8 31 | store i32 0, i32* %1, align 4 32 | %4 = bitcast %struct._opaque_pthread_t** %2 to i8* 33 | %5 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %4) 34 | %6 = bitcast %struct._opaque_pthread_t** %3 to i8* 35 | %7 = call i32 @pthread_create(%struct._opaque_pthread_t** %3, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %6) 36 | %8 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 37 | %9 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %8, i8** null) 38 | %10 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %3, align 8 39 | %11 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %10, i8** null) 40 | ret i32 0 41 | } 42 | 43 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 44 | 45 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 46 | 47 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 48 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 49 | 50 | !llvm.module.flags = !{!0, !1, !2} 51 | !llvm.ident = !{!3} 52 | 53 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 54 | !1 = !{i32 1, !"wchar_size", i32 4} 55 | !2 = !{i32 7, !"PIC Level", i32 2} 56 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 57 | !4 = !{!5} 58 | !5 = !{i64 2, i64 3, i1 false} 59 | -------------------------------------------------------------------------------- /hw3/src/analyse/Control_experiment.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'Control_experiment.bc' 2 | source_filename = "Control_experiment.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | ; Function Attrs: noinline nounwind optnone ssp uwtable 7 | define i32 @add(i32, i32) #0 { 8 | %3 = alloca i32, align 4 9 | %4 = alloca i32, align 4 10 | %5 = alloca i32, align 4 11 | store i32 %0, i32* %3, align 4 12 | store i32 %1, i32* %4, align 4 13 | %6 = load i32, i32* %3, align 4 14 | %7 = load i32, i32* %4, align 4 15 | %8 = add nsw i32 %6, %7 16 | store i32 %8, i32* %5, align 4 17 | %9 = load i32, i32* %5, align 4 18 | ret i32 %9 19 | } 20 | 21 | ; Function Attrs: noinline nounwind optnone ssp uwtable 22 | define i32 @main() #0 { 23 | %1 = alloca i32, align 4 24 | store i32 0, i32* %1, align 4 25 | ret i32 0 26 | } 27 | 28 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 29 | 30 | !llvm.module.flags = !{!0, !1, !2} 31 | !llvm.ident = !{!3} 32 | 33 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 34 | !1 = !{i32 1, !"wchar_size", i32 4} 35 | !2 = !{i32 7, !"PIC Level", i32 2} 36 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 37 | -------------------------------------------------------------------------------- /hw3/src/analyse/IO.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'IO.bc' 2 | source_filename = "IO.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @Global = common global i32 0, align 4 11 | @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 12 | 13 | ; Function Attrs: noinline nounwind optnone ssp uwtable 14 | define i8* @fun(i8*) #0 { 15 | %2 = alloca i8*, align 8 16 | store i8* %0, i8** %2, align 8 17 | store i32 42, i32* @Global, align 4 18 | %3 = load i32, i32* @Global, align 4 19 | %4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %3) 20 | %5 = load i8*, i8** %2, align 8 21 | ret i8* %5 22 | } 23 | 24 | declare i32 @printf(i8*, ...) #1 25 | 26 | ; Function Attrs: noinline nounwind optnone ssp uwtable 27 | define i32 @main() #0 { 28 | %1 = alloca i32, align 4 29 | %2 = alloca %struct._opaque_pthread_t*, align 8 30 | store i32 0, i32* %1, align 4 31 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null) 32 | store i32 43, i32* @Global, align 4 33 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 34 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 35 | %6 = load i32, i32* @Global, align 4 36 | ret i32 %6 37 | } 38 | 39 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 40 | 41 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 42 | 43 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 44 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 45 | 46 | !llvm.module.flags = !{!0, !1, !2} 47 | !llvm.ident = !{!3} 48 | 49 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 50 | !1 = !{i32 1, !"wchar_size", i32 4} 51 | !2 = !{i32 7, !"PIC Level", i32 2} 52 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 53 | !4 = !{!5} 54 | !5 = !{i64 2, i64 3, i1 false} 55 | -------------------------------------------------------------------------------- /hw3/src/analyse/analyse: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw3/src/analyse/analyse -------------------------------------------------------------------------------- /hw3/src/analyse/analyse.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by zhb on 2020/5/1. 3 | // Copyright © 2020 zhb. All rights reserved. 4 | // 5 | #include "analyse.h" 6 | 7 | int main(int argc, const char * argv[]) { 8 | if(argc != 2){ 9 | cout << "Please input the LLVM IR file name!" << endl; 10 | abort(); 11 | } 12 | string file = read_file(argv[1]); 13 | divide(file); 14 | Non_reentrant_fun.push_back("printf"); 15 | Non_reentrant_fun.push_back("scanf"); 16 | Non_reentrant_fun.push_back("malloc"); 17 | Non_reentrant_fun.push_back("free"); 18 | 19 | bool res = false; 20 | for(int i = 0;i < all_function.size();i++){ 21 | module test(all_function[i]); 22 | bool ret = test.detect_and_print(); 23 | if(ret)res = true; 24 | } 25 | if(!res){ 26 | cout << "Non reentrant function not detected." << endl; 27 | } 28 | } 29 | 30 | -------------------------------------------------------------------------------- /hw3/src/analyse/analyse.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by zhb on 2020/5/1. 3 | // Copyright © 2020 zhb. All rights reserved. 4 | // 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace std; 15 | 16 | vector all_function; 17 | vector Non_reentrant_fun; 18 | 19 | 20 | 21 | 22 | inline void deduplication(vector& c) 23 | { 24 | sort(c.begin(), c.end()); 25 | auto new_end = unique(c.begin(), c.end());//"删除"相邻的重复元素 26 | c.erase(new_end, c.end());//删除(真正的删除)重复的元素 27 | } 28 | 29 | 30 | string read_file(const char*file_name){ 31 | fstream file(file_name); 32 | stringstream ss; 33 | ss << file.rdbuf(); 34 | string str = ss.str(); 35 | return str; 36 | } 37 | 38 | void divide(string s){ 39 | vector pos; 40 | int index = 0; 41 | string sub = "define"; 42 | while ((index = s.find(sub, index)) < s.length()) 43 | { 44 | pos.push_back(index); 45 | string temp; 46 | int flag = 0; 47 | int target = 0; 48 | for(int i = index;i < s.size();i++){ 49 | if(s[i] == '}'){ 50 | target = i; 51 | flag = 1; 52 | break; 53 | } 54 | if(flag)break; 55 | } 56 | temp = s.substr(index,target - index + 1); 57 | all_function.push_back(temp); 58 | index++; 59 | 60 | } 61 | } 62 | 63 | 64 | class module{ 65 | string fun_name;//函数名 66 | string argument;//参数列表 67 | string ret_type;//返回值类型 68 | string content;//函数体 69 | vector symbol;//标志@在函数体的位置 70 | vector _global;//函数中的全局变量 71 | vector _static;//函数中的静态变量 72 | vector _non_reentrant_fun;//函数中调用的不可重入函数 73 | 74 | //获取所有的@标志 75 | void get_all_symbol(); 76 | //检测symbol向量中是否有全局变量,有则放入_global向量 77 | void detect_global(); 78 | //检测symbol向量中是否有静态变量,有则放入_static向量 79 | void detect_static(); 80 | //检测symbol向量中是否有函数中调用的不可重入函数,有则放入_non_reentrant_fun向量 81 | void detect_non_reentrant_fun(); 82 | //对上述四种操作进行封装,若为不可重用函数则返回true 83 | bool detect(); 84 | public: 85 | //将IR code中一个函数的字符串表示作为一个整体初始化模块类 86 | module(string fun); 87 | //对detect()进行封装并打印提示信息。 88 | bool detect_and_print(); 89 | }; 90 | module::module(string fun){ 91 | stringstream s; 92 | s << fun; 93 | string define; 94 | s >> define; 95 | s >> ret_type; 96 | s >> fun_name; 97 | int content_begin; 98 | for(int i = 0;i < fun.size();i++){ 99 | if(fun[i] == '{'){ 100 | content_begin = i; 101 | break; 102 | } 103 | } 104 | content = fun.substr(content_begin); 105 | int argument_pos = 0; 106 | for(int i = 0;i < fun_name.size();i++){ 107 | if(fun_name[i] == '('){ 108 | argument_pos = i; 109 | break; 110 | } 111 | } 112 | argument = fun_name.substr(argument_pos + 1); 113 | argument.pop_back(); 114 | fun_name.erase(fun_name.begin() + argument_pos,fun_name.end()); 115 | fun_name.erase(fun_name.begin()); 116 | } 117 | void module::get_all_symbol(){ 118 | for(int i = 0;i < content.size();i++){ 119 | if(content[i] == '@'){ 120 | symbol.push_back(i); 121 | } 122 | } 123 | 124 | } 125 | void module::detect_global(){ 126 | for(int i = 0;i < symbol.size();i++){ 127 | stringstream ss; 128 | string sub = content.substr(symbol[i]); 129 | ss << sub; 130 | string value; 131 | ss >> value; 132 | int flag = 0; 133 | for(int j = 0;j < value.size();j++){ 134 | if(value[j] == '(' || value[j] == '.'){ 135 | flag = 1; 136 | break; 137 | } 138 | } 139 | if(flag)continue; 140 | while(value[value.size() - 1] == ',')value.pop_back(); 141 | value.erase(value.begin()); 142 | _global.push_back(value); 143 | } 144 | } 145 | void module::detect_static(){ 146 | for(int i = 0;i < symbol.size();i++){ 147 | stringstream ss; 148 | string sub = content.substr(symbol[i]); 149 | ss << sub; 150 | string value; 151 | ss >> value; 152 | int flag = 0; 153 | int point = 1; 154 | for(int j = 0;j < value.size();j++){ 155 | if(value[j] == '('){ 156 | flag = 1; 157 | break; 158 | } 159 | if(value[j] == '.'){ 160 | point = 0; 161 | } 162 | } 163 | if(flag || point)continue; 164 | while(value[value.size() - 1] == ',')value.pop_back(); 165 | value.erase(value.begin()); 166 | if(value[0] == '.')continue; 167 | _static.push_back(value); 168 | } 169 | } 170 | void module::detect_non_reentrant_fun(){ 171 | for(int i = 0;i < symbol.size();i++){ 172 | stringstream ss; 173 | string sub = content.substr(symbol[i]); 174 | ss << sub; 175 | string value; 176 | ss >> value; 177 | int flag = 0; 178 | int pos = -1; 179 | for(int j = 0;j < value.size();j++){ 180 | if(value[j] == '('){ 181 | flag = 1; 182 | pos = j; 183 | break; 184 | } 185 | } 186 | if(!flag)continue; 187 | 188 | value.erase(value.begin() + pos,value.end()); 189 | value.erase(value.begin()); 190 | for(int k = 0;k < Non_reentrant_fun.size();k++){ 191 | if(value == Non_reentrant_fun[k]){ 192 | _non_reentrant_fun.push_back(value); 193 | } 194 | } 195 | } 196 | } 197 | bool module::detect(){ 198 | get_all_symbol(); 199 | detect_non_reentrant_fun(); 200 | detect_global(); 201 | detect_static(); 202 | return _global.size() || _static.size() || _non_reentrant_fun.size(); 203 | } 204 | bool module::detect_and_print(){ 205 | if(fun_name == "main")return false; 206 | bool res = detect(); 207 | if(res == false){ 208 | return false; 209 | } 210 | else{ 211 | Non_reentrant_fun.push_back(fun_name); 212 | cout << "Detect non reentrant function " << fun_name << ", details are as follows: " << endl; 213 | deduplication(_non_reentrant_fun); 214 | deduplication(_global); 215 | deduplication(_static); 216 | if(_non_reentrant_fun.size()){ 217 | cout << "Call non reentrant function:"; 218 | for(int i = 0;i < _non_reentrant_fun.size();i++){ 219 | cout << _non_reentrant_fun[i] << " "; 220 | } 221 | cout << endl; 222 | } 223 | if(_global.size()){ 224 | cout << "Use global valuable:"; 225 | for(int i = 0;i < _global.size();i++){ 226 | cout << _global[i] << " "; 227 | } 228 | cout << endl; 229 | } 230 | if(_static.size()){ 231 | cout << "Use static valuable:"; 232 | for(int i = 0;i < _static.size();i++){ 233 | cout << _static[i] << " "; 234 | } 235 | cout << endl; 236 | } 237 | return true; 238 | } 239 | return true; 240 | } 241 | -------------------------------------------------------------------------------- /hw3/src/analyse/global.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'global.bc' 2 | source_filename = "global.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @global = global i32 10, align 4 11 | 12 | ; Function Attrs: noinline nounwind optnone ssp uwtable 13 | define i8* @fun(i8*) #0 { 14 | %2 = alloca i8*, align 8 15 | store i8* %0, i8** %2, align 8 16 | store i32 20, i32* @global, align 4 17 | ret i8* null 18 | } 19 | 20 | ; Function Attrs: noinline nounwind optnone ssp uwtable 21 | define i32 @main() #0 { 22 | %1 = alloca i32, align 4 23 | %2 = alloca %struct._opaque_pthread_t*, align 8 24 | store i32 0, i32* %1, align 4 25 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* null) 26 | store i32 43, i32* @global, align 4 27 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 28 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 29 | %6 = load i32, i32* @global, align 4 30 | ret i32 %6 31 | } 32 | 33 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 34 | 35 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 36 | 37 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 38 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 39 | 40 | !llvm.module.flags = !{!0, !1, !2} 41 | !llvm.ident = !{!3} 42 | 43 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 44 | !1 = !{i32 1, !"wchar_size", i32 4} 45 | !2 = !{i32 7, !"PIC Level", i32 2} 46 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 47 | !4 = !{!5} 48 | !5 = !{i64 2, i64 3, i1 false} 49 | -------------------------------------------------------------------------------- /hw3/src/analyse/malloc.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'malloc.bc' 2 | source_filename = "malloc.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | ; Function Attrs: noinline nounwind optnone ssp uwtable 11 | define i8* @init(i8*) #0 { 12 | %2 = alloca i8*, align 8 13 | %3 = alloca i32*, align 8 14 | %4 = alloca i32, align 4 15 | store i8* %0, i8** %2, align 8 16 | %5 = call i8* @malloc(i64 40) #3 17 | %6 = bitcast i8* %5 to i32* 18 | store i32* %6, i32** %3, align 8 19 | store i32 0, i32* %4, align 4 20 | br label %7 21 | 22 | 7: ; preds = %15, %1 23 | %8 = load i32, i32* %4, align 4 24 | %9 = icmp slt i32 %8, 10 25 | br i1 %9, label %10, label %18 26 | 27 | 10: ; preds = %7 28 | %11 = load i32*, i32** %3, align 8 29 | %12 = load i32, i32* %4, align 4 30 | %13 = sext i32 %12 to i64 31 | %14 = getelementptr inbounds i32, i32* %11, i64 %13 32 | store i32 0, i32* %14, align 4 33 | br label %15 34 | 35 | 15: ; preds = %10 36 | %16 = load i32, i32* %4, align 4 37 | %17 = add nsw i32 %16, 1 38 | store i32 %17, i32* %4, align 4 39 | br label %7 40 | 41 | 18: ; preds = %7 42 | %19 = load i32*, i32** %3, align 8 43 | %20 = bitcast i32* %19 to i8* 44 | call void @free(i8* %20) 45 | %21 = load i8*, i8** %2, align 8 46 | ret i8* %21 47 | } 48 | 49 | ; Function Attrs: allocsize(0) 50 | declare i8* @malloc(i64) #1 51 | 52 | declare void @free(i8*) #2 53 | 54 | ; Function Attrs: noinline nounwind optnone ssp uwtable 55 | define i32 @main() #0 { 56 | %1 = alloca i32, align 4 57 | %2 = alloca %struct._opaque_pthread_t*, align 8 58 | store i32 0, i32* %1, align 4 59 | %3 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @init, i8* null) 60 | %4 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 61 | %5 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %4, i8** null) 62 | ret i32 0 63 | } 64 | 65 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #2 66 | 67 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #2 68 | 69 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 70 | attributes #1 = { allocsize(0) "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 71 | attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 72 | attributes #3 = { allocsize(0) } 73 | 74 | !llvm.module.flags = !{!0, !1, !2} 75 | !llvm.ident = !{!3} 76 | 77 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 78 | !1 = !{i32 1, !"wchar_size", i32 4} 79 | !2 = !{i32 7, !"PIC Level", i32 2} 80 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 81 | !4 = !{!5} 82 | !5 = !{i64 2, i64 3, i1 false} 83 | -------------------------------------------------------------------------------- /hw3/src/analyse/static.ll: -------------------------------------------------------------------------------- 1 | ; ModuleID = 'static.bc' 2 | source_filename = "static.c" 3 | target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 4 | target triple = "x86_64-apple-macosx10.15.0" 5 | 6 | %struct._opaque_pthread_t = type { i64, %struct.__darwin_pthread_handler_rec*, [8176 x i8] } 7 | %struct.__darwin_pthread_handler_rec = type { void (i8*)*, i8*, %struct.__darwin_pthread_handler_rec* } 8 | %struct._opaque_pthread_attr_t = type { i64, [56 x i8] } 9 | 10 | @fun.x = internal global i32 0, align 4 11 | 12 | ; Function Attrs: noinline nounwind optnone ssp uwtable 13 | define i8* @fun(i8*) #0 { 14 | %2 = alloca i8*, align 8 15 | %3 = alloca i32*, align 8 16 | store i8* %0, i8** %2, align 8 17 | %4 = load i8*, i8** %2, align 8 18 | %5 = bitcast i8* %4 to i32* 19 | store i32* %5, i32** %3, align 8 20 | %6 = load i32*, i32** %3, align 8 21 | %7 = load i32, i32* %6, align 4 22 | store i32 %7, i32* @fun.x, align 4 23 | ret i8* null 24 | } 25 | 26 | ; Function Attrs: noinline nounwind optnone ssp uwtable 27 | define i32 @main() #0 { 28 | %1 = alloca i32, align 4 29 | %2 = alloca %struct._opaque_pthread_t*, align 8 30 | %3 = alloca %struct._opaque_pthread_t*, align 8 31 | store i32 0, i32* %1, align 4 32 | %4 = bitcast %struct._opaque_pthread_t** %2 to i8* 33 | %5 = call i32 @pthread_create(%struct._opaque_pthread_t** %2, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %4) 34 | %6 = bitcast %struct._opaque_pthread_t** %3 to i8* 35 | %7 = call i32 @pthread_create(%struct._opaque_pthread_t** %3, %struct._opaque_pthread_attr_t* null, i8* (i8*)* @fun, i8* %6) 36 | %8 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %2, align 8 37 | %9 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %8, i8** null) 38 | %10 = load %struct._opaque_pthread_t*, %struct._opaque_pthread_t** %3, align 8 39 | %11 = call i32 @"\01_pthread_join"(%struct._opaque_pthread_t* %10, i8** null) 40 | ret i32 0 41 | } 42 | 43 | declare !callback !4 i32 @pthread_create(%struct._opaque_pthread_t**, %struct._opaque_pthread_attr_t*, i8* (i8*)*, i8*) #1 44 | 45 | declare i32 @"\01_pthread_join"(%struct._opaque_pthread_t*, i8**) #1 46 | 47 | attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 48 | attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "darwin-stkchk-strong-link" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "probe-stack"="___chkstk_darwin" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 49 | 50 | !llvm.module.flags = !{!0, !1, !2} 51 | !llvm.ident = !{!3} 52 | 53 | !0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 4]} 54 | !1 = !{i32 1, !"wchar_size", i32 4} 55 | !2 = !{i32 7, !"PIC Level", i32 2} 56 | !3 = !{!"Apple clang version 11.0.3 (clang-1103.0.32.29)"} 57 | !4 = !{!5} 58 | !5 = !{i64 2, i64 3, i1 false} 59 | -------------------------------------------------------------------------------- /hw4/Homework-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/Homework-4.pdf -------------------------------------------------------------------------------- /hw4/performance analysis/parallel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/performance analysis/parallel -------------------------------------------------------------------------------- /hw4/performance analysis/parallel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define row 1000 5 | #define col 1000 6 | 7 | int matrix[row][col]; 8 | int vector[col]; 9 | int res[row]; 10 | 11 | int s_res[col]; 12 | void serial(){ 13 | for(int i = 0;i < row;i++){ 14 | for(int j = 0;j < col;j++){ 15 | s_res[i] += matrix[i][j] * vector[j]; 16 | } 17 | } 18 | } 19 | int main(int argc, char **argv) 20 | { 21 | int my_id = 0; 22 | int temp = 100; 23 | int p; 24 | MPI_Init(&argc, &argv); 25 | MPI_Comm_rank(MPI_COMM_WORLD, &my_id); 26 | MPI_Comm_size(MPI_COMM_WORLD, &p); 27 | MPI_Status status_p; 28 | if(my_id == 0){ 29 | for(int i = 0;i < col;i++){ 30 | vector[i] = rand() % 10; 31 | } 32 | for(int i = 0;i < row;i++){ 33 | for(int j = 0;j < col;j++){ 34 | matrix[i][j] = rand() % 10; 35 | } 36 | } 37 | } 38 | MPI_Barrier(MPI_COMM_WORLD); 39 | double my_start,my_end,my_elapsed,elapsed; 40 | my_start = MPI_Wtime(); 41 | if(my_id == 0){ 42 | for(int i = 1;i < p;i++){ 43 | MPI_Send(matrix[i * row / p],col * row / p,MPI_INT,i,i,MPI_COMM_WORLD); 44 | } 45 | } 46 | else{ 47 | MPI_Recv(matrix[my_id * row / p],col * row / p,MPI_INT,0,my_id,MPI_COMM_WORLD,&status_p); 48 | } 49 | 50 | 51 | MPI_Bcast(vector,col,MPI_INT,0,MPI_COMM_WORLD); 52 | 53 | for(int i = my_id * row / p;i < (my_id + 1) * row / p; i++){ 54 | for(int j = 0;j < col;j++){ 55 | res[i] += matrix[i][j] * vector[j]; 56 | } 57 | } 58 | if(my_id == 0){ 59 | for(int i = 1;i < p;i++){ 60 | MPI_Recv(res + i * row / p,row / p,MPI_INT,i,i,MPI_COMM_WORLD,&status_p); 61 | } 62 | /* serial(); 63 | int flag = 1; 64 | for(int i = 0;i < row;i++){ 65 | if(res[i] != s_res[i]){ 66 | printf("Error!\n"); 67 | flag = 0; 68 | } 69 | } 70 | if(flag){ 71 | printf("Correct calculation!\n"); 72 | }*/ 73 | } 74 | else{ 75 | MPI_Send(res + my_id * row / p,row / p,MPI_INT,0,my_id,MPI_COMM_WORLD); 76 | } 77 | my_end = MPI_Wtime(); 78 | 79 | my_elapsed = my_end - my_start; 80 | 81 | MPI_Reduce(&my_elapsed,&elapsed,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); 82 | if(my_id == 0){ 83 | printf("Parallel time is:%e seconds\n",elapsed); 84 | } 85 | MPI_Finalize(); 86 | 87 | 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /hw4/performance analysis/serial: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/performance analysis/serial -------------------------------------------------------------------------------- /hw4/performance analysis/serial.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define row 1000 5 | #define col 1000 6 | 7 | int matrix[row][col]; 8 | int vector[row]; 9 | 10 | int res[col]; 11 | int main(){ 12 | struct timeval begin,end; 13 | for(int i = 0;i < col;i++){ 14 | vector[i] = rand() % 10;//Initialize Vector 15 | } 16 | for(int i = 0;i < row;i++){ 17 | for(int j = 0;j < col;j++){ 18 | matrix[i][j] = rand() % 10;//Initialize Matrix 19 | } 20 | } 21 | gettimeofday(&begin,NULL); 22 | for(int i = 0;i < row;i++){ 23 | for(int j = 0;j < col;j++){ 24 | res[i] += matrix[i][j] * vector[j];//Multiplication 25 | } 26 | } 27 | gettimeofday(&end,NULL); 28 | printf("Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 29 | return 0; 30 | } -------------------------------------------------------------------------------- /hw4/readme.md: -------------------------------------------------------------------------------- 1 | ## 注意 2 | * 运行环境为macOS 3 | * 在src文件夹中的代码是没有对时间对测量的,无法进行性能分析,不过在并行的程序中有对程序正确性的验证 4 | * 在performance analysis文件夹中的代码可以对串行程序和并行程序的运行时间进行测量。 -------------------------------------------------------------------------------- /hw4/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/report.pdf -------------------------------------------------------------------------------- /hw4/src/parallel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/src/parallel -------------------------------------------------------------------------------- /hw4/src/parallel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define row 1000 5 | #define col 1000 6 | 7 | int matrix[row][col]; 8 | int vector[col]; 9 | int res[row]; 10 | 11 | int s_res[col]; 12 | void serial(){ 13 | for(int i = 0;i < row;i++){ 14 | for(int j = 0;j < col;j++){ 15 | s_res[i] += matrix[i][j] * vector[j]; 16 | } 17 | } 18 | } 19 | int main(int argc, char **argv) 20 | { 21 | int my_id = 0; 22 | int temp = 100; 23 | int p; 24 | MPI_Init(&argc, &argv); 25 | MPI_Comm_rank(MPI_COMM_WORLD, &my_id); 26 | MPI_Comm_size(MPI_COMM_WORLD, &p); 27 | MPI_Status status_p; 28 | if(my_id == 0){ 29 | for(int i = 0;i < col;i++){ 30 | vector[i] = rand() % 10; 31 | } 32 | for(int i = 0;i < row;i++){ 33 | for(int j = 0;j < col;j++){ 34 | matrix[i][j] = rand() % 10; 35 | } 36 | } 37 | } 38 | if(my_id == 0){ 39 | for(int i = 1;i < p;i++){ 40 | MPI_Send(matrix[i * row / p],col * row / p,MPI_INT,i,i,MPI_COMM_WORLD); 41 | } 42 | } 43 | else{ 44 | MPI_Recv(matrix[my_id * row / p],col * row / p,MPI_INT,0,my_id,MPI_COMM_WORLD,&status_p); 45 | } 46 | 47 | 48 | MPI_Bcast(vector,col,MPI_INT,0,MPI_COMM_WORLD); 49 | 50 | for(int i = my_id * row / p;i < (my_id + 1) * row / p; i++){ 51 | for(int j = 0;j < col;j++){ 52 | res[i] += matrix[i][j] * vector[j]; 53 | } 54 | } 55 | if(my_id == 0){ 56 | for(int i = 1;i < p;i++){ 57 | MPI_Recv(res + i * row / p,row / p,MPI_INT,i,i,MPI_COMM_WORLD,&status_p); 58 | } 59 | serial(); 60 | int flag = 1; 61 | for(int i = 0;i < row;i++){ 62 | if(res[i] != s_res[i]){ 63 | printf("Error!\n"); 64 | flag = 0; 65 | } 66 | } 67 | if(flag){ 68 | printf("Correct calculation!\n"); 69 | } 70 | } 71 | else{ 72 | MPI_Send(res + my_id * row / p,row / p,MPI_INT,0,my_id,MPI_COMM_WORLD); 73 | } 74 | 75 | MPI_Finalize(); 76 | 77 | 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /hw4/src/serial: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw4/src/serial -------------------------------------------------------------------------------- /hw4/src/serial.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #define row 1000 4 | #define col 1000 5 | 6 | int matrix[row][col]; 7 | int vector[row]; 8 | 9 | int res[col]; 10 | int main(){ 11 | for(int i = 0;i < col;i++){ 12 | vector[i] = rand() % 10;//Initialize Vector 13 | } 14 | for(int i = 0;i < row;i++){ 15 | for(int j = 0;j < col;j++){ 16 | matrix[i][j] = rand() % 10;//Initialize Matrix 17 | } 18 | } 19 | for(int i = 0;i < row;i++){ 20 | for(int j = 0;j < col;j++){ 21 | res[i] += matrix[i][j] * vector[j];//Multiplication 22 | } 23 | } 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /hw5/Homework-5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Homework-5.pdf -------------------------------------------------------------------------------- /hw5/Q1/1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | struct compressed_matrix 7 | { 8 | int row_size; 9 | int col_size; 10 | int element_size; 11 | int* row; 12 | int* col; 13 | double* element; 14 | }; 15 | //获取随机数 16 | double getRandData(int min,int max){ 17 | double m1 = (double)(rand() % 101) / 101; 18 | min ++; 19 | double m2 = (double)((rand() % (max - min + 1) + min)); 20 | m2--; 21 | return m1 + m2; 22 | } 23 | double* init_vector(int vector_size){ 24 | double* vector = (double*)malloc(sizeof(double) * vector_size); 25 | for(int i = 0;i < vector_size;i++){ 26 | vector[i] = getRandData(1,10); 27 | } 28 | return vector; 29 | } 30 | 31 | struct compressed_matrix init_matrix(FILE*stream){ 32 | int row_size,col_size,element_size; 33 | fscanf(stream,"%d%d%d",&row_size,&col_size,&element_size); 34 | struct compressed_matrix mat; 35 | mat.col_size = col_size; 36 | mat.row_size = row_size; 37 | mat.element_size = element_size; 38 | mat.row = (int*)malloc(row_size * sizeof(int)); 39 | mat.col = (int*)malloc(element_size * sizeof(int)); 40 | mat.element = (double*)malloc(element_size * sizeof(double)); 41 | for(int i = 0;i < row_size;i++){ 42 | fscanf(stream,"%d",&mat.row[i]); 43 | mat.row[i]--; 44 | } 45 | for(int i = 0;i < element_size;i++){ 46 | fscanf(stream,"%d",&mat.col[i]); 47 | mat.col[i]--; 48 | } 49 | for(int i = 0;i < element_size;i++){ 50 | fscanf(stream,"%lf",&mat.element[i]); 51 | } 52 | return mat; 53 | } 54 | 55 | double* serial(struct compressed_matrix matrix,double* vector){ 56 | double*res = (double*)malloc(matrix.row_size * sizeof(double)); 57 | memset(res,0,matrix.row_size * sizeof(double)); 58 | for(int i = 0;i < matrix.row_size;i++){ 59 | int begin = matrix.row[i]; 60 | int end; 61 | if(i < matrix.row_size - 1) 62 | end = matrix.row[i + 1] - 1; 63 | else 64 | end = matrix.element_size - 1; 65 | for(int j = begin;j <= end;j++){ 66 | res[i] += vector[matrix.col[j]] * matrix.element[j]; 67 | } 68 | } 69 | return res; 70 | } 71 | double* parallel(struct compressed_matrix matrix,double* vector,int thread_count){ 72 | double*res = (double*)malloc(matrix.row_size * sizeof(double)); 73 | memset(res,0,matrix.row_size * sizeof(double)); 74 | #pragma omp parallel for num_threads(thread_count) 75 | for(int i = 0;i < matrix.row_size;i++){ 76 | int begin = matrix.row[i]; 77 | int end; 78 | if(i < matrix.row_size - 1) 79 | end = matrix.row[i + 1] - 1; 80 | else 81 | end = matrix.element_size - 1; 82 | for(int j = begin;j <= end;j++){ 83 | res[i] += vector[matrix.col[j]] * matrix.element[j]; 84 | } 85 | } 86 | return res; 87 | } 88 | void print_vector(double*res,int len){ 89 | printf("{"); 90 | for(int i = 0;i < len;i++){ 91 | printf("%f",res[i]); 92 | if(i < len - 1)printf(","); 93 | } 94 | printf("}\n"); 95 | } 96 | void free_matrix(struct compressed_matrix matrix){ 97 | free(matrix.col); 98 | free(matrix.row); 99 | free(matrix.element); 100 | } 101 | int main(int argc, const char * argv[]){ 102 | if(argc < 3){ 103 | abort(); 104 | } 105 | int thread_count = atoi(argv[1]); 106 | FILE* matrix_file = fopen(argv[2],"r"); 107 | struct timeval begin,end; 108 | struct compressed_matrix matrix = init_matrix(matrix_file); 109 | double* vector = init_vector(matrix.col_size); 110 | gettimeofday(&begin,NULL); 111 | double* res = serial(matrix,vector); 112 | gettimeofday(&end,NULL); 113 | printf("\n\n"); 114 | printf(" Serial_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 115 | gettimeofday(&begin,NULL); 116 | double* parallel_res = parallel(matrix,vector,thread_count); 117 | gettimeofday(&end,NULL); 118 | printf("\n\n"); 119 | printf(" There are %d threads.\n",thread_count); 120 | printf(" Parallel_time is:%ldμs\n",end.tv_sec*1000000 + end.tv_usec - begin.tv_sec*1000000 - begin.tv_usec); 121 | //print_vector(res,matrix.row_size); 122 | //print_vector(parallel_res,matrix.row_size); 123 | free_matrix(matrix); 124 | free(vector); 125 | free(res); 126 | fclose(matrix_file); 127 | free(parallel_res); 128 | } -------------------------------------------------------------------------------- /hw5/Q2/2.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Q2/2.1 -------------------------------------------------------------------------------- /hw5/Q2/2.1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #define MaxSize 50 9 | using namespace std; 10 | 11 | typedef int resources; 12 | sem_t n ,s ; 13 | 14 | queue all_resources; 15 | 16 | resources produce(){ 17 | for(int i = 0;i < 5000;i++); 18 | if(all_resources.empty()){ 19 | return 1; 20 | } 21 | else{ 22 | return all_resources.back() + 1; 23 | } 24 | } 25 | 26 | void consume(resources i){ 27 | printf("Resources %d is poped.\n",i); 28 | } 29 | 30 | void producer(){ 31 | while(true){ 32 | resources x = produce(); 33 | sem_wait(&s); 34 | all_resources.push(x); 35 | printf("Resources %d is pushed.\n",x); 36 | sem_post(&s); 37 | sem_post(&n); 38 | } 39 | } 40 | void consumer(){ 41 | while(true){ 42 | resources x; 43 | sem_wait(&n); 44 | sem_wait(&s); 45 | x = all_resources.front(); 46 | all_resources.pop(); 47 | sem_post(&s); 48 | consume(x); 49 | } 50 | } 51 | 52 | int main(){ 53 | sem_init(&n,0,0); 54 | sem_init(&s,0,1); 55 | #pragma omp parallel sections 56 | { 57 | #pragma omp section 58 | { 59 | producer(); 60 | } 61 | #pragma omp section 62 | { 63 | consumer(); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /hw5/Q2/2.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/Q2/2.2 -------------------------------------------------------------------------------- /hw5/Q2/2.2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #define MaxSize 50 9 | using namespace std; 10 | 11 | typedef int resources; 12 | sem_t n ,s ; 13 | 14 | queue all_resources; 15 | 16 | resources produce(){ 17 | for(int i = 0;i < 1000;i++) 18 | for(int j = 0;j < 1000;j++); 19 | if(all_resources.empty()){ 20 | return 1; 21 | } 22 | else{ 23 | return all_resources.back() + 1; 24 | } 25 | } 26 | 27 | void consume(resources i,int id){ 28 | printf("Resources %d is poped by thread %d.\n",i,id); 29 | } 30 | 31 | void producer(int id){ 32 | while(true){ 33 | resources x = produce(); 34 | sem_wait(&s); 35 | all_resources.push(x); 36 | printf("Resources %d is pushed by thread %d.\n",x,id); 37 | sem_post(&s); 38 | sem_post(&n); 39 | if(id == MaxSize)break; 40 | } 41 | } 42 | void consumer(int id){ 43 | while(true){ 44 | resources x; 45 | sem_wait(&n); 46 | sem_wait(&s); 47 | x = all_resources.front(); 48 | all_resources.pop(); 49 | sem_post(&s); 50 | consume(x,id); 51 | } 52 | } 53 | 54 | int main(){ 55 | int p_count = 8; 56 | int c_count = 4; 57 | sem_init(&n,0,0); 58 | sem_init(&s,0,1); 59 | 60 | #pragma omp parallel num_threads(p_count + c_count) 61 | { 62 | int id = omp_get_thread_num(); 63 | #pragma omp parallel sections 64 | { 65 | #pragma omp section 66 | { 67 | if(id < p_count) 68 | producer(id); 69 | } 70 | #pragma omp section 71 | { 72 | if(id >= p_count) 73 | consumer(id); 74 | } 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /hw5/Q3/measure.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define BUFLEN 1000000 5 | 6 | char buf[BUFLEN]; 7 | 8 | int main(int argc, char **argv) 9 | { 10 | int my_id = 0; 11 | int p; 12 | MPI_Init(&argc, &argv); 13 | MPI_Comm_rank(MPI_COMM_WORLD, &my_id); 14 | MPI_Comm_size(MPI_COMM_WORLD, &p); 15 | MPI_Status status_p; 16 | MPI_Barrier(MPI_COMM_WORLD); 17 | double my_start,my_end,my_elapsed,elapsed; 18 | my_start = MPI_Wtime(); 19 | if(my_id == 0){ 20 | for(int i = 1;i < p;i++){ 21 | MPI_Send(buf,BUFLEN,MPI_CHAR,i,i,MPI_COMM_WORLD); 22 | } 23 | } 24 | else{ 25 | MPI_Recv(buf,BUFLEN,MPI_CHAR,0,my_id,MPI_COMM_WORLD,&status_p); 26 | } 27 | my_end = MPI_Wtime(); 28 | 29 | my_elapsed = my_end - my_start; 30 | 31 | MPI_Reduce(&my_elapsed,&elapsed,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); 32 | if(my_id == 0){ 33 | printf("Time delay is %f s\n",elapsed); 34 | printf("Bandwidth is %f Mbit/s\n", BUFLEN * 1.0 / (1048576* elapsed)); 35 | } 36 | MPI_Finalize(); 37 | 38 | 39 | return 0; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /hw5/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw5/report.pdf -------------------------------------------------------------------------------- /hw6/Homework-6/Homework-6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw6/Homework-6/Homework-6.pdf -------------------------------------------------------------------------------- /hw6/Homework-6/error-test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG 4 | 5 | 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N) 7 | { 8 | // Add the kernel code 9 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | // Do not try to access past the allocated memory 12 | if (idx < N) { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | const int N = 20; 21 | const int ThreadsInBlock = 128; 22 | double *dA, *dB, *dC; 23 | double hA[N], hB[N], hC[N]; 24 | 25 | for(int i = 0; i < N; ++i) { 26 | hA[i] = (double) i; 27 | hB[i] = (double) i * i; 28 | } 29 | 30 | /* 31 | Add memory allocations and copies. Wrap your runtime function 32 | calls with CUDA_CHECK( ) macro 33 | */ 34 | CUDA_CHECK( cudaMalloc((void**)&dA, sizeof(double)*N) ); 35 | #error Add the remaining memory allocations and copies 36 | 37 | // Note the maximum size of threads in a block 38 | dim3 grid, threads; 39 | 40 | //// Add the kernel call here 41 | #error Add the CUDA kernel call 42 | 43 | 44 | // Here we add an explicit synchronization so that we catch errors 45 | // as early as possible. Don't do this in production code! 46 | cudaDeviceSynchronize(); 47 | CHECK_ERROR_MSG("vector_add kernel"); 48 | 49 | //// Copy back the results and free the device memory 50 | #error Copy back the results and free the allocated memory 51 | 52 | for (int i = 0; i < N; i++) 53 | printf("%5.1f\n", hC[i]); 54 | 55 | return 0; 56 | } -------------------------------------------------------------------------------- /hw6/Homework-6/error_check_1.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/Homework-6/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/Homework-6/jacobi.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "jacobi.h" 4 | #include "error_checks.h" 5 | 6 | // Change this to 0 if CPU reference result is not needed 7 | #define COMPUTE_CPU_REFERENCE 1 8 | #define MAX_ITERATIONS 3000 9 | 10 | // CPU kernel 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 12 | double h2, int N) 13 | { 14 | int i, j; 15 | int index, i1, i2, i3, i4; 16 | 17 | for (j = 1; j < N-1; j++) { 18 | for (i = 1; i < N-1; i++) { 19 | index = i + j*N; 20 | i1 = (i-1) + j * N; 21 | i2 = (i+1) + j * N; 22 | i3 = i + (j-1) * N; 23 | i4 = i + (j+1) * N; 24 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 25 | phiPrev[i3] + phiPrev[i4] - 26 | h2 * source[index]); 27 | } 28 | } 29 | } 30 | 31 | // GPU kernel 32 | __global__ 33 | void sweepGPU(double *phi, const double *phiPrev, const double *source, 34 | double h2, int N) 35 | { 36 | #error Add here the GPU version of the update routine (see sweepCPU above) 37 | } 38 | 39 | 40 | double compareArrays(const double *a, const double *b, int N) 41 | { 42 | double error = 0.0; 43 | int i; 44 | for (i = 0; i < N*N; i++) { 45 | error += fabs(a[i] - b[i]); 46 | } 47 | return error/(N*N); 48 | } 49 | 50 | 51 | double diffCPU(const double *phi, const double *phiPrev, int N) 52 | { 53 | int i; 54 | double sum = 0; 55 | double diffsum = 0; 56 | 57 | for (i = 0; i < N*N; i++) { 58 | diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]); 59 | sum += phi[i] * phi[i]; 60 | } 61 | 62 | return sqrt(diffsum/sum); 63 | } 64 | 65 | 66 | int main() 67 | { 68 | timeval t1, t2; // Structs for timing 69 | const int N = 512; 70 | double h = 1.0 / (N - 1); 71 | int iterations; 72 | const double tolerance = 5e-4; // Stopping condition 73 | int i, j, index; 74 | 75 | const int blocksize = 16; 76 | 77 | double *phi = new double[N*N]; 78 | double *phiPrev = new double[N*N]; 79 | double *source = new double[N*N]; 80 | double *phi_cuda = new double[N*N]; 81 | 82 | double *phi_d, *phiPrev_d, *source_d; 83 | // Size of the arrays in bytes 84 | const int size = N*N*sizeof(double); 85 | double diff; 86 | 87 | // Source initialization 88 | for (i = 0; i < N; i++) { 89 | for (j = 0; j < N; j++) { 90 | double x, y; 91 | x = (i - N / 2) * h; 92 | y = (j - N / 2) * h; 93 | index = j + i * N; 94 | if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1) 95 | source[index] = 1e10*h*h; 96 | else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1) 97 | source[index] = -1e10*h*h; 98 | else 99 | source[index] = 0.0; 100 | } 101 | } 102 | 103 | CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 104 | CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 105 | 106 | // Reset values to zero 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | index = j + i * N; 110 | phi[index] = 0.0; 111 | phiPrev[index] = 0.0; 112 | } 113 | } 114 | 115 | CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 116 | CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 117 | CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) ); 118 | CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) ); 119 | 120 | // CPU version 121 | if(COMPUTE_CPU_REFERENCE) { 122 | gettimeofday(&t1, NULL); 123 | 124 | // Do sweeps untill difference is under the tolerance 125 | diff = tolerance * 2; 126 | iterations = 0; 127 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 128 | sweepCPU(phiPrev, phi, source, h * h, N); 129 | sweepCPU(phi, phiPrev, source, h * h, N); 130 | 131 | iterations += 2; 132 | if (iterations % 100 == 0) { 133 | diff = diffCPU(phi, phiPrev, N); 134 | printf("%d %g\n", iterations, diff); 135 | } 136 | } 137 | gettimeofday(&t2, NULL); 138 | printf("CPU Jacobi: %g seconds, %d iterations\n", 139 | t2.tv_sec - t1.tv_sec + 140 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 141 | } 142 | 143 | // GPU version 144 | 145 | dim3 dimBlock(blocksize, blocksize); 146 | dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 147 | 148 | //do sweeps until diff under tolerance 149 | diff = tolerance * 2; 150 | iterations = 0; 151 | 152 | gettimeofday(&t1, NULL); 153 | 154 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 155 | // See above how the CPU update kernel is called 156 | // and implement similar calling sequence for the GPU code 157 | 158 | //// Add routines here 159 | #error Add GPU kernel calls here (see CPU version above) 160 | 161 | iterations += 2; 162 | 163 | if (iterations % 100 == 0) { 164 | // diffGPU is defined in the header file, it uses 165 | // Thrust library for reduction computation 166 | diff = diffGPU(phiPrev_d, phi_d, N); 167 | CHECK_ERROR_MSG("Difference computation"); 168 | printf("%d %g\n", iterations, diff); 169 | } 170 | } 171 | 172 | //// Add here the routine to copy back the results 173 | #error Copy back the results 174 | 175 | gettimeofday(&t2, NULL); 176 | printf("GPU Jacobi: %g seconds, %d iterations\n", 177 | t2.tv_sec - t1.tv_sec + 178 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 179 | 180 | //// Add here the clean up code for all allocated CUDA resources 181 | #error Add here the clean up code 182 | 183 | if (COMPUTE_CPU_REFERENCE) { 184 | printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N)); 185 | } 186 | 187 | delete[] phi; 188 | delete[] phi_cuda; 189 | delete[] phiPrev; 190 | delete[] source; 191 | 192 | return EXIT_SUCCESS; 193 | } -------------------------------------------------------------------------------- /hw6/Homework-6/jacobi.h: -------------------------------------------------------------------------------- 1 | #ifndef EX3_H_ 2 | #define EX3_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Helper function prototypes 10 | double compareArrays(const double *a, const double *b, int N); 11 | double diffCPU(const double *a, const double *b, int N); 12 | void sweepCPU(double *phi, const double *phiPrev, 13 | const double *source, double h2, int N); 14 | 15 | 16 | /* ------------------------------------------------------------------------- 17 | EXTRACURRICULAR ACTIVITIES 18 | 19 | This part provides the reduction operation (in this case summation of 20 | difference of two arrays) using thrust library. Thrust mimics the 21 | syntax and design of standard template library (STL) of C++. Thrust is 22 | also a part of CUDA 4 SDK. 23 | More information can be found from thrust home page 24 | http://code.google.com/p/thrust/ 25 | ----------------------------------------------------------------------- */ 26 | 27 | template 28 | class square_diff_thr : public thrust::unary_function, T> 29 | { 30 | public: 31 | __host__ __device__ 32 | T operator()(const thrust::tuple& x) const { 33 | return (thrust::get<1>(x) - thrust::get<0>(x)) * 34 | (thrust::get<1>(x) - thrust::get<0>(x)); 35 | } 36 | }; 37 | 38 | template 39 | class square_thr : public thrust::unary_function 40 | { 41 | public: 42 | __host__ __device__ 43 | T operator()(const T& x) const { 44 | return x*x; 45 | } 46 | }; 47 | 48 | template 49 | T diffGPU(T *A_d, T *B_d, int N) 50 | { 51 | typedef thrust::device_ptr FloatIterator; 52 | typedef thrust::tuple IteratorTuple; 53 | typedef thrust::zip_iterator ZipIterator; 54 | 55 | thrust::device_ptr A_ptr(A_d); 56 | thrust::device_ptr B_ptr(B_d); 57 | 58 | ZipIterator first = 59 | thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr)); 60 | ZipIterator last = 61 | thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N*N, 62 | B_ptr + N*N)); 63 | 64 | T a1 = thrust::transform_reduce(first, last, square_diff_thr(), 65 | static_cast(0), thrust::plus()); 66 | T a2 = thrust::transform_reduce(B_ptr, B_ptr + N*N, 67 | square_thr(), static_cast(0), 68 | thrust::plus()); 69 | 70 | return sqrt(a1/a2); 71 | } 72 | 73 | 74 | #endif // EX3_H_ -------------------------------------------------------------------------------- /hw6/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhanghb55/parallel-and-distributed-computing-homework/c0fce214cc33698c2d67d6de7e2a233697d0980f/hw6/report.pdf -------------------------------------------------------------------------------- /hw6/src/1.0/error-test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG 4 | 5 | 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N) 7 | { 8 | // Add the kernel code 9 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | // Do not try to access past the allocated memory 12 | if (idx < N) { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | const int N = 20; 21 | const int ThreadsInBlock = 128; 22 | double *dA, *dB, *dC; 23 | double hA[N], hB[N], hC[N]; 24 | 25 | for(int i = 0; i < N; ++i) { 26 | hA[i] = (double) i; 27 | hB[i] = (double) i * i; 28 | } 29 | 30 | /* 31 | Add memory allocations and copies. Wrap your runtime function 32 | calls with CUDA_CHECK( ) macro 33 | */ 34 | CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N)); 35 | CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N)); 36 | CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N)); 37 | 38 | // Note the maximum size of threads in a block 39 | dim3 grid, threads; 40 | 41 | CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice)); 42 | CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice)); 43 | 44 | //// Add the kernel call here 45 | vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N); 46 | 47 | 48 | // Here we add an explicit synchronization so that we catch errors 49 | // as early as possible. Don't do this in production code! 50 | cudaDeviceSynchronize(); 51 | CHECK_ERROR_MSG("vector_add kernel"); 52 | 53 | //// Copy back the results and free the device memory 54 | CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost)); 55 | 56 | CUDA_CHECK(cudaFree(dA)); 57 | CUDA_CHECK(cudaFree(dB)); 58 | CUDA_CHECK(cudaFree(dC)); 59 | for (int i = 0; i < N; i++) 60 | printf("%5.1f\n", hC[i]); 61 | 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /hw6/src/1.0/error_check_1.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.0/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.1/error-test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG 4 | 5 | 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N) 7 | { 8 | // Add the kernel code 9 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | // Do not try to access past the allocated memory 12 | if (idx < N) { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | const int N = 20; 21 | const int ThreadsInBlock = 12800; 22 | double *dA, *dB, *dC; 23 | double hA[N], hB[N], hC[N]; 24 | 25 | for(int i = 0; i < N; ++i) { 26 | hA[i] = (double) i; 27 | hB[i] = (double) i * i; 28 | } 29 | 30 | /* 31 | Add memory allocations and copies. Wrap your runtime function 32 | calls with CUDA_CHECK( ) macro 33 | */ 34 | CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N)); 35 | CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N)); 36 | CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N)); 37 | 38 | // Note the maximum size of threads in a block 39 | dim3 grid, threads; 40 | 41 | CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice)); 42 | CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice)); 43 | 44 | //// Add the kernel call here 45 | vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N); 46 | 47 | 48 | // Here we add an explicit synchronization so that we catch errors 49 | // as early as possible. Don't do this in production code! 50 | //cudaDeviceSynchronize(); 51 | //CHECK_ERROR_MSG("vector_add kernel"); 52 | 53 | //// Copy back the results and free the device memory 54 | CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost)); 55 | 56 | 57 | for (int i = 0; i < N; i++) 58 | printf("%5.1f\n", hC[i]); 59 | 60 | CUDA_CHECK(cudaFree(dA)); 61 | CUDA_CHECK(cudaFree(dB)); 62 | CUDA_CHECK(cudaFree(dC)); 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /hw6/src/1.1/error_check_1.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.1/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.2/error-test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG 4 | 5 | 6 | __global__ void vector_add(double *C, const double *A, const double *B, int N) 7 | { 8 | // Add the kernel code 9 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | // Do not try to access past the allocated memory 12 | if (idx < N) { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | const int N = 20; 21 | const int ThreadsInBlock = 128; 22 | double *dA, *dB, *dC; 23 | double hA[N], hB[N], hC[N]; 24 | 25 | for(int i = 0; i < N; ++i) { 26 | hA[i] = (double) i; 27 | hB[i] = (double) i * i; 28 | } 29 | 30 | /* 31 | Add memory allocations and copies. Wrap your runtime function 32 | calls with CUDA_CHECK( ) macro 33 | */ 34 | CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N)); 35 | CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N)); 36 | CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N)); 37 | 38 | // Note the maximum size of threads in a block 39 | dim3 grid, threads; 40 | 41 | CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice)); 42 | CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice)); 43 | 44 | //// Add the kernel call here 45 | vector_add<<<1,ThreadsInBlock>>>(dC, dA, dB, N); 46 | 47 | printf("Pointer to device memory: %d",*dA); 48 | 49 | // Here we add an explicit synchronization so that we catch errors 50 | // as early as possible. Don't do this in production code! 51 | cudaDeviceSynchronize(); 52 | CHECK_ERROR_MSG("vector_add kernel"); 53 | 54 | //// Copy back the results and free the device memory 55 | CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost)); 56 | 57 | 58 | for (int i = 0; i < N; i++) 59 | printf("%5.1f\n", hC[i]); 60 | 61 | CUDA_CHECK(cudaFree(dA)); 62 | CUDA_CHECK(cudaFree(dB)); 63 | CUDA_CHECK(cudaFree(dC)); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /hw6/src/1.2/error_check_1.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.2/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.3/error-test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG 4 | 5 | 6 | __global__ void vector_add(int* x,double *C, const double *A, const double *B, int N) 7 | { 8 | // Add the kernel code 9 | int idx = *x * blockIdx.x * blockDim.x + threadIdx.x; 10 | 11 | // Do not try to access past the allocated memory 12 | if (idx < N) { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | } 16 | 17 | 18 | int main(void) 19 | { 20 | const int N = 20; 21 | const int ThreadsInBlock = 128; 22 | double *dA, *dB, *dC; 23 | double hA[N], hB[N], hC[N]; 24 | int *x = (int*)malloc(sizeof(int)); 25 | *x = 1; 26 | 27 | for(int i = 0; i < N; ++i) { 28 | hA[i] = (double) i; 29 | hB[i] = (double) i * i; 30 | } 31 | 32 | /* 33 | Add memory allocations and copies. Wrap your runtime function 34 | calls with CUDA_CHECK( ) macro 35 | */ 36 | CUDA_CHECK(cudaMalloc((void**)&dA, sizeof(double)*N)); 37 | CUDA_CHECK(cudaMalloc((void**)&dB, sizeof(double)*N)); 38 | CUDA_CHECK(cudaMalloc((void**)&dC, sizeof(double)*N)); 39 | 40 | // Note the maximum size of threads in a block 41 | dim3 grid, threads; 42 | 43 | CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double)*N, cudaMemcpyHostToDevice)); 44 | CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double)*N, cudaMemcpyHostToDevice)); 45 | 46 | //// Add the kernel call here 47 | vector_add<<<1,ThreadsInBlock>>>(x,dC, dA, dB, N); 48 | 49 | //printf("Pointer to device memory: %d",*dA); 50 | 51 | // Here we add an explicit synchronization so that we catch errors 52 | // as early as possible. Don't do this in production code! 53 | cudaDeviceSynchronize(); 54 | CHECK_ERROR_MSG("vector_add kernel"); 55 | 56 | //// Copy back the results and free the device memory 57 | CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double)*N, cudaMemcpyDeviceToHost)); 58 | 59 | 60 | for (int i = 0; i < N; i++) 61 | printf("%5.1f\n", hC[i]); 62 | 63 | CUDA_CHECK(cudaFree(dA)); 64 | CUDA_CHECK(cudaFree(dB)); 65 | CUDA_CHECK(cudaFree(dC)); 66 | free(x); 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /hw6/src/1.3/error_check_1.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/1.3/error_checks.h: -------------------------------------------------------------------------------- 1 | // This header provides two helper macros for error checking 2 | // See the exercise skeletons and answers for usage examples. 3 | 4 | #ifndef COURSE_UTIL_H_ 5 | #define COURSE_UTIL_H_ 6 | 7 | #include 8 | #include 9 | 10 | #define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__) 11 | #define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__) 12 | 13 | inline void __checkErrorFunc(cudaError_t errarg, const char* file, 14 | const int line) 15 | { 16 | if(errarg) { 17 | fprintf(stderr, "Error at %s(%i)\n", file, line); 18 | exit(EXIT_FAILURE); 19 | } 20 | } 21 | 22 | 23 | inline void __checkErrMsgFunc(const char* errstr, const char* file, 24 | const int line) 25 | { 26 | cudaError_t err = cudaGetLastError(); 27 | if(err != cudaSuccess) { 28 | fprintf(stderr, "Error: %s at %s(%i): %s\n", 29 | errstr, file, line, cudaGetErrorString(err)); 30 | exit(EXIT_FAILURE); 31 | } 32 | } 33 | 34 | #endif -------------------------------------------------------------------------------- /hw6/src/2/jacobi.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "jacobi.h" 4 | #include "error_checks.h" 5 | 6 | // Change this to 0 if CPU reference result is not needed 7 | #define COMPUTE_CPU_REFERENCE 1 8 | #define MAX_ITERATIONS 3000 9 | 10 | // CPU kernel 11 | void sweepCPU(double* phi, const double *phiPrev, const double *source, 12 | double h2, int N) 13 | { 14 | int i, j; 15 | int index, i1, i2, i3, i4; 16 | 17 | for (j = 1; j < N-1; j++) { 18 | for (i = 1; i < N-1; i++) { 19 | index = i + j*N; 20 | i1 = (i-1) + j * N; 21 | i2 = (i+1) + j * N; 22 | i3 = i + (j-1) * N; 23 | i4 = i + (j+1) * N; 24 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 25 | phiPrev[i3] + phiPrev[i4] - 26 | h2 * source[index]); 27 | } 28 | } 29 | } 30 | 31 | // GPU kernel 32 | __global__ void sweepGPU(double *phi, const double *phiPrev, const double *source, 33 | double h2, int N) 34 | { 35 | //#error Add here the GPU version of the update routine (see sweepCPU above) 36 | int i, j; 37 | int index, i1, i2, i3, i4; 38 | 39 | i = blockIdx.x * blockDim.x + threadIdx.x; 40 | j = blockIdx.y * blockDim.y + threadIdx.y; 41 | 42 | if(i < N - 1 && j < N - 1 && i > 0 && j > 0) { 43 | index = i + j*N; 44 | i1 = (i-1) + j * N; 45 | i2 = (i+1) + j * N; 46 | i3 = i + (j-1) * N; 47 | i4 = i + (j+1) * N; 48 | phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + 49 | phiPrev[i3] + phiPrev[i4] - 50 | h2 * source[index]); 51 | } 52 | } 53 | 54 | 55 | double compareArrays(const double *a, const double *b, int N) 56 | { 57 | double error = 0.0; 58 | int i; 59 | for (i = 0; i < N*N; i++) { 60 | error += fabs(a[i] - b[i]); 61 | } 62 | return error/(N*N); 63 | } 64 | 65 | 66 | double diffCPU(const double *phi, const double *phiPrev, int N) 67 | { 68 | int i; 69 | double sum = 0; 70 | double diffsum = 0; 71 | 72 | for (i = 0; i < N*N; i++) { 73 | diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]); 74 | sum += phi[i] * phi[i]; 75 | } 76 | 77 | return sqrt(diffsum/sum); 78 | } 79 | 80 | 81 | int main() 82 | { 83 | timeval t1, t2; // Structs for timing 84 | const int N = 512; 85 | double h = 1.0 / (N - 1); 86 | int iterations; 87 | const double tolerance = 5e-4; // Stopping condition 88 | int i, j, index; 89 | 90 | const int blocksize = 16; 91 | 92 | double *phi = new double[N*N]; 93 | double *phiPrev = new double[N*N]; 94 | double *source = new double[N*N]; 95 | double *phi_cuda = new double[N*N]; 96 | 97 | double *phi_d, *phiPrev_d, *source_d; 98 | // Size of the arrays in bytes 99 | const int size = N*N*sizeof(double); 100 | double diff; 101 | 102 | // Source initialization 103 | for (i = 0; i < N; i++) { 104 | for (j = 0; j < N; j++) { 105 | double x, y; 106 | x = (i - N / 2) * h; 107 | y = (j - N / 2) * h; 108 | index = j + i * N; 109 | if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1) 110 | source[index] = 1e10*h*h; 111 | else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1) 112 | source[index] = -1e10*h*h; 113 | else 114 | source[index] = 0.0; 115 | } 116 | } 117 | 118 | CUDA_CHECK( cudaMalloc( (void**)&source_d, size) ); 119 | CUDA_CHECK( cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice) ); 120 | 121 | // Reset values to zero 122 | for (i = 0; i < N; i++) { 123 | for (j = 0; j < N; j++) { 124 | index = j + i * N; 125 | phi[index] = 0.0; 126 | phiPrev[index] = 0.0; 127 | } 128 | } 129 | 130 | CUDA_CHECK( cudaMalloc( (void**)&phi_d, size) ); 131 | CUDA_CHECK( cudaMalloc( (void**)&phiPrev_d, size) ); 132 | CUDA_CHECK( cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice) ); 133 | CUDA_CHECK( cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice) ); 134 | 135 | // CPU version 136 | if(COMPUTE_CPU_REFERENCE) { 137 | gettimeofday(&t1, NULL); 138 | 139 | // Do sweeps untill difference is under the tolerance 140 | diff = tolerance * 2; 141 | iterations = 0; 142 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 143 | sweepCPU(phiPrev, phi, source, h * h, N); 144 | sweepCPU(phi, phiPrev, source, h * h, N); 145 | 146 | iterations += 2; 147 | if (iterations % 100 == 0) { 148 | diff = diffCPU(phi, phiPrev, N); 149 | printf("%d %g\n", iterations, diff); 150 | } 151 | } 152 | gettimeofday(&t2, NULL); 153 | printf("CPU Jacobi: %g seconds, %d iterations\n", 154 | t2.tv_sec - t1.tv_sec + 155 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 156 | } 157 | 158 | // GPU version 159 | 160 | dim3 dimBlock(blocksize, blocksize); 161 | dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize); 162 | 163 | //do sweeps until diff under tolerance 164 | diff = tolerance * 2; 165 | iterations = 0; 166 | 167 | gettimeofday(&t1, NULL); 168 | 169 | while (diff > tolerance && iterations < MAX_ITERATIONS) { 170 | // See above how the CPU update kernel is called 171 | // and implement similar calling sequence for the GPU code 172 | 173 | //// Add routines here 174 | //#error Add GPU kernel calls here (see CPU version above) 175 | sweepGPU<<>>(phiPrev_d, phi_d, source_d, h*h, N); 176 | sweepGPU<<>>(phi_d, phiPrev_d, source_d, h*h, N); 177 | 178 | iterations += 2; 179 | 180 | if (iterations % 100 == 0) { 181 | // diffGPU is defined in the header file, it uses 182 | // Thrust library for reduction computation 183 | diff = diffGPU(phiPrev_d, phi_d, N); 184 | CHECK_ERROR_MSG("Difference computation"); 185 | printf("%d %g\n", iterations, diff); 186 | } 187 | } 188 | 189 | //// Add here the routine to copy back the results 190 | //#error Copy back the results 191 | 192 | CUDA_CHECK( cudaMemcpy(phi, phi_d, sizeof(double)*N*N, cudaMemcpyDeviceToHost)); 193 | CUDA_CHECK( cudaMemcpy(phiPrev, phiPrev_d, sizeof(double)*N*N,cudaMemcpyDeviceToHost)); 194 | 195 | gettimeofday(&t2, NULL); 196 | printf("GPU Jacobi: %g seconds, %d iterations\n", 197 | t2.tv_sec - t1.tv_sec + 198 | (t2.tv_usec - t1.tv_usec) / 1.0e6, iterations); 199 | 200 | //// Add here the clean up code for all allocated CUDA resources 201 | //#error Add here the clean up code 202 | 203 | CUDA_CHECK( cudaFree(phi_d)); 204 | CUDA_CHECK( cudaFree(phiPrev_d)); 205 | CUDA_CHECK( cudaFree(source_d)); 206 | 207 | if (COMPUTE_CPU_REFERENCE) { 208 | printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N)); 209 | } 210 | 211 | delete[] phi; 212 | delete[] phi_cuda; 213 | delete[] phiPrev; 214 | delete[] source; 215 | 216 | return EXIT_SUCCESS; 217 | } 218 | -------------------------------------------------------------------------------- /hw6/src/2/jacobi.h: -------------------------------------------------------------------------------- 1 | #ifndef EX3_H_ 2 | #define EX3_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // Helper function prototypes 10 | double compareArrays(const double *a, const double *b, int N); 11 | double diffCPU(const double *a, const double *b, int N); 12 | void sweepCPU(double *phi, const double *phiPrev, 13 | const double *source, double h2, int N); 14 | 15 | 16 | /* ------------------------------------------------------------------------- 17 | EXTRACURRICULAR ACTIVITIES 18 | 19 | This part provides the reduction operation (in this case summation of 20 | difference of two arrays) using thrust library. Thrust mimics the 21 | syntax and design of standard template library (STL) of C++. Thrust is 22 | also a part of CUDA 4 SDK. 23 | More information can be found from thrust home page 24 | http://code.google.com/p/thrust/ 25 | ----------------------------------------------------------------------- */ 26 | 27 | template 28 | class square_diff_thr : public thrust::unary_function, T> 29 | { 30 | public: 31 | __host__ __device__ 32 | T operator()(const thrust::tuple& x) const { 33 | return (thrust::get<1>(x) - thrust::get<0>(x)) * 34 | (thrust::get<1>(x) - thrust::get<0>(x)); 35 | } 36 | }; 37 | 38 | template 39 | class square_thr : public thrust::unary_function 40 | { 41 | public: 42 | __host__ __device__ 43 | T operator()(const T& x) const { 44 | return x*x; 45 | } 46 | }; 47 | 48 | template 49 | T diffGPU(T *A_d, T *B_d, int N) 50 | { 51 | typedef thrust::device_ptr FloatIterator; 52 | typedef thrust::tuple IteratorTuple; 53 | typedef thrust::zip_iterator ZipIterator; 54 | 55 | thrust::device_ptr A_ptr(A_d); 56 | thrust::device_ptr B_ptr(B_d); 57 | 58 | ZipIterator first = 59 | thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr)); 60 | ZipIterator last = 61 | thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N*N, 62 | B_ptr + N*N)); 63 | 64 | T a1 = thrust::transform_reduce(first, last, square_diff_thr(), 65 | static_cast(0), thrust::plus()); 66 | T a2 = thrust::transform_reduce(B_ptr, B_ptr + N*N, 67 | square_thr(), static_cast(0), 68 | thrust::plus()); 69 | 70 | return sqrt(a1/a2); 71 | } 72 | 73 | 74 | #endif // EX3_H_ --------------------------------------------------------------------------------